Repository: funstory-ai/BabelDOC Branch: main Commit: 34739ea88118 Files: 156 Total size: 1.9 MB Directory structure: gitextract_4xv94fs_/ ├── .cursorignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yaml │ │ └── feature_request.yaml │ ├── PULL_REQUEST_TEMPLATE/ │ │ └── pr_form.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── dependabot.yml │ ├── labels.yml │ ├── release-drafter.yml │ └── workflows/ │ ├── codeql.yml │ ├── docs.yml │ ├── labeler.yml │ ├── lint.yml │ ├── pr-lint.yml │ ├── publish-to-pypi.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── babeldoc/ │ ├── __init__.py │ ├── assets/ │ │ ├── assets.py │ │ └── embedding_assets_metadata.py │ ├── asynchronize/ │ │ └── __init__.py │ ├── babeldoc_exception/ │ │ ├── BabelDOCException.py │ │ └── __init__.py │ ├── const.py │ ├── docvision/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── base_doclayout.py │ │ ├── doclayout.py │ │ ├── rpc_doclayout.py │ │ ├── rpc_doclayout2.py │ │ ├── rpc_doclayout3.py │ │ ├── rpc_doclayout4.py │ │ ├── rpc_doclayout5.py │ │ ├── rpc_doclayout6.py │ │ ├── rpc_doclayout7.py │ │ └── table_detection/ │ │ └── rapidocr.py │ ├── format/ │ │ ├── __init__.py │ │ └── pdf/ │ │ ├── __init__.py │ │ ├── babelpdf/ │ │ │ ├── base14.py │ │ │ ├── cidfont.py │ │ │ ├── cmap.py │ │ │ ├── encoding.py │ │ │ ├── type3.py │ │ │ ├── utils.py │ │ │ └── win_core.py │ │ ├── converter.py │ │ ├── document_il/ │ │ │ ├── __init__.py │ │ │ ├── backend/ │ │ │ │ ├── __init__.py │ │ │ │ └── pdf_creater.py │ │ │ ├── frontend/ │ │ │ │ ├── __init__.py │ │ │ │ └── il_creater.py │ │ │ ├── il_version_1.py │ │ │ ├── il_version_1.rnc │ │ │ ├── il_version_1.rng │ │ │ ├── il_version_1.xsd │ │ │ ├── midend/ │ │ │ │ ├── __init__.py │ │ │ │ ├── add_debug_information.py │ │ │ │ ├── automatic_term_extractor.py │ │ │ │ ├── detect_scanned_file.py │ │ │ │ ├── il_translator.py │ │ │ │ ├── il_translator_llm_only.py │ │ │ │ ├── layout_parser.py │ │ │ │ ├── paragraph_finder.py │ │ │ │ ├── remove_descent.py │ │ │ │ ├── styles_and_formulas.py │ │ │ │ ├── table_parser.py │ │ │ │ └── typesetting.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── extract_char.py │ │ │ │ ├── fontmap.py │ │ │ │ ├── formular_helper.py │ │ │ │ ├── layout_helper.py │ │ │ │ ├── matrix_helper.py │ │ │ │ ├── mupdf_helper.py │ │ │ │ ├── paragraph_helper.py │ │ │ │ ├── spatial_analyzer.py │ │ │ │ ├── style_helper.py │ │ │ │ └── zstd_helper.py │ │ │ └── xml_converter.py │ │ ├── high_level.py │ │ ├── pdfinterp.py │ │ ├── result_merger.py │ │ ├── split_manager.py │ │ └── translation_config.py │ ├── glossary.py │ ├── main.py │ ├── pdfminer/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── _saslprep.py │ │ ├── arcfour.py │ │ ├── ascii85.py │ │ ├── casting.py │ │ ├── ccitt.py │ │ ├── cmap/ │ │ │ └── README.txt │ │ ├── cmapdb.py │ │ ├── converter.py │ │ ├── data_structures.py │ │ ├── encodingdb.py │ │ ├── fontmetrics.py │ │ ├── glyphlist.py │ │ ├── high_level.py │ │ ├── image.py │ │ ├── jbig2.py │ │ ├── latin_enc.py │ │ ├── layout.py │ │ ├── lzw.py │ │ ├── pdfcolor.py │ │ ├── pdfdevice.py │ │ ├── pdfdocument.py │ │ ├── pdfexceptions.py │ │ ├── pdffont.py │ │ ├── pdfinterp.py │ │ ├── pdfpage.py │ │ ├── pdfparser.py │ │ ├── pdftypes.py │ │ ├── psexceptions.py │ │ ├── psparser.py │ │ ├── py.typed │ │ ├── runlength.py │ │ ├── settings.py │ │ └── utils.py │ ├── progress_monitor.py │ ├── tools/ │ │ ├── generate_cmap_metadata.py │ │ ├── generate_font_metadata.py │ │ ├── italic_assistance.py │ │ └── italic_recognize_tool.py │ ├── translator/ │ │ ├── __init__.py │ │ ├── cache.py │ │ └── translator.py │ └── utils/ │ ├── __init__.py │ ├── atomic_integer.py │ ├── memory.py │ └── priority_thread_pool_executor.py ├── docs/ │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── CONTRIBUTOR_REWARD.md │ ├── ImplementationDetails/ │ │ ├── AsyncTranslate/ │ │ │ └── AsyncTranslate.md │ │ ├── ILTranslator/ │ │ │ └── ILTranslator.md │ │ ├── PDFCreation/ │ │ │ └── PDFCreation.md │ │ ├── PDFParsing/ │ │ │ └── PDFParsing.md │ │ ├── ParagraphFinding/ │ │ │ └── ParagraphFinding.md │ │ ├── README.md │ │ ├── StylesAndFormulas/ │ │ │ └── StylesAndFormulas.md │ │ └── Typesetting/ │ │ └── Typesetting.md │ ├── README.md │ ├── deploy.sh │ ├── example/ │ │ └── demo_glossary.csv │ ├── index.md │ ├── intro-to-pdf-object.md │ ├── requirements.txt │ └── supported_languages.md ├── mkdocs.yml ├── pyproject.toml └── tests/ └── test_translation_cache_cleanup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cursorignore ================================================ # Project notes and templates xnotes/ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yaml ================================================ name: "🐞 Bug Report" description: Create a report to help us improve labels: ['bug'] body: - type: checkboxes id: checks attributes: label: Before you submit options: - label: I have searched existing issues required: true - label: I spent at least 5 minutes investigating and preparing this report required: true - label: I confirmed this is not caused by a network issue required: true - label: I have fully read and understood the [README](https://github.com/funstory-ai/BabelDOC/blob/main/README.md) required: true - label: I am certain that this issue is with BabelDOC itself and can be reproduced through the BabelDOC cli required: true - label: I have uploaded the original file, or confirmed that this issue is unrelated to the original file required: true - label: I have uploaded the log. required: true - label: I confirm that the latest version of BabelDOC is being used. required: true - label: I am aware that the issue section of this project is only for submitting bugs that are clearly related to the BabelDOC core, with complete reproduction steps and relevant logs attached.** Otherwise, issues will be closed directly. required: true - type: markdown attributes: value: | Thank you for using **BabelDOC** and helping us improve it! 🙏 Please confirm again that the above checklist items have been carefully executed! (If you have not carefully executed them, the issue will be closed directly without any response) Please also note: - If you are using a downstream project like pdf2zh-next, please submit an issue directly to the downstream application. Only when you confirm that this issue is a problem with the core library of BabelDOC, submit this issue. - The CLI is only used for debugging purposes, we do not provide any technical support for CLI usage. - type: markdown attributes: value: | Please note! Users of immersive translate online services should contact customer service and provide their translation ID. **Feedback related to online services is not handled here.** - type: textarea id: environment attributes: label: Environment description: Provide your system details (required) value: | - OS: - Python: - BabelDOC: render: markdown validations: required: true - type: textarea id: describe attributes: label: Describe the bug description: A clear and concise description of what the bug is. validations: required: true - type: textarea id: reproduce attributes: label: Steps to Reproduce description: Help us reproduce the issue. Issues that do not provide reproduction steps will be closed directly. value: | 1. Go to '...' 2. Click on '...' 3. See error validations: required: false - type: textarea id: expected attributes: label: Expected Behavior description: What did you expect to happen? validations: required: false - type: textarea id: logs attributes: label: Relevant Log Output or Screenshots description: Copy and paste any logs or attach screenshots. This will be formatted automatically. render: text validations: required: false - type: textarea id: pdf attributes: label: Original PDF File description: Upload the input PDF if applicable. (Issues related to specific PDFs but without uploaded files will be closed directly.) validations: required: false - type: textarea id: others attributes: label: Additional Context description: Anything else we should know? validations: required: false ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yaml ================================================ name: "✨ Feature Request" description: Suggest a new idea or improvement for BabelDOC labels: ['enhancement'] body: - type: markdown attributes: value: | Thank you for helping improve **BabelDOC**! Please fill out the form below to suggest a feature. - type: checkboxes id: checks attributes: label: Before you submit options: - label: I have searched existing issues required: true - label: I have fully read and understood the [README](https://github.com/funstory-ai/BabelDOC/blob/main/README.md) required: true - label: This feature is not related to BabelDOC CLI. The CLI is only used for debugging purposes, we do not accept any feature requests related to the CLI. required: true - type: markdown attributes: value: | 如果您想自部署 BabelDOC,请使用 [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 代替。若其功能无法满足,请向 [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 提交功能请求。 If you wish to self-host BabelDOC, please use [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) instead. If its features do not meet your needs, please submit a feature request to [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next). - type: textarea id: describe attributes: label: Is your feature request related to a problem? description: If applicable, describe what problem this feature would solve. placeholder: Ex. I'm always frustrated when ... validations: required: false - type: textarea id: solution attributes: label: Describe the solution you'd like description: What would you like to see happen? validations: required: true - type: textarea id: alternatives attributes: label: Describe alternatives you've considered description: Have you thought of other ways to solve this? validations: required: false - type: textarea id: additional attributes: label: Additional context description: Any other context, examples, or screenshots? validations: required: false ================================================ FILE: .github/PULL_REQUEST_TEMPLATE/pr_form.yml ================================================ name: Pull Request description: Submit a pull request to contribute to BabelDOC title: "[PR] " labels: - needs triage body: - type: markdown attributes: value: | ## 👋 Thanks for contributing to **BabelDOC**! Please fill out this form to help us review your pull request effectively. - type: input id: issue attributes: label: Related Issue(s) description: If this pull request closes or is related to one or more issues, list them here (e.g., #37) placeholder: "#37" validations: required: false - type: textarea id: summary attributes: label: Description description: Describe the purpose of this pull request and what was changed. placeholder: | - What does this PR introduce or fix? - What is the motivation behind it? validations: required: true - type: dropdown id: pr_type attributes: label: PR Type description: What kind of change is this? multiple: true options: - enhancement - bug - documentation - refactor - test - chore validations: required: true - type: checkboxes id: checklist attributes: label: Contributor Checklist options: - label: I’ve fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide required: true - label: My changes follow the project’s code style and guidelines required: true - label: I’ve linked the related issue(s) in the description above - label: I’ve updated relevant documentation (if applicable) - label: I’ve added necessary tests (if applicable) - label: All new and existing tests passed locally - label: I understand that due to limited maintainer resources, only small pull requests are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary. - type: textarea id: testing attributes: label: Testing Instructions description: Provide step-by-step instructions on how to test your changes placeholder: | 1. Run `...` 2. Visit `...` 3. Click `...` 4. Verify `...` validations: required: false - type: textarea id: screenshots attributes: label: Screenshots (if applicable) description: If UI changes were made, please attach before/after screenshots. validations: required: false - type: textarea id: notes attributes: label: Additional Notes description: Anything else the reviewer should know? validations: required: false ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ### PR Title [PR] ### Related Issue(s) ### Motivation and Context ### Summary of Changes ### PR Type - [ ] ✨ Enhancement - [ ] 🐛 Bug Fix - [ ] 📚 Documentation - [ ] 🏗️ Refactor - [ ] 🧪 Test - [ ] 🧹 Chore ### Breaking Changes ### Contributor Checklist - [ ] I have fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide. - [ ] I have performed a self-review of my own code. - [ ] My changes follow the project's code style and guidelines - [ ] I have linked the related issue(s) in the description above (if applicable) - [ ] I have updated relevant documentation (if applicable) - [ ] I have added necessary tests that prove my fix is effective or that my feature works (if applicable) - [ ] All new and existing tests passed locally with my changes - [ ] My changes generate no new warnings or errors - [ ] I understand that due to limited maintainer resources, only small PRs are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary. ### Testing Instructions ### Screenshots (if applicable) ### Additional Notes ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: github-actions directory: "/" schedule: interval: weekly # - package-ecosystem: pip # directory: "/.github/workflows" # schedule: # interval: weekly # - package-ecosystem: pip # directory: "/docs" # schedule: # interval: weekly - package-ecosystem: pip directory: "/" schedule: interval: weekly versioning-strategy: lockfile-only allow: - dependency-type: "all" ================================================ FILE: .github/labels.yml ================================================ --- # Labels names are important as they are used by Release Drafter to decide # regarding where to record them in changelog or if to skip them. # # The repository labels will be automatically configured using this file and # the GitHub Action https://github.com/marketplace/actions/github-labeler. - name: breaking description: Breaking Changes color: "bfd4f2" - name: bug description: Something isn't working color: "d73a4a" - name: build description: Build System and Dependencies color: "bfdadc" - name: ci description: Continuous Integration color: "4a97d6" - name: dependencies description: Pull requests that update a dependency file color: "0366d6" - name: documentation description: Improvements or additions to documentation color: "0075ca" - name: duplicate description: This issue or pull request already exists color: "cfd3d7" - name: enhancement description: New feature or request color: "a2eeef" - name: github_actions description: Pull requests that update Github_actions code color: "000000" - name: good first issue description: Good for newcomers color: "7057ff" - name: help wanted description: Extra attention is needed color: "008672" - name: invalid description: This doesn't seem right color: "e4e669" - name: performance description: Performance color: "016175" - name: python description: Pull requests that update Python code color: "2b67c6" - name: question description: Further information is requested color: "d876e3" - name: refactoring description: Refactoring color: "ef67c4" - name: removal description: Removals and Deprecations color: "9ae7ea" - name: style description: Style color: "c120e5" - name: testing description: Testing color: "b1fc6f" - name: wontfix description: This will not be worked on color: "ffffff" ================================================ FILE: .github/release-drafter.yml ================================================ name-template: 'v$RESOLVED_VERSION' tag-template: 'v$RESOLVED_VERSION' categories: - title: '🚀 Features' labels: - 'feature' - 'enhancement' - title: '🐛 Bug Fixes' labels: - 'fix' - 'bugfix' - 'bug' - title: '🧰 Maintenance' labels: - 'chore' - 'maintenance' - 'refactor' - title: '📝 Documentation' labels: - 'docs' - 'documentation' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions version-resolver: major: labels: - 'major' minor: labels: - 'minor' patch: labels: - 'patch' default: patch template: | ## Changes $CHANGES ## Contributors $CONTRIBUTORS ================================================ FILE: .github/workflows/codeql.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL Advanced" on: push: pull_request: branches: [ "main" ] schedule: - cron: '36 14 * * 1' jobs: analyze: name: Analyze (${{ matrix.language }}) # Runner size impacts CodeQL analysis time. To learn more, please see: # - https://gh.io/recommended-hardware-resources-for-running-codeql # - https://gh.io/supported-runners-and-hardware-resources # - https://gh.io/using-larger-runners (GitHub.com only) # Consider using larger runners or machines with greater resources for possible analysis time improvements. runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} permissions: # required for all workflows security-events: write # required to fetch internal or private CodeQL packs packages: read # only required for workflows in private repositories actions: read contents: read strategy: fail-fast: false matrix: include: - language: python build-mode: none - language: actions # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' # Use `c-cpp` to analyze code written in C, C++ or both # Use 'java-kotlin' to analyze code written in Java, Kotlin or both # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages steps: - name: Checkout repository uses: actions/checkout@v5 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality # If the analyze step fails for one of the languages you are analyzing with # "We were unable to automatically build your code", modify the matrix above # to set the build mode to "manual" for that language. Then modify this step # to build your code. # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - if: matrix.build-mode == 'manual' shell: bash run: | echo 'If you are using a "manual" build mode for one or more of the' \ 'languages you are analyzing, replace this with the commands to build' \ 'your code, for example:' echo ' make bootstrap' echo ' make release' exit 1 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}" ================================================ FILE: .github/workflows/docs.yml ================================================ name: docs on: push: branches: - main permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 with: fetch-depth: 0 - name: Configure Git Credentials run: | git config user.name github-actions[bot] git config user.email 41898282+github-actions[bot]@users.noreply.github.com - name: Setup uv with Python 3.12 uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 with: python-version: "3.12" enable-cache: true cache-dependency-glob: "uv.lock" activate-environment: true - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache restore-keys: | mkdocs-material- - run: uv sync - run: uv run mkdocs gh-deploy --force ================================================ FILE: .github/workflows/labeler.yml ================================================ name: Labeler on: push: branches: - 'main' paths: - '.github/labels.yml' - '.github/workflows/labels.yml' pull_request: paths: - '.github/labels.yml' - '.github/workflows/labels.yml' permissions: contents: read issues: write pull-requests: write jobs: labeler: runs-on: ubuntu-latest steps: - name: Check out the repository uses: actions/checkout@v5 - name: Run Labeler uses: crazy-max/ghaction-github-labeler@24d110aa46a59976b8a7f35518cb7f14f434c916 # v5.3.0 with: skip-delete: true dry-run: ${{ github.event_name == 'pull_request' }} github-token: ${{ secrets.GITHUB_TOKEN }} yaml-file: .github/labels.yml exclude: | help* *issue ================================================ FILE: .github/workflows/lint.yml ================================================ name: Lint Code permissions: contents: read pull-requests: write on: [push] jobs: lint: strategy: fail-fast: false runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - name: Ruff uses: astral-sh/ruff-action@v3 - name: AutoCorrect uses: huacnlee/autocorrect-action@main ================================================ FILE: .github/workflows/pr-lint.yml ================================================ name: Lint Code and Review Dog Report on: [pull_request] permissions: contents: read pull-requests: write jobs: ruff: name: runner / ruff runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - name: Install Python uses: actions/setup-python@v6 with: python-version: '3.11' - name: Install ruff run: pip install ruff - name: Install reviewdog uses: reviewdog/action-setup@d8edfce3dd5e1ec6978745e801f9c50b5ef80252 # v1.4.0 with: reviewdog_version: latest - name: Run ruff with reviewdog env: REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | ruff check . --output-format=rdjson | reviewdog -f=rdjson -reporter=github-pr-review -fail-on-error autocorrect: name: runner / autocorrect runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - name: AutoCorrect uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3 - name: Report ReviewDog if: failure() uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3 env: REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: reviewdog: true ================================================ FILE: .github/workflows/publish-to-pypi.yml ================================================ name: Release on: push: branches: - main - master permissions: id-token: write contents: write pull-requests: write jobs: check-repository: name: Check if running in main repository runs-on: ubuntu-latest outputs: is_main_repo: ${{ github.repository == 'funstory-ai/BabelDOC' }} steps: - run: echo "Running repository check" build: name: Build distribution 📦 needs: check-repository if: needs.check-repository.outputs.is_main_repo == 'true' runs-on: ubuntu-latest outputs: is_release: ${{ steps.check-version.outputs.tag }} steps: - uses: actions/checkout@v5 with: persist-credentials: true fetch-depth: 2 token: ${{ secrets.GITHUB_TOKEN }} - name: Setup uv with Python 3.12 uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 with: python-version: "3.12" enable-cache: true cache-dependency-glob: "uv.lock" activate-environment: true - name: Check if there is a parent commit id: check-parent-commit run: | echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT - name: Detect and tag new version id: check-version if: steps.check-parent-commit.outputs.sha uses: salsify/action-detect-and-tag-new-version@b1778166f13188a9d478e2d1198f993011ba9864 # v2.0.3 with: version-command: | cat pyproject.toml | grep "version = " | head -n 1 | awk -F'"' '{print $2}' - name: Install Dependencies run: | uv sync - name: Bump version for developmental release if: "! steps.check-version.outputs.tag" run: | version=$(uv run bumpver update --patch --tag=final --dry 2>&1 | grep "New Version" | awk '{print $NF}') && uv run bumpver update --set-version $version.dev$(date +%s) - name: Build package run: "uv build" - name: Store the distribution packages uses: actions/upload-artifact@v4.6.2 with: name: python-package-distributions path: dist/ publish-to-pypi: name: Publish Python 🐍 distribution 📦 to PyPI if: needs.build.outputs.is_release != '' needs: - check-repository - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/BabelDOC permissions: id-token: write steps: - name: Download all the dists uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: name: python-package-distributions path: dist/ - name: Publish distribution 📦 to PyPI uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 publish-to-testpypi: name: Publish Python 🐍 distribution 📦 to TestPyPI if: needs.build.outputs.is_release == '' needs: - check-repository - build runs-on: ubuntu-latest environment: name: testpypi url: https://test.pypi.org/p/BabelDOC permissions: id-token: write steps: - name: Download all the dists uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: name: python-package-distributions path: dist/ - name: Publish distribution 📦 to TestPyPI uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: repository-url: https://test.pypi.org/legacy/ post-release: name: Post Release Tasks needs: - check-repository - build - publish-to-pypi - publish-to-testpypi if: | always() && needs.check-repository.outputs.is_main_repo == 'true' && (needs.publish-to-pypi.result == 'success' || needs.publish-to-testpypi.result == 'success') runs-on: ubuntu-latest permissions: contents: write pull-requests: write steps: - uses: actions/checkout@v5 with: persist-credentials: true fetch-depth: 2 token: ${{ secrets.GITHUB_TOKEN }} - name: Publish the release notes uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0 with: publish: ${{ needs.build.outputs.is_release != '' }} tag: ${{ needs.build.outputs.is_release }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/test.yml ================================================ name: Run Tests 🧪 on: push: pull_request: branches: ["main"] permissions: contents: read pull-requests: read jobs: test: name: Run Python Tests runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v5 with: persist-credentials: false - name: Cached Assets id: cache-assets uses: actions/cache@v4.2.0 with: path: ~/.cache/babeldoc key: babeldoc-assets-${{ hashFiles('babeldoc/assets/embedding_assets_metadata.py') }} - name: Setup uv with Python ${{ matrix.python-version }} uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 with: python-version: ${{ matrix.python-version }} enable-cache: true cache-dependency-glob: "uv.lock" activate-environment: true - name: Warm up cache run: | uv run babeldoc --warmup - name: Run tests env: OPENAI_API_KEY: ${{ secrets.OPENAIAPIKEY }} OPENAI_BASE_URL: ${{ secrets.OPENAIAPIURL }} OPENAI_MODEL: ${{ secrets.OPENAIMODEL }} run: | uv run babeldoc --help uv run babeldoc --openai --files examples/ci/test.pdf --openai-api-key ${{ env.OPENAI_API_KEY }} --openai-base-url ${{ env.OPENAI_BASE_URL }} --openai-model ${{ env.OPENAI_MODEL }} - name: Generate offline assets package run: | uv run babeldoc --generate-offline-assets /tmp/offline_assets - name: Restore offline assets package run: | rm -rf ~/.cache/babeldoc uv run babeldoc --restore-offline-assets /tmp/offline_assets - name: Clean up run: | rm -rf /tmp/offline_assets rm -rf ~/.cache/babeldoc/cache.v1.db rm -rf ~/.cache/babeldoc/working ================================================ FILE: .gitignore ================================================ # Logs web/logs web/*.log web/npm-debug.log* web/yarn-debug.log* web/yarn-error.log* web/pnpm-debug.log* web/lerna-debug.log* web/node_modules web/dist web/dist-ssr web/*.local memray* **/*.so *.pdf *.docx *.json **/*.pyc .venv .idea *.egg-info .DS_Store .vscode __pycache__ .ruff_cache yadt.toml examples/ /make_gif.py /dist .cache .cursor/rules/_*.mdc /.cursor /xnotes /docs/workflow-rules.md babeldoc/format/txt /profile.svg # uv uv.lock # Claude Code memory file CLAUDE.md /.claude babeldoc/format/playground temp.jpg AGENTS.md ================================================ FILE: .pre-commit-config.yaml ================================================ files: '^.*\.py$' repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.9.5 hooks: # Run the linter. - id: ruff args: [ "--fix", "--ignore=E203,E261,E501,E741,F841" ] # Run the formatter. - id: ruff-format ================================================ FILE: LICENSE ================================================ GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. BabelDOC is library for ultimated document translation solution. Copyright (C) 2024 This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . ================================================ FILE: README.md ================================================ PDF scientific paper translation and bilingual comparison library. - **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) Free usage quota is available; please refer to the FAQ section on the page for details. - **Self-deployment**: [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) support for BabelDOC, available for self-deployment + WebUI with more translation services. - Provides a simple [command line interface](#getting-started). - Provides a [Python API](#python-api). - Mainly designed to be embedded into other programs, but can also be used directly for simple translation tasks. > [!TIP] > > How to use BabelDOC in Zotero > > 1. Immersive Translate Pro members can use the [immersive-translate/zotero-immersivetranslate](https://github.com/immersive-translate/zotero-immersivetranslate) plugin > > 2. PDFMathTranslate self-deployed users can use the [guaguastandup/zotero-pdf2zh](https://github.com/guaguastandup/zotero-pdf2zh) plugin [Supported Language](https://funstory-ai.github.io/BabelDOC/supported_languages/) ## Preview
## We are hiring See details: [EN](https://github.com/funstory-ai/jobs) | [ZH](https://github.com/funstory-ai/jobs/blob/main/README_ZH.md) ## Getting Started ### Install from PyPI We recommend using the Tool feature of [uv](https://github.com/astral-sh/uv) to install yadt. 1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted. 2. Use the following command to install yadt: ```bash uv tool install --python 3.12 BabelDOC babeldoc --help ``` 3. Use the `babeldoc` command. For example: ```bash babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here" --files example.pdf # multiple files babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here" --files example1.pdf --files example2.pdf ``` ### Install from Source We still recommend using [uv](https://github.com/astral-sh/uv) to manage virtual environments. 1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted. 2. Use the following command to install yadt: ```bash # clone the project git clone https://github.com/funstory-ai/BabelDOC # enter the project directory cd BabelDOC # install dependencies and run babeldoc uv run babeldoc --help ``` 3. Use the `uv run babeldoc` command. For example: ```bash uv run babeldoc --files example.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here" # multiple files uv run babeldoc --files example.pdf --files example2.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here" ``` > [!TIP] > The absolute path is recommended. ## Advanced Options > [!NOTE] > This CLI is mainly for debugging purposes. Although end users can use this CLI to translate files, we do not provide any technical support for this purpose. > > End users should directly use **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) 1000 free pages per month. > > End users who need self-deployment should use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next) > > If you find that an option is not listed below, it means that this option is a debugging option for maintainers. Please do not use these options. ### Language Options - `--lang-in`, `-li`: Source language code (default: en) - `--lang-out`, `-lo`: Target language code (default: zh) > [!TIP] > Currently, this project mainly focuses on English-to-Chinese translation, and other scenarios have not been tested yet. > > (2025.3.1 update): Basic English target language support has been added, primarily to minimize line breaks within words([0-9A-Za-z]+). > > [HELP WANTED: Collecting word regular expressions for more languages](https://github.com/funstory-ai/BabelDOC/issues/129) ### PDF Processing Options - `--files`: One or more file paths to input PDF documents. - `--pages`, `-p`: Specify pages to translate (e.g., "1,2,1-,-3,3-5"). If not set, translate all pages - `--split-short-lines`: Force split short lines into different paragraphs (may cause poor typesetting & bugs) - `--short-line-split-factor`: Split threshold factor (default: 0.8). The actual threshold is the median length of all lines on the current page \* this factor - `--skip-clean`: Skip PDF cleaning step - `--dual-translate-first`: Put translated pages first in dual PDF mode (default: original pages first) - `--disable-rich-text-translate`: Disable rich text translation (may help improve compatibility with some PDFs) - `--enhance-compatibility`: Enable all compatibility enhancement options (equivalent to --skip-clean --dual-translate-first --disable-rich-text-translate) - `--use-alternating-pages-dual`: Use alternating pages mode for dual PDF. When enabled, original and translated pages are arranged in alternate order. When disabled (default), original and translated pages are shown side by side on the same page. - `--watermark-output-mode`: Control watermark output mode: 'watermarked' (default) adds watermark to translated PDF, 'no_watermark' doesn't add watermark, 'both' outputs both versions. - `--max-pages-per-part`: Maximum number of pages per part for split translation. If not set, no splitting will be performed. - `--no-watermark`: [DEPRECATED] Use --watermark-output-mode=no_watermark instead. - `--translate-table-text`: Translate table text (experimental, default: False) - `--formular-font-pattern`: Font pattern to identify formula text (default: None) - `--formular-char-pattern`: Character pattern to identify formula text (default: None) - `--show-char-box`: Show character bounding boxes (debug only, default: False) - `--skip-scanned-detection`: Skip scanned document detection (default: False). When using split translation, only the first part performs detection if not skipped. - `--ocr-workaround`: Use OCR workaround (default: False). Only suitable for documents with black text on white background. When enabled, white rectangular blocks will be added below the translation to cover the original text content, and all text will be forced to black color. - `--auto-enable-ocr-workaround`: Enable automatic OCR workaround (default: False). If a document is detected as heavily scanned, this will attempt to enable OCR processing and skip further scan detection. See "Important Interaction Note" below for crucial details on how this interacts with `--ocr-workaround` and `--skip-scanned-detection`. - `--primary-font-family`: Override primary font family for translated text. Choices: 'serif' for serif fonts, 'sans-serif' for sans-serif fonts, 'script' for script/italic fonts. If not specified, uses automatic font selection based on original text properties. - `--only-include-translated-page`: Only include translated pages in the output PDF. This option is only effective when `--pages` is used. (default: False) - `--merge-alternating-line-numbers`: Enable post-processing to merge alternating line-number layouts (keep the number paragraph as an independent paragraph b; merge adjacent text paragraphs a and c across it when `layout_id` and `xobj_id` match, digits are ASCII and spaces only). Default: off. - `--skip-form-render`: Skip form rendering (default: False). When enabled, PDF forms will not be rendered in the output. - `--skip-curve-render`: Skip curve rendering (default: False). When enabled, PDF curves will not be rendered in the output. - `--only-parse-generate-pdf`: Only parse PDF and generate output PDF without translation (default: False). This skips all translation-related processing including layout analysis, paragraph finding, style processing, and translation itself. Useful for testing PDF parsing and reconstruction functionality. - `--remove-non-formula-lines`: Remove non-formula lines from paragraph areas (default: False). This removes decorative lines that are not part of formulas, while protecting lines in figure/table areas. Useful for cleaning up documents with decorative elements that interfere with text flow. - `--non-formula-line-iou-threshold`: IoU threshold for detecting paragraph overlap when removing non-formula lines (default: 0.9). Higher values are more conservative and will remove fewer lines. - `--figure-table-protection-threshold`: IoU threshold for protecting lines in figure/table areas when removing non-formula lines (default: 0.9). Higher values provide more protection for structural elements in figures and tables. - `--rpc-doclayout`: RPC service host address for document layout analysis (default: None) - `--working-dir`: Working directory for translation. If not set, use temp directory. - `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled. - `--save-auto-extracted-glossary`: Save automatically extracted glossary to the specified file. If not set, the glossary will not be saved. > [!TIP] > - Both `--skip-clean` and `--dual-translate-first` may help improve compatibility with some PDF readers > - `--disable-rich-text-translate` can also help with compatibility by simplifying translation input > - However, using `--skip-clean` will result in larger file sizes > - If you encounter any compatibility issues, try using `--enhance-compatibility` first > - Use `--max-pages-per-part` for large documents to split them into smaller parts for translation and automatically merge them back. > - Use `--skip-scanned-detection` to speed up processing when you know your document is not a scanned PDF. > - Use `--ocr-workaround` to fill background for scanned PDF. (Current assumption: background is pure white, text is pure black, this option will also auto enable `--skip-scanned-detection`) ### Translation Service Options - `--qps`: QPS (Queries Per Second) limit for translation service (default: 4) - `--ignore-cache`: Ignore translation cache and force retranslation - `--no-dual`: Do not output bilingual PDF files - `--no-mono`: Do not output monolingual PDF files - `--min-text-length`: Minimum text length to translate (default: 5) - `--openai`: Use OpenAI for translation (default: False) - `--custom-system-prompt`: Custom system prompt for translation. - `--add-formula-placehold-hint`: Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False) - `--disable-same-text-fallback`: Disable fallback translation when LLM output matches input text. (default: False) - `--pool-max-workers`: Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations. - `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled. > [!TIP] > > 1. Currently, only OpenAI-compatible LLM is supported. For more translator support, please use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next). > 2. It is recommended to use models with strong compatibility with OpenAI, such as: `glm-4-flash`, `deepseek-chat`, etc. > 3. Currently, it has not been optimized for traditional translation engines like Bing/Google, it is recommended to use LLMs. > 4. You can use [litellm](https://github.com/BerriAI/litellm) to access multiple models. > 5. `--custom-system-prompt`: It is mainly used to add the `/no_think` instruction of Qwen 3 in the prompt. For example: `--custom-system-prompt "/no_think You are a professional, authentic machine translation engine."` ### OpenAI Specific Options - `--openai-model`: OpenAI model to use (default: gpt-4o-mini) - `--openai-base-url`: Base URL for OpenAI API - `--openai-api-key`: API key for OpenAI service - `--enable-json-mode-if-requested`: Enable JSON mode for OpenAI requests (default: False) - `--term-pool-max-workers`: Maximum number of worker threads dedicated to automatic term extraction. If not specified, this defaults to the value of `--pool-max-workers`, which itself defaults to the QPS value when unset. > [!TIP] > > 1. This tool supports any OpenAI-compatible API endpoints. Just set the correct base URL and API key. (e.g. `https://xxx.custom.xxx/v1`) > 2. For local models like Ollama, you can use any value as the API key (e.g. `--openai-api-key a`). ### Glossary Options - `--glossary-files`: Comma-separated paths to glossary CSV files. - Each CSV file should have the columns: `source`, `target`, and an optional `tgt_lng`. - The `source` column contains the term in the original language. - The `target` column contains the term in the target language. - The `tgt_lng` column (optional) specifies the target language for that specific entry (e.g., "zh-CN", "en-US"). - If `tgt_lng` is provided for an entry, that entry will only be loaded and used if its (normalized) `tgt_lng` matches the (normalized) overall target language specified by `--lang-out`. Normalization involves lowercasing and replacing hyphens (`-`) with underscores (`_`). - If `tgt_lng` is omitted for an entry, that entry is considered applicable for any `--lang-out`. - The name of each glossary (used in LLM prompts) is derived from its filename (without the .csv extension). - During translation, the system will check the input text against the loaded glossaries. If terms from a glossary are found in the current text segment, that glossary (with the relevant terms) will be included in the prompt to the language model, along with an instruction to adhere to it. ### Output Control - `--output`, `-o`: Output directory for translated files. If not set, use current working directory. - `--debug`: Enable debug logging level and export detailed intermediate results in `~/.cache/yadt/working`. - `--report-interval`: Progress report interval in seconds (default: 0.1). ### General Options - `--warmup`: Only download and verify required assets then exit (default: False) ### Offline Assets Management - `--generate-offline-assets`: Generate an offline assets package in the specified directory. This creates a zip file containing all required models and fonts. - `--restore-offline-assets`: Restore an offline assets package from the specified file. This extracts models and fonts from a previously generated package. > [!TIP] > > 1. Offline assets packages are useful for environments without internet access or to speed up installation on multiple machines. > 2. Generate a package once with `babeldoc --generate-offline-assets /path/to/output/dir` and then distribute it. > 3. Restore the package on target machines with `babeldoc --restore-offline-assets /path/to/offline_assets_*.zip`. > 4. The offline assets package name cannot be modified because the file list hash is encoded in the name. > 5. If you provide a directory path to `--restore-offline-assets`, the tool will automatically look for the correct offline assets package file in that directory. > 6. The package contains all necessary fonts and models required for document processing, ensuring consistent results across different environments. > 7. The integrity of all assets is verified using SHA3-256 hashes during both packaging and restoration. > 8. If you're deploying in an air-gapped environment, make sure to generate the package on a machine with internet access first. ### Configuration File - `--config`, `-c`: Configuration file path. Use the TOML format. Example Configuration: ```toml [babeldoc] # Basic settings debug = true lang-in = "en-US" lang-out = "zh-CN" qps = 10 output = "/path/to/output/dir" # PDF processing options split-short-lines = false short-line-split-factor = 0.8 skip-clean = false dual-translate-first = false disable-rich-text-translate = false use-alternating-pages-dual = false watermark-output-mode = "watermarked" # Choices: "watermarked", "no_watermark", "both" max-pages-per-part = 50 # Automatically split the document for translation and merge it back. only_include_translated_page = false # Only include translated pages in the output PDF. Effective only when `pages` is used. # no-watermark = false # DEPRECATED: Use watermark-output-mode instead skip-scanned-detection = false # Skip scanned document detection for faster processing auto_extract_glossary = true # Set to false to disable automatic term extraction formular_font_pattern = "" # Font pattern for formula text formular_char_pattern = "" # Character pattern for formula text show_char_box = false # Show character bounding boxes (debug) ocr_workaround = false # Use OCR workaround for scanned PDFs rpc_doclayout = "" # RPC service host for document layout analysis working_dir = "" # Working directory for translation auto_enable_ocr_workaround = false # Enable automatic OCR workaround for scanned PDFs. See docs for interaction with ocr_workaround and skip_scanned_detection. skip_form_render = false # Skip form rendering (default: False) skip_curve_render = false # Skip curve rendering (default: False) only_parse_generate_pdf = false # Only parse PDF and generate output PDF without translation (default: False) remove_non_formula_lines = false # Remove non-formula lines from paragraph areas (default: False) non_formula_line_iou_threshold = 0.2 # IoU threshold for paragraph overlap detection (default: 0.2) figure_table_protection_threshold = 0.3 # IoU threshold for figure/table protection (default: 0.3) # Translation service openai = true openai-model = "gpt-4o-mini" openai-base-url = "https://api.openai.com/v1" openai-api-key = "your-api-key-here" enable-json-mode-if-requested = false # Enable JSON mode when requested (default: false) disable_same_text_fallback = false # Disable fallback translation when LLM output matches input text (default: false) pool-max-workers = 8 # Maximum worker threads for task processing (defaults to QPS value if not set) # Glossary Options (Optional) # glossary-files = "/path/to/glossary1.csv,/path/to/glossary2.csv" # Output control no-dual = false no-mono = false min-text-length = 5 report-interval = 0.5 # Offline assets management # Uncomment one of these options as needed: # generate-offline-assets = "/path/to/output/dir" # restore-offline-assets = "/path/to/offline_assets_package.zip" ``` ## Python API The current recommended way to call BabelDOC in Python is to call the `high_level.do_translate_async_stream` function of [pdf2zh next](https://github.com/PDFMathTranslate/PDFMathTranslate-next). > [!WARNING] > **All APIs of BabelDOC should be considered as internal APIs, and any direct use of BabelDOC is not supported.** ## Background There are a lot projects and teams working on to make document editing and translating easier like: - [mathpix](https://mathpix.com/) - [Doc2X](https://doc2x.noedgeai.com/) - [minerU](https://github.com/opendatalab/MinerU) - [PDFMathTranslate](https://github.com/funstory-ai/yadt) There are also some solutions to solve specific parts of the problem like: - [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader): the read order of the text block in a pdf - [Surya](https://github.com/surya-is/surya): the structure of the pdf This project hopes to promote a standard pipeline and interface to solve the problem. In fact, there are two main stages of a PDF parser or translator: - **Parsing**: A stage of parsing means to get the structure of the pdf such as text blocks, images, tables, etc. - **Rendering**: A stage of rendering means to render the structure into a new pdf or other format. For a service like mathpix, it will parse the pdf into a structure may be in a XML format, and then render them using a single column reader order as [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader) does. The bad news is that the original structure lost. Some people will use Adobe PDF Parser because it will generate a Word document and it keeps the original structure. But it is somewhat expensive. And you know, a pdf or word document is not a good format for reading in mobile devices. We offer an intermediate representation of the results from parser and can be rendered into a new pdf or other format. The pipeline is also a plugin-based system which everybody can add their new model, ocr, renderer, etc. ## Roadmap - [ ] Add line support - [ ] Add table support - [ ] Add cross-page/cross-column paragraph support - [ ] More advanced typesetting features - [ ] Outline support - [ ] ... Our first 1.0 version goal is to finish a translation from [PDF Reference, Version 1.7](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf) to the following language version: - Simplified Chinese - Traditional Chinese - Japanese - Spanish And meet the following requirements: - layout error less than 1% - content loss less than 1% ## Version Number Explanation This project uses a combination of [Semantic Versioning](https://semver.org/) and [Pride Versioning](https://pridever.org/). The version number format is: "0.MAJOR.MINOR". > [!NOTE] > > The API compatibility here mainly refers to the compatibility with [pdf2zh_next](https://github.com/PDFMathTranslate/PDFMathTranslate-next). - MAJOR: Incremented by 1 when API incompatible changes are made or when proud improvements are implemented. - MINOR: Incremented by 1 when any API compatible changes are made. ## Known Issues 1. Parsing errors in the author and reference sections; they get merged into one paragraph after translation. 2. Lines are not supported. 3. Does not support drop caps. 4. Large pages will be skipped. ## How to Contribute We encourage you to contribute to YADT! Please check out the [CONTRIBUTING](https://github.com/funstory-ai/yadt/blob/main/docs/CONTRIBUTING.md) guide. Everyone interacting in YADT and its sub-projects' codebases, issue trackers, chat rooms, and mailing lists is expected to follow the YADT [Code of Conduct](https://github.com/funstory-ai/yadt/blob/main/docs/CODE_OF_CONDUCT.md). [Immersive Translation](https://immersivetranslate.com) sponsors monthly Pro membership redemption codes for active contributors to this project, see details at: [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md) ## Acknowledgements - [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate) - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - [pdfminer](https://github.com/pdfminer/pdfminer.six) - [PyMuPDF](https://github.com/pymupdf/PyMuPDF) - [Asynchronize](https://github.com/multimeric/Asynchronize/tree/master?tab=readme-ov-file) - [PriorityThreadPoolExecutor](https://github.com/oleglpts/PriorityThreadPoolExecutor)

Star History

Star History Chart > [!WARNING] > **Important Interaction Note for `--auto-enable-ocr-workaround`:** > > When `--auto-enable-ocr-workaround` is set to `true` (either via command line or config file): > > 1. During the initial setup, the values for `ocr_workaround` and `skip_scanned_detection` will be forced to `false` by `TranslationConfig`, regardless of whether you also set `--ocr-workaround` or `--skip-scanned-detection` flags. > 2. Then, during the scanned document detection phase (`DetectScannedFile` stage): > * If the document is identified as heavily scanned (e.g., >80% scanned pages) AND `auto_enable_ocr_workaround` is `true` (i.e., `translation_config.auto_enable_ocr_workaround` is true), the system will then attempt to set both `ocr_workaround` to `true` and `skip_scanned_detection` to `true`. > > This means that `--auto-enable-ocr-workaround` effectively gives the system control to enable OCR processing for scanned documents, potentially overriding manual settings for `--ocr-workaround` and `--skip_scanned_detection` based on its detection results. If the document is *not* detected as heavily scanned, then the initial `false` values for `ocr_workaround` and `skip_scanned_detection` (forced by `--auto-enable-ocr-workaround` at the `TranslationConfig` initialization stage) will remain in effect unless changed by other logic. ================================================ FILE: babeldoc/__init__.py ================================================ __version__ = "0.5.23" ================================================ FILE: babeldoc/assets/assets.py ================================================ import asyncio import hashlib import json import logging import threading import zipfile from pathlib import Path import httpx from babeldoc.assets import embedding_assets_metadata from babeldoc.assets.embedding_assets_metadata import CMAP_METADATA from babeldoc.assets.embedding_assets_metadata import CMAP_URL_BY_UPSTREAM from babeldoc.assets.embedding_assets_metadata import DOC_LAYOUT_ONNX_MODEL_URL from babeldoc.assets.embedding_assets_metadata import ( DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256, ) from babeldoc.assets.embedding_assets_metadata import EMBEDDING_FONT_METADATA from babeldoc.assets.embedding_assets_metadata import FONT_METADATA_URL from babeldoc.assets.embedding_assets_metadata import FONT_URL_BY_UPSTREAM from babeldoc.assets.embedding_assets_metadata import ( TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256, ) from babeldoc.assets.embedding_assets_metadata import TABLE_DETECTION_RAPIDOCR_MODEL_URL from babeldoc.assets.embedding_assets_metadata import TIKTOKEN_CACHES from babeldoc.const import get_cache_file_path from tenacity import retry from tenacity import stop_after_attempt from tenacity import wait_exponential logger = logging.getLogger(__name__) _FASTEST_FONT_UPSTREAM_LOCK = asyncio.Lock() _FASTEST_FONT_UPSTREAM: str | None = None _FASTEST_FONT_METADATA: dict | None = None class ResultContainer: def __init__(self): self.result = None def set_result(self, result): self.result = result def run_in_another_thread(coro): result_container = ResultContainer() def _wrapper(): result_container.set_result(asyncio.run(coro)) thread = threading.Thread(target=_wrapper) thread.start() thread.join() return result_container.result def run_coro(coro): return run_in_another_thread(coro) def _retry_if_not_cancelled_and_failed(retry_state): """Only retry if the exception is not CancelledError and the attempt failed.""" if retry_state.outcome.failed: exception = retry_state.outcome.exception() # Don't retry on CancelledError if isinstance(exception, asyncio.CancelledError): logger.debug("Operation was cancelled, not retrying") return False # Retry on network related errors if isinstance( exception, httpx.HTTPError | ConnectionError | ValueError | TimeoutError ): logger.warning(f"Network error occurred: {exception}, will retry") return True # Don't retry on success return False def verify_file(path: Path, sha3_256: str): if not path.exists(): return False hash_ = hashlib.sha3_256() with path.open("rb") as f: while True: chunk = f.read(1024 * 1024) if not chunk: break hash_.update(chunk) return hash_.hexdigest() == sha3_256 @retry( retry=_retry_if_not_cancelled_and_failed, stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=15), before_sleep=lambda retry_state: logger.warning( f"Download file failed, retrying in {retry_state.next_action.sleep} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) async def download_file( client: httpx.AsyncClient | None = None, url: str = None, path: Path = None, sha3_256: str = None, ): if client is None: async with httpx.AsyncClient() as client: response = await client.get(url, follow_redirects=True) else: response = await client.get(url, follow_redirects=True) response.raise_for_status() with path.open("wb") as f: f.write(response.content) if not verify_file(path, sha3_256): path.unlink(missing_ok=True) raise ValueError(f"File {path} is corrupted") @retry( retry=_retry_if_not_cancelled_and_failed, stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=15), before_sleep=lambda retry_state: logger.warning( f"Get font metadata failed, retrying in {retry_state.next_action.sleep} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) async def get_font_metadata( client: httpx.AsyncClient | None = None, upstream: str = None ): if upstream not in FONT_METADATA_URL: logger.critical(f"Invalid upstream: {upstream}") exit(1) if client is None: async with httpx.AsyncClient() as client: response = await client.get( FONT_METADATA_URL[upstream], follow_redirects=True ) else: response = await client.get(FONT_METADATA_URL[upstream], follow_redirects=True) response.raise_for_status() logger.debug(f"Get font metadata from {upstream} success") return upstream, response.json() async def _get_fastest_upstream_for_font_internal( client: httpx.AsyncClient | None = None, exclude_upstream: list[str] | None = None ) -> tuple[str | None, dict | None]: """Find the fastest upstream for font metadata without using cached result.""" tasks: list[asyncio.Task[tuple[str, dict]]] = [] for upstream in FONT_METADATA_URL: if exclude_upstream and upstream in exclude_upstream: continue tasks.append(asyncio.create_task(get_font_metadata(client, upstream))) for future in asyncio.as_completed(tasks): try: result = await future for task in tasks: if not task.done(): task.cancel() return result except Exception as e: logger.exception(f"Error getting font metadata: {e}") logger.error("All upstreams failed") return None, None async def get_fastest_upstream_for_font( client: httpx.AsyncClient | None = None, exclude_upstream: list[str] | None = None ) -> tuple[str | None, dict | None]: """Get the fastest upstream for font metadata with cached result. The cached upstream is only used when exclude_upstream is None. """ global _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA if exclude_upstream is None and _FASTEST_FONT_UPSTREAM is not None: return _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA if exclude_upstream is not None: # Do not use or update cache when exclude_upstream is provided. return await _get_fastest_upstream_for_font_internal(client, exclude_upstream) async with _FASTEST_FONT_UPSTREAM_LOCK: if _FASTEST_FONT_UPSTREAM is not None: return _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA upstream, metadata = await _get_fastest_upstream_for_font_internal(client) if upstream is not None: _FASTEST_FONT_UPSTREAM = upstream _FASTEST_FONT_METADATA = metadata logger.info(f"Fastest font upstream determined: {upstream}") return upstream, metadata async def get_fastest_upstream_for_model(client: httpx.AsyncClient | None = None): return await get_fastest_upstream_for_font(client, exclude_upstream=["github"]) async def get_fastest_upstream(client: httpx.AsyncClient | None = None): ( fastest_upstream_for_font, online_font_metadata, ) = await get_fastest_upstream_for_font(client) if fastest_upstream_for_font is None: logger.error("Failed to get fastest upstream") exit(1) if fastest_upstream_for_font == "github": # since github is only store font, we need to get the fastest upstream for model fastest_upstream_for_model, _ = await get_fastest_upstream_for_model(client) if fastest_upstream_for_model is None: logger.error("Failed to get fastest upstream") exit(1) else: fastest_upstream_for_model = fastest_upstream_for_font return online_font_metadata, fastest_upstream_for_font, fastest_upstream_for_model async def get_doclayout_onnx_model_path_async(client: httpx.AsyncClient | None = None): onnx_path = get_cache_file_path( "doclayout_yolo_docstructbench_imgsz1024.onnx", "models" ) if verify_file(onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256): return onnx_path logger.info("doclayout onnx model not found or corrupted, downloading...") fastest_upstream, _ = await get_fastest_upstream_for_model(client) if fastest_upstream is None: logger.error("Failed to get fastest upstream") exit(1) url = DOC_LAYOUT_ONNX_MODEL_URL[fastest_upstream] await download_file( client, url, onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256 ) logger.info(f"Download doclayout onnx model from {fastest_upstream} success") return onnx_path async def get_table_detection_rapidocr_model_path_async( client: httpx.AsyncClient | None = None, ): onnx_path = get_cache_file_path("ch_PP-OCRv4_det_infer.onnx", "models") if verify_file(onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256): return onnx_path logger.info("table detection rapidocr model not found or corrupted, downloading...") fastest_upstream, _ = await get_fastest_upstream_for_model(client) if fastest_upstream is None: logger.error("Failed to get fastest upstream") exit(1) url = TABLE_DETECTION_RAPIDOCR_MODEL_URL[fastest_upstream] await download_file(client, url, onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256) logger.info( f"Download table detection rapidocr model from {fastest_upstream} success" ) return onnx_path def get_doclayout_onnx_model_path(): return run_coro(get_doclayout_onnx_model_path_async()) def get_table_detection_rapidocr_model_path(): return run_coro(get_table_detection_rapidocr_model_path_async()) def get_font_url_by_name_and_upstream(font_file_name: str, upstream: str): if upstream not in FONT_URL_BY_UPSTREAM: logger.critical(f"Invalid upstream: {upstream}") exit(1) return FONT_URL_BY_UPSTREAM[upstream](font_file_name) async def get_font_and_metadata_async( font_file_name: str, client: httpx.AsyncClient | None = None, fastest_upstream: str | None = None, font_metadata: dict | None = None, ): cache_file_path = get_cache_file_path(font_file_name, "fonts") if font_file_name in EMBEDDING_FONT_METADATA and verify_file( cache_file_path, EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"] ): return cache_file_path, EMBEDDING_FONT_METADATA[font_file_name] logger.info(f"Font {cache_file_path} not found or corrupted, downloading...") if fastest_upstream is None: fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client) if fastest_upstream is None: logger.critical("Failed to get fastest upstream") exit(1) if font_file_name not in font_metadata: logger.critical(f"Font {font_file_name} not found in {font_metadata}") exit(1) if verify_file(cache_file_path, font_metadata[font_file_name]["sha3_256"]): return cache_file_path, font_metadata[font_file_name] assert font_metadata is not None logger.info(f"download {font_file_name} from {fastest_upstream}") url = get_font_url_by_name_and_upstream(font_file_name, fastest_upstream) if "sha3_256" not in font_metadata[font_file_name]: logger.critical(f"Font {font_file_name} not found in {font_metadata}") exit(1) await download_file( client, url, cache_file_path, font_metadata[font_file_name]["sha3_256"] ) return cache_file_path, font_metadata[font_file_name] def get_font_and_metadata(font_file_name: str): return run_coro(get_font_and_metadata_async(font_file_name)) async def get_cmap_file_path_async( name: str, client: httpx.AsyncClient | None = None ) -> Path: """Get cached cmap file path, downloading it if necessary.""" if name.endswith(".json"): file_name = name else: file_name = f"{name}.json" if file_name not in CMAP_METADATA: logger.critical(f"CMap {file_name} not found in CMAP_METADATA") exit(1) meta = CMAP_METADATA[file_name] cache_file_path = get_cache_file_path(file_name, "cmap") if verify_file(cache_file_path, meta["sha3_256"]): return cache_file_path logger.info(f"CMap {cache_file_path} not found or corrupted, downloading...") await download_cmap_file_async(file_name, client) if not verify_file(cache_file_path, meta["sha3_256"]): logger.critical(f"Failed to verify downloaded cmap file: {cache_file_path}") exit(1) return cache_file_path async def download_cmap_file_async( file_name: str, client: httpx.AsyncClient | None = None ) -> Path: """Download a single cmap file to cache directory.""" if file_name not in CMAP_METADATA: logger.critical(f"CMap {file_name} not found in CMAP_METADATA") exit(1) fastest_upstream, _ = await get_fastest_upstream_for_font(client) if fastest_upstream is None: logger.critical("Failed to get fastest upstream for cmap") exit(1) if fastest_upstream not in CMAP_URL_BY_UPSTREAM: logger.critical(f"Invalid fastest upstream for cmap: {fastest_upstream}") exit(1) url = CMAP_URL_BY_UPSTREAM[fastest_upstream](file_name) cache_file_path = get_cache_file_path(file_name, "cmap") sha3_256 = CMAP_METADATA[file_name]["sha3_256"] await download_file(client, url, cache_file_path, sha3_256) return cache_file_path async def get_cmap_data_async( name: str, client: httpx.AsyncClient | None = None ) -> dict: """Load cmap json data from cached file, downloading it if necessary.""" path = await get_cmap_file_path_async(name, client) return json.loads(path.read_text()) def get_cmap_file_path(name: str): return run_coro(get_cmap_file_path_async(name)) def get_cmap_data(name: str): return run_coro(get_cmap_data_async(name)) def get_font_family(lang_code: str): font_family = embedding_assets_metadata.get_font_family(lang_code) return font_family async def download_all_fonts_async(client: httpx.AsyncClient | None = None): for font_file_name in EMBEDDING_FONT_METADATA: if not verify_file( get_cache_file_path(font_file_name, "fonts"), EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"], ): break else: logger.debug("All fonts are already downloaded") return fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client) if fastest_upstream is None: logger.error("Failed to get fastest upstream") exit(1) logger.info(f"Downloading fonts from {fastest_upstream}") font_tasks = [ asyncio.create_task( get_font_and_metadata_async( font_file_name, client, fastest_upstream, font_metadata ) ) for font_file_name in EMBEDDING_FONT_METADATA ] await asyncio.gather(*font_tasks) async def download_all_cmaps_async(client: httpx.AsyncClient | None = None): """Download all cmap files defined in CMAP_METADATA.""" for cmap_file_name, meta in CMAP_METADATA.items(): if not verify_file( get_cache_file_path(cmap_file_name, "cmap"), meta["sha3_256"], ): break else: logger.debug("All cmaps are already downloaded") return fastest_upstream, _ = await get_fastest_upstream_for_font(client) if fastest_upstream is None: logger.error("Failed to get fastest upstream for cmap") exit(1) logger.info(f"Downloading cmaps from {fastest_upstream}") cmap_tasks = [ asyncio.create_task(get_cmap_file_path_async(cmap_file_name, client)) for cmap_file_name in CMAP_METADATA ] await asyncio.gather(*cmap_tasks) async def async_warmup(): logger.info("Downloading all assets...") from tiktoken import encoding_for_model _ = encoding_for_model("gpt-4o") async with httpx.AsyncClient() as client: onnx_task = asyncio.create_task(get_doclayout_onnx_model_path_async(client)) onnx_task2 = asyncio.create_task( get_table_detection_rapidocr_model_path_async(client) ) font_tasks = asyncio.create_task(download_all_fonts_async(client)) cmap_tasks = asyncio.create_task(download_all_cmaps_async(client)) await asyncio.gather(onnx_task, onnx_task2, font_tasks, cmap_tasks) def warmup(): run_coro(async_warmup()) def generate_all_assets_file_list(): result: dict[str, list[dict[str, str]]] = {} result["fonts"] = [] result["models"] = [] result["tiktoken"] = [] result["cmap"] = [] for font_file_name in EMBEDDING_FONT_METADATA: result["fonts"].append( { "name": font_file_name, "sha3_256": EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"], } ) for cmap_file_name in CMAP_METADATA: result["cmap"].append( { "name": cmap_file_name, "sha3_256": CMAP_METADATA[cmap_file_name]["sha3_256"], } ) for tiktoken_file, sha3_256 in TIKTOKEN_CACHES.items(): result["tiktoken"].append( { "name": tiktoken_file, "sha3_256": sha3_256, } ) result["models"].append( { "name": "doclayout_yolo_docstructbench_imgsz1024.onnx", "sha3_256": DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256, }, ) result["models"].append( { "name": "ch_PP-OCRv4_det_infer.onnx", "sha3_256": TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256, }, ) return result async def generate_offline_assets_package_async(output_directory: Path | None = None): await async_warmup() logger.info("Generating offline assets package...") file_list = generate_all_assets_file_list() offline_assets_tag = get_offline_assets_tag(file_list) if output_directory is None: output_path = get_cache_file_path( f"offline_assets_{offline_assets_tag}.zip", "assets" ) else: output_directory.mkdir(parents=True, exist_ok=True) output_path = output_directory / f"offline_assets_{offline_assets_tag}.zip" with zipfile.ZipFile( output_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as zipf: for file_type, file_descs in file_list.items(): # zipf.mkdir(file_type) for file_desc in file_descs: file_name = file_desc["name"] sha3_256 = file_desc["sha3_256"] file_path = get_cache_file_path(file_name, file_type) if not verify_file(file_path, sha3_256): logger.error(f"File {file_path} is corrupted") exit(1) with file_path.open("rb") as f: zipf.writestr(f"{file_type}/{file_name}", f.read()) logger.info(f"Offline assets package generated at {output_path}") async def restore_offline_assets_package_async(input_path: Path | None = None): file_list = generate_all_assets_file_list() offline_assets_tag = get_offline_assets_tag(file_list) if input_path is None: input_path = get_cache_file_path( f"offline_assets_{offline_assets_tag}.zip", "assets" ) else: if input_path.exists() and input_path.is_dir(): input_path = input_path / f"offline_assets_{offline_assets_tag}.zip" if not input_path.exists(): logger.critical(f"Offline assets package not found: {input_path}") exit(1) import re offline_assets_tag_from_input_path = re.match( r"offline_assets_(.*)\.zip", input_path.name ).group(1) if offline_assets_tag != offline_assets_tag_from_input_path: logger.critical( f"Offline assets tag mismatch: {offline_assets_tag} != {offline_assets_tag_from_input_path}" ) exit(1) nothing_changed = True with zipfile.ZipFile(input_path, "r") as zipf: for file_type, file_descs in file_list.items(): for file_desc in file_descs: file_name = file_desc["name"] file_path = get_cache_file_path(file_name, file_type) if verify_file(file_path, file_desc["sha3_256"]): continue nothing_changed = False with zipf.open(f"{file_type}/{file_name}", "r") as f: with file_path.open("wb") as f2: f2.write(f.read()) if not verify_file(file_path, file_desc["sha3_256"]): logger.critical( "Offline assets package is corrupted, please delete it and try again" ) exit(1) if not nothing_changed: logger.info(f"Offline assets package restored from {input_path}") def get_offline_assets_tag(file_list: dict | None = None): if file_list is None: file_list = generate_all_assets_file_list() import orjson # noinspection PyTypeChecker offline_assets_tag = hashlib.sha3_256( orjson.dumps( file_list, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ) ).hexdigest() return offline_assets_tag def generate_offline_assets_package(output_directory: Path | None = None): return run_coro(generate_offline_assets_package_async(output_directory)) def restore_offline_assets_package(input_path: Path | None = None): return run_coro(restore_offline_assets_package_async(input_path)) if __name__ == "__main__": from rich.logging import RichHandler logging.basicConfig(level=logging.DEBUG, handlers=[RichHandler()]) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) # warmup() # generate_offline_assets_package() # restore_offline_assets_package(Path( # '/Users/aw/.cache/babeldoc/assets/offline_assets_33971e4940e90ba0c35baacda44bbe83b214f4703a7bdb8b837de97d0383508c.zip')) # warmup() ================================================ FILE: babeldoc/assets/embedding_assets_metadata.py ================================================ import itertools DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256 = ( "60be061226930524958b5465c8c04af3d7c03bcb0beb66454f5da9f792e3cf2a" ) TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256 = ( "062f4619afe91b33147c033acadecbb53f2a7b99ac703d157b96d5b10948da5e" ) TIKTOKEN_CACHES = { "fb374d419588a4632f3f557e76b4b70aebbca790": "cb04bcda5782cfbbe77f2f991d92c0ea785d9496ef1137c91dfc3c8c324528d6" } FONT_METADATA_URL = { "github": "https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/font_metadata.json", "huggingface": "https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true", # "hf-mirror": "https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true", "modelscope": "https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/font_metadata.json", } FONT_URL_BY_UPSTREAM = { "github": lambda name: f"https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/fonts/{name}", "huggingface": lambda name: f"https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true", "hf-mirror": lambda name: f"https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true", "modelscope": lambda name: f"https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/fonts/{name}", } CMAP_URL_BY_UPSTREAM = { "github": lambda name: f"https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/cmap/{name}", "huggingface": lambda name: f"https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/cmap/{name}?download=true", "hf-mirror": lambda name: f"https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/cmap/{name}?download=true", "modelscope": lambda name: f"https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/cmap/{name}", } DOC_LAYOUT_ONNX_MODEL_URL = { "huggingface": "https://huggingface.co/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true", "hf-mirror": "https://hf-mirror.com/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true", "modelscope": "https://www.modelscope.cn/models/AI-ModelScope/DocLayout-YOLO-DocStructBench-onnx/resolve/master/doclayout_yolo_docstructbench_imgsz1024.onnx", } TABLE_DETECTION_RAPIDOCR_MODEL_URL = { "huggingface": "https://huggingface.co/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx", "hf-mirror": "https://hf-mirror.com/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx", "modelscope": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/master/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx", } # from https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json EMBEDDING_FONT_METADATA = { "GoNotoKurrent-Bold.ttf": { "ascent": 1069, "bold": 1, "descent": -293, "encoding_length": 2, "file_name": "GoNotoKurrent-Bold.ttf", "font_name": "Go Noto Kurrent-Bold Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "000b37f592477945b27b7702dcad39f73e23e140e66ddff9847eb34f32389566", "size": 15303772, }, "GoNotoKurrent-Regular.ttf": { "ascent": 1069, "bold": 0, "descent": -293, "encoding_length": 2, "file_name": "GoNotoKurrent-Regular.ttf", "font_name": "Go Noto Kurrent-Regular Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "4324a60d507c691e6efc97420647f4d2c2d86d9de35009d1c769861b76074ae6", "size": 15515760, }, "KleeOne-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "KleeOne-Regular.ttf", "font_name": "Klee One Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "8585c29f89b322d937f83739f61ede5d84297873e1465cad9a120a208ac55ce0", "size": 8724704, }, "LXGWWenKai-Regular.1.520.ttf": { "ascent": 928, "bold": 0, "descent": -256, "encoding_length": 2, "file_name": "LXGWWenKai-Regular.1.520.ttf", "font_name": "LXGW WenKai Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "708b4fd6cfae62a26f71016724d38e862210732f101b9225225a1d5e8205f94d", "size": 24744500, }, "LXGWWenKaiGB-Regular.1.520.ttf": { "ascent": 928, "bold": 0, "descent": -256, "encoding_length": 2, "file_name": "LXGWWenKaiGB-Regular.1.520.ttf", "font_name": "LXGW WenKai GB Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "0671656b00992e317f9e20610e7145b024e664ada9f272d4f8e497196af98005", "size": 24903712, }, "LXGWWenKaiGB-Regular.ttf": { "ascent": 928, "bold": 0, "descent": -256, "encoding_length": 2, "file_name": "LXGWWenKaiGB-Regular.ttf", "font_name": "LXGW WenKai GB Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "b563a5e8d9db4cd15602a3a3700b01925e80a21f99fb88e1b763b1fb8685f8ee", "size": 19558756, }, "LXGWWenKaiMonoTC-Regular.ttf": { "ascent": 928, "bold": 0, "descent": -241, "encoding_length": 2, "file_name": "LXGWWenKaiMonoTC-Regular.ttf", "font_name": "LXGW WenKai Mono TC Regular", "italic": 0, "monospace": 1, "serif": 0, "sha3_256": "596b278d11418d374a1cfa3a50cbfb82b31db82d3650cfacae8f94311b27fdc5", "size": 13115416, }, "LXGWWenKaiTC-Regular.1.520.ttf": { "ascent": 928, "bold": 0, "descent": -256, "encoding_length": 2, "file_name": "LXGWWenKaiTC-Regular.1.520.ttf", "font_name": "LXGW WenKai TC Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "347d3d4bd88c2afcb194eba186d2c6c0b95d18b2145220feb1c88abf761f1398", "size": 15348376, }, "LXGWWenKaiTC-Regular.ttf": { "ascent": 928, "bold": 0, "descent": -256, "encoding_length": 2, "file_name": "LXGWWenKaiTC-Regular.ttf", "font_name": "LXGW WenKai TC Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "66ccd0ffe8e56cd585dabde8d1292c3f551b390d8ed85f81d7a844825f9c2379", "size": 13100328, }, "MaruBuri-Regular.ttf": { "ascent": 800, "bold": 0, "descent": -200, "encoding_length": 2, "file_name": "MaruBuri-Regular.ttf", "font_name": "MaruBuri Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "abb672dde7b89e06914ce27c59159b7a2933f26207bfcc47981c67c11c41e6d1", "size": 3268988, }, "NotoSans-Bold.ttf": { "ascent": 1069, "bold": 1, "descent": -293, "encoding_length": 2, "file_name": "NotoSans-Bold.ttf", "font_name": "Noto Sans Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "ecd38d472c1cad07d8a5dffd2b5a0f72edcd40fff2b4e68d770da8f2ef343a82", "size": 630964, }, "NotoSans-BoldItalic.ttf": { "ascent": 1069, "bold": 1, "descent": -293, "encoding_length": 2, "file_name": "NotoSans-BoldItalic.ttf", "font_name": "Noto Sans Bold Italic", "italic": 1, "monospace": 0, "serif": 0, "sha3_256": "0b6c690a4a6b7d605b2ecbde00c7ac1a23e60feb17fa30d8b972d61ec3ff732b", "size": 644340, }, "NotoSans-Italic.ttf": { "ascent": 1069, "bold": 0, "descent": -293, "encoding_length": 2, "file_name": "NotoSans-Italic.ttf", "font_name": "Noto Sans Italic", "italic": 1, "monospace": 0, "serif": 0, "sha3_256": "830652f61724c017e5a29a96225b484a2ccbd25f69a1b3f47e5f466a2dbed1ad", "size": 642344, }, "NotoSans-Regular.ttf": { "ascent": 1069, "bold": 0, "descent": -293, "encoding_length": 2, "file_name": "NotoSans-Regular.ttf", "font_name": "Noto Sans Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "7dfe2bbf97dc04c852d1223b220b63430e6ad03b0dbb28ebe6328a20a2d45eb8", "size": 629024, }, "NotoSerif-Bold.ttf": { "ascent": 1069, "bold": 1, "descent": -293, "encoding_length": 2, "file_name": "NotoSerif-Bold.ttf", "font_name": "Noto Serif Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "28d88d924285eadb9f9ce49f2d2b95473f89a307b226c5f6ebed87a654898312", "size": 506864, }, "NotoSerif-BoldItalic.ttf": { "ascent": 1069, "bold": 1, "descent": -293, "encoding_length": 2, "file_name": "NotoSerif-BoldItalic.ttf", "font_name": "Noto Serif Bold Italic", "italic": 1, "monospace": 0, "serif": 1, "sha3_256": "b69ee56af6351b2fb4fbce623f8e1c1f9fb19170686a9e5db2cf260b8cf24ac7", "size": 535724, }, "NotoSerif-Italic.ttf": { "ascent": 1069, "bold": 0, "descent": -293, "encoding_length": 2, "file_name": "NotoSerif-Italic.ttf", "font_name": "Noto Serif Italic", "italic": 1, "monospace": 0, "serif": 1, "sha3_256": "9b7773c24ab8a29e3c1c03efa4ab652d051e4c209134431953463aa946d62868", "size": 535340, }, "NotoSerif-Regular.ttf": { "ascent": 1069, "bold": 0, "descent": -293, "encoding_length": 2, "file_name": "NotoSerif-Regular.ttf", "font_name": "Noto Serif Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "c2bbe984e65bafd3bcd38b3cb1e1344f3b7b79d6beffc7a3d883b57f8358559d", "size": 504932, }, "SourceHanSansCN-Bold.ttf": { "ascent": 1160, "bold": 1, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansCN-Bold.ttf", "font_name": "Source Han Sans CN Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "82314c11016a04ef03e7afd00abe0ccc8df54b922dee79abf6424f3002a31825", "size": 10174460, }, "SourceHanSansCN-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansCN-Regular.ttf", "font_name": "Source Han Sans CN Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "b45a80cf3650bfc62aa014e58243c6325e182c4b0c5819e41a583c699cce9a8f", "size": 10397552, }, "SourceHanSansHK-Bold.ttf": { "ascent": 1160, "bold": 1, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansHK-Bold.ttf", "font_name": "Source Han Sans HK Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "3eecd57457ba9a0fbad6c794f40e7ae704c4f825091aef2ac18902ffdde50608", "size": 6856692, }, "SourceHanSansHK-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansHK-Regular.ttf", "font_name": "Source Han Sans HK Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "5fe4141f9164c03616323400b2936ee4c8265314492e2b822c3a6fbfb63ffe08", "size": 6999792, }, "SourceHanSansJP-Bold.ttf": { "ascent": 1160, "bold": 1, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansJP-Bold.ttf", "font_name": "Source Han Sans JP Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "fb05bd84d62e8064117ee357ab6a4481e1cde931e8e984c0553c8c4b09dc3938", "size": 5603068, }, "SourceHanSansJP-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansJP-Regular.ttf", "font_name": "Source Han Sans JP Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "722cfbdcc0fd83fe07a3d1b10e9e64343c924a351d02cfe8dbb6ec4c6bc38230", "size": 5723960, }, "SourceHanSansKR-Bold.ttf": { "ascent": 1160, "bold": 1, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansKR-Bold.ttf", "font_name": "Source Han Sans KR Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "02959eb2c1eea0786a736aeb50b6e61f2ab873cd69c659389b7511f80f734838", "size": 5858892, }, "SourceHanSansKR-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansKR-Regular.ttf", "font_name": "Source Han Sans KR Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "aba70109eff718e8f796f0185f8dca38026c1661b43c195883c84577e501adf2", "size": 5961704, }, "SourceHanSansTW-Bold.ttf": { "ascent": 1160, "bold": 1, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansTW-Bold.ttf", "font_name": "Source Han Sans TW Bold", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "4a92730e644a1348e87bba7c77e9b462f257f381bd6abbeac5860d8f8306aee6", "size": 6883224, }, "SourceHanSansTW-Regular.ttf": { "ascent": 1160, "bold": 0, "descent": -288, "encoding_length": 2, "file_name": "SourceHanSansTW-Regular.ttf", "font_name": "Source Han Sans TW Regular", "italic": 0, "monospace": 0, "serif": 0, "sha3_256": "6129b68ff4b0814624cac7edca61fbacf8f4d79db6f4c3cfc46b1c48ea2f81ac", "size": 7024812, }, "SourceHanSerifCN-Bold.ttf": { "ascent": 1150, "bold": 1, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifCN-Bold.ttf", "font_name": "Source Han Serif CN Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "77816a54957616e140e25a36a41fc061ddb505a1107de4e6a65f561e5dcf8310", "size": 14134156, }, "SourceHanSerifCN-Regular.ttf": { "ascent": 1150, "bold": 0, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifCN-Regular.ttf", "font_name": "Source Han Serif CN Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "c8bf74da2c3b7457c9d887465b42fb6f80d3d84f361cfe5b0673a317fb1f85ad", "size": 14047768, }, "SourceHanSerifHK-Bold.ttf": { "ascent": 1150, "bold": 1, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifHK-Bold.ttf", "font_name": "Source Han Serif HK Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "0f81296f22846b622a26f7342433d6c5038af708a32fc4b892420c150227f4bb", "size": 9532580, }, "SourceHanSerifHK-Regular.ttf": { "ascent": 1150, "bold": 0, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifHK-Regular.ttf", "font_name": "Source Han Serif HK Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "d5232ec3adf4fb8604bb4779091169ec9bd9d574b513e4a75752e614193afebe", "size": 9467292, }, "SourceHanSerifJP-Bold.ttf": { "ascent": 1150, "bold": 1, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifJP-Bold.ttf", "font_name": "Source Han Serif JP Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "a4a8c22e8ec7bb6e66b9caaff1e12c7a52b5a4201eec3d074b35957c0126faef", "size": 7811832, }, "SourceHanSerifJP-Regular.ttf": { "ascent": 1150, "bold": 0, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifJP-Regular.ttf", "font_name": "Source Han Serif JP Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "3d1f9933c7f3abc8c285e317119a533e6dcfe6027d1f5f066ba71b3eb9161e9c", "size": 7748816, }, "SourceHanSerifKR-Bold.ttf": { "ascent": 1150, "bold": 1, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifKR-Bold.ttf", "font_name": "Source Han Serif KR Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "b071b1aecb042aa779e1198767048438dc756d0da8f90660408abb421393f5cb", "size": 12387920, }, "SourceHanSerifKR-Regular.ttf": { "ascent": 1150, "bold": 0, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifKR-Regular.ttf", "font_name": "Source Han Serif KR Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "a85913439f0a49024ca77c02dfede4318e503ee6b2b7d8fef01eb42435f27b61", "size": 12459924, }, "SourceHanSerifTW-Bold.ttf": { "ascent": 1150, "bold": 1, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifTW-Bold.ttf", "font_name": "Source Han Serif TW Bold", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "562eea88895ab79ffefab7eabb4d322352a7b1963764c524c6d5242ca456bb6e", "size": 9551724, }, "SourceHanSerifTW-Regular.ttf": { "ascent": 1150, "bold": 0, "descent": -286, "encoding_length": 2, "file_name": "SourceHanSerifTW-Regular.ttf", "font_name": "Source Han Serif TW Regular", "italic": 0, "monospace": 0, "serif": 1, "sha3_256": "85c1d6460b2e169b3d53ac60f6fb7a219fb99923027d78fb64b679475e2ddae4", "size": 9486772, }, } CMAP_METADATA = { "78-EUC-H.json": { "file_name": "78-EUC-H.json", "sha3_256": "657006ae4360ac584316dbda94f2223d7dd4cf7c721021b78b470ed712d22a3d", "size": 15035, }, "78-EUC-V.json": { "file_name": "78-EUC-V.json", "sha3_256": "ffd0610937d3893cd6b9f10007033dab4c846d6a50914b3e0b5b1a1d5a446483", "size": 704, }, "78-H.json": { "file_name": "78-H.json", "sha3_256": "07960a71bd7f2dc8501bfff6ebacb5d179961accbb8d043837d6d213d4e7c43f", "size": 14993, }, "78-RKSJ-H.json": { "file_name": "78-RKSJ-H.json", "sha3_256": "2cea4cbf474c08d99420790509473f48960d14df27e37155c0833150eff0310c", "size": 15054, }, "78-RKSJ-V.json": { "file_name": "78-RKSJ-V.json", "sha3_256": "0005485dc7cb41b9911d651a31a008ff4d8f707f3a271f5eb900640415255f58", "size": 705, }, "78-V.json": { "file_name": "78-V.json", "sha3_256": "6ec527dfdd6f8176719db47aea208d96c8427ff2c44bb6d6adcf215e3599c7dd", "size": 700, }, "78ms-RKSJ-H.json": { "file_name": "78ms-RKSJ-H.json", "sha3_256": "781802e72f8e79d599d58a81445333d005df5117b10c9b8392459729e51bbec7", "size": 17125, }, "78ms-RKSJ-V.json": { "file_name": "78ms-RKSJ-V.json", "sha3_256": "1854ff118f30bdee044813bf764f44123697cb2c2dfcfacb10e1aa161d7db16b", "size": 1928, }, "83pv-RKSJ-H.json": { "file_name": "83pv-RKSJ-H.json", "sha3_256": "2b6dd0a63fc97f3b33767a1b16a49b30ba0cb97a1ff01deb6ca5592d90e79815", "size": 5277, }, "90ms-RKSJ-H.json": { "file_name": "90ms-RKSJ-H.json", "sha3_256": "ebacf23e35e924a65b45afb6276f645289f68b122f1b32ab4dbc64f9c7903ccf", "size": 4117, }, "90ms-RKSJ-V.json": { "file_name": "90ms-RKSJ-V.json", "sha3_256": "0e08ffc0c46d93912870ad12a863081bcea12db09038e3929e1e015cfc1663da", "size": 1928, }, "90msp-RKSJ-H.json": { "file_name": "90msp-RKSJ-H.json", "sha3_256": "3098d897f17b1723d5915518d281d3c5d4f46f0b83dbde8b8001073e0f882d32", "size": 4096, }, "90msp-RKSJ-V.json": { "file_name": "90msp-RKSJ-V.json", "sha3_256": "a7ad430c32de4dbce2667fff874efc5d4114c685107f026788eee4ec83992fc8", "size": 1929, }, "90pv-RKSJ-H.json": { "file_name": "90pv-RKSJ-H.json", "sha3_256": "2c1720cc7343f95ccb87e073df0c7788d33bc8811b703b709a0230e79ecb2341", "size": 6314, }, "90pv-RKSJ-V.json": { "file_name": "90pv-RKSJ-V.json", "sha3_256": "487bf100397d4f0bcfa86dbfea149cac54faa59c0b449d65284cc43123d99023", "size": 1283, }, "Add-H.json": { "file_name": "Add-H.json", "sha3_256": "3bd6fbbe961dffa3a6395d1e3823da665efc74363f44ff6083d98fc5ae22433a", "size": 15174, }, "Add-RKSJ-H.json": { "file_name": "Add-RKSJ-H.json", "sha3_256": "bde048bae5dc9c43570bff29ff4691e03372e029dde66edc5e8de64a891dd53b", "size": 15259, }, "Add-RKSJ-V.json": { "file_name": "Add-RKSJ-V.json", "sha3_256": "1a81852c30ebf3101e1e0b0b5eff2e4f19211373c513d7c42b0933ded6b6e59b", "size": 1426, }, "Add-V.json": { "file_name": "Add-V.json", "sha3_256": "6a4f7a4ee2d7a04ce0500b93453859faf3fc3f11b3f55cb61753ef79846b419b", "size": 1421, }, "B5-H.json": { "file_name": "B5-H.json", "sha3_256": "f1b984aa231df737628663a56d380c93fe3172a243792db6d36921b964a118db", "size": 5960, }, "B5-V.json": { "file_name": "B5-V.json", "sha3_256": "0fafc3f78a34f2bf2377a89b2679469505a35ae42df95bf6765f743344f9a94c", "size": 334, }, "B5pc-H.json": { "file_name": "B5pc-H.json", "sha3_256": "07f0c25086768b9731971ba164d88cb10202a9d36e79a076c43233351f61c52f", "size": 6015, }, "B5pc-V.json": { "file_name": "B5pc-V.json", "sha3_256": "f5e44d8eeeda40e8c3a81858dfb823eeed3f5e834e985544d1e56fb79260b8f8", "size": 336, }, "CNS-EUC-H.json": { "file_name": "CNS-EUC-H.json", "sha3_256": "2add6b8cd4750db8bf6b029595232fecb8f1e54a0bad56590d4aa46401085e44", "size": 11342, }, "CNS-EUC-V.json": { "file_name": "CNS-EUC-V.json", "sha3_256": "1ff26a35f10467a99957886c482de267658b9132a704b547381c90fc37c90820", "size": 12592, }, "CNS1-H.json": { "file_name": "CNS1-H.json", "sha3_256": "e64c524f07718603b6bd84fd6799f875cc13c00137fbaa2b41215d518e96c87a", "size": 3728, }, "CNS1-V.json": { "file_name": "CNS1-V.json", "sha3_256": "57a1d2aabe6ab9db9a323ab43c37e3aa1ba9b3eb71841dfec4d8568d657d503a", "size": 332, }, "CNS2-H.json": { "file_name": "CNS2-H.json", "sha3_256": "90831af5d65fae9565d705fc8f1fccd091e33a67a1e544552410e39d7558daed", "size": 2053, }, "CNS2-V.json": { "file_name": "CNS2-V.json", "sha3_256": "c4d2aae661b26120030754901abced51766fa4bce638433a7aa7130a3d5eabb0", "size": 54, }, "ETHK-B5-H.json": { "file_name": "ETHK-B5-H.json", "sha3_256": "3ef2e9ef0364675c2fb9ccbfd37ed9227d416457ee8cadb9e59b2db4354d88ea", "size": 25660, }, "ETHK-B5-V.json": { "file_name": "ETHK-B5-V.json", "sha3_256": "a12c5917b6f3400793e7d6ea2e217e9af05a28621a937cfef4da9f5184a03578", "size": 364, }, "ETen-B5-H.json": { "file_name": "ETen-B5-H.json", "sha3_256": "57f29290c730277b221ad074709d4f76c429d5410931131c9da7157ebae76951", "size": 6205, }, "ETen-B5-V.json": { "file_name": "ETen-B5-V.json", "sha3_256": "d07d9af9e30a8fc3ca7e52158f854226b831ab9ef552cda46219819e47950680", "size": 364, }, "ETenms-B5-H.json": { "file_name": "ETenms-B5-H.json", "sha3_256": "0659f282182ebdaa6abb38062bc3428a3b7b5907513fd499980d1b49930a9b9e", "size": 72, }, "ETenms-B5-V.json": { "file_name": "ETenms-B5-V.json", "sha3_256": "74b107f8950456b2df294a089091837bf802892c1bc3136c403da2a427130c33", "size": 429, }, "EUC-H.json": { "file_name": "EUC-H.json", "sha3_256": "b6df6e254254eb5a2254b0d581f4820d2b3553cd372136ec88f605521683c44a", "size": 2910, }, "EUC-V.json": { "file_name": "EUC-V.json", "sha3_256": "e81c0f409365f2fd60232f6e5c84bf52c8a6b9c6336d4c96fb554f213dbdfaf6", "size": 701, }, "Ext-H.json": { "file_name": "Ext-H.json", "sha3_256": "629359cf115575acb68b59c82373a1a3958001212a854d0a5b98e6fe1efe81db", "size": 15891, }, "Ext-RKSJ-H.json": { "file_name": "Ext-RKSJ-H.json", "sha3_256": "3336a4a77a75924588f13c5a24157680c9c5b6a46298063dcdb461b90bb55da0", "size": 15975, }, "Ext-RKSJ-V.json": { "file_name": "Ext-RKSJ-V.json", "sha3_256": "f2915039ff32992094ff6521fa24c3f41c27f55f3f071730eea732e261a2a553", "size": 994, }, "Ext-V.json": { "file_name": "Ext-V.json", "sha3_256": "e2fb58ec483aee0910b0733dcb6220f10f9f4d2553c8c139a523e3992363f93e", "size": 989, }, "GB-EUC-H.json": { "file_name": "GB-EUC-H.json", "sha3_256": "4a0b5fda367993409663ec1d4be57c207a3500d778373546b729d143d789c191", "size": 2178, }, "GB-EUC-V.json": { "file_name": "GB-EUC-V.json", "sha3_256": "b45a8a562304c2c388fd1574c3a1a0af6f49e4849f7904ba07d57967d9625917", "size": 520, }, "GB-H.json": { "file_name": "GB-H.json", "sha3_256": "a50b5d6461c95a667ccbc44c507ff5e6686e4f1bbd8bfae69486396b4ed03510", "size": 2139, }, "GB-V.json": { "file_name": "GB-V.json", "sha3_256": "1f043042065f2df4590ebbd27fbc8f93802ea66caeb0b8ba92823575842743e5", "size": 516, }, "GBK-EUC-H.json": { "file_name": "GBK-EUC-H.json", "sha3_256": "4502e7abe2edfb6256b5a4308dfca940aaa92a2d951c4b44942ce7bdb9eda877", "size": 99532, }, "GBK-EUC-V.json": { "file_name": "GBK-EUC-V.json", "sha3_256": "c71f6281bb59897dcf48f587136d002d5caa8a0ed89f9b490a6a288765ec674d", "size": 521, }, "GBK2K-H.json": { "file_name": "GBK2K-H.json", "sha3_256": "0a2a975da25641067ea2743f15407df20895b28804a1e64c12cd9fd0f306b1a9", "size": 109298, }, "GBK2K-V.json": { "file_name": "GBK2K-V.json", "sha3_256": "0febb4a13f8f73dc949d159b4f37e886d1c3d1514aaf53d3492e0b5e21523f52", "size": 1044, }, "GBKp-EUC-H.json": { "file_name": "GBKp-EUC-H.json", "sha3_256": "50d628304aff1f13ded3790cc3b8bd48502267768cac5e72cb3be8a46f9a5436", "size": 99510, }, "GBKp-EUC-V.json": { "file_name": "GBKp-EUC-V.json", "sha3_256": "8c540fc12dfed309896544f8153fa52b793708a85e3882985567dcae86fb1732", "size": 522, }, "GBT-EUC-H.json": { "file_name": "GBT-EUC-H.json", "sha3_256": "5fbe99ec7638de5216ea452788d3ef40cfd8c110c8b8ae936b57db6221d9b9d9", "size": 54802, }, "GBT-EUC-V.json": { "file_name": "GBT-EUC-V.json", "sha3_256": "4cc3a48b1f7c8ab088391aa78131289da3d68e2fe0071b380a10c19757356ab5", "size": 521, }, "GBT-H.json": { "file_name": "GBT-H.json", "sha3_256": "8bbbbbdee2722751708dd66a7ed12fa54a08bbf0dcfaefca2b87f305ca591f32", "size": 54763, }, "GBT-V.json": { "file_name": "GBT-V.json", "sha3_256": "32e4457c8b0edbeeec9445465ec40106603ad50003e1af98994c02020df1c59f", "size": 517, }, "GBTpc-EUC-H.json": { "file_name": "GBTpc-EUC-H.json", "sha3_256": "7f7faa903850fc471948e284853a81ee2f4a32693e14131f3ab1fbc490c5695b", "size": 54820, }, "GBTpc-EUC-V.json": { "file_name": "GBTpc-EUC-V.json", "sha3_256": "3cf85a97171567e08d0112b71ca4a0aef68c52918b7c635669ef7e25e1bcb818", "size": 523, }, "GBpc-EUC-H.json": { "file_name": "GBpc-EUC-H.json", "sha3_256": "38332ce5be0b82e4010fbd05ceac92e9f05a784ccacf6a4f004cd8da734c47de", "size": 2196, }, "GBpc-EUC-V.json": { "file_name": "GBpc-EUC-V.json", "sha3_256": "5a0b4e7db0aedd6b27f84b191791b527da3ea27ea1ca42460086cb0d294418bf", "size": 522, }, "H.json": { "file_name": "H.json", "sha3_256": "5ee11fcc99897b769fd62238967954e957bb8079353abba815792aab6f3e329c", "size": 2868, }, "HKdla-B5-H.json": { "file_name": "HKdla-B5-H.json", "sha3_256": "8f24808486e1d5363a66981021f3f8b136f1ec6231d48bda76344e1f7f1695aa", "size": 25384, }, "HKdla-B5-V.json": { "file_name": "HKdla-B5-V.json", "sha3_256": "1e686a7f69d6b7a3c05a4be9e7e396cf81498ef48299341616e76805c1092733", "size": 340, }, "HKdlb-B5-H.json": { "file_name": "HKdlb-B5-H.json", "sha3_256": "0ccae437017107059630d56c7e0e2d6f086d5fb512c9e60b1bd48c4a04b6652d", "size": 22501, }, "HKdlb-B5-V.json": { "file_name": "HKdlb-B5-V.json", "sha3_256": "dad584337fd6e5e6ab5e1e30dc9b8cc1013985a04a159b3c108c4dfb5c10fb55", "size": 340, }, "HKgccs-B5-H.json": { "file_name": "HKgccs-B5-H.json", "sha3_256": "f7da0854c355c51957de6e71ffa33fbc69414d52dcfc5a5cb50c8f8c6c6bd9c6", "size": 13642, }, "HKgccs-B5-V.json": { "file_name": "HKgccs-B5-V.json", "sha3_256": "d7f89dc24162b624bc4d682484da315a4d39eaf9a8f63c1392e06d2aa46f015a", "size": 341, }, "HKm314-B5-H.json": { "file_name": "HKm314-B5-H.json", "sha3_256": "febd4cb78048e012478df9fc91aa23e946304d63c5f7c64ea8e16277b64a359b", "size": 13405, }, "HKm314-B5-V.json": { "file_name": "HKm314-B5-V.json", "sha3_256": "d310bbf5a975fe8e1f8bb4523b0db8e792043578f0c2a12735bbc24fc4a3721f", "size": 341, }, "HKm471-B5-H.json": { "file_name": "HKm471-B5-H.json", "sha3_256": "fdb1368b1a6f4df20ab87e2a1045a579088645828d1168e39d6aa5b52c09bd8e", "size": 17079, }, "HKm471-B5-V.json": { "file_name": "HKm471-B5-V.json", "sha3_256": "34c40c1bb1409942f12f66f1bcbc2be73406b4c5e626ea7a4ab7f73160ba2a88", "size": 341, }, "HKscs-B5-H.json": { "file_name": "HKscs-B5-H.json", "sha3_256": "63fe2b09c05c8ef70fb937aad49698d4154e1d7bb75f94344fea4db522b87a88", "size": 25722, }, "HKscs-B5-V.json": { "file_name": "HKscs-B5-V.json", "sha3_256": "14c864025ffca616fc173458162efe190bdace4700e2a7ad4869c66476534223", "size": 365, }, "Hankaku.json": { "file_name": "Hankaku.json", "sha3_256": "befe81a2bbe191bcb8e0ff23706a51cb6a41a60f6bc508d5c0c19040c14afc06", "size": 238, }, "Hiragana.json": { "file_name": "Hiragana.json", "sha3_256": "0e8ce0a48ec8c05f4c65d23ada539c4a2a236fcb7dd46e20874acd9362394525", "size": 200, }, "Identity-H.json": { "file_name": "Identity-H.json", "sha3_256": "77cc630138b29b5acd4ab216cb1d173bb3e7b994ab932a4f3d8a9121be91fbab", "size": 6404, }, "Identity-V.json": { "file_name": "Identity-V.json", "sha3_256": "067a8d390f2d99dfa94ff19009925e5815c8b54b65b39314a244cbbace494679", "size": 62, }, "KSC-EUC-H.json": { "file_name": "KSC-EUC-H.json", "sha3_256": "79fb3c0bd9d2ce6b80da98d6f1ef4fd2776dfc3fb78c5ee4d6ee3a06aebc9fd0", "size": 11234, }, "KSC-EUC-V.json": { "file_name": "KSC-EUC-V.json", "sha3_256": "a541a285c966105a92dba6939401ac8aaeb057e5200bdbf8c874ceecb9f37b01", "size": 441, }, "KSC-H.json": { "file_name": "KSC-H.json", "sha3_256": "a0a20bce98ffe98036aa748d46c2921e17247827a22298edb59c778b8b776f24", "size": 11214, }, "KSC-Johab-H.json": { "file_name": "KSC-Johab-H.json", "sha3_256": "3d7cd1473ddcf7c3bfb80c7eadf45a365389759b1df1f53e0bd5f31e31125e96", "size": 100922, }, "KSC-Johab-V.json": { "file_name": "KSC-Johab-V.json", "sha3_256": "2f7cf1d05bd82d65e488fc3297aefc1c1f48f2c6972b01304c4be5f260fae86e", "size": 443, }, "KSC-V.json": { "file_name": "KSC-V.json", "sha3_256": "f6f09bab60f802d61c22368ca8650cefa08851c2039c5825e37404c7047eb496", "size": 437, }, "KSCms-UHC-H.json": { "file_name": "KSCms-UHC-H.json", "sha3_256": "6df55fd679239f3a6642c7690e89a85525fa6a8a3cf748aef247b2d06fdc1aca", "size": 16419, }, "KSCms-UHC-HW-H.json": { "file_name": "KSCms-UHC-HW-H.json", "sha3_256": "a05183c5d7b6b6f62d11f8175e5749d5ad2913d469403905c8f01a403d715583", "size": 16422, }, "KSCms-UHC-HW-V.json": { "file_name": "KSCms-UHC-HW-V.json", "sha3_256": "e2586795b094fade7e385ff1ce5570232edc791c456acf4c6e1c11bc501f82a4", "size": 446, }, "KSCms-UHC-V.json": { "file_name": "KSCms-UHC-V.json", "sha3_256": "c09dc49c1afea5a5dc01bd6ac672d2af83b4821d74de7df71d4da3233513cefb", "size": 443, }, "KSCpc-EUC-H.json": { "file_name": "KSCpc-EUC-H.json", "sha3_256": "b43448cb510c7f952a6affd0950db58063719f7499309c64f78fea6b2778fa11", "size": 12226, }, "KSCpc-EUC-V.json": { "file_name": "KSCpc-EUC-V.json", "sha3_256": "1f4889c2e7278085738257e8097382ef5ac40b543b71751b75b155b056a46db2", "size": 443, }, "Katakana.json": { "file_name": "Katakana.json", "sha3_256": "524b659bd0acc0fb4baa7633c3250683d6b3ba1685caadc9739240ccdbfd2ce2", "size": 86, }, "NWP-H.json": { "file_name": "NWP-H.json", "sha3_256": "6c067655436fe89fb21a26e258973313bfe7cd5fbab3a2857b00ea92cc82c25d", "size": 18143, }, "NWP-V.json": { "file_name": "NWP-V.json", "sha3_256": "b494038c72c63c6917ab3ed3f83a8b6bf21c65ba9ea47a4887833fffcc434763", "size": 1205, }, "RKSJ-H.json": { "file_name": "RKSJ-H.json", "sha3_256": "eff868636f960b80d6923b77eb59d76acf6d7297bc74e1b7f3a13ff92a71c1cb", "size": 2953, }, "RKSJ-V.json": { "file_name": "RKSJ-V.json", "sha3_256": "f3827bc17eb1172a5713d2d5c83a9b60f965894e3f2cb8dcb731b6f151abaa10", "size": 702, }, "Roman.json": { "file_name": "Roman.json", "sha3_256": "620ab6ac0f4b487f19d44397b49612db57d164ddbff8e7d52fb5fd7e969e0cb9", "size": 67, }, "UniAKR-UTF16-H.json": { "file_name": "UniAKR-UTF16-H.json", "sha3_256": "1204af593c62e5d10ace0db3b5ca0caecc80240f1c866bf1585fad405c204a54", "size": 232741, }, "UniAKR-UTF32-H.json": { "file_name": "UniAKR-UTF32-H.json", "sha3_256": "cbbebc4b9b018109612dcfc0798f5c164d739a8b202017580301e0f27f76c35d", "size": 296773, }, "UniAKR-UTF8-H.json": { "file_name": "UniAKR-UTF8-H.json", "sha3_256": "e08da06fc02a877abb02205fe0db3b61566d9ac41511a735ef2f12b5741d069a", "size": 266575, }, "UniCNS-UCS2-H.json": { "file_name": "UniCNS-UCS2-H.json", "sha3_256": "48a0840498b90cf597c05ad2f63e26aaea778a49171f821d4b87b94424d7e640", "size": 400654, }, "UniCNS-UCS2-V.json": { "file_name": "UniCNS-UCS2-V.json", "sha3_256": "014f9d86baea5fd13e460dd3735eab98dbbacf126922826ef0be9d7c8c605418", "size": 360, }, "UniCNS-UTF16-H.json": { "file_name": "UniCNS-UTF16-H.json", "sha3_256": "c67980ebfb0d525365d0b5421548cc64ce9fb89afca1a0f6d04972f1e39b7f9c", "size": 320254, }, "UniCNS-UTF16-V.json": { "file_name": "UniCNS-UTF16-V.json", "sha3_256": "98bd35d76997c0f3c443f130d44e814997cb0277183b7bf6571f92206d9a85a0", "size": 311, }, "UniCNS-UTF32-H.json": { "file_name": "UniCNS-UTF32-H.json", "sha3_256": "6ab73cc531843f9bef915a949a0b79de1df288bb7ed6026db782ac446ed36c94", "size": 391690, }, "UniCNS-UTF32-V.json": { "file_name": "UniCNS-UTF32-V.json", "sha3_256": "d94f8c3d7fe834d34f746b9404a4bb5dd8479353e3b9f95b308642a8be793a44", "size": 391, }, "UniCNS-UTF8-H.json": { "file_name": "UniCNS-UTF8-H.json", "sha3_256": "3666cbe4d00de4038120c98472137857c93d44735c3a5def8c4ac7f84a59aa72", "size": 357287, }, "UniCNS-UTF8-V.json": { "file_name": "UniCNS-UTF8-V.json", "sha3_256": "e410ed491c0e2f31ba30cfd60eb4e21c40d3ee82e2be1c06c7adb8772b175f10", "size": 350, }, "UniGB-UCS2-H.json": { "file_name": "UniGB-UCS2-H.json", "sha3_256": "42a8e01b690cf2cd6b137c1eb94e7668899f0041b6e43b921252fe453486a96e", "size": 336533, }, "UniGB-UCS2-V.json": { "file_name": "UniGB-UCS2-V.json", "sha3_256": "0a0aaf21f823546faf0971b7926724cc95b53b3da3f42a22ec0526ca8de1b237", "size": 617, }, "UniGB-UTF16-H.json": { "file_name": "UniGB-UTF16-H.json", "sha3_256": "c306f093839fffe81e0c8597a24be508a64aa2a9c3e9b9eee858d55059530c0d", "size": 251806, }, "UniGB-UTF16-V.json": { "file_name": "UniGB-UTF16-V.json", "sha3_256": "bd283b8c7e145e340db39868ec1a3b0a08d89acc2bfac672d41008a8195c7bb3", "size": 456, }, "UniGB-UTF32-H.json": { "file_name": "UniGB-UTF32-H.json", "sha3_256": "a01a6a8b4b715f27c7e1866894240b0e1fd61a4eaca1c91df80c1f256ad06f72", "size": 319766, }, "UniGB-UTF32-V.json": { "file_name": "UniGB-UTF32-V.json", "sha3_256": "8b31bba8b852a2c6c1f6d92aea633285e2f75237fbe87ecadff9f9312a0bfaa9", "size": 572, }, "UniGB-UTF8-H.json": { "file_name": "UniGB-UTF8-H.json", "sha3_256": "87f7a6b0360d0f9bd0658cb7a67587e86c604be44292214622d972d85a474dbf", "size": 290481, }, "UniGB-UTF8-V.json": { "file_name": "UniGB-UTF8-V.json", "sha3_256": "1378adf3ecd0bfbdb11dabbf2118cbb968a03aa2215780b77b07459e3b1df6e7", "size": 513, }, "UniJIS-UCS2-H.json": { "file_name": "UniJIS-UCS2-H.json", "sha3_256": "a73e449136b46240ef86c9fb2b614e7d290b814130e9beb4b987c52fd7eda575", "size": 205924, }, "UniJIS-UCS2-HW-H.json": { "file_name": "UniJIS-UCS2-HW-H.json", "sha3_256": "e58ec4fd06677ecfcef12d25f6456b7f80da706b2ac6ef915239e0b780b775a0", "size": 154, }, "UniJIS-UCS2-HW-V.json": { "file_name": "UniJIS-UCS2-HW-V.json", "sha3_256": "bc3c81dbd6329d83cd71743a6985ed0cf516b0aa97a1c58c3cc3940e280b1e8e", "size": 4868, }, "UniJIS-UCS2-V.json": { "file_name": "UniJIS-UCS2-V.json", "sha3_256": "276712ac66416538e859ad28e9f5b685fbc71e5d7d91e905a3489f03667ae4bc", "size": 4775, }, "UniJIS-UTF16-H.json": { "file_name": "UniJIS-UTF16-H.json", "sha3_256": "afc923e268f22dcf09e0871ce0060c7588aa1304d4b26e781a261c14566f7642", "size": 238042, }, "UniJIS-UTF16-V.json": { "file_name": "UniJIS-UTF16-V.json", "sha3_256": "0a044ab7015485c3b0f7f9e4d883a1d9e9f1d04235b13e2a17687e878ce3e9f0", "size": 3951, }, "UniJIS-UTF32-H.json": { "file_name": "UniJIS-UTF32-H.json", "sha3_256": "1c27e2e595d659073e37e5ee22a9b39abe30af1483de33e1078ed174abdc723c", "size": 295294, }, "UniJIS-UTF32-V.json": { "file_name": "UniJIS-UTF32-V.json", "sha3_256": "aa7a475ce5f85f79d73e17355c08e6aee21a949b596f2efe359913489a22117f", "size": 4983, }, "UniJIS-UTF8-H.json": { "file_name": "UniJIS-UTF8-H.json", "sha3_256": "d91079b3f1671a7f4ace8b8f89478558f43f7782e666064ce1b53af563a87306", "size": 266367, }, "UniJIS-UTF8-V.json": { "file_name": "UniJIS-UTF8-V.json", "sha3_256": "d0c8c94f7d54dafa40876ce7eb28845d8ac00b688cf4bac255694cb2f086d109", "size": 4483, }, "UniJIS2004-UTF16-H.json": { "file_name": "UniJIS2004-UTF16-H.json", "sha3_256": "336660e87fc57ad166258d22f09690fcebb546840faee1e1b3f6cad3556bcf80", "size": 238119, }, "UniJIS2004-UTF16-V.json": { "file_name": "UniJIS2004-UTF16-V.json", "sha3_256": "f6619a74b62f9986e9a74620b28e726b927dde5cd6184742f368ef4d686fe55c", "size": 3955, }, "UniJIS2004-UTF32-H.json": { "file_name": "UniJIS2004-UTF32-H.json", "sha3_256": "2512690db880e0663f8208d22acda8daa98f1240ff14a038bf02e57c4908afb5", "size": 295371, }, "UniJIS2004-UTF32-V.json": { "file_name": "UniJIS2004-UTF32-V.json", "sha3_256": "da1728a91845f1654457eaf0f15b75d1ace5cbf75486bca8523bd5edf20a8010", "size": 4987, }, "UniJIS2004-UTF8-H.json": { "file_name": "UniJIS2004-UTF8-H.json", "sha3_256": "af36b0255a1ed15966670703ba8a48987a1cf7e43f5c94a4e86a41e5ee26b940", "size": 266444, }, "UniJIS2004-UTF8-V.json": { "file_name": "UniJIS2004-UTF8-V.json", "sha3_256": "28bebdf1581c45f2e9b38caa2ff643abd561321bab45febb0f90d802d2290faa", "size": 4487, }, "UniJISPro-UCS2-HW-V.json": { "file_name": "UniJISPro-UCS2-HW-V.json", "sha3_256": "21fd353a062b6c415389d6fde11718488f765ca31fd4ca481050c89633568009", "size": 4994, }, "UniJISPro-UCS2-V.json": { "file_name": "UniJISPro-UCS2-V.json", "sha3_256": "8daa155869a35f3f629abb042790c59eb5cff342b83573c2ae4c87b3e865dc27", "size": 4901, }, "UniJISPro-UTF8-V.json": { "file_name": "UniJISPro-UTF8-V.json", "sha3_256": "19b9a6d908f9fb7413d778c9cc912072314864225c38a3f5c345936fabcea650", "size": 5726, }, "UniJISX0213-UTF32-H.json": { "file_name": "UniJISX0213-UTF32-H.json", "sha3_256": "e6a07453703f5070bf567c9d67aa20bc4b404bd311413fed45d9ba8c297a91d9", "size": 295246, }, "UniJISX0213-UTF32-V.json": { "file_name": "UniJISX0213-UTF32-V.json", "sha3_256": "5f2dd4ff8045b2308a707e3d4ffb73e1ba7f5a1c1fdb43b17c5a322109897b9c", "size": 4908, }, "UniJISX02132004-UTF32-H.json": { "file_name": "UniJISX02132004-UTF32-H.json", "sha3_256": "81427dc73cf9392c0c3e8eeeb1dedbc797b123059714bfcdcd1ecffec9f341e3", "size": 295323, }, "UniJISX02132004-UTF32-V.json": { "file_name": "UniJISX02132004-UTF32-V.json", "sha3_256": "c0721298f3449f0c6f48ada1200ebcadbfc4020b10333871f6c0eea0be9f13ac", "size": 4912, }, "UniKS-UCS2-H.json": { "file_name": "UniKS-UCS2-H.json", "sha3_256": "3a1c10535982d06dde447764f8e3dd82c6c87bec6c4272eaf449f67db6d50ab8", "size": 202706, }, "UniKS-UCS2-V.json": { "file_name": "UniKS-UCS2-V.json", "sha3_256": "b915820ff4639f837e4d3b7e5a7c0810c26af1dcf3df9e56ed9a0a69e3cdba9d", "size": 492, }, "UniKS-UTF16-H.json": { "file_name": "UniKS-UTF16-H.json", "sha3_256": "820f534efffcef15f0d3f270c078774febee31b451a1387b27f7225da321c12f", "size": 153894, }, "UniKS-UTF16-V.json": { "file_name": "UniKS-UTF16-V.json", "sha3_256": "2b5be7641990cf79754a12309c6069c01b636cfc3308bc4dc8075da59c2d8d6b", "size": 403, }, "UniKS-UTF32-H.json": { "file_name": "UniKS-UTF32-H.json", "sha3_256": "541515ed8ff15170b38fbe6587ff6c54f6fc75aeede9da110133dc335e4ddf0e", "size": 195998, }, "UniKS-UTF32-V.json": { "file_name": "UniKS-UTF32-V.json", "sha3_256": "940e977d3927c8480c65dc4ad6be4f365f65b8d76707758a7696d40e2b3583ea", "size": 503, }, "UniKS-UTF8-H.json": { "file_name": "UniKS-UTF8-H.json", "sha3_256": "81b5c336c1a20dee2e9592c6615a46cdd906edd242717c1807609b5687576252", "size": 177154, }, "UniKS-UTF8-V.json": { "file_name": "UniKS-UTF8-V.json", "sha3_256": "9a282e8eee884f801a5518cc52ff240ee8635553661dd0ee7df952adbad7462a", "size": 452, }, "V.json": { "file_name": "V.json", "sha3_256": "616f263e53079846a66efc861524a15c0a411e823c37fe08e62bad835745cbba", "size": 697, }, "WP-Symbol.json": { "file_name": "WP-Symbol.json", "sha3_256": "533dfe497eab1f095039b6344217fc0ff6b1f7cdf9b406bb19c30b945fe78c21", "size": 588, }, } FONT_NAMES = {v["font_name"] for v in EMBEDDING_FONT_METADATA.values()} CN_FONT_FAMILY = { # 手写体 "script": [ "LXGWWenKaiGB-Regular.1.520.ttf", ], # 正文字体 "normal": [ "SourceHanSerifCN-Bold.ttf", "SourceHanSerifCN-Regular.ttf", "SourceHanSansCN-Bold.ttf", "SourceHanSansCN-Regular.ttf", ], # 备用字体 "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": ["SourceHanSansCN-Regular.ttf"], } HK_FONT_FAMILY = { "script": ["LXGWWenKaiTC-Regular.1.520.ttf"], "normal": [ "SourceHanSerifHK-Bold.ttf", "SourceHanSerifHK-Regular.ttf", "SourceHanSansHK-Bold.ttf", "SourceHanSansHK-Regular.ttf", ], "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": ["SourceHanSansCN-Regular.ttf"], } TW_FONT_FAMILY = { "script": ["LXGWWenKaiTC-Regular.1.520.ttf"], "normal": [ "SourceHanSerifTW-Bold.ttf", "SourceHanSerifTW-Regular.ttf", "SourceHanSansTW-Bold.ttf", "SourceHanSansTW-Regular.ttf", ], "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": ["SourceHanSansCN-Regular.ttf"], } KR_FONT_FAMILY = { "script": ["MaruBuri-Regular.ttf"], "normal": [ "SourceHanSerifKR-Bold.ttf", "SourceHanSerifKR-Regular.ttf", "SourceHanSansKR-Bold.ttf", "SourceHanSansKR-Regular.ttf", ], "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": ["SourceHanSansCN-Regular.ttf"], } JP_FONT_FAMILY = { "script": ["KleeOne-Regular.ttf"], "normal": [ "SourceHanSerifJP-Bold.ttf", "SourceHanSerifJP-Regular.ttf", "SourceHanSansJP-Bold.ttf", "SourceHanSansJP-Regular.ttf", ], "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": ["SourceHanSansCN-Regular.ttf"], } EN_FONT_FAMILY = { "script": [ "NotoSans-Italic.ttf", "NotoSans-BoldItalic.ttf", "NotoSerif-Italic.ttf", "NotoSerif-BoldItalic.ttf", ], "normal": [ "NotoSerif-Regular.ttf", "NotoSerif-Bold.ttf", "NotoSans-Regular.ttf", "NotoSans-Bold.ttf", ], "fallback": [ "GoNotoKurrent-Regular.ttf", "GoNotoKurrent-Bold.ttf", ], "base": [ "NotoSans-Regular.ttf", ], } ALL_FONT_FAMILY = { "CN": CN_FONT_FAMILY, "TW": TW_FONT_FAMILY, "HK": HK_FONT_FAMILY, "KR": KR_FONT_FAMILY, "JP": JP_FONT_FAMILY, "EN": EN_FONT_FAMILY, "JA": JP_FONT_FAMILY, } def __add_fallback_to_font_family(): for lang1, family1 in ALL_FONT_FAMILY.items(): added_font = set() for font in itertools.chain.from_iterable(family1.values()): added_font.add(font) for lang2, family2 in ALL_FONT_FAMILY.items(): if lang1 != lang2: for type_ in family1: for font in family2[type_]: if font not in added_font: family1[type_].append(font) added_font.add(font) def __cleanup_unused_font_metadata(): """Remove unused font metadata that are not referenced in any font family.""" referenced_fonts = set() for family in ALL_FONT_FAMILY.values(): for font_list in family.values(): referenced_fonts.update(font_list) # Remove unreferenced fonts from EMBEDDING_FONT_METADATA unused_fonts = set(EMBEDDING_FONT_METADATA.keys()) - referenced_fonts for font_name in unused_fonts: del EMBEDDING_FONT_METADATA[font_name] __add_fallback_to_font_family() __cleanup_unused_font_metadata() def get_font_family(lang_code: str): lang_code = lang_code.upper() if "KR" in lang_code: font_family = KR_FONT_FAMILY elif "JP" in lang_code or "JA" in lang_code: font_family = JP_FONT_FAMILY elif "HK" in lang_code: font_family = HK_FONT_FAMILY elif "TW" in lang_code: font_family = TW_FONT_FAMILY elif "EN" in lang_code: font_family = EN_FONT_FAMILY elif "CN" in lang_code: font_family = CN_FONT_FAMILY else: font_family = EN_FONT_FAMILY verify_font_family(font_family) return font_family def verify_font_family(font_family: str | dict): if isinstance(font_family, str): font_family = ALL_FONT_FAMILY[font_family] for k in font_family: if k not in ["script", "normal", "fallback", "base"]: raise ValueError(f"Invalid font family: {font_family}") for font_file_name in font_family[k]: if font_file_name not in EMBEDDING_FONT_METADATA: raise ValueError(f"Invalid font file: {font_file_name}") if __name__ == "__main__": for k in ALL_FONT_FAMILY: verify_font_family(k) ================================================ FILE: babeldoc/asynchronize/__init__.py ================================================ import asyncio import time class Args: def __init__(self, args, kwargs): self.args = args self.kwargs = kwargs class AsyncCallback: def __init__(self): self.queue = asyncio.Queue() self.finished = False self.loop = asyncio.get_event_loop() def step_callback(self, *args, **kwargs): # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue args = Args(args, kwargs) # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping: # https://stackoverflow.com/a/49912853/2148718 self.loop.call_soon_threadsafe(self.queue.put_nowait, args) # Add a small delay to release the GIL, ensuring the event loop has time to process messages time.sleep(0.01) def finished_callback(self, *args, **kwargs): # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__ # will terminate after processing the remaining items if self.finished: return self.step_callback(*args, **kwargs) self.finished = True def __await__(self): # Since this implements __anext__, this can return itself return self.queue.get().__await__() def __aiter__(self): # Since this implements __anext__, this can return itself return self async def __anext__(self): # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish # processing the remaining items even after we've finished if self.finished and self.queue.empty(): raise StopAsyncIteration result = await self.queue.get() return result ================================================ FILE: babeldoc/babeldoc_exception/BabelDOCException.py ================================================ class ScannedPDFError(Exception): def __init__(self, message): super().__init__(message) class ExtractTextError(Exception): def __init__(self, message): super().__init__(message) class InputFileGeneratedByBabelDOCError(Exception): def __init__(self, message): super().__init__(message) class ContentFilterError(Exception): def __init__(self, message): super().__init__(message) self.message = message ================================================ FILE: babeldoc/babeldoc_exception/__init__.py ================================================ ================================================ FILE: babeldoc/const.py ================================================ import itertools import multiprocessing as mp import os import shutil import subprocess import threading from pathlib import Path __version__ = "0.5.23" CACHE_FOLDER = Path.home() / ".cache" / "babeldoc" def get_cache_file_path(filename: str, sub_folder: str | None = None) -> Path: if sub_folder is not None: sub_folder = sub_folder.strip("/") sub_folder_path = CACHE_FOLDER / sub_folder sub_folder_path.mkdir(parents=True, exist_ok=True) return sub_folder_path / filename return CACHE_FOLDER / filename try: git_path = shutil.which("git") if git_path is None: raise FileNotFoundError("git executable not found") two_parent = Path(__file__).resolve().parent.parent md_ = two_parent / "docs" / "README.md" if two_parent.name == "site-packages" or not md_.exists(): raise FileNotFoundError("not in git repo") WATERMARK_VERSION = ( subprocess.check_output( # noqa: S603 [git_path, "describe", "--always"], cwd=Path(__file__).resolve().parent, ) .strip() .decode() ) except (OSError, FileNotFoundError, subprocess.CalledProcessError): WATERMARK_VERSION = f"v{__version__}" TIKTOKEN_CACHE_FOLDER = CACHE_FOLDER / "tiktoken" TIKTOKEN_CACHE_FOLDER.mkdir(parents=True, exist_ok=True) os.environ["TIKTOKEN_CACHE_DIR"] = str(TIKTOKEN_CACHE_FOLDER) _process_pool = None _process_pool_lock = threading.Lock() _ENABLE_PROCESS_POOL = False def enable_process_pool(): # Development and Testing ONLY API global _ENABLE_PROCESS_POOL _ENABLE_PROCESS_POOL = True # macos & windows use spawn mode # linux use forkserver mode def get_process_pool(): if not _ENABLE_PROCESS_POOL: return None global _process_pool with _process_pool_lock: if _process_pool is None: # Create pool only in main process if mp.current_process().name != "MainProcess": return None _process_pool = mp.Pool() return _process_pool def close_process_pool(): if not _ENABLE_PROCESS_POOL: return None global _process_pool with _process_pool_lock: if _process_pool: _process_pool.close() _process_pool.join() _process_pool = None def batched(iterable, n, *, strict=False): # batched('ABCDEFG', 3) → ABC DEF G if n < 1: raise ValueError("n must be at least one") iterator = iter(iterable) while batch := tuple(itertools.islice(iterator, n)): if strict and len(batch) != n: raise ValueError("batched(): incomplete batch") yield batch ================================================ FILE: babeldoc/docvision/README.md ================================================ ================================================ FILE: babeldoc/docvision/__init__.py ================================================ ================================================ FILE: babeldoc/docvision/base_doclayout.py ================================================ import abc import logging from collections.abc import Generator import pymupdf from babeldoc.format.pdf.document_il.il_version_1 import Page logger = logging.getLogger(__name__) class YoloResult: """Helper class to store detection results from ONNX model.""" def __init__(self, names, boxes=None, boxes_data=None): if boxes is not None: self.boxes = boxes else: assert boxes_data is not None self.boxes = [YoloBox(data=d) for d in boxes_data] self.boxes.sort(key=lambda x: x.conf, reverse=True) self.names = names class YoloBox: """Helper class to store detection results from ONNX model.""" def __init__(self, data=None, xyxy=None, conf=None, cls=None): if data is not None: self.xyxy = data[:4] self.conf = data[-2] self.cls = data[-1] return assert xyxy is not None and conf is not None and cls is not None self.xyxy = xyxy self.conf = conf self.cls = cls class DocLayoutModel(abc.ABC): @staticmethod def load_onnx(): logger.info("Loading ONNX model...") from babeldoc.docvision.doclayout import OnnxModel model = OnnxModel.from_pretrained() return model @staticmethod def load_available(): return DocLayoutModel.load_onnx() @property @abc.abstractmethod def stride(self) -> int: """Stride of the model input.""" @abc.abstractmethod def handle_document( self, pages: list[Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ) -> Generator[tuple[Page, YoloResult], None, None]: """ Handle a document. """ ================================================ FILE: babeldoc/docvision/doclayout.py ================================================ import ast import logging import platform import re import threading from collections.abc import Generator import cv2 import numpy as np from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img try: import onnx import onnxruntime except ImportError as e: if "DLL load failed" in str(e): raise OSError( "Microsoft Visual C++ Redistributable is not installed. " "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe" ) from e raise import pymupdf import babeldoc.format.pdf.document_il.il_version_1 from babeldoc.assets.assets import get_doclayout_onnx_model_path # from huggingface_hub import hf_hub_download logger = logging.getLogger(__name__) # 检测操作系统类型 os_name = platform.system() class OnnxModel(DocLayoutModel): def __init__(self, model_path: str): self.model_path = model_path model = onnx.load(model_path) metadata = {d.key: d.value for d in model.metadata_props} self._stride = ast.literal_eval(metadata["stride"]) self._names = ast.literal_eval(metadata["names"]) providers = [] available_providers = onnxruntime.get_available_providers() for provider in available_providers: # disable dml|cuda| # directml/cuda may encounter problems under special circumstances if re.match(r"cpu", provider, re.IGNORECASE): logger.info(f"Available Provider: {provider}") providers.append(provider) self.model = onnxruntime.InferenceSession( model.SerializeToString(), providers=providers, ) self.lock = threading.Lock() @staticmethod def from_pretrained(): pth = get_doclayout_onnx_model_path() return OnnxModel(pth) @property def stride(self): return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR, ) # Calculate padding size and align to stride multiple pad_w = (new_w - resized_w) % self.stride pad_h = (new_h - resized_h) % self.stride top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114), ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict(self, image, imgsz=800, batch_size=16, **kwargs): """ Predict the layout of document pages. Args: image: A single image or a list of images of document pages. imgsz: Resize the image to this size. Must be a multiple of the stride. batch_size: Number of images to process in one batch. **kwargs: Additional arguments. Returns: A list of YoloResult objects, one for each input image. """ # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] total_images = len(image) results = [] batch_size = 1 # Process images in batches for i in range(0, total_images, batch_size): batch_images = image[i : i + batch_size] batch_size_actual = len(batch_images) # Calculate target size based on the maximum height in the batch max_height = max(img.shape[0] for img in batch_images) target_imgsz = 1024 # Preprocess batch processed_batch = [] orig_shapes = [] for img in batch_images: orig_h, orig_w = img.shape[:2] orig_shapes.append((orig_h, orig_w)) pix = self.resize_and_pad_image(img, new_shape=target_imgsz) pix = np.transpose(pix, (2, 0, 1)) # CHW pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1] processed_batch.append(pix) # Stack batch batch_input = np.stack(processed_batch, axis=0) # BCHW new_h, new_w = batch_input.shape[2:] # Run inference batch_preds = self.model.run(None, {"images": batch_input})[0] # Process each prediction in the batch for j in range(batch_size_actual): preds = batch_preds[j] preds = preds[preds[..., 4] > 0.25] if len(preds) > 0: preds[..., :4] = self.scale_boxes( (new_h, new_w), preds[..., :4], orig_shapes[j], ) results.append(YoloResult(boxes_data=preds, names=self._names)) return results def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ) -> Generator[ tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None ]: for page in pages: translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number]) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict(image)[0] save_debug_image( image, predict_result, page.page_number + 1, ) yield page, predict_result ================================================ FILE: babeldoc/docvision/rpc_doclayout.py ================================================ import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import msgpack import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image # logger.debug(f"Image shape: {img.shape}") img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {retry_state.next_action.sleep} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data if not isinstance(image, list): image = [image] image_data = [encode_image(image) for image in image] data = { "image": image_data, "imgsz": imgsz, } # Pack data using msgpack packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( f"{host}/inference", data=packed_data, headers={ "Content-Type": "application/msgpack", "Accept": "application/msgpack", }, timeout=300, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") if response.status_code == 200: try: result = msgpack.unpackb(response.content, raw=False) return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.content}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str = None, result_container: ResultContainer | None = None, imgsz: int = 1024, ) -> ResultContainer: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) preds = predict_layout([image], host=self.host, imgsz=800) if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( (800, 800), np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: """Predict the layout of document pages using RPC service.""" # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] result_containers = [ResultContainer() for _ in image] predict_thread = ThreadPoolExecutor(max_workers=len(image)) for img, result_container in zip(image, result_containers, strict=True): predict_thread.submit( self.predict_image, img, self.host, result_container, 800 ) predict_thread.shutdown(wait=True) result = [result_container.result for result_container in result_containers] return result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number]) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=16) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout2.py ================================================ import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import msgpack import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data if not isinstance(image, list): image = [image] image_data = [encode_image(image) for image in image] data = { "image": image_data, } # Pack data using msgpack packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480", f"{host}/inference", data=packed_data, headers={ "Content-Type": "application/msgpack", "Accept": "application/msgpack", }, timeout=480, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = msgpack.unpackb(response.content, raw=False) useful_result = [] if isinstance(result, dict): names = {} for box in result["boxes"]: if box["score"] < 0.7: continue box["xyxy"] = box["coordinate"] box["conf"] = box["score"] if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.content}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str | None = None, result_container: ResultContainer | None = None, imgsz: int = 1024, ) -> ResultContainer: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) preds = predict_layout(image, host=self.host) orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72 if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: """Predict the layout of document pages using RPC service.""" # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] result_containers = [ResultContainer() for _ in image] predict_thread = ThreadPoolExecutor(max_workers=len(image)) for img, result_container in zip(image, result_containers, strict=True): predict_thread.submit( self.predict_image, img, self.host, result_container, 800 ) predict_thread.shutdown(wait=True) result = [result_container.result for result_container in result_containers] return result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=16) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout3.py ================================================ import json import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data image_data = encode_image(image) # Pack data using msgpack # packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=1800", files={"file": ("image.jpg", image_data, "image/jpeg")}, headers={ "Accept": "application/json", }, timeout=1800, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = json.loads(response.text) useful_result = [] if isinstance(result, dict): names = {} for box in result["boxes"]: if box["ocr_match_score"] < 0.7: continue box["xyxy"] = box["coords"] box["conf"] = box["ocr_match_score"] if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.content}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str | None = None, result_container: ResultContainer | None = None, imgsz: int = 1024, ) -> ResultContainer: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) preds = predict_layout(image, host=self.host) orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72 if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: """Predict the layout of document pages using RPC service.""" # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] result_containers = [ResultContainer() for _ in image] predict_thread = ThreadPoolExecutor(max_workers=len(image)) for img, result_container in zip(image, result_containers, strict=True): predict_thread.submit( self.predict_image, img, self.host, result_container, 800 ) predict_thread.shutdown(wait=True) result = [result_container.result for result_container in result_containers] return result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=4) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout4.py ================================================ import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import msgpack import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data if not isinstance(image, list): image = [image] image_data = [encode_image(image) for image in image] data = { "image": image_data, } # Pack data using msgpack packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480", f"{host}/inference", data=packed_data, headers={ "Content-Type": "application/msgpack", "Accept": "application/msgpack", }, timeout=480, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = msgpack.unpackb(response.content, raw=False) useful_result = [] if isinstance(result, dict): names = {} for box in result["boxes"]: if box["score"] < 0.7: continue box["xyxy"] = box["coordinate"] box["conf"] = box["score"] if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.content}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str | None = None, result_container: ResultContainer | None = None, imgsz: int = 1024, ) -> ResultContainer: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) preds = predict_layout(image, host=self.host) orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72 if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: """Predict the layout of document pages using RPC service.""" # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] result_containers = [ResultContainer() for _ in image] predict_thread = ThreadPoolExecutor(max_workers=len(image)) for img, result_container in zip(image, result_containers, strict=True): predict_thread.submit( self.predict_image, img, self.host, result_container, 800 ) predict_thread.shutdown(wait=True) result = [result_container.result for result_container in result_containers] return result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=1) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout5.py ================================================ import json import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data image_data = encode_image(image) # Pack data using msgpack # packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( f"{host}/analyze_hybrid?min_sim=0.7&early_stop=0.99&timeout=1800", files={"file": ("image.jpg", image_data, "image/jpeg")}, headers={ "Accept": "application/json", }, timeout=1800, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = json.loads(response.text) useful_result = [] if isinstance(result, dict): names = {} clusters = result["clusters"] for box in clusters: box["xyxy"] = box["box"] box["conf"] = 1 if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.text}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str | None = None, result_container: ResultContainer | None = None, imgsz: int = 1024, ) -> ResultContainer: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) preds = predict_layout(image, host=self.host) orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72 if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: """Predict the layout of document pages using RPC service.""" # Handle single image input if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] result_containers = [ResultContainer() for _ in image] predict_thread = ThreadPoolExecutor(max_workers=len(image)) for img, result_container in zip(image, result_containers, strict=True): predict_thread.submit( self.predict_image, img, self.host, result_container, 800 ) predict_thread.shutdown(wait=True) result = [result_container.result for result_container in result_containers] return result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=1) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout6.py ================================================ import base64 import json import logging import threading import unicodedata from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import msgpack import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.extract_char import ( convert_page_to_char_boxes, ) from babeldoc.format.pdf.document_il.utils.extract_char import ( process_page_chars_to_lines, ) from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.layout_helper import SPACE_REGEX from babeldoc.format.pdf.document_il.utils.mupdf_helper import ( get_no_rotation_img_multiprocess, ) logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() # logger.debug(f"Encoded image size: {len(encoded)} bytes") return encoded def clip_num(num: float, min_value: float, max_value: float) -> float: """Clip a number to a specified range.""" if num < min_value: return min_value elif num > max_value: return max_value return num @retry( stop=stop_after_attempt(5), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed VLM, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/5)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, lines=None, font_mapper: FontMapper | None = None, ): """Predict document layout using OCR line information (RPC service).""" if lines is None: lines = [] image_data = encode_image(image) def convert_line(line): if not line.text: return None boxes = [c[0] for c in line.chars] min_x = min(b.x for b in boxes) max_x = max(b.x2 for b in boxes) min_y = min(b.y for b in boxes) max_y = max(b.y2 for b in boxes) image_height, image_width = image.shape[:2] # Transform to image pixel coordinates min_x = min_x / 72 * DPI max_x = max_x / 72 * DPI min_y = min_y / 72 * DPI max_y = max_y / 72 * DPI min_y, max_y = image_height - max_y, image_height - min_y box_volume = (max_x - min_x) * (max_y - min_y) if box_volume < 1: return None min_x = clip_num(min_x, 0, image_width - 1) max_x = clip_num(max_x, 0, image_width - 1) min_y = clip_num(min_y, 0, image_height - 1) max_y = clip_num(max_y, 0, image_height - 1) filtered_text = filter_text(line.text, font_mapper) if not filtered_text: return None return {"box": [min_x, min_y, max_x, max_y], "text": filtered_text} formatted_results = [convert_line(l) for l in lines] formatted_results = [r for r in formatted_results if r is not None] if not formatted_results: return None image_b64 = base64.b64encode(image_data).decode("utf-8") request_data = { "image": image_b64, "ocr_results": formatted_results, "image_size": list(image.shape[:2])[::-1], # (height, width) } response = httpx.post( f"{host}/inference", json=request_data, headers={"Accept": "application/json", "Content-Type": "application/json"}, timeout=30, follow_redirects=True, ) idx = 0 id_lookup = {} if response.status_code == 200: try: result = json.loads(response.text) useful_result = [] if isinstance(result, dict): names = {} clusters = result["clusters"] for box in clusters: box["xyxy"] = box["box"] box["conf"] = 1 if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.text}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) @retry( stop=stop_after_attempt(5), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed PADDLE, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/5)" ), ) def predict_layout2( image, host: str = "http://localhost:8000", _imgsz: int = 1024, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data if not isinstance(image, list): image = [image] image_data = [encode_image(image) for image in image] data = { "image": image_data, } # Pack data using msgpack packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480", f"{host}/inference", data=packed_data, headers={ "Content-Type": "application/msgpack", "Accept": "application/msgpack", }, timeout=30, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = msgpack.unpackb(response.content, raw=False) useful_result = [] if isinstance(result, dict): names = {} for box in result["boxes"]: if box["score"] < 0.7: continue box["xyxy"] = box["coordinate"] box["conf"] = box["score"] if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.content}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) def filter_text(txt: str, font_mapper: FontMapper): normalize = unicodedata.normalize("NFKC", txt) unicodes = [] for c in normalize: if font_mapper.has_char(c): unicodes.append(c) normalize = "".join(unicodes) result = SPACE_REGEX.sub(" ", normalize).strip() return result class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000;http://localhost:8001"): """Initialize RPC model with host address. Args: host: Two RPC service hosts separated by ';', e.g. "host1;host2". """ if ";" not in host: raise ValueError( "RpcDocLayoutModel host must be two hosts separated by ';' (e.g. 'http://h1;http://h2')" ) self.host1, self.host2 = [h.strip() for h in host.split(";", 1)] # keep the raw host string for logging/debugging purposes self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() self.font_mapper = None def init_font_mapper(self, translation_config): self.font_mapper = FontMapper(translation_config) @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def calculate_iou(self, box1, box2): """Calculate IoU between two boxes in xyxy format.""" x1_1, y1_1, x2_1, y2_1 = box1 x1_2, y1_2, x2_2, y2_2 = box2 # Calculate intersection area x1_inter = max(x1_1, x1_2) y1_inter = max(y1_1, y1_2) x2_inter = min(x2_1, x2_2) y2_inter = min(y2_1, y2_2) if x2_inter <= x1_inter or y2_inter <= y1_inter: return 0.0 intersection = (x2_inter - x1_inter) * (y2_inter - y1_inter) # Calculate union area area1 = (x2_1 - x1_1) * (y2_1 - y1_1) area2 = (x2_2 - x1_2) * (y2_2 - y1_2) union = area1 + area2 - intersection return intersection / union if union > 0 else 0.0 def is_subset(self, inner_box, outer_box): """Check if inner_box is a subset of outer_box.""" x1_inner, y1_inner, x2_inner, y2_inner = inner_box x1_outer, y1_outer, x2_outer, y2_outer = outer_box return ( x1_inner >= x1_outer and y1_inner >= y1_outer and x2_inner <= x2_outer and y2_inner <= y2_outer ) def expand_box_to_contain(self, box_to_expand, box_to_contain): """Expand box_to_expand to fully contain box_to_contain.""" x1_expand, y1_expand, x2_expand, y2_expand = box_to_expand x1_contain, y1_contain, x2_contain, y2_contain = box_to_contain return [ min(x1_expand, x1_contain), min(y1_expand, y1_contain), max(x2_expand, x2_contain), max(y2_expand, y2_contain), ] def post_process_boxes(self, merged_boxes: list[YoloBox], names: dict[int, str]): """Post-process merged boxes to handle text and paragraph_hybrid overlaps.""" for i, text_box in enumerate(merged_boxes): text_label = names.get(text_box.cls, "") if "text" not in text_label: continue for j, para_box in enumerate(merged_boxes): if i == j: continue para_label = names.get(para_box.cls, "") if "paragraph_hybrid" not in para_label: continue # Calculate IoU iou = self.calculate_iou(text_box.xyxy, para_box.xyxy) # Check if IoU > 0.95 and paragraph is not subset of text if iou > 0.95 and not self.is_subset(para_box.xyxy, text_box.xyxy): # Expand text box to contain paragraph_hybrid expanded_box = self.expand_box_to_contain( text_box.xyxy, para_box.xyxy ) merged_boxes[i] = YoloBox( None, np.array(expanded_box), text_box.conf, text_box.cls, ) def predict_image( self, image, imgsz: int = 1024, lines=None, ) -> YoloResult: """Predict the layout of a single page and fuse results from two RPC services.""" # Resize/pad image if needed – use original size to avoid extra scaling artefacts orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image_proc = self.resize_and_pad_image(image, new_shape=target_imgsz) else: image_proc = image # Parallel calls to both services; exceptions propagate if either fails with ThreadPoolExecutor(max_workers=2) as ex: if lines: future1 = ex.submit( predict_layout, image_proc, self.host1, imgsz, lines, self.font_mapper, ) future2 = ex.submit(predict_layout2, image_proc, self.host2, imgsz) # .result() will re-raise any exception occurred in worker thread. if lines: preds1 = future1.result() else: preds1 = None preds2 = future2.result() # Convert DPI to PDF points (72 dpi) pdf_h, pdf_w = orig_h / DPI * 72, orig_w / DPI * 72 merged_boxes: list[YoloBox] = [] names: dict[int, str] = {} def _process_preds(preds, id_offset: int, label_suffix: str | None): for pred in preds or []: for box in pred["boxes"]: # scale coords back to PDF space scaled_xyxy = self.scale_boxes( target_imgsz, np.array(box["xyxy"]), (pdf_h, pdf_w) ) new_cls_id = box["cls"] + id_offset # derive label – fall back gracefully if missing label = pred["names"].get(box["cls"], str(box["cls"])) if label_suffix: label = f"{label}{label_suffix}" names[new_cls_id] = label merged_boxes.append( YoloBox( None, scaled_xyxy, np.array(box.get("conf", box.get("score", 1.0))), new_cls_id, ) ) # service-1: +1000 id, add "_hybrid" suffix if preds1: _process_preds(preds1, 1000, "_hybrid") # service-2: +2000 id, label unchanged _process_preds(preds2, 2000, None) # Sort boxes by confidence desc (YoloResult expects sorted list) merged_boxes.sort(key=lambda b: b.conf, reverse=True) # Post-process boxes to handle text and paragraph_hybrid overlaps self.post_process_boxes(merged_boxes, names) return YoloResult(boxes=merged_boxes, names=names) def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]: # type: ignore[override] """Predict the layout for one or multiple images.""" # Normalize to list if isinstance(image, np.ndarray) and len(image.shape) == 3: image = [image] # Sequential processing is sufficient; keep simple results: list[YoloResult] = [] for img in image: results.append(self.predict_image(img, imgsz)) return results def predict_page(self, page, pdf_bytes: Path, translate_config, save_debug_image): translate_config.raise_if_cancelled() # doc = pymupdf.open(io.BytesIO(pdf_bytes)) # with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) image = get_no_rotation_img_multiprocess( pdf_bytes.as_posix(), page.page_number, dpi=DPI ) # image = np.frombuffer(pix.samples, np.uint8).reshape( # pix.height, # pix.width, # 3, # )[:, :, ::-1] char_boxes = convert_page_to_char_boxes(page) lines = process_page_chars_to_lines(char_boxes) predict_result = self.predict_image(image, 800, lines) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( # type: ignore[override] self, pages: list["babeldoc.format.pdf.document_il.il_version_1.Page"], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): layout_temp_path = translate_config.get_working_file_path("layout.temp.pdf") mupdf_doc.save(layout_temp_path.as_posix()) with ThreadPoolExecutor(max_workers=32) as executor: yield from executor.map( self.predict_page, pages, (layout_temp_path for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/rpc_doclayout7.py ================================================ import base64 import json import logging import threading from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import httpx import numpy as np import pymupdf from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential import babeldoc from babeldoc.docvision.base_doclayout import DocLayoutModel from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils.extract_char import ( convert_page_to_char_boxes, ) from babeldoc.format.pdf.document_il.utils.extract_char import ( process_page_chars_to_lines, ) from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img logger = logging.getLogger(__name__) DPI = 150 def encode_image(image) -> bytes: """Read and encode image to bytes Args: image: Can be either a file path (str) or numpy array """ if isinstance(image, str): if not Path(image).exists(): raise FileNotFoundError(f"Image file not found: {image}") img = cv2.imread(image) if img is None: raise ValueError(f"Failed to read image: {image}") else: img = image img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # logger.debug(f"Image shape: {img.shape}") encoded = cv2.imencode(".jpg", img)[1].tobytes() return encoded @retry( stop=stop_after_attempt(3), # 最多重试 3 次 wait=wait_exponential( multiplier=1, min=1, max=10 ), # 指数退避策略,初始 1 秒,最大 10 秒 retry=retry_if_exception_type((httpx.HTTPError, Exception)), # 针对哪些异常重试 before_sleep=lambda retry_state: logger.warning( f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... " f"(Attempt {retry_state.attempt_number}/3)" ), ) def predict_layout( image, host: str = "http://localhost:8000", _imgsz: int = 1024, lines: list[babeldoc.format.pdf.document_il.utils.extract_char.Line] | None = None, ): """ Predict document layout using the MOSEC service Args: image: Can be either a file path (str) or numpy array host: Service host URL imgsz: Image size for model input Returns: List of predictions containing bounding boxes and classes """ # Prepare request data image_data = encode_image(image) def convert_line(line: babeldoc.format.pdf.document_il.utils.extract_char.Line): """Extract bounding box from a line object.""" boxes = [c[0] for c in line.chars] min_x = min([b.x for b in boxes]) max_x = max([b.x2 for b in boxes]) min_y = min([b.y for b in boxes]) max_y = max([b.y2 for b in boxes]) # min_y, max_y = max_y, min_y min_x = min_x / 72 * DPI max_x = max_x / 72 * DPI min_y = min_y / 72 * DPI max_y = max_y / 72 * DPI image_height = image.shape[0] min_y, max_y = image_height - max_y, image_height - min_y return {"box": [min_x, min_y, max_x, max_y], "text": line.text} formatted_results = [convert_line(l) for l in lines] image_b64 = base64.b64encode(image_data).decode("utf-8") request_data = { "image": image_b64, "ocr_results": formatted_results, "image_size": list(image.shape[:2])[::-1], # (height, width) } # Pack data using msgpack # packed_data = msgpack.packb(data, use_bin_type=True) # logger.debug(f"Packed data size: {len(packed_data)} bytes") # Send request # logger.debug(f"Sending request to {host}/inference") response = httpx.post( f"{host}/inference", json=request_data, headers={"Accept": "application/json", "Content-Type": "application/json"}, timeout=1800, follow_redirects=True, ) # logger.debug(f"Response status: {response.status_code}") # logger.debug(f"Response headers: {response.headers}") idx = 0 id_lookup = {} if response.status_code == 200: try: result = json.loads(response.text) useful_result = [] if isinstance(result, dict): names = {} clusters = result["clusters"] for box in clusters: box["xyxy"] = box["box"] box["conf"] = 1 if box["label"] not in names: idx += 1 names[idx] = box["label"] box["cls_id"] = idx id_lookup[box["label"]] = idx else: box["cls_id"] = id_lookup[box["label"]] names[box["cls_id"]] = box["label"] box["cls"] = box["cls_id"] useful_result.append(box) if "names" not in result: result["names"] = names result["boxes"] = useful_result result = [result] return result except Exception as e: logger.exception(f"Failed to unpack response: {e!s}") raise else: logger.error(f"Request failed with status {response.status_code}") logger.error(f"Response content: {response.text}") raise Exception( f"Request failed with status {response.status_code}: {response.text}", ) class ResultContainer: def __init__(self): self.result = YoloResult(boxes_data=np.array([]), names=[]) class RpcDocLayoutModel(DocLayoutModel): """DocLayoutModel implementation that uses RPC service.""" def __init__(self, host: str = "http://localhost:8000"): """Initialize RPC model with host address.""" self.host = host self._stride = 32 # Default stride value self._names = ["text", "title", "list", "table", "figure"] self.lock = threading.Lock() @property def stride(self) -> int: """Stride of the model input.""" return self._stride def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR ) # Calculate padding size pad_h = new_h - resized_h pad_w = new_w - resized_w top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict_image( self, image, host: str | None = None, result_container: ResultContainer | None = None, imgsz: int = 1024, page: il_version_1.Page | None = None, ) -> YoloResult: """Predict the layout of document pages using RPC service.""" if result_container is None: result_container = ResultContainer() target_imgsz = (800, 800) orig_h, orig_w = image.shape[:2] target_imgsz = (orig_h, orig_w) if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]: image = self.resize_and_pad_image(image, new_shape=target_imgsz) char_boxes = convert_page_to_char_boxes(page) lines = process_page_chars_to_lines(char_boxes) preds = predict_layout(image, host=self.host, lines=lines) orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72 if len(preds) > 0: for pred in preds: boxes = [ YoloBox( None, self.scale_boxes( target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w) ), np.array(x["conf"]), x["cls"], ) for x in pred["boxes"] ] result_container.result = YoloResult( boxes=boxes, names={int(k): v for k, v in pred["names"].items()}, ) return result_container.result def predict_page( self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image ): translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] predict_result = self.predict_image(image, self.host, None, 800, page) save_debug_image(image, predict_result, page.page_number + 1) return page, predict_result def handle_document( self, pages: list[il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ): with ThreadPoolExecutor(max_workers=1) as executor: yield from executor.map( self.predict_page, pages, (mupdf_doc for _ in range(len(pages))), (translate_config for _ in range(len(pages))), (save_debug_image for _ in range(len(pages))), ) @staticmethod def from_host(host: str) -> "RpcDocLayoutModel": """Create RpcDocLayoutModel from host address.""" return RpcDocLayoutModel(host=host) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # Test the service try: # Use a default test image if example/1.png doesn't exist image_path = "example/1.png" if not Path(image_path).exists(): print(f"Warning: {image_path} not found.") print("Please provide the path to a test image:") image_path = input("> ") logger.info(f"Processing image: {image_path}") result = predict_layout(image_path) print("Prediction results:") print(result) except Exception as e: print(f"Error: {e!s}") ================================================ FILE: babeldoc/docvision/table_detection/rapidocr.py ================================================ import logging import re import threading from collections.abc import Generator import cv2 import numpy as np from babeldoc.assets.assets import get_table_detection_rapidocr_model_path from babeldoc.docvision.base_doclayout import YoloBox from babeldoc.docvision.base_doclayout import YoloResult from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img from rapidocr_onnxruntime import RapidOCR try: import onnxruntime except ImportError as e: if "DLL load failed" in str(e): raise OSError( "Microsoft Visual C++ Redistributable is not installed. " "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe" ) from e raise import babeldoc.format.pdf.document_il.il_version_1 import pymupdf logger = logging.getLogger(__name__) def convert_to_yolo_result(predictions): """ Convert RapidOCR predictions to YoloResult format. Args: predictions (list): List of predictions, where each prediction is a list of coordinates in format [[x1, y1], [x2, y2], [x3, y3], [x4, y4], (text, confidence)] or a numpy array of format [x1, y1, x2, y2, ...] Returns: YoloResult: Converted predictions in YoloResult format """ boxes = [] for pred in predictions: # Check if the prediction is in the format of 4 corner points if isinstance(pred, list) and len(pred) >= 5 and isinstance(pred[0], list): # Convert 4 corner points to xyxy format (min x, min y, max x, max y) points = np.array(pred[:4]) x1, y1 = points[:, 0].min(), points[:, 1].min() x2, y2 = points[:, 0].max(), points[:, 1].max() xyxy = [x1, y1, x2, y2] box = YoloBox(xyxy=xyxy, conf=1.0, cls="text") # Check if the prediction is already in xyxy format elif isinstance(pred, list | np.ndarray) and len(pred) >= 4: if isinstance(pred, np.ndarray): pred = pred.tolist() xyxy = pred[:4] box = YoloBox(xyxy=xyxy, conf=1.0, cls="text") else: continue boxes.append(box) return YoloResult(names=["text"], boxes=boxes) def create_yolo_result_from_nested_coords(nested_coords: np.ndarray, names: dict): boxes = [] for quad in nested_coords.tolist(): if len(quad) != 4: continue # Convert quad coordinates to xyxy format (min x, min y, max x, max y) x1, y1, x2, y2 = quad # Create YoloBox with confidence 1.0 and class 'text' box = YoloBox( xyxy=[float(x1), float(y1), float(x2), float(y2)], conf=np.array(1.0), cls=0 ) boxes.append(box) return YoloResult(names=names, boxes=boxes) class RapidOCRModel: def __init__(self): self.use_cuda = False self.use_dml = False available_providers = onnxruntime.get_available_providers() for provider in available_providers: if re.match(r"dml", provider, re.IGNORECASE): self.use_dml = True elif re.match(r"cuda", provider, re.IGNORECASE): self.use_cuda = True self.use_dml = False # force disable directml self.model = RapidOCR( det_model_path=get_table_detection_rapidocr_model_path(), det_use_cuda=self.use_cuda, det_use_dml=False, ) self.names = {0: "table_text"} self.lock = threading.Lock() @property def stride(self): return 32 def resize_and_pad_image(self, image, new_shape): """ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. Parameters: - image: Input image - new_shape: Target size (integer or (height, width) tuple) - stride: Padding alignment stride, default 32 Returns: - Processed image """ if isinstance(new_shape, int): new_shape = (new_shape, new_shape) h, w = image.shape[:2] new_h, new_w = new_shape # Calculate scaling ratio r = min(new_h / h, new_w / w) resized_h, resized_w = int(round(h * r)), int(round(w * r)) # Resize image image = cv2.resize( image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR, ) # Calculate padding size and align to stride multiple pad_w = (new_w - resized_w) % self.stride pad_h = (new_h - resized_h) % self.stride top, bottom = pad_h // 2, pad_h - pad_h // 2 left, right = pad_w // 2, pad_w - pad_w // 2 # Add padding image = cv2.copyMakeBorder( image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114), ) return image def scale_boxes(self, img1_shape, boxes, img0_shape): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ # Calculate scaling ratio gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # Calculate padding size pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # Remove padding and scale boxes boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain return boxes def predict(self, image, imgsz=800, batch_size=16, **kwargs): """ Predict the layout of document pages. Args: image: A single image or a list of images of document pages. imgsz: Resize the image to this size. Must be a multiple of the stride. batch_size: Number of images to process in one batch. **kwargs: Additional arguments. Returns: A YoloResult object containing the detected boxes. """ # Handle single image input assert isinstance(image, np.ndarray) and len(image.shape) == 3 # Calculate target size based on the maximum height in the batch target_imgsz = 1024 orig_shape = (image.shape[0], image.shape[1]) pix = self.resize_and_pad_image(image, new_shape=target_imgsz) # pix = np.transpose(pix, (2, 0, 1)) # CHW # pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1] input_ = pix new_h, new_w = input_.shape[:2] # Run inference preds = self.model(input_, use_det=True, use_cls=False, use_rec=False) # Process each prediction in the batch if len(preds) > 0: preds_np = np.array(preds[0])[:, [0, 2], :].reshape([-1, 4]) preds_np[..., :4] = self.scale_boxes( (new_h, new_w), preds_np[..., :4], orig_shape, ) # Convert predictions to YoloResult format return create_yolo_result_from_nested_coords(preds_np, self.names) else: # Return empty YoloResult if no predictions return YoloResult(names=self.names, boxes=[]) def handle_document( self, pages: list[babeldoc.format.pdf.document_il.il_version_1.Page], mupdf_doc: pymupdf.Document, translate_config, save_debug_image, ) -> Generator[ tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None ]: for page in pages: translate_config.raise_if_cancelled() with self.lock: # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) pix = get_no_rotation_img(mupdf_doc[page.page_number]) image = np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] table_boxes = [] for layout in page.page_layout: if layout.class_name == "table": table_boxes.append(layout.box) predict_result = self.predict(image) ok_boxes = [] for box in predict_result.boxes: # Convert the box coordinates to float for proper comparison box_xyxy = [float(coord) for coord in box.xyxy] # Check if this box is inside any of the table boxes for table_box in table_boxes: # Determine if box is inside or overlapping with table_box with image dimensions if self._is_box_in_table( box_xyxy, table_box, page, image.shape[1], image.shape[0] ): ok_boxes.append(box) break yolo_result = YoloResult(names=self.names, boxes=ok_boxes) save_debug_image( image, yolo_result, page.page_number + 1, ) yield page, yolo_result def _is_box_in_table(self, box_xyxy, table_box, page, img_width, img_height): """ Check if a box from image coordinates is inside a table box from PDF coordinates. Args: box_xyxy (list): Box coordinates in image coordinate system [x1, y1, x2, y2] table_box (Box): Table box in PDF coordinate system page: The page object containing information for coordinate conversion img_width: Width of the image img_height: Height of the image Returns: bool: True if the box is inside or significantly overlapping with the table box """ # Get table box coordinates in PDF coordinate system table_pdf_x1 = table_box.x table_pdf_y1 = table_box.y table_pdf_x2 = table_box.x2 table_pdf_y2 = table_box.y2 # Convert table box to image coordinates table_img_x1 = table_pdf_x1 table_img_y1 = img_height - table_pdf_y2 table_img_x2 = table_pdf_x2 table_img_y2 = img_height - table_pdf_y1 # Now check for overlap between the boxes # Calculate the area of overlap x_overlap = max( 0, min(box_xyxy[2], table_img_x2) - max(box_xyxy[0], table_img_x1) ) y_overlap = max( 0, min(box_xyxy[3], table_img_y2) - max(box_xyxy[1], table_img_y1) ) overlap_area = x_overlap * y_overlap # Calculate area of the detected box box_area = (box_xyxy[2] - box_xyxy[0]) * (box_xyxy[3] - box_xyxy[1]) # If overlap area is significant relative to the box area, consider it inside if box_area > 0 and overlap_area / box_area > 0.5: return True return False ================================================ FILE: babeldoc/format/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/babelpdf/base14.py ================================================ from .encoding import get_type1_encoding from .win_core import win_core base14_bbox = { "Courier-BoldOblique": { ".notdef": (0, 0, 0, 0), "exclam": (216, -15, 495, 572), "quotedbl": (212, 277, 584, 562), "numbersign": (88, -45, 640, 651), "dollar": (87, -126, 629, 666), "percent": (102, -15, 624, 616), "ampersand": (62, -15, 594, 543), "quoteright": (230, 277, 542, 562), "parenleft": (266, -102, 592, 616), "parenright": (117, -102, 443, 616), "asterisk": (179, 219, 597, 601), "plus": (114, 39, 596, 478), "comma": (99, -111, 430, 174), "hyphen": (143, 203, 567, 313), "period": (207, -15, 426, 171), "slash": (91, -77, 626, 626), "zero": (137, -15, 591, 616), "one": (93, 0, 561, 616), "two": (61, 0, 593, 616), "three": (72, -15, 570, 616), "four": (82, 0, 558, 616), "five": (77, -15, 621, 601), "six": (136, -15, 652, 616), "seven": (147, 0, 622, 601), "eight": (116, -15, 603, 616), "nine": (76, -15, 591, 616), "colon": (206, -15, 479, 425), "semicolon": (99, -111, 480, 425), "less": (121, 15, 612, 501), "equal": (96, 118, 614, 398), "greater": (97, 15, 589, 501), "question": (183, -14, 591, 580), "at": (67, -15, 641, 616), "A": (-9, 0, 631, 562), "B": (30, 0, 628, 562), "C": (75, -18, 674, 580), "D": (30, 0, 663, 562), "E": (25, 0, 669, 562), "F": (39, 0, 683, 562), "G": (75, -18, 674, 580), "H": (20, 0, 699, 562), "I": (77, 0, 642, 562), "J": (59, -18, 720, 562), "K": (21, 0, 691, 562), "L": (39, 0, 635, 562), "M": (-2, 0, 721, 562), "N": (8, -12, 729, 562), "O": (75, -18, 645, 580), "P": (48, 0, 642, 562), "Q": (84, -138, 635, 580), "R": (24, 0, 617, 562), "S": (54, -22, 672, 582), "T": (86, 0, 678, 562), "U": (101, -18, 715, 562), "V": (84, 0, 732, 562), "W": (84, 0, 737, 562), "X": (12, 0, 689, 562), "Y": (109, 0, 708, 562), "Z": (62, 0, 636, 562), "bracketleft": (223, -102, 606, 616), "backslash": (223, -77, 496, 626), "bracketright": (103, -102, 486, 616), "asciicircum": (171, 250, 555, 616), "underscore": (-27, -125, 584, -75), "quoteleft": (297, 277, 487, 562), "a": (62, -15, 592, 454), "b": (13, -15, 635, 626), "c": (82, -15, 631, 459), "d": (61, -15, 644, 626), "e": (82, -15, 604, 454), "f": (83, 0, 677, 626), "g": (41, -146, 673, 454), "h": (18, 0, 614, 626), "i": (77, 0, 545, 658), "j": (37, -146, 580, 658), "k": (33, 0, 642, 626), "l": (77, 0, 545, 626), "m": (-22, 0, 648, 454), "n": (18, 0, 614, 454), "o": (72, -15, 622, 454), "p": (-31, -142, 621, 454), "q": (61, -142, 684, 454), "r": (47, 0, 654, 454), "s": (67, -17, 607, 459), "t": (118, -15, 566, 562), "u": (70, -15, 591, 439), "v": (70, 0, 694, 439), "w": (53, 0, 711, 439), "x": (6, 0, 670, 439), "y": (-20, -142, 694, 439), "z": (81, 0, 613, 439), "braceleft": (204, -102, 595, 616), "bar": (202, -250, 504, 750), "braceright": (114, -102, 506, 616), "asciitilde": (120, 153, 589, 356), "exclamdown": (197, -146, 476, 449), "cent": (122, -49, 604, 614), "sterling": (107, -28, 650, 611), "fraction": (22, -60, 707, 661), "yen": (98, 0, 709, 562), "florin": (-56, -131, 701, 616), "section": (74, -70, 619, 580), "currency": (77, 49, 643, 517), "quotesingle": (304, 277, 492, 562), "quotedblleft": (190, 277, 594, 562), "guillemotleft": (63, 70, 638, 446), "guilsinglleft": (196, 70, 544, 446), "guilsinglright": (166, 70, 514, 446), "fi": (12, 0, 643, 626), "fl": (12, 0, 643, 626), "endash": (108, 203, 602, 313), "dagger": (176, -70, 586, 580), "daggerdbl": (122, -70, 586, 580), "periodcentered": (250, 165, 460, 351), "paragraph": (61, -70, 699, 580), "bullet": (197, 132, 523, 430), "quotesinglbase": (145, -142, 457, 143), "quotedblbase": (35, -142, 559, 143), "quotedblright": (120, 277, 644, 562), "guillemotright": (72, 70, 647, 446), "ellipsis": (36, -15, 586, 116), "perthousand": (-44, -15, 742, 616), "questiondown": (102, -146, 509, 449), "grave": (272, 508, 503, 661), "acute": (313, 508, 608, 661), "circumflex": (212, 483, 606, 657), "tilde": (200, 493, 642, 636), "macron": (195, 505, 636, 585), "breve": (217, 468, 651, 631), "dotaccent": (347, 485, 489, 625), "dieresis": (245, 485, 591, 625), "ring": (319, 481, 527, 678), "cedilla": (169, -206, 366, 0), "hungarumlaut": (172, 488, 728, 661), "ogonek": (144, -199, 350, 0), "caron": (238, 493, 632, 667), "emdash": (33, 203, 677, 313), "AE": (-29, 0, 707, 562), "ordfeminine": (189, 196, 526, 580), "Lslash": (39, 0, 635, 562), "Oslash": (48, -22, 672, 584), "OE": (27, 0, 700, 562), "ordmasculine": (189, 196, 542, 580), "ae": (22, -15, 651, 454), "dotlessi": (77, 0, 545, 439), "lslash": (77, 0, 578, 626), "oslash": (55, -24, 637, 463), "oe": (19, -15, 661, 454), "germandbls": (22, -15, 628, 626), "Scedilla": (54, -206, 672, 582), "multiply": (105, 39, 606, 478), "logicalnot": (135, 103, 617, 413), "format": (-26, -146, 243, 601), "tab": (19, 0, 641, 562), "overscore": (123, 579, 734, 629), "IJ": (-8, -18, 741, 562), "trademark": (86, 230, 868, 562), "onequarter": (14, -60, 706, 661), "mu": (50, -142, 591, 439), "minus": (114, 203, 596, 313), "brokenbar": (218, -175, 488, 675), "arrowleft": (40, 143, 708, 455), "LL": (-45, 0, 694, 562), "arrowright": (20, 143, 688, 455), "thorn": (-31, -142, 621, 626), "lira": (107, -28, 650, 611), "arrowboth": (40, 143, 688, 455), "indent": (99, 45, 579, 372), "threesuperior": (193, 222, 525, 616), "onehalf": (23, -60, 715, 661), "graybox": (76, 0, 652, 599), "Idot": (77, 0, 642, 748), "ll": (1, 0, 653, 626), "Thorn": (48, 0, 619, 562), "Ccedilla": (75, -206, 674, 580), "notegraphic": (91, -15, 619, 572), "arrowup": (244, 3, 556, 626), "down": (168, -15, 496, 439), "plusminus": (76, 24, 614, 515), "threequarters": (8, -60, 698, 661), "scedilla": (67, -206, 607, 459), "ij": (6, -146, 714, 658), "eth": (94, -27, 661, 626), "merge": (168, -15, 533, 487), "twosuperior": (192, 230, 540, 616), "arrowdown": (174, -15, 486, 608), "left": (109, 44, 589, 371), "return": (79, 0, 700, 562), "Eth": (30, 0, 663, 562), "up": (196, 0, 523, 447), "divide": (114, 16, 596, 500), "prescription": (24, -15, 632, 562), "square": (19, 0, 700, 562), "stop": (19, 0, 700, 562), "degree": (174, 243, 569, 616), "ccedilla": (82, -206, 631, 459), "onesuperior": (213, 230, 514, 616), "largebullet": (307, 229, 413, 333), "center": (103, 14, 623, 580), "registered": (54, -18, 666, 580), "copyright": (54, -18, 666, 580), "dectab": (8, 0, 615, 320), "space": (0, 0, 0, 0), "Aacute": (-9, 0, 665, 784), "Acircumflex": (-9, 0, 631, 780), "Adieresis": (-9, 0, 631, 748), "Agrave": (-9, 0, 631, 784), "Aring": (-9, 0, 631, 801), "Atilde": (-9, 0, 638, 759), "Eacute": (25, 0, 669, 784), "Ecircumflex": (25, 0, 669, 780), "Edieresis": (25, 0, 669, 748), "Egrave": (25, 0, 669, 784), "Gcaron": (75, -18, 674, 790), "Iacute": (77, 0, 642, 784), "Icircumflex": (77, 0, 642, 780), "Idieresis": (77, 0, 642, 748), "Igrave": (77, 0, 642, 784), "Ntilde": (8, -12, 729, 759), "Oacute": (75, -18, 645, 784), "Ocircumflex": (75, -18, 645, 780), "Odieresis": (75, -18, 645, 748), "Ograve": (75, -18, 645, 784), "Otilde": (75, -18, 668, 759), "Scaron": (54, -22, 672, 790), "Uacute": (101, -18, 715, 784), "Ucircumflex": (101, -18, 715, 780), "Udieresis": (101, -18, 715, 748), "Ugrave": (101, -18, 715, 784), "Yacute": (109, 0, 708, 784), "Ydieresis": (109, 0, 708, 748), "Zcaron": (62, 0, 659, 790), "aacute": (62, -15, 608, 661), "acircumflex": (62, -15, 592, 657), "adieresis": (62, -15, 592, 625), "agrave": (62, -15, 592, 661), "aring": (62, -15, 592, 678), "atilde": (62, -15, 642, 636), "eacute": (82, -15, 608, 661), "ecircumflex": (82, -15, 606, 657), "edieresis": (82, -15, 604, 625), "egrave": (82, -15, 604, 661), "gcaron": (41, -146, 673, 667), "iacute": (77, 0, 608, 661), "icircumflex": (77, 0, 566, 657), "idieresis": (77, 0, 551, 625), "igrave": (77, 0, 545, 661), "ntilde": (18, 0, 642, 636), "oacute": (72, -15, 622, 661), "ocircumflex": (72, -15, 622, 657), "odieresis": (72, -15, 622, 625), "ograve": (72, -15, 622, 661), "otilde": (72, -15, 642, 636), "scaron": (67, -17, 632, 667), "uacute": (70, -15, 608, 661), "ucircumflex": (70, -15, 591, 657), "udieresis": (70, -15, 591, 625), "ugrave": (70, -15, 591, 661), "yacute": (-20, -142, 694, 661), "ydieresis": (-20, -142, 694, 625), "zcaron": (81, 0, 632, 667), }, "Courier-Bold": { ".notdef": (0, 0, 0, 0), "exclam": (202, -15, 398, 572), "quotedbl": (135, 277, 465, 562), "numbersign": (56, -45, 544, 651), "dollar": (82, -126, 519, 666), "percent": (5, -15, 595, 616), "ampersand": (36, -15, 546, 543), "quoteright": (171, 277, 423, 562), "parenleft": (219, -102, 461, 616), "parenright": (139, -102, 381, 616), "asterisk": (91, 219, 509, 601), "plus": (71, 39, 529, 478), "comma": (123, -111, 393, 174), "hyphen": (100, 203, 500, 313), "period": (192, -15, 408, 171), "slash": (98, -77, 502, 626), "zero": (87, -15, 513, 616), "one": (81, 0, 539, 616), "two": (61, 0, 499, 616), "three": (63, -15, 501, 616), "four": (53, 0, 507, 616), "five": (70, -15, 521, 601), "six": (90, -15, 521, 616), "seven": (55, 0, 494, 601), "eight": (83, -15, 517, 616), "nine": (79, -15, 510, 616), "colon": (191, -15, 407, 425), "semicolon": (123, -111, 408, 425), "less": (66, 15, 523, 501), "equal": (71, 118, 529, 398), "greater": (77, 15, 534, 501), "question": (98, -14, 501, 580), "at": (16, -15, 584, 616), "A": (-9, 0, 609, 562), "B": (30, 0, 573, 562), "C": (22, -18, 560, 580), "D": (30, 0, 594, 562), "E": (25, 0, 560, 562), "F": (39, 0, 570, 562), "G": (22, -18, 594, 580), "H": (20, 0, 580, 562), "I": (77, 0, 523, 562), "J": (37, -18, 601, 562), "K": (21, 0, 599, 562), "L": (39, 0, 578, 562), "M": (-2, 0, 602, 562), "N": (8, -12, 610, 562), "O": (22, -18, 578, 580), "P": (48, 0, 559, 562), "Q": (32, -138, 578, 580), "R": (24, 0, 599, 562), "S": (47, -22, 553, 582), "T": (21, 0, 579, 562), "U": (4, -18, 596, 562), "V": (-13, 0, 613, 562), "W": (-18, 0, 618, 562), "X": (12, 0, 588, 562), "Y": (12, 0, 589, 562), "Z": (62, 0, 539, 562), "bracketleft": (245, -102, 475, 616), "backslash": (99, -77, 503, 626), "bracketright": (125, -102, 355, 616), "asciicircum": (108, 250, 492, 616), "underscore": (0, -125, 600, -75), "quoteleft": (178, 277, 428, 562), "a": (35, -15, 570, 454), "b": (0, -15, 584, 626), "c": (40, -15, 545, 459), "d": (20, -15, 591, 626), "e": (40, -15, 563, 454), "f": (83, 0, 547, 626), "g": (30, -146, 580, 454), "h": (5, 0, 592, 626), "i": (77, 0, 523, 658), "j": (63, -146, 440, 658), "k": (20, 0, 585, 626), "l": (77, 0, 523, 626), "m": (-22, 0, 626, 454), "n": (18, 0, 592, 454), "o": (30, -15, 570, 454), "p": (-1, -142, 570, 454), "q": (20, -142, 591, 454), "r": (47, 0, 580, 454), "s": (68, -17, 535, 459), "t": (47, -15, 532, 562), "u": (-1, -15, 569, 439), "v": (-1, 0, 601, 439), "w": (-18, 0, 618, 439), "x": (6, 0, 594, 439), "y": (-4, -142, 601, 439), "z": (81, 0, 520, 439), "braceleft": (160, -102, 464, 616), "bar": (255, -250, 345, 750), "braceright": (136, -102, 440, 616), "asciitilde": (71, 153, 530, 356), "exclamdown": (202, -146, 398, 449), "cent": (66, -49, 518, 614), "sterling": (72, -28, 558, 611), "fraction": (25, -60, 576, 661), "yen": (10, 0, 590, 562), "florin": (-30, -131, 572, 616), "section": (83, -70, 517, 580), "currency": (54, 49, 546, 517), "quotesingle": (227, 277, 373, 562), "quotedblleft": (71, 277, 535, 562), "guillemotleft": (8, 70, 553, 446), "guilsinglleft": (141, 70, 459, 446), "guilsinglright": (141, 70, 459, 446), "fi": (12, 0, 593, 626), "fl": (12, 0, 593, 626), "endash": (65, 203, 535, 313), "dagger": (106, -70, 494, 580), "daggerdbl": (106, -70, 494, 580), "periodcentered": (196, 165, 404, 351), "paragraph": (6, -70, 576, 580), "bullet": (140, 132, 460, 430), "quotesinglbase": (175, -142, 427, 143), "quotedblbase": (65, -142, 529, 143), "quotedblright": (61, 277, 525, 562), "guillemotright": (47, 70, 592, 446), "ellipsis": (26, -15, 574, 116), "perthousand": (-113, -15, 713, 616), "questiondown": (99, -146, 502, 449), "grave": (132, 508, 395, 661), "acute": (205, 508, 468, 661), "circumflex": (103, 483, 497, 657), "tilde": (89, 493, 512, 636), "macron": (88, 505, 512, 585), "breve": (83, 468, 517, 631), "dotaccent": (230, 485, 370, 625), "dieresis": (128, 485, 472, 625), "ring": (198, 481, 402, 678), "cedilla": (205, -206, 387, 0), "hungarumlaut": (68, 488, 588, 661), "ogonek": (169, -199, 367, 0), "caron": (103, 493, 497, 667), "emdash": (-10, 203, 610, 313), "AE": (-29, 0, 602, 562), "ordfeminine": (147, 196, 453, 580), "Lslash": (39, 0, 578, 562), "Oslash": (22, -22, 578, 584), "OE": (-25, 0, 595, 562), "ordmasculine": (147, 196, 453, 580), "ae": (-4, -15, 601, 454), "dotlessi": (77, 0, 523, 439), "lslash": (77, 0, 523, 626), "oslash": (30, -24, 570, 463), "oe": (-18, -15, 611, 454), "germandbls": (22, -15, 596, 626), "Scedilla": (47, -206, 553, 582), "multiply": (81, 39, 520, 478), "logicalnot": (71, 103, 529, 413), "format": (5, -146, 115, 601), "tab": (19, 0, 581, 562), "overscore": (0, 579, 600, 629), "IJ": (-8, -18, 622, 562), "trademark": (-9, 230, 749, 562), "onequarter": (-56, -60, 656, 661), "mu": (-1, -142, 569, 439), "minus": (71, 203, 529, 313), "brokenbar": (255, -175, 345, 675), "arrowleft": (-24, 143, 634, 455), "LL": (-45, 0, 645, 562), "arrowright": (-34, 143, 624, 455), "thorn": (-14, -142, 570, 626), "lira": (72, -28, 558, 611), "arrowboth": (-24, 143, 624, 455), "indent": (65, 45, 535, 372), "threesuperior": (138, 222, 433, 616), "onehalf": (-47, -60, 648, 661), "graybox": (76, 0, 525, 599), "Idot": (77, 0, 523, 748), "ll": (-12, 0, 600, 626), "Thorn": (48, 0, 557, 562), "Ccedilla": (22, -206, 560, 580), "notegraphic": (77, -15, 523, 572), "arrowup": (144, 3, 456, 626), "down": (137, -15, 464, 439), "plusminus": (71, 24, 529, 515), "threequarters": (-47, -60, 648, 661), "scedilla": (68, -206, 535, 459), "ij": (6, -146, 574, 658), "eth": (58, -27, 543, 626), "merge": (137, -15, 464, 487), "twosuperior": (143, 230, 436, 616), "arrowdown": (144, -15, 456, 608), "left": (65, 44, 535, 371), "return": (19, 0, 581, 562), "Eth": (30, 0, 594, 562), "up": (136, 0, 463, 447), "divide": (71, 16, 529, 500), "prescription": (24, -15, 599, 562), "square": (19, 0, 581, 562), "stop": (19, 0, 581, 562), "degree": (86, 243, 474, 616), "ccedilla": (40, -206, 545, 459), "onesuperior": (153, 230, 447, 616), "largebullet": (248, 229, 352, 333), "center": (40, 14, 560, 580), "registered": (0, -18, 600, 580), "copyright": (0, -18, 600, 580), "dectab": (8, 0, 592, 320), "space": (0, 0, 0, 0), "Aacute": (-9, 0, 609, 784), "Acircumflex": (-9, 0, 609, 780), "Adieresis": (-9, 0, 609, 748), "Agrave": (-9, 0, 609, 784), "Aring": (-9, 0, 609, 801), "Atilde": (-9, 0, 609, 759), "Eacute": (25, 0, 560, 784), "Ecircumflex": (25, 0, 560, 780), "Edieresis": (25, 0, 560, 748), "Egrave": (25, 0, 560, 784), "Gcaron": (22, -18, 594, 790), "Iacute": (77, 0, 523, 784), "Icircumflex": (77, 0, 523, 780), "Idieresis": (77, 0, 523, 748), "Igrave": (77, 0, 523, 784), "Ntilde": (8, -12, 610, 759), "Oacute": (22, -18, 578, 784), "Ocircumflex": (22, -18, 578, 780), "Odieresis": (22, -18, 578, 748), "Ograve": (22, -18, 578, 784), "Otilde": (22, -18, 578, 759), "Scaron": (47, -22, 553, 790), "Uacute": (4, -18, 596, 784), "Ucircumflex": (4, -18, 596, 780), "Udieresis": (4, -18, 596, 748), "Ugrave": (4, -18, 596, 784), "Yacute": (12, 0, 589, 784), "Ydieresis": (12, 0, 589, 748), "Zcaron": (62, 0, 539, 790), "aacute": (35, -15, 570, 661), "acircumflex": (35, -15, 570, 657), "adieresis": (35, -15, 570, 625), "agrave": (35, -15, 570, 661), "aring": (35, -15, 570, 678), "atilde": (35, -15, 570, 636), "eacute": (40, -15, 563, 661), "ecircumflex": (40, -15, 563, 657), "edieresis": (40, -15, 563, 625), "egrave": (40, -15, 563, 661), "gcaron": (30, -146, 580, 667), "iacute": (77, 0, 523, 661), "icircumflex": (63, 0, 523, 657), "idieresis": (77, 0, 523, 625), "igrave": (77, 0, 523, 661), "ntilde": (18, 0, 592, 636), "oacute": (30, -15, 570, 661), "ocircumflex": (30, -15, 570, 657), "odieresis": (30, -15, 570, 625), "ograve": (30, -15, 570, 661), "otilde": (30, -15, 570, 636), "scaron": (68, -17, 535, 667), "uacute": (-1, -15, 569, 661), "ucircumflex": (-1, -15, 569, 657), "udieresis": (-1, -15, 569, 625), "ugrave": (-1, -15, 569, 661), "yacute": (-4, -142, 601, 661), "ydieresis": (-4, -142, 601, 625), "zcaron": (81, 0, 520, 667), }, "Courier": { ".notdef": (0, 0, 0, 0), "exclam": (236, -15, 364, 572), "quotedbl": (187, 328, 413, 562), "numbersign": (93, -32, 507, 639), "dollar": (105, -126, 496, 662), "percent": (81, -15, 518, 622), "ampersand": (63, -15, 538, 543), "quoteright": (213, 328, 376, 562), "parenleft": (269, -108, 440, 622), "parenright": (160, -108, 331, 622), "asterisk": (116, 257, 484, 607), "plus": (80, 44, 520, 470), "comma": (181, -112, 344, 122), "hyphen": (103, 231, 497, 285), "period": (229, -15, 371, 109), "slash": (125, -80, 475, 629), "zero": (106, -15, 494, 622), "one": (96, 0, 505, 622), "two": (70, 0, 471, 622), "three": (75, -15, 466, 622), "four": (78, 0, 500, 622), "five": (92, -15, 497, 607), "six": (111, -15, 497, 622), "seven": (82, 0, 483, 607), "eight": (102, -15, 498, 622), "nine": (96, -15, 489, 622), "colon": (229, -15, 371, 385), "semicolon": (181, -112, 371, 385), "less": (41, 42, 519, 472), "equal": (80, 138, 520, 376), "greater": (66, 42, 544, 472), "question": (129, -15, 492, 572), "at": (77, -15, 533, 622), "A": (3, 0, 597, 562), "B": (43, 0, 559, 562), "C": (41, -18, 540, 580), "D": (43, 0, 574, 562), "E": (53, 0, 550, 562), "F": (53, 0, 545, 562), "G": (31, -18, 575, 580), "H": (32, 0, 568, 562), "I": (96, 0, 504, 562), "J": (34, -18, 566, 562), "K": (38, 0, 582, 562), "L": (47, 0, 554, 562), "M": (4, 0, 596, 562), "N": (7, -13, 593, 562), "O": (43, -18, 557, 580), "P": (79, 0, 558, 562), "Q": (43, -138, 557, 580), "R": (38, 0, 588, 562), "S": (72, -20, 529, 580), "T": (38, 0, 563, 562), "U": (17, -18, 583, 562), "V": (-4, -13, 604, 562), "W": (-3, -13, 603, 562), "X": (23, 0, 577, 562), "Y": (24, 0, 576, 562), "Z": (86, 0, 514, 562), "bracketleft": (269, -108, 442, 622), "backslash": (118, -80, 482, 629), "bracketright": (158, -108, 331, 622), "asciicircum": (94, 354, 506, 622), "underscore": (0, -125, 600, -75), "quoteleft": (224, 328, 387, 562), "a": (53, -15, 559, 441), "b": (14, -15, 575, 629), "c": (66, -15, 529, 441), "d": (45, -15, 591, 629), "e": (66, -15, 548, 441), "f": (114, 0, 531, 629), "g": (45, -157, 566, 441), "h": (18, 0, 582, 629), "i": (95, 0, 505, 657), "j": (82, -157, 410, 657), "k": (43, 0, 580, 629), "l": (95, 0, 505, 629), "m": (-5, 0, 605, 441), "n": (26, 0, 575, 441), "o": (62, -15, 538, 441), "p": (9, -157, 555, 441), "q": (45, -157, 591, 441), "r": (60, 0, 559, 441), "s": (80, -15, 513, 441), "t": (87, -15, 530, 561), "u": (21, -15, 562, 426), "v": (10, -10, 590, 426), "w": (-4, -10, 604, 426), "x": (20, 0, 580, 426), "y": (7, -157, 592, 426), "z": (99, 0, 502, 426), "braceleft": (182, -108, 437, 622), "bar": (275, -250, 326, 750), "braceright": (163, -108, 418, 622), "asciitilde": (63, 197, 540, 320), "exclamdown": (236, -157, 364, 430), "cent": (96, -49, 500, 614), "sterling": (84, -21, 521, 611), "fraction": (92, -57, 509, 665), "yen": (26, 0, 574, 562), "florin": (4, -143, 539, 622), "section": (113, -78, 488, 580), "currency": (73, 58, 527, 506), "quotesingle": (259, 328, 341, 562), "quotedblleft": (143, 328, 471, 562), "guillemotleft": (37, 70, 563, 446), "guilsinglleft": (149, 70, 451, 446), "guilsinglright": (149, 70, 451, 446), "fi": (3, 0, 597, 629), "fl": (3, 0, 597, 629), "endash": (75, 231, 525, 285), "dagger": (141, -78, 459, 580), "daggerdbl": (141, -78, 459, 580), "periodcentered": (222, 189, 378, 327), "paragraph": (50, -78, 511, 562), "bullet": (172, 130, 428, 383), "quotesinglbase": (213, -134, 376, 100), "quotedblbase": (143, -134, 457, 100), "quotedblright": (143, 328, 457, 562), "guillemotright": (37, 70, 563, 446), "ellipsis": (37, -15, 563, 111), "perthousand": (3, -15, 600, 622), "questiondown": (108, -157, 471, 430), "grave": (151, 497, 378, 672), "acute": (242, 497, 469, 672), "circumflex": (124, 477, 476, 654), "tilde": (105, 489, 503, 606), "macron": (120, 525, 480, 565), "breve": (153, 501, 447, 609), "dotaccent": (249, 477, 352, 580), "dieresis": (148, 492, 453, 595), "ring": (218, 463, 382, 627), "cedilla": (224, -151, 362, 10), "hungarumlaut": (133, 497, 540, 672), "ogonek": (227, -151, 370, 0), "caron": (124, 492, 476, 669), "emdash": (0, 231, 600, 285), "AE": (3, 0, 550, 562), "ordfeminine": (156, 249, 442, 580), "Lslash": (47, 0, 554, 562), "Oslash": (43, -80, 557, 629), "OE": (7, 0, 567, 562), "ordmasculine": (157, 249, 443, 580), "ae": (19, -15, 570, 441), "dotlessi": (95, 0, 505, 426), "lslash": (95, 0, 505, 629), "oslash": (62, -80, 538, 506), "oe": (19, -15, 559, 441), "germandbls": (48, -15, 588, 629), "Scedilla": (72, -151, 529, 580), "multiply": (87, 43, 515, 470), "logicalnot": (87, 108, 513, 369), "format": (5, -157, 56, 607), "tab": (19, 0, 581, 562), "overscore": (0, 579, 600, 629), "IJ": (32, -18, 583, 562), "trademark": (-23, 263, 623, 562), "onequarter": (0, -57, 600, 665), "mu": (21, -157, 562, 426), "minus": (80, 232, 520, 283), "brokenbar": (275, -175, 326, 675), "arrowleft": (-24, 115, 624, 483), "LL": (8, 0, 592, 562), "arrowright": (-24, 115, 624, 483), "thorn": (-6, -157, 555, 629), "lira": (73, -21, 521, 611), "arrowboth": (-28, 115, 628, 483), "indent": (70, 68, 530, 348), "threesuperior": (155, 240, 406, 622), "onehalf": (0, -57, 611, 665), "graybox": (76, 0, 525, 599), "Idot": (96, 0, 504, 716), "ll": (18, 0, 567, 629), "Thorn": (79, 0, 538, 562), "Ccedilla": (41, -151, 540, 580), "notegraphic": (136, -15, 464, 572), "arrowup": (116, 0, 484, 623), "down": (160, -15, 440, 426), "plusminus": (87, 44, 513, 558), "threequarters": (8, -56, 593, 666), "scedilla": (80, -151, 513, 441), "ij": (37, -157, 490, 657), "eth": (62, -15, 538, 629), "merge": (160, -15, 440, 436), "twosuperior": (177, 249, 424, 622), "arrowdown": (116, -15, 484, 608), "left": (70, 68, 530, 348), "return": (19, 0, 581, 562), "Eth": (30, 0, 574, 562), "up": (160, 0, 440, 437), "divide": (87, 48, 513, 467), "prescription": (27, -15, 577, 562), "square": (19, 0, 581, 562), "stop": (19, 0, 581, 562), "degree": (123, 269, 477, 622), "ccedilla": (66, -151, 529, 441), "onesuperior": (172, 249, 428, 622), "largebullet": (261, 220, 339, 297), "center": (40, 14, 560, 580), "registered": (0, -18, 600, 580), "copyright": (0, -18, 600, 580), "dectab": (18, 0, 582, 227), "space": (0, 0, 0, 0), "Aacute": (3, 0, 597, 793), "Acircumflex": (3, 0, 597, 775), "Adieresis": (3, 0, 597, 731), "Agrave": (3, 0, 597, 793), "Aring": (3, 0, 597, 753), "Atilde": (3, 0, 597, 732), "Eacute": (53, 0, 550, 793), "Ecircumflex": (53, 0, 550, 775), "Edieresis": (53, 0, 550, 731), "Egrave": (53, 0, 550, 793), "Gcaron": (31, -18, 575, 805), "Iacute": (96, 0, 504, 793), "Icircumflex": (96, 0, 504, 775), "Idieresis": (96, 0, 504, 731), "Igrave": (96, 0, 504, 793), "Ntilde": (7, -13, 593, 732), "Oacute": (43, -18, 557, 793), "Ocircumflex": (43, -18, 557, 775), "Odieresis": (43, -18, 557, 731), "Ograve": (43, -18, 557, 793), "Otilde": (43, -18, 557, 732), "Scaron": (72, -20, 529, 805), "Uacute": (17, -18, 583, 793), "Ucircumflex": (17, -18, 583, 775), "Udieresis": (17, -18, 583, 731), "Ugrave": (17, -18, 583, 793), "Yacute": (24, 0, 576, 793), "Ydieresis": (24, 0, 576, 731), "Zcaron": (86, 0, 514, 805), "aacute": (53, -15, 559, 672), "acircumflex": (53, -15, 559, 654), "adieresis": (53, -15, 559, 595), "agrave": (53, -15, 559, 672), "aring": (53, -15, 559, 627), "atilde": (53, -15, 559, 606), "eacute": (66, -15, 548, 672), "ecircumflex": (66, -15, 548, 654), "edieresis": (66, -15, 548, 595), "egrave": (66, -15, 548, 672), "gcaron": (45, -157, 566, 669), "iacute": (95, 0, 505, 672), "icircumflex": (94, 0, 505, 654), "idieresis": (95, 0, 505, 595), "igrave": (95, 0, 505, 672), "ntilde": (26, 0, 575, 606), "oacute": (62, -15, 538, 672), "ocircumflex": (62, -15, 538, 654), "odieresis": (62, -15, 538, 595), "ograve": (62, -15, 538, 672), "otilde": (62, -15, 538, 606), "scaron": (80, -15, 513, 669), "uacute": (21, -15, 562, 672), "ucircumflex": (21, -15, 562, 654), "udieresis": (21, -15, 562, 595), "ugrave": (21, -15, 562, 672), "yacute": (7, -157, 592, 672), "ydieresis": (7, -157, 592, 595), "zcaron": (99, 0, 502, 669), }, "Courier-Oblique": { ".notdef": (0, 0, 0, 0), "exclam": (244, -15, 464, 572), "quotedbl": (273, 328, 532, 562), "numbersign": (133, -32, 596, 639), "dollar": (108, -126, 596, 662), "percent": (134, -15, 599, 622), "ampersand": (87, -15, 580, 543), "quoteright": (283, 328, 495, 562), "parenleft": (314, -108, 572, 622), "parenright": (137, -108, 396, 622), "asterisk": (212, 257, 580, 607), "plus": (129, 44, 580, 470), "comma": (157, -112, 370, 122), "hyphen": (152, 231, 558, 285), "period": (238, -15, 382, 109), "slash": (112, -80, 604, 629), "zero": (155, -15, 574, 622), "one": (98, 0, 515, 622), "two": (70, 0, 568, 622), "three": (82, -15, 537, 622), "four": (108, 0, 541, 622), "five": (99, -15, 589, 607), "six": (155, -15, 629, 622), "seven": (182, 0, 612, 607), "eight": (133, -15, 588, 622), "nine": (93, -15, 574, 622), "colon": (238, -15, 441, 385), "semicolon": (157, -112, 441, 385), "less": (96, 42, 610, 472), "equal": (109, 138, 600, 376), "greater": (85, 42, 599, 472), "question": (222, -15, 583, 572), "at": (127, -15, 582, 622), "A": (3, 0, 607, 562), "B": (43, 0, 615, 562), "C": (94, -18, 655, 580), "D": (43, 0, 645, 562), "E": (53, 0, 660, 562), "F": (53, 0, 660, 562), "G": (84, -18, 645, 580), "H": (32, 0, 687, 562), "I": (96, 0, 623, 562), "J": (52, -18, 685, 562), "K": (38, 0, 671, 562), "L": (47, 0, 607, 562), "M": (4, 0, 715, 562), "N": (7, -13, 712, 562), "O": (95, -18, 625, 580), "P": (79, 0, 643, 562), "Q": (95, -138, 625, 580), "R": (38, 0, 598, 562), "S": (76, -20, 650, 580), "T": (108, 0, 665, 562), "U": (125, -18, 702, 562), "V": (105, -13, 723, 562), "W": (106, -13, 722, 562), "X": (23, 0, 675, 562), "Y": (133, 0, 695, 562), "Z": (86, 0, 610, 562), "bracketleft": (246, -108, 574, 622), "backslash": (249, -80, 468, 629), "bracketright": (135, -108, 463, 622), "asciicircum": (175, 354, 587, 622), "underscore": (-27, -125, 584, -75), "quoteleft": (343, 328, 457, 562), "a": (77, -15, 569, 441), "b": (29, -15, 625, 629), "c": (106, -15, 608, 441), "d": (86, -15, 640, 629), "e": (107, -15, 597, 441), "f": (114, 0, 662, 629), "g": (61, -157, 657, 441), "h": (33, 0, 592, 629), "i": (95, 0, 515, 657), "j": (52, -157, 550, 657), "k": (58, 0, 633, 629), "l": (95, 0, 515, 629), "m": (-5, 0, 615, 441), "n": (26, 0, 585, 441), "o": (102, -15, 588, 441), "p": (-24, -157, 605, 441), "q": (86, -157, 682, 441), "r": (60, 0, 636, 441), "s": (78, -15, 584, 441), "t": (167, -15, 561, 561), "u": (101, -15, 572, 426), "v": (90, -10, 681, 426), "w": (76, -10, 695, 426), "x": (20, 0, 655, 426), "y": (-4, -157, 683, 426), "z": (99, 0, 593, 426), "braceleft": (233, -108, 569, 622), "bar": (222, -250, 485, 750), "braceright": (140, -108, 477, 622), "asciitilde": (116, 197, 600, 320), "exclamdown": (225, -157, 445, 430), "cent": (152, -49, 588, 614), "sterling": (124, -21, 621, 611), "fraction": (84, -57, 646, 665), "yen": (120, 0, 693, 562), "florin": (-26, -143, 671, 622), "section": (104, -78, 590, 580), "currency": (94, 58, 628, 506), "quotesingle": (345, 328, 460, 562), "quotedblleft": (262, 328, 541, 562), "guillemotleft": (92, 70, 652, 446), "guilsinglleft": (204, 70, 540, 446), "guilsinglright": (170, 70, 506, 446), "fi": (3, 0, 619, 629), "fl": (3, 0, 619, 629), "endash": (124, 231, 586, 285), "dagger": (217, -78, 546, 580), "daggerdbl": (163, -78, 546, 580), "periodcentered": (276, 189, 434, 327), "paragraph": (100, -78, 630, 562), "bullet": (225, 130, 485, 383), "quotesinglbase": (185, -134, 397, 100), "quotedblbase": (115, -134, 478, 100), "quotedblright": (213, 328, 576, 562), "guillemotright": (58, 70, 618, 446), "ellipsis": (46, -15, 574, 111), "perthousand": (59, -15, 626, 622), "questiondown": (106, -157, 466, 430), "grave": (294, 497, 484, 672), "acute": (348, 497, 612, 672), "circumflex": (229, 477, 581, 654), "tilde": (212, 489, 629, 606), "macron": (232, 525, 600, 565), "breve": (279, 501, 576, 609), "dotaccent": (360, 477, 465, 580), "dieresis": (263, 492, 570, 595), "ring": (333, 463, 499, 627), "cedilla": (197, -151, 344, 10), "hungarumlaut": (239, 497, 683, 672), "ogonek": (207, -151, 348, 0), "caron": (262, 492, 614, 669), "emdash": (49, 231, 661, 285), "AE": (3, 0, 655, 562), "ordfeminine": (209, 249, 512, 580), "Lslash": (47, 0, 607, 562), "Oslash": (95, -80, 625, 629), "OE": (60, 0, 672, 562), "ordmasculine": (210, 249, 534, 580), "ae": (42, -15, 626, 441), "dotlessi": (95, 0, 515, 426), "lslash": (95, 0, 583, 629), "oslash": (102, -80, 588, 506), "oe": (55, -15, 615, 441), "germandbls": (48, -15, 617, 629), "Scedilla": (76, -151, 650, 580), "multiply": (103, 43, 607, 470), "logicalnot": (155, 108, 591, 369), "format": (-28, -157, 185, 607), "tab": (19, 0, 641, 562), "overscore": (123, 579, 734, 629), "IJ": (32, -18, 702, 562), "trademark": (75, 263, 742, 562), "onequarter": (65, -57, 674, 665), "mu": (72, -157, 572, 426), "minus": (129, 232, 580, 283), "brokenbar": (238, -175, 469, 675), "arrowleft": (40, 115, 693, 483), "LL": (8, 0, 647, 562), "arrowright": (34, 115, 688, 483), "thorn": (-24, -157, 605, 629), "lira": (118, -21, 621, 611), "arrowboth": (36, 115, 692, 483), "indent": (108, 68, 574, 348), "threesuperior": (213, 240, 500, 622), "onehalf": (65, -57, 669, 665), "graybox": (76, 0, 652, 599), "Idot": (96, 0, 623, 716), "ll": (33, 0, 616, 629), "Thorn": (79, 0, 605, 562), "Ccedilla": (94, -151, 658, 580), "notegraphic": (144, -15, 564, 572), "arrowup": (209, 0, 577, 623), "down": (187, -15, 467, 426), "plusminus": (96, 44, 594, 558), "threequarters": (73, -56, 659, 666), "scedilla": (78, -151, 584, 441), "ij": (37, -157, 630, 657), "eth": (102, -15, 639, 629), "merge": (187, -15, 503, 436), "twosuperior": (230, 249, 534, 622), "arrowdown": (152, -15, 520, 608), "left": (114, 68, 580, 348), "return": (79, 0, 700, 562), "Eth": (43, 0, 645, 562), "up": (223, 0, 503, 437), "divide": (136, 48, 573, 467), "prescription": (27, -15, 617, 562), "square": (19, 0, 700, 562), "stop": (19, 0, 700, 562), "degree": (214, 269, 575, 622), "ccedilla": (106, -151, 614, 441), "onesuperior": (231, 249, 491, 622), "largebullet": (316, 220, 394, 297), "center": (103, 14, 623, 580), "registered": (54, -18, 666, 580), "copyright": (54, -18, 666, 580), "dectab": (18, 0, 593, 227), "space": (0, 0, 0, 0), "Aacute": (3, 0, 658, 793), "Acircumflex": (3, 0, 607, 775), "Adieresis": (3, 0, 607, 731), "Agrave": (3, 0, 607, 793), "Aring": (3, 0, 607, 753), "Atilde": (3, 0, 656, 732), "Eacute": (53, 0, 668, 793), "Ecircumflex": (53, 0, 660, 775), "Edieresis": (53, 0, 660, 731), "Egrave": (53, 0, 660, 793), "Gcaron": (84, -18, 645, 805), "Iacute": (96, 0, 638, 793), "Icircumflex": (96, 0, 623, 775), "Idieresis": (96, 0, 623, 731), "Igrave": (96, 0, 623, 793), "Ntilde": (7, -13, 712, 732), "Oacute": (95, -18, 638, 793), "Ocircumflex": (95, -18, 625, 775), "Odieresis": (95, -18, 625, 731), "Ograve": (95, -18, 625, 793), "Otilde": (95, -18, 656, 732), "Scaron": (76, -20, 673, 805), "Uacute": (125, -18, 702, 793), "Ucircumflex": (125, -18, 702, 775), "Udieresis": (125, -18, 702, 731), "Ugrave": (125, -18, 702, 793), "Yacute": (133, 0, 695, 793), "Ydieresis": (133, 0, 695, 731), "Zcaron": (86, 0, 643, 805), "aacute": (77, -15, 612, 672), "acircumflex": (77, -15, 581, 654), "adieresis": (77, -15, 570, 595), "agrave": (77, -15, 569, 672), "aring": (77, -15, 569, 627), "atilde": (77, -15, 629, 606), "eacute": (107, -15, 612, 672), "ecircumflex": (107, -15, 597, 654), "edieresis": (107, -15, 597, 595), "egrave": (107, -15, 597, 672), "gcaron": (61, -157, 657, 669), "iacute": (95, 0, 612, 672), "icircumflex": (95, 0, 551, 654), "idieresis": (95, 0, 540, 595), "igrave": (95, 0, 515, 672), "ntilde": (26, 0, 629, 606), "oacute": (102, -15, 612, 672), "ocircumflex": (102, -15, 588, 654), "odieresis": (102, -15, 588, 595), "ograve": (102, -15, 588, 672), "otilde": (102, -15, 629, 606), "scaron": (78, -15, 614, 669), "uacute": (101, -15, 602, 672), "ucircumflex": (101, -15, 572, 654), "udieresis": (101, -15, 572, 595), "ugrave": (101, -15, 572, 672), "yacute": (-4, -157, 683, 672), "ydieresis": (-4, -157, 683, 595), "zcaron": (99, 0, 624, 669), }, "Helvetica-BoldOblique": { ".notdef": (0, 0, 0, 0), "exclam": (94, 0, 397, 718), "quotedbl": (193, 447, 529, 718), "numbersign": (60, 0, 644, 698), "dollar": (67, -115, 621, 775), "percent": (137, -19, 900, 710), "ampersand": (89, -19, 732, 718), "quoteright": (167, 445, 362, 718), "parenleft": (76, -208, 470, 734), "parenright": (-25, -208, 368, 734), "asterisk": (146, 387, 481, 718), "plus": (82, 0, 610, 506), "comma": (28, -168, 245, 146), "hyphen": (73, 215, 379, 345), "period": (64, 0, 245, 146), "slash": (-37, -19, 468, 737), "zero": (87, -19, 617, 710), "one": (173, 0, 529, 710), "two": (26, 0, 619, 710), "three": (66, -19, 608, 710), "four": (60, 0, 598, 710), "five": (64, -19, 636, 698), "six": (86, -19, 619, 710), "seven": (125, 0, 676, 698), "eight": (70, -19, 615, 710), "nine": (78, -19, 615, 710), "colon": (92, 0, 351, 512), "semicolon": (56, -168, 351, 512), "less": (82, -8, 655, 514), "equal": (58, 87, 633, 419), "greater": (36, -8, 609, 514), "question": (165, 0, 670, 727), "at": (186, -19, 953, 737), "A": (20, 0, 702, 718), "B": (76, 0, 763, 718), "C": (107, -19, 788, 737), "D": (76, 0, 777, 718), "E": (76, 0, 757, 718), "F": (76, 0, 740, 718), "G": (108, -19, 816, 737), "H": (71, 0, 804, 718), "I": (64, 0, 367, 718), "J": (60, -18, 637, 718), "K": (87, 0, 858, 718), "L": (76, 0, 611, 718), "M": (69, 0, 918, 718), "N": (69, 0, 807, 718), "O": (108, -19, 823, 737), "P": (76, 0, 737, 718), "Q": (108, -52, 823, 737), "R": (76, 0, 778, 718), "S": (81, -19, 717, 737), "T": (140, 0, 751, 718), "U": (116, -19, 804, 718), "V": (172, 0, 801, 718), "W": (169, 0, 1082, 718), "X": (14, 0, 791, 718), "Y": (168, 0, 806, 718), "Z": (25, 0, 737, 718), "bracketleft": (21, -196, 462, 722), "backslash": (124, -19, 307, 737), "bracketright": (-18, -196, 423, 722), "asciicircum": (131, 323, 591, 698), "underscore": (-27, -125, 540, -75), "quoteleft": (165, 454, 361, 727), "a": (55, -14, 582, 546), "b": (61, -14, 645, 718), "c": (79, -14, 599, 546), "d": (83, -14, 704, 718), "e": (71, -14, 592, 546), "f": (87, 0, 469, 727), "g": (39, -217, 666, 546), "h": (65, 0, 629, 718), "i": (69, 0, 363, 725), "j": (-42, -214, 363, 725), "k": (69, 0, 670, 718), "l": (69, 0, 362, 718), "m": (64, 0, 909, 546), "n": (65, 0, 629, 546), "o": (83, -14, 643, 546), "p": (18, -207, 645, 546), "q": (81, -207, 665, 546), "r": (64, 0, 489, 546), "s": (63, -14, 584, 546), "t": (101, -6, 422, 676), "u": (99, -14, 658, 532), "v": (126, 0, 656, 532), "w": (123, 0, 882, 532), "x": (15, 0, 648, 532), "y": (42, -214, 652, 532), "z": (20, 0, 583, 532), "braceleft": (94, -196, 518, 722), "bar": (80, -19, 353, 737), "braceright": (-18, -196, 407, 722), "asciitilde": (115, 163, 577, 343), "exclamdown": (50, -186, 353, 532), "cent": (79, -118, 599, 628), "sterling": (50, -16, 635, 718), "fraction": (-174, -19, 487, 710), "yen": (60, 0, 713, 698), "florin": (-50, -210, 669, 737), "section": (61, -184, 598, 727), "currency": (27, 76, 680, 636), "quotesingle": (165, 447, 321, 718), "quotedblleft": (160, 454, 588, 727), "guillemotleft": (135, 76, 571, 484), "guilsinglleft": (130, 76, 353, 484), "guilsinglright": (99, 76, 322, 484), "fi": (87, 0, 696, 727), "fl": (87, 0, 695, 727), "endash": (48, 227, 627, 333), "dagger": (118, -171, 626, 718), "daggerdbl": (46, -171, 628, 718), "periodcentered": (111, 172, 275, 334), "paragraph": (99, -191, 688, 700), "bullet": (84, 194, 420, 524), "quotesinglbase": (41, -146, 236, 127), "quotedblbase": (36, -146, 463, 127), "quotedblright": (162, 445, 589, 718), "guillemotright": (104, 76, 540, 484), "ellipsis": (92, 0, 939, 146), "perthousand": (76, -19, 1038, 710), "questiondown": (54, -195, 559, 532), "grave": (136, 604, 353, 750), "acute": (236, 604, 515, 750), "circumflex": (118, 604, 471, 750), "tilde": (113, 610, 507, 737), "macron": (122, 604, 483, 678), "breve": (156, 604, 494, 750), "dotaccent": (235, 614, 385, 729), "dieresis": (137, 614, 482, 729), "ring": (200, 568, 420, 776), "cedilla": (-37, -228, 219, 0), "hungarumlaut": (137, 604, 645, 750), "ogonek": (41, -228, 264, 0), "caron": (149, 604, 502, 750), "emdash": (48, 227, 1071, 333), "AE": (5, 0, 1100, 718), "ordfeminine": (92, 276, 464, 737), "Lslash": (34, 0, 611, 718), "Oslash": (35, -27, 894, 745), "OE": (99, -19, 1114, 737), "ordmasculine": (92, 276, 484, 737), "ae": (56, -14, 922, 546), "dotlessi": (69, 0, 322, 532), "lslash": (40, 0, 407, 718), "oslash": (22, -29, 701, 560), "oe": (83, -14, 976, 546), "germandbls": (69, -14, 657, 731), "onesuperior": (148, 283, 388, 710), "logicalnot": (105, 108, 633, 419), "mu": (22, -207, 658, 532), "trademark": (179, 306, 1109, 718), "Eth": (62, 0, 777, 718), "onehalf": (132, -19, 858, 710), "plusminus": (40, 0, 625, 506), "Thorn": (76, 0, 715, 718), "onequarter": (132, -19, 806, 710), "divide": (82, -42, 610, 548), "brokenbar": (80, -19, 353, 737), "degree": (175, 426, 467, 712), "thorn": (18, -208, 645, 718), "threequarters": (100, -19, 839, 710), "twosuperior": (69, 283, 448, 710), "registered": (56, -19, 834, 737), "minus": (82, 197, 610, 309), "eth": (82, -14, 670, 737), "multiply": (57, 1, 635, 505), "threesuperior": (92, 271, 440, 710), "copyright": (57, -19, 835, 737), "space": (0, 0, 0, 0), "Aacute": (20, 0, 750, 936), "Acircumflex": (20, 0, 706, 936), "Adieresis": (20, 0, 716, 915), "Agrave": (20, 0, 702, 936), "Aring": (20, 0, 702, 962), "Atilde": (20, 0, 741, 923), "Ccedilla": (107, -228, 788, 737), "Eacute": (76, 0, 757, 936), "Ecircumflex": (76, 0, 757, 936), "Edieresis": (76, 0, 757, 915), "Egrave": (76, 0, 757, 936), "Iacute": (64, 0, 528, 936), "Icircumflex": (64, 0, 484, 936), "Idieresis": (64, 0, 494, 915), "Igrave": (64, 0, 367, 936), "Ntilde": (69, 0, 807, 923), "Oacute": (108, -19, 823, 936), "Ocircumflex": (108, -19, 823, 936), "Odieresis": (108, -19, 823, 915), "Ograve": (108, -19, 823, 936), "Otilde": (108, -19, 823, 923), "Scaron": (81, -19, 717, 936), "Uacute": (116, -19, 804, 936), "Ucircumflex": (116, -19, 804, 936), "Udieresis": (116, -19, 804, 915), "Ugrave": (116, -19, 804, 936), "Yacute": (168, 0, 806, 936), "Ydieresis": (168, 0, 806, 915), "Zcaron": (25, 0, 737, 936), "aacute": (55, -14, 627, 750), "acircumflex": (55, -14, 583, 750), "adieresis": (55, -14, 594, 729), "agrave": (55, -14, 582, 750), "aring": (55, -14, 582, 776), "atilde": (55, -14, 619, 737), "ccedilla": (79, -228, 599, 546), "eacute": (71, -14, 627, 750), "ecircumflex": (71, -14, 592, 750), "edieresis": (71, -14, 594, 729), "egrave": (71, -14, 592, 750), "iacute": (69, 0, 488, 750), "icircumflex": (69, 0, 444, 750), "idieresis": (69, 0, 455, 729), "igrave": (69, 0, 326, 750), "ntilde": (65, 0, 646, 737), "oacute": (83, -14, 654, 750), "ocircumflex": (83, -14, 643, 750), "odieresis": (83, -14, 643, 729), "ograve": (83, -14, 643, 750), "otilde": (83, -14, 646, 737), "scaron": (63, -14, 614, 750), "uacute": (99, -14, 658, 750), "ucircumflex": (99, -14, 658, 750), "udieresis": (99, -14, 658, 729), "ugrave": (99, -14, 658, 750), "yacute": (42, -214, 652, 750), "ydieresis": (42, -214, 652, 729), "zcaron": (20, 0, 586, 750), }, "Helvetica-Bold": { ".notdef": (0, 0, 0, 0), "exclam": (90, 0, 244, 718), "quotedbl": (98, 447, 376, 718), "numbersign": (18, 0, 538, 698), "dollar": (30, -115, 523, 775), "percent": (28, -19, 861, 710), "ampersand": (54, -19, 701, 718), "quoteright": (69, 445, 209, 718), "parenleft": (35, -208, 314, 734), "parenright": (19, -208, 298, 734), "asterisk": (27, 387, 362, 718), "plus": (40, 0, 544, 506), "comma": (64, -168, 214, 146), "hyphen": (27, 215, 306, 345), "period": (64, 0, 214, 146), "slash": (-33, -19, 311, 737), "zero": (32, -19, 524, 710), "one": (69, 0, 378, 710), "two": (26, 0, 511, 710), "three": (27, -19, 516, 710), "four": (27, 0, 526, 710), "five": (27, -19, 516, 698), "six": (31, -19, 520, 710), "seven": (25, 0, 528, 698), "eight": (32, -19, 524, 710), "nine": (30, -19, 522, 710), "colon": (92, 0, 242, 512), "semicolon": (92, -168, 242, 512), "less": (38, -8, 546, 514), "equal": (40, 87, 544, 419), "greater": (38, -8, 546, 514), "question": (60, 0, 556, 727), "at": (118, -19, 856, 737), "A": (20, 0, 702, 718), "B": (76, 0, 669, 718), "C": (44, -19, 684, 737), "D": (76, 0, 685, 718), "E": (76, 0, 621, 718), "F": (76, 0, 587, 718), "G": (44, -19, 713, 737), "H": (71, 0, 651, 718), "I": (64, 0, 214, 718), "J": (22, -18, 484, 718), "K": (87, 0, 722, 718), "L": (76, 0, 583, 718), "M": (69, 0, 765, 718), "N": (69, 0, 654, 718), "O": (44, -19, 734, 737), "P": (76, 0, 627, 718), "Q": (44, -52, 737, 737), "R": (76, 0, 677, 718), "S": (39, -19, 629, 737), "T": (14, 0, 598, 718), "U": (72, -19, 651, 718), "V": (19, 0, 648, 718), "W": (16, 0, 929, 718), "X": (14, 0, 653, 718), "Y": (15, 0, 653, 718), "Z": (25, 0, 586, 718), "bracketleft": (63, -196, 309, 722), "backslash": (-33, -19, 311, 737), "bracketright": (24, -196, 270, 722), "asciicircum": (62, 323, 522, 698), "underscore": (0, -125, 556, -75), "quoteleft": (69, 454, 209, 727), "a": (29, -14, 527, 546), "b": (61, -14, 578, 718), "c": (34, -14, 524, 546), "d": (34, -14, 551, 718), "e": (23, -14, 528, 546), "f": (10, 0, 318, 727), "g": (40, -217, 553, 546), "h": (65, 0, 546, 718), "i": (69, 0, 209, 725), "j": (3, -214, 209, 725), "k": (69, 0, 562, 718), "l": (69, 0, 209, 718), "m": (64, 0, 826, 546), "n": (65, 0, 546, 546), "o": (34, -14, 578, 546), "p": (62, -207, 578, 546), "q": (34, -207, 552, 546), "r": (64, 0, 373, 546), "s": (30, -14, 519, 546), "t": (10, -6, 309, 676), "u": (66, -14, 545, 532), "v": (13, 0, 543, 532), "w": (10, 0, 769, 532), "x": (15, 0, 541, 532), "y": (10, -214, 539, 532), "z": (20, 0, 480, 532), "braceleft": (48, -196, 365, 722), "bar": (84, -19, 196, 737), "braceright": (24, -196, 341, 722), "asciitilde": (61, 163, 523, 343), "exclamdown": (90, -186, 244, 532), "cent": (34, -118, 524, 628), "sterling": (28, -16, 541, 718), "fraction": (-170, -19, 336, 710), "yen": (-9, 0, 565, 698), "florin": (-10, -210, 516, 737), "section": (34, -184, 522, 727), "currency": (-3, 76, 559, 636), "quotesingle": (70, 447, 168, 718), "quotedblleft": (64, 454, 436, 727), "guillemotleft": (88, 76, 468, 484), "guilsinglleft": (83, 76, 250, 484), "guilsinglright": (83, 76, 250, 484), "fi": (10, 0, 542, 727), "fl": (10, 0, 542, 727), "endash": (0, 227, 556, 333), "dagger": (36, -171, 520, 718), "daggerdbl": (36, -171, 520, 718), "periodcentered": (58, 172, 220, 334), "paragraph": (-8, -191, 539, 700), "bullet": (10, 194, 340, 524), "quotesinglbase": (69, -146, 209, 127), "quotedblbase": (64, -146, 436, 127), "quotedblright": (64, 445, 436, 718), "guillemotright": (88, 76, 468, 484), "ellipsis": (92, 0, 908, 146), "perthousand": (-3, -19, 1003, 710), "questiondown": (55, -195, 551, 532), "grave": (-23, 604, 225, 750), "acute": (108, 604, 356, 750), "circumflex": (-10, 604, 343, 750), "tilde": (-17, 610, 350, 737), "macron": (-6, 604, 339, 678), "breve": (-2, 604, 335, 750), "dotaccent": (104, 614, 230, 729), "dieresis": (6, 614, 327, 729), "ring": (59, 568, 275, 776), "cedilla": (6, -228, 245, 0), "hungarumlaut": (9, 604, 486, 750), "ogonek": (71, -228, 304, 0), "caron": (-10, 604, 343, 750), "emdash": (0, 227, 1000, 333), "AE": (5, 0, 954, 718), "ordfeminine": (22, 276, 347, 737), "Lslash": (-20, 0, 583, 718), "Oslash": (33, -27, 744, 745), "OE": (37, -19, 961, 737), "ordmasculine": (6, 276, 360, 737), "ae": (29, -14, 858, 546), "dotlessi": (69, 0, 209, 532), "lslash": (-18, 0, 296, 718), "oslash": (22, -29, 589, 560), "oe": (34, -14, 912, 546), "germandbls": (69, -14, 579, 731), "onesuperior": (26, 283, 237, 710), "logicalnot": (40, 108, 544, 419), "mu": (66, -207, 545, 532), "trademark": (44, 306, 956, 718), "Eth": (-5, 0, 685, 718), "onehalf": (26, -19, 794, 710), "plusminus": (40, 0, 544, 506), "Thorn": (76, 0, 627, 718), "onequarter": (26, -19, 766, 710), "divide": (40, -42, 544, 548), "brokenbar": (84, -19, 196, 737), "degree": (57, 426, 343, 712), "thorn": (62, -208, 578, 718), "threequarters": (16, -19, 799, 710), "twosuperior": (9, 283, 324, 710), "registered": (-11, -19, 748, 737), "minus": (40, 197, 544, 309), "eth": (34, -14, 578, 737), "multiply": (40, 1, 545, 505), "threesuperior": (8, 271, 326, 710), "copyright": (-11, -19, 749, 737), "space": (0, 0, 0, 0), "Aacute": (20, 0, 702, 936), "Acircumflex": (20, 0, 702, 936), "Adieresis": (20, 0, 702, 915), "Agrave": (20, 0, 702, 936), "Aring": (20, 0, 702, 962), "Atilde": (20, 0, 702, 923), "Ccedilla": (44, -228, 684, 737), "Eacute": (76, 0, 621, 936), "Ecircumflex": (76, 0, 621, 936), "Edieresis": (76, 0, 621, 915), "Egrave": (76, 0, 621, 936), "Iacute": (64, 0, 329, 936), "Icircumflex": (-37, 0, 316, 936), "Idieresis": (-21, 0, 300, 915), "Igrave": (-50, 0, 214, 936), "Ntilde": (69, 0, 654, 923), "Oacute": (44, -19, 734, 936), "Ocircumflex": (44, -19, 734, 936), "Odieresis": (44, -19, 734, 915), "Ograve": (44, -19, 734, 936), "Otilde": (44, -19, 734, 923), "Scaron": (39, -19, 629, 936), "Uacute": (72, -19, 651, 936), "Ucircumflex": (72, -19, 651, 936), "Udieresis": (72, -19, 651, 915), "Ugrave": (72, -19, 651, 936), "Yacute": (15, 0, 653, 936), "Ydieresis": (15, 0, 653, 915), "Zcaron": (25, 0, 586, 936), "aacute": (29, -14, 527, 750), "acircumflex": (29, -14, 527, 750), "adieresis": (29, -14, 527, 729), "agrave": (29, -14, 527, 750), "aring": (29, -14, 527, 776), "atilde": (29, -14, 527, 737), "ccedilla": (34, -228, 524, 546), "eacute": (23, -14, 528, 750), "ecircumflex": (23, -14, 528, 750), "edieresis": (23, -14, 528, 729), "egrave": (23, -14, 528, 750), "iacute": (69, 0, 329, 750), "icircumflex": (-37, 0, 316, 750), "idieresis": (-21, 0, 300, 729), "igrave": (-50, 0, 209, 750), "ntilde": (65, 0, 546, 737), "oacute": (34, -14, 578, 750), "ocircumflex": (34, -14, 578, 750), "odieresis": (34, -14, 578, 729), "ograve": (34, -14, 578, 750), "otilde": (34, -14, 578, 737), "scaron": (30, -14, 519, 750), "uacute": (66, -14, 545, 750), "ucircumflex": (66, -14, 545, 750), "udieresis": (66, -14, 545, 729), "ugrave": (66, -14, 545, 750), "yacute": (10, -214, 539, 750), "ydieresis": (10, -214, 539, 729), "zcaron": (20, 0, 480, 750), }, "Helvetica-Oblique": { ".notdef": (0, 0, 0, 0), "exclam": (90, 0, 340, 718), "quotedbl": (168, 463, 438, 718), "numbersign": (73, 0, 631, 688), "dollar": (69, -115, 617, 775), "percent": (147, -19, 888, 703), "ampersand": (78, -15, 647, 718), "quoteright": (151, 463, 310, 718), "parenleft": (108, -207, 454, 733), "parenright": (-9, -207, 336, 733), "asterisk": (165, 431, 475, 718), "plus": (85, 0, 606, 505), "comma": (56, -147, 214, 106), "hyphen": (93, 232, 357, 322), "period": (87, 0, 214, 106), "slash": (-21, -19, 452, 737), "zero": (94, -19, 607, 703), "one": (207, 0, 508, 703), "two": (26, 0, 617, 703), "three": (75, -19, 609, 703), "four": (61, 0, 576, 703), "five": (68, -19, 621, 688), "six": (91, -19, 615, 703), "seven": (137, 0, 669, 688), "eight": (74, -19, 606, 703), "nine": (83, -19, 608, 703), "colon": (87, 0, 301, 516), "semicolon": (56, -147, 301, 516), "less": (94, 11, 641, 495), "equal": (63, 115, 628, 390), "greater": (50, 11, 597, 495), "question": (161, 0, 610, 727), "at": (215, -19, 964, 737), "A": (14, 0, 654, 718), "B": (74, 0, 711, 718), "C": (108, -19, 781, 737), "D": (81, 0, 763, 718), "E": (86, 0, 762, 718), "F": (86, 0, 736, 718), "G": (111, -19, 798, 737), "H": (77, 0, 799, 718), "I": (91, 0, 341, 718), "J": (47, -19, 581, 718), "K": (76, 0, 808, 718), "L": (76, 0, 555, 718), "M": (73, 0, 914, 718), "N": (76, 0, 799, 718), "O": (105, -19, 825, 737), "P": (86, 0, 736, 718), "Q": (105, -56, 825, 737), "R": (88, 0, 773, 718), "S": (90, -19, 712, 737), "T": (148, 0, 750, 718), "U": (124, -19, 797, 718), "V": (173, 0, 800, 718), "W": (169, 0, 1081, 718), "X": (19, 0, 790, 718), "Y": (167, 0, 806, 718), "Z": (23, 0, 741, 718), "bracketleft": (21, -196, 403, 722), "backslash": (140, -19, 291, 737), "bracketright": (-14, -196, 368, 722), "asciicircum": (42, 264, 539, 688), "underscore": (-27, -125, 540, -75), "quoteleft": (165, 470, 323, 725), "a": (62, -15, 558, 538), "b": (58, -15, 584, 718), "c": (75, -15, 553, 538), "d": (84, -15, 652, 718), "e": (85, -15, 578, 538), "f": (86, 0, 416, 728), "g": (42, -220, 610, 538), "h": (65, 0, 572, 718), "i": (67, 0, 308, 718), "j": (-60, -210, 308, 718), "k": (67, 0, 600, 718), "l": (67, 0, 308, 718), "m": (65, 0, 851, 538), "n": (65, 0, 572, 538), "o": (84, -14, 584, 538), "p": (14, -207, 584, 538), "q": (84, -207, 605, 538), "r": (77, 0, 446, 538), "s": (64, -15, 529, 538), "t": (103, -7, 368, 669), "u": (95, -15, 600, 523), "v": (119, 0, 603, 523), "w": (125, 0, 820, 523), "x": (11, 0, 594, 523), "y": (15, -214, 600, 523), "z": (31, 0, 571, 523), "braceleft": (92, -196, 445, 722), "bar": (90, -19, 324, 737), "braceright": (0, -196, 354, 722), "asciitilde": (111, 180, 580, 326), "exclamdown": (77, -195, 326, 523), "cent": (96, -115, 583, 623), "sterling": (49, -16, 633, 718), "fraction": (-170, -19, 482, 703), "yen": (81, 0, 699, 688), "florin": (-52, -207, 654, 737), "section": (77, -191, 583, 737), "currency": (60, 99, 646, 603), "quotesingle": (157, 463, 285, 718), "quotedblleft": (138, 470, 461, 725), "guillemotleft": (146, 108, 554, 446), "guilsinglleft": (137, 108, 340, 446), "guilsinglright": (111, 108, 314, 446), "fi": (86, 0, 587, 728), "fl": (86, 0, 585, 728), "endash": (51, 240, 623, 313), "dagger": (135, -159, 622, 718), "daggerdbl": (52, -159, 623, 718), "periodcentered": (130, 190, 257, 315), "paragraph": (126, -173, 650, 718), "bullet": (91, 202, 412, 517), "quotesinglbase": (21, -149, 180, 106), "quotedblbase": (-6, -149, 318, 106), "quotedblright": (124, 463, 448, 718), "guillemotright": (120, 108, 528, 446), "ellipsis": (115, 0, 908, 106), "perthousand": (88, -19, 1029, 703), "questiondown": (85, -201, 534, 525), "grave": (170, 593, 337, 734), "acute": (248, 593, 475, 734), "circumflex": (147, 593, 438, 734), "tilde": (125, 606, 490, 722), "macron": (143, 627, 468, 684), "breve": (167, 595, 476, 731), "dotaccent": (249, 604, 362, 706), "dieresis": (168, 604, 443, 706), "ring": (214, 572, 402, 756), "cedilla": (2, -225, 232, 0), "hungarumlaut": (157, 593, 565, 734), "ogonek": (44, -225, 249, 0), "caron": (177, 593, 468, 734), "emdash": (51, 240, 1067, 313), "AE": (8, 0, 1097, 718), "ordfeminine": (100, 304, 448, 737), "Lslash": (41, 0, 555, 718), "Oslash": (43, -19, 890, 737), "OE": (99, -19, 1116, 737), "ordmasculine": (100, 304, 467, 737), "ae": (62, -15, 909, 538), "dotlessi": (95, 0, 294, 523), "lslash": (41, 0, 347, 718), "oslash": (29, -22, 647, 545), "oe": (84, -15, 964, 538), "germandbls": (67, -15, 657, 728), "onesuperior": (166, 281, 371, 703), "logicalnot": (106, 108, 628, 390), "mu": (24, -207, 600, 523), "trademark": (186, 306, 1056, 718), "Eth": (69, 0, 763, 718), "onehalf": (114, -19, 838, 703), "plusminus": (39, 0, 618, 506), "Thorn": (86, 0, 711, 718), "onequarter": (150, -19, 802, 703), "divide": (85, -19, 606, 524), "brokenbar": (90, -19, 324, 737), "degree": (169, 411, 467, 703), "thorn": (14, -207, 584, 718), "threequarters": (130, -19, 861, 703), "twosuperior": (64, 281, 448, 703), "registered": (55, -19, 837, 737), "minus": (85, 216, 606, 289), "eth": (82, -15, 617, 737), "multiply": (50, 0, 642, 506), "threesuperior": (90, 270, 436, 703), "copyright": (55, -19, 837, 737), "space": (0, 0, 0, 0), "Aacute": (14, 0, 683, 929), "Acircumflex": (14, 0, 654, 929), "Adieresis": (14, 0, 654, 901), "Agrave": (14, 0, 654, 929), "Aring": (14, 0, 654, 931), "Atilde": (14, 0, 699, 917), "Ccedilla": (108, -225, 781, 737), "Eacute": (86, 0, 762, 929), "Ecircumflex": (86, 0, 762, 929), "Edieresis": (86, 0, 762, 901), "Egrave": (86, 0, 762, 929), "Iacute": (91, 0, 489, 929), "Icircumflex": (91, 0, 452, 929), "Idieresis": (91, 0, 458, 901), "Igrave": (91, 0, 351, 929), "Ntilde": (76, 0, 799, 917), "Oacute": (105, -19, 825, 929), "Ocircumflex": (105, -19, 825, 929), "Odieresis": (105, -19, 825, 901), "Ograve": (105, -19, 825, 929), "Otilde": (105, -19, 825, 917), "Scaron": (90, -19, 712, 929), "Uacute": (124, -19, 797, 929), "Ucircumflex": (124, -19, 797, 929), "Udieresis": (124, -19, 797, 901), "Ugrave": (124, -19, 797, 929), "Yacute": (167, 0, 806, 929), "Ydieresis": (167, 0, 806, 901), "Zcaron": (23, 0, 741, 929), "aacute": (62, -15, 587, 734), "acircumflex": (62, -15, 558, 734), "adieresis": (62, -15, 558, 706), "agrave": (62, -15, 558, 734), "aring": (62, -15, 558, 756), "atilde": (62, -15, 592, 722), "ccedilla": (75, -225, 553, 538), "eacute": (85, -15, 587, 734), "ecircumflex": (85, -15, 578, 734), "edieresis": (85, -15, 578, 706), "egrave": (85, -15, 578, 734), "iacute": (95, 0, 448, 734), "icircumflex": (95, 0, 411, 734), "idieresis": (95, 0, 416, 706), "igrave": (95, 0, 310, 734), "ntilde": (65, 0, 592, 722), "oacute": (84, -14, 587, 734), "ocircumflex": (84, -14, 584, 734), "odieresis": (84, -14, 584, 706), "ograve": (84, -14, 584, 734), "otilde": (84, -14, 602, 722), "scaron": (64, -15, 552, 734), "uacute": (95, -15, 600, 734), "ucircumflex": (95, -15, 600, 734), "udieresis": (95, -15, 600, 706), "ugrave": (95, -15, 600, 734), "yacute": (15, -214, 600, 734), "ydieresis": (15, -214, 600, 706), "zcaron": (31, 0, 571, 734), }, "Helvetica": { ".notdef": (0, 0, 0, 0), "exclam": (90, 0, 187, 718), "quotedbl": (70, 463, 285, 718), "numbersign": (28, 0, 529, 688), "dollar": (32, -115, 520, 775), "percent": (39, -19, 850, 703), "ampersand": (44, -15, 645, 718), "quoteright": (53, 463, 157, 718), "parenleft": (68, -207, 299, 733), "parenright": (34, -207, 265, 733), "asterisk": (39, 431, 349, 718), "plus": (39, 0, 545, 505), "comma": (87, -147, 191, 106), "hyphen": (44, 232, 289, 322), "period": (87, 0, 191, 106), "slash": (-17, -19, 295, 737), "zero": (37, -19, 519, 703), "one": (101, 0, 359, 703), "two": (26, 0, 507, 703), "three": (34, -19, 522, 703), "four": (25, 0, 523, 703), "five": (32, -19, 514, 688), "six": (38, -19, 518, 703), "seven": (37, 0, 523, 688), "eight": (38, -19, 517, 703), "nine": (42, -19, 514, 703), "colon": (87, 0, 191, 516), "semicolon": (87, -147, 191, 516), "less": (48, 11, 536, 495), "equal": (39, 115, 545, 390), "greater": (48, 11, 536, 495), "question": (56, 0, 492, 727), "at": (147, -19, 868, 737), "A": (14, 0, 654, 718), "B": (74, 0, 627, 718), "C": (44, -19, 681, 737), "D": (81, 0, 674, 718), "E": (86, 0, 616, 718), "F": (86, 0, 583, 718), "G": (48, -19, 704, 737), "H": (77, 0, 646, 718), "I": (91, 0, 188, 718), "J": (17, -19, 428, 718), "K": (76, 0, 663, 718), "L": (76, 0, 537, 718), "M": (73, 0, 761, 718), "N": (76, 0, 646, 718), "O": (39, -19, 739, 737), "P": (86, 0, 622, 718), "Q": (39, -56, 739, 737), "R": (88, 0, 684, 718), "S": (49, -19, 620, 737), "T": (14, 0, 597, 718), "U": (79, -19, 644, 718), "V": (20, 0, 647, 718), "W": (16, 0, 928, 718), "X": (19, 0, 648, 718), "Y": (14, 0, 653, 718), "Z": (23, 0, 588, 718), "bracketleft": (63, -196, 250, 722), "backslash": (-17, -19, 295, 737), "bracketright": (28, -196, 215, 722), "asciicircum": (-14, 264, 483, 688), "underscore": (0, -125, 556, -75), "quoteleft": (65, 470, 169, 725), "a": (36, -15, 530, 538), "b": (58, -15, 517, 718), "c": (30, -15, 477, 538), "d": (35, -15, 499, 718), "e": (40, -15, 516, 538), "f": (14, 0, 262, 728), "g": (40, -220, 499, 538), "h": (65, 0, 491, 718), "i": (67, 0, 155, 718), "j": (-16, -210, 155, 718), "k": (67, 0, 501, 718), "l": (67, 0, 155, 718), "m": (65, 0, 769, 538), "n": (65, 0, 491, 538), "o": (35, -14, 521, 538), "p": (58, -207, 517, 538), "q": (35, -207, 494, 538), "r": (77, 0, 332, 538), "s": (32, -15, 464, 538), "t": (14, -7, 257, 669), "u": (68, -15, 489, 523), "v": (8, 0, 492, 523), "w": (14, 0, 709, 523), "x": (11, 0, 490, 523), "y": (11, -214, 489, 523), "z": (31, 0, 469, 523), "braceleft": (42, -196, 292, 722), "bar": (94, -19, 167, 737), "braceright": (42, -196, 292, 722), "asciitilde": (61, 180, 523, 326), "exclamdown": (118, -195, 215, 523), "cent": (51, -115, 513, 623), "sterling": (33, -16, 539, 718), "fraction": (-166, -19, 333, 703), "yen": (3, 0, 553, 688), "florin": (-11, -207, 501, 737), "section": (43, -191, 512, 737), "currency": (28, 99, 528, 603), "quotesingle": (59, 463, 132, 718), "quotedblleft": (38, 470, 307, 725), "guillemotleft": (97, 108, 459, 446), "guilsinglleft": (88, 108, 245, 446), "guilsinglright": (88, 108, 245, 446), "fi": (14, 0, 434, 728), "fl": (14, 0, 432, 728), "endash": (0, 240, 556, 313), "dagger": (43, -159, 514, 718), "daggerdbl": (43, -159, 514, 718), "periodcentered": (77, 190, 202, 315), "paragraph": (18, -173, 497, 718), "bullet": (18, 202, 333, 517), "quotesinglbase": (53, -149, 157, 106), "quotedblbase": (26, -149, 295, 106), "quotedblright": (26, 463, 295, 718), "guillemotright": (97, 108, 459, 446), "ellipsis": (115, 0, 885, 106), "perthousand": (7, -19, 994, 703), "questiondown": (91, -201, 527, 525), "grave": (14, 593, 211, 734), "acute": (122, 593, 319, 734), "circumflex": (21, 593, 312, 734), "tilde": (-4, 606, 337, 722), "macron": (10, 627, 323, 684), "breve": (13, 595, 321, 731), "dotaccent": (121, 604, 212, 706), "dieresis": (40, 604, 293, 706), "ring": (75, 572, 259, 756), "cedilla": (45, -225, 259, 0), "hungarumlaut": (31, 593, 409, 734), "ogonek": (73, -225, 287, 0), "caron": (21, 593, 312, 734), "emdash": (0, 240, 1000, 313), "AE": (8, 0, 951, 718), "ordfeminine": (24, 304, 346, 737), "Lslash": (-20, 0, 537, 718), "Oslash": (39, -19, 740, 737), "OE": (36, -19, 965, 737), "ordmasculine": (25, 304, 341, 737), "ae": (36, -15, 847, 538), "dotlessi": (95, 0, 183, 523), "lslash": (-20, 0, 242, 718), "oslash": (28, -22, 537, 545), "oe": (35, -15, 902, 538), "germandbls": (67, -15, 571, 728), "onesuperior": (43, 281, 222, 703), "logicalnot": (39, 108, 545, 390), "mu": (68, -207, 489, 523), "trademark": (46, 306, 903, 718), "Eth": (0, 0, 674, 718), "onehalf": (43, -19, 773, 703), "plusminus": (39, 0, 545, 506), "Thorn": (86, 0, 622, 718), "onequarter": (73, -19, 756, 703), "divide": (39, -19, 545, 524), "brokenbar": (94, -19, 167, 737), "degree": (54, 411, 346, 703), "thorn": (58, -207, 517, 718), "threequarters": (45, -19, 810, 703), "twosuperior": (4, 281, 323, 703), "registered": (-14, -19, 752, 737), "minus": (39, 216, 545, 289), "eth": (35, -15, 522, 737), "multiply": (39, 0, 545, 506), "threesuperior": (5, 270, 325, 703), "copyright": (-14, -19, 752, 737), "space": (0, 0, 0, 0), "Aacute": (14, 0, 654, 929), "Acircumflex": (14, 0, 654, 929), "Adieresis": (14, 0, 654, 901), "Agrave": (14, 0, 654, 929), "Aring": (14, 0, 654, 931), "Atilde": (14, 0, 654, 917), "Ccedilla": (44, -225, 681, 737), "Eacute": (86, 0, 616, 929), "Ecircumflex": (86, 0, 616, 929), "Edieresis": (86, 0, 616, 901), "Egrave": (86, 0, 616, 929), "Iacute": (91, 0, 292, 929), "Icircumflex": (-6, 0, 285, 929), "Idieresis": (13, 0, 266, 901), "Igrave": (-13, 0, 188, 929), "Ntilde": (76, 0, 646, 917), "Oacute": (39, -19, 739, 929), "Ocircumflex": (39, -19, 739, 929), "Odieresis": (39, -19, 739, 901), "Ograve": (39, -19, 739, 929), "Otilde": (39, -19, 739, 917), "Scaron": (49, -19, 620, 929), "Uacute": (79, -19, 644, 929), "Ucircumflex": (79, -19, 644, 929), "Udieresis": (79, -19, 644, 901), "Ugrave": (79, -19, 644, 929), "Yacute": (14, 0, 653, 929), "Ydieresis": (14, 0, 653, 901), "Zcaron": (23, 0, 588, 929), "aacute": (36, -15, 530, 734), "acircumflex": (36, -15, 530, 734), "adieresis": (36, -15, 530, 706), "agrave": (36, -15, 530, 734), "aring": (36, -15, 530, 756), "atilde": (36, -15, 530, 722), "ccedilla": (30, -225, 477, 538), "eacute": (40, -15, 516, 734), "ecircumflex": (40, -15, 516, 734), "edieresis": (40, -15, 516, 706), "egrave": (40, -15, 516, 734), "iacute": (95, 0, 292, 734), "icircumflex": (-6, 0, 285, 734), "idieresis": (13, 0, 266, 706), "igrave": (-13, 0, 184, 734), "ntilde": (65, 0, 491, 722), "oacute": (35, -14, 521, 734), "ocircumflex": (35, -14, 521, 734), "odieresis": (35, -14, 521, 706), "ograve": (35, -14, 521, 734), "otilde": (35, -14, 521, 722), "scaron": (32, -15, 464, 734), "uacute": (68, -15, 489, 734), "ucircumflex": (68, -15, 489, 734), "udieresis": (68, -15, 489, 706), "ugrave": (68, -15, 489, 734), "yacute": (11, -214, 489, 734), "ydieresis": (11, -214, 489, 706), "zcaron": (31, 0, 469, 734), }, "Symbol": { ".notdef": (0, 0, 0, 0), "exclam": (128, -17, 240, 672), "universal": (31, 0, 681, 705), "numbersign": (20, -16, 481, 673), "existential": (25, 0, 478, 707), "percent": (64, -35, 771, 655), "ampersand": (42, -17, 750, 661), "suchthat": (48, -17, 414, 499), "parenleft": (53, -191, 300, 673), "parenright": (30, -191, 277, 673), "asteriskmath": (65, 134, 427, 551), "plus": (10, 0, 539, 533), "comma": (56, -152, 194, 104), "minus": (11, 233, 535, 288), "period": (69, -17, 181, 95), "slash": (0, -18, 254, 646), "zero": (24, -17, 470, 685), "one": (117, 0, 390, 673), "two": (25, 0, 475, 685), "three": (39, -17, 435, 685), "four": (16, 0, 469, 685), "five": (29, -17, 443, 685), "six": (36, -17, 467, 685), "seven": (24, -16, 448, 673), "eight": (55, -17, 440, 684), "nine": (32, -18, 459, 684), "colon": (81, -17, 193, 460), "semicolon": (83, -152, 221, 460), "less": (26, 0, 523, 522), "equal": (11, 141, 537, 390), "greater": (26, 0, 523, 522), "question": (71, -17, 411, 686), "congruent": (11, 0, 537, 475), "Alpha": (4, 0, 684, 673), "Beta": (29, 0, 592, 673), "Chi": (-9, 0, 704, 673), "Delta": (6, 0, 608, 688), "Epsilon": (32, 0, 617, 673), "Phi": (26, 0, 741, 673), "Gamma": (24, 0, 609, 673), "Eta": (39, 0, 729, 673), "Iota": (32, 0, 316, 673), "theta1": (18, -17, 623, 689), "Kappa": (35, 0, 722, 673), "Lambda": (6, 0, 680, 688), "Mu": (28, 0, 887, 673), "Nu": (29, -8, 720, 673), "Omicron": (41, -17, 715, 685), "Pi": (25, 0, 745, 673), "Theta": (41, -17, 715, 685), "Rho": (28, 0, 562, 673), "Sigma": (5, 0, 589, 673), "Tau": (33, 0, 607, 673), "Upsilon": (-8, 0, 694, 673), "sigma1": (40, -233, 436, 500), "Omega": (34, 0, 736, 688), "Xi": (40, 0, 599, 673), "Psi": (15, 0, 781, 684), "Zeta": (44, 0, 636, 673), "bracketleft": (86, -155, 299, 674), "therefore": (163, 0, 701, 478), "bracketright": (33, -155, 246, 674), "perpendicular": (15, 0, 652, 674), "underscore": (-2, -252, 502, -206), "radicalex": (480, 881, 1090, 917), "alpha": (41, -18, 622, 500), "beta": (61, -223, 515, 740), "chi": (12, -231, 522, 499), "delta": (40, -18, 481, 739), "epsilon": (22, -19, 427, 501), "phi": (28, -224, 490, 671), "gamma": (6, -225, 484, 498), "eta": (0, -202, 527, 513), "iota": (0, -17, 301, 503), "phi1": (37, -224, 587, 499), "kappa": (33, 0, 558, 501), "lambda": (24, -17, 548, 739), "mu": (33, -223, 567, 500), "nu": (-9, -16, 474, 507), "omicron": (35, -18, 501, 498), "pi": (10, -19, 530, 487), "theta": (43, -17, 485, 690), "rho": (50, -230, 490, 498), "sigma": (31, -21, 588, 500), "tau": (10, -18, 418, 500), "upsilon": (7, -18, 535, 507), "omega1": (12, -17, 671, 583), "omega": (43, -17, 683, 500), "xi": (28, -224, 469, 765), "psi": (12, -228, 701, 500), "zeta": (60, -225, 467, 756), "braceleft": (58, -183, 397, 673), "bar": (65, -177, 135, 673), "braceright": (79, -183, 418, 673), "similar": (17, 203, 529, 307), "Upsilon1": (-1, 0, 610, 685), "minute": (27, 459, 228, 734), "lessequal": (29, 0, 526, 639), "fraction": (-180, -12, 340, 677), "infinity": (26, 125, 688, 404), "florin": (2, -193, 494, 686), "club": (86, -26, 660, 533), "diamond": (142, -36, 600, 550), "heart": (117, -33, 631, 532), "spade": (114, -36, 628, 548), "arrowboth": (24, -15, 1024, 511), "arrowleft": (32, -15, 942, 511), "arrowup": (45, 0, 571, 910), "arrowright": (49, -15, 959, 511), "arrowdown": (45, -22, 571, 888), "degree": (50, 385, 350, 685), "plusminus": (10, 0, 539, 645), "second": (20, 459, 413, 736), "greaterequal": (29, 0, 526, 639), "multiply": (17, 8, 533, 524), "proportional": (27, 124, 639, 404), "partialdiff": (27, -20, 462, 745), "bullet": (50, 113, 410, 473), "divide": (10, 71, 536, 456), "notequal": (15, -25, 540, 549), "equivalence": (14, 82, 538, 443), "approxequal": (14, 135, 527, 394), "ellipsis": (111, -17, 889, 95), "arrowvertex": (280, -120, 336, 1010), "arrowhorizex": (-60, 220, 1050, 276), "carriagereturn": (15, -16, 602, 629), "aleph": (175, -18, 661, 658), "Ifraktur": (10, -53, 578, 740), "Rfraktur": (26, -15, 759, 733), "weierstrass": (159, -211, 870, 573), "circlemultiply": (43, -17, 733, 673), "circleplus": (43, -15, 733, 675), "emptyset": (39, -24, 781, 719), "intersection": (40, 0, 732, 509), "union": (40, -17, 732, 492), "propersuperset": (20, 0, 673, 470), "reflexsuperset": (20, -125, 673, 470), "notsubset": (36, -70, 690, 540), "propersubset": (37, 0, 690, 470), "reflexsubset": (37, -125, 690, 470), "element": (45, 0, 505, 468), "notelement": (45, -58, 505, 555), "angle": (26, 0, 738, 673), "gradient": (36, -19, 681, 718), "registerserif": (50, -17, 740, 673), "copyrightserif": (51, -15, 741, 675), "trademarkserif": (18, 293, 855, 673), "product": (25, -101, 803, 751), "radical": (10, -38, 515, 917), "dotmath": (69, 210, 169, 310), "logicalnot": (15, 0, 680, 288), "logicaland": (23, 0, 583, 454), "logicalor": (30, 0, 578, 477), "arrowdblboth": (27, -20, 1023, 510), "arrowdblleft": (30, -15, 939, 513), "arrowdblup": (39, 2, 567, 911), "arrowdblright": (45, -20, 954, 508), "arrowdbldown": (44, -19, 572, 890), "lozenge": (18, 0, 466, 745), "angleleft": (25, -198, 306, 746), "registersans": (50, -20, 740, 670), "copyrightsans": (49, -15, 739, 675), "trademarksans": (5, 293, 725, 673), "summation": (14, -108, 695, 752), "parenlefttp": (40, -293, 436, 926), "parenleftex": (40, -85, 92, 925), "parenleftbt": (40, -293, 436, 926), "bracketlefttp": (0, -80, 341, 926), "bracketleftex": (0, -79, 55, 925), "bracketleftbt": (0, -80, 340, 926), "bracelefttp": (201, -75, 439, 926), "braceleftmid": (14, -85, 255, 935), "braceleftbt": (201, -70, 439, 926), "braceex": (201, -80, 255, 935), "angleright": (21, -198, 302, 746), "integral": (2, -107, 290, 915), "integraltp": (332, -83, 715, 921), "integralex": (332, -88, 415, 975), "integralbt": (39, -81, 415, 921), "parenrighttp": (54, -293, 450, 926), "parenrightex": (398, -85, 450, 925), "parenrightbt": (54, -293, 450, 926), "bracketrighttp": (22, -80, 360, 926), "bracketrightex": (305, -79, 360, 925), "bracketrightbt": (20, -80, 360, 926), "bracerighttp": (17, -75, 255, 926), "bracerightmid": (201, -85, 442, 935), "bracerightbt": (17, -70, 255, 926), "apple": (56, -2, 733, 808), "space": (0, 0, 0, 0), }, "Times-BoldItalic": { ".notdef": (0, 0, 0, 0), "exclam": (67, -13, 370, 684), "quotedbl": (136, 398, 536, 685), "numbersign": (-33, 0, 533, 700), "dollar": (-20, -100, 497, 733), "percent": (39, -10, 793, 692), "ampersand": (5, -19, 699, 682), "quoteright": (98, 369, 302, 685), "parenleft": (28, -179, 344, 685), "parenright": (-44, -179, 271, 685), "asterisk": (65, 249, 456, 685), "plus": (33, 0, 537, 506), "comma": (-60, -182, 144, 134), "hyphen": (2, 166, 271, 282), "period": (-9, -13, 139, 135), "slash": (-64, -18, 342, 685), "zero": (17, -14, 477, 683), "one": (5, 0, 419, 683), "two": (-27, 0, 446, 683), "three": (-15, -13, 450, 683), "four": (-15, 0, 503, 683), "five": (-11, -13, 487, 669), "six": (23, -15, 509, 679), "seven": (52, 0, 525, 669), "eight": (3, -13, 476, 683), "nine": (-12, -10, 475, 683), "colon": (23, -13, 264, 459), "semicolon": (-25, -183, 264, 459), "less": (31, -8, 539, 514), "equal": (33, 107, 537, 399), "greater": (31, -8, 539, 514), "question": (79, -13, 470, 684), "at": (63, -18, 770, 685), "A": (-67, 0, 593, 683), "B": (-24, 0, 624, 669), "C": (32, -18, 677, 685), "D": (-46, 0, 685, 669), "E": (-27, 0, 653, 669), "F": (-13, 0, 660, 669), "G": (21, -18, 706, 685), "H": (-24, 0, 799, 669), "I": (-32, 0, 406, 669), "J": (-46, -99, 524, 669), "K": (-21, 0, 702, 669), "L": (-22, 0, 590, 669), "M": (-29, -12, 917, 669), "N": (-27, -15, 748, 669), "O": (27, -18, 691, 685), "P": (-27, 0, 613, 669), "Q": (27, -208, 691, 685), "R": (-29, 0, 623, 669), "S": (2, -18, 526, 685), "T": (50, 0, 650, 669), "U": (67, -18, 744, 669), "V": (65, -18, 715, 669), "W": (65, -18, 940, 669), "X": (-24, 0, 694, 669), "Y": (73, 0, 659, 669), "Z": (-11, 0, 590, 669), "bracketleft": (-37, -159, 362, 674), "backslash": (-1, -18, 279, 685), "bracketright": (-56, -157, 343, 674), "asciicircum": (67, 304, 503, 669), "underscore": (0, -125, 500, -75), "quoteleft": (128, 369, 332, 685), "a": (-21, -14, 455, 462), "b": (-14, -13, 444, 699), "c": (-5, -13, 392, 462), "d": (-21, -13, 517, 699), "e": (5, -13, 398, 462), "f": (-169, -205, 446, 698), "g": (-52, -203, 478, 462), "h": (-13, -9, 498, 699), "i": (2, -9, 263, 684), "j": (-189, -207, 279, 684), "k": (-23, -8, 483, 699), "l": (2, -9, 290, 699), "m": (-14, -9, 722, 462), "n": (-6, -9, 493, 462), "o": (-3, -13, 441, 462), "p": (-120, -205, 446, 462), "q": (1, -205, 471, 462), "r": (-21, 0, 389, 462), "s": (-19, -13, 333, 462), "t": (-11, -9, 281, 594), "u": (15, -9, 492, 462), "v": (16, -13, 401, 462), "w": (16, -13, 614, 462), "x": (-46, -13, 469, 462), "y": (-94, -205, 392, 462), "z": (-43, -78, 368, 449), "braceleft": (5, -187, 436, 686), "bar": (66, -18, 154, 685), "braceright": (-129, -187, 302, 686), "asciitilde": (54, 173, 516, 333), "exclamdown": (19, -205, 322, 492), "cent": (42, -143, 439, 576), "sterling": (-32, -12, 510, 683), "fraction": (-169, -14, 324, 683), "yen": (33, 0, 628, 669), "florin": (-87, -156, 537, 707), "section": (36, -143, 459, 685), "currency": (-26, 34, 526, 586), "quotesingle": (128, 398, 268, 685), "quotedblleft": (53, 369, 513, 685), "guillemotleft": (12, 32, 468, 415), "guilsinglleft": (32, 32, 303, 415), "guilsinglright": (10, 32, 281, 415), "fi": (-188, -205, 514, 703), "fl": (-186, -205, 553, 704), "endash": (-40, 178, 477, 269), "dagger": (91, -145, 494, 685), "daggerdbl": (10, -139, 493, 685), "periodcentered": (51, 257, 199, 405), "paragraph": (-57, -193, 562, 669), "bullet": (0, 175, 350, 525), "quotesinglbase": (-5, -182, 199, 134), "quotedblbase": (-57, -182, 403, 134), "quotedblright": (53, 369, 513, 685), "guillemotright": (12, 32, 468, 415), "ellipsis": (40, -13, 852, 135), "perthousand": (7, -29, 996, 706), "questiondown": (30, -205, 421, 492), "grave": (85, 516, 297, 697), "acute": (139, 516, 379, 697), "circumflex": (40, 516, 367, 690), "tilde": (48, 536, 407, 655), "macron": (51, 553, 393, 623), "breve": (71, 516, 387, 678), "dotaccent": (163, 525, 293, 655), "dieresis": (55, 525, 397, 655), "ring": (127, 516, 340, 729), "cedilla": (-80, -218, 156, 5), "hungarumlaut": (69, 516, 498, 697), "ogonek": (-40, -173, 189, 44), "caron": (79, 516, 411, 690), "emdash": (-40, 178, 977, 269), "AE": (-64, 0, 918, 669), "ordfeminine": (16, 399, 330, 685), "Lslash": (-22, 0, 590, 669), "Oslash": (27, -125, 691, 764), "OE": (23, -8, 946, 677), "ordmasculine": (56, 400, 347, 685), "ae": (-5, -13, 673, 462), "dotlessi": (2, -9, 238, 462), "lslash": (-13, -9, 301, 699), "oslash": (-3, -119, 441, 560), "oe": (6, -13, 674, 462), "germandbls": (-200, -200, 473, 705), "onesuperior": (30, 274, 301, 683), "logicalnot": (51, 108, 555, 399), "mu": (-60, -207, 516, 449), "trademark": (32, 263, 968, 669), "Eth": (-31, 0, 700, 669), "onehalf": (-9, -14, 723, 683), "plusminus": (33, 0, 537, 506), "Thorn": (-27, 0, 573, 669), "onequarter": (7, -14, 721, 683), "divide": (33, -29, 537, 535), "brokenbar": (66, -18, 154, 685), "degree": (83, 397, 369, 683), "thorn": (-120, -205, 446, 699), "threequarters": (7, -14, 726, 683), "twosuperior": (2, 274, 313, 683), "registered": (30, -18, 718, 685), "minus": (51, 209, 555, 297), "eth": (-3, -13, 454, 699), "multiply": (48, 16, 522, 490), "threesuperior": (17, 265, 321, 683), "copyright": (30, -18, 718, 685), "space": (0, 0, 0, 0), "Aacute": (-67, 0, 593, 904), "Acircumflex": (-67, 0, 593, 897), "Adieresis": (-67, 0, 593, 862), "Agrave": (-67, 0, 593, 904), "Aring": (-67, 0, 593, 921), "Atilde": (-67, 0, 593, 862), "Ccedilla": (32, -218, 677, 685), "Eacute": (-27, 0, 653, 904), "Ecircumflex": (-27, 0, 653, 897), "Edieresis": (-27, 0, 653, 862), "Egrave": (-27, 0, 653, 904), "Iacute": (-32, 0, 412, 904), "Icircumflex": (-32, 0, 420, 897), "Idieresis": (-32, 0, 445, 862), "Igrave": (-32, 0, 406, 904), "Ntilde": (-27, -15, 748, 862), "Oacute": (27, -18, 691, 904), "Ocircumflex": (27, -18, 691, 897), "Odieresis": (27, -18, 691, 862), "Ograve": (27, -18, 691, 904), "Otilde": (27, -18, 691, 862), "Scaron": (2, -18, 526, 897), "Uacute": (67, -18, 744, 904), "Ucircumflex": (67, -18, 744, 897), "Udieresis": (67, -18, 744, 862), "Ugrave": (67, -18, 744, 904), "Yacute": (73, 0, 659, 904), "Ydieresis": (73, 0, 659, 862), "Zcaron": (-11, 0, 590, 897), "aacute": (-21, -14, 463, 697), "acircumflex": (-21, -14, 455, 690), "adieresis": (-21, -14, 471, 655), "agrave": (-21, -14, 455, 697), "aring": (-21, -14, 455, 729), "atilde": (-21, -14, 491, 655), "ccedilla": (-24, -218, 392, 462), "eacute": (5, -13, 435, 697), "ecircumflex": (5, -13, 423, 690), "edieresis": (5, -13, 443, 655), "egrave": (5, -13, 398, 697), "iacute": (2, -9, 352, 697), "icircumflex": (-2, -9, 325, 690), "idieresis": (2, -9, 360, 655), "igrave": (2, -9, 260, 697), "ntilde": (-6, -9, 504, 655), "oacute": (-3, -13, 463, 697), "ocircumflex": (-3, -13, 451, 690), "odieresis": (-3, -13, 466, 655), "ograve": (-3, -13, 441, 697), "otilde": (-3, -13, 491, 655), "scaron": (-19, -13, 439, 690), "uacute": (15, -9, 492, 697), "ucircumflex": (15, -9, 492, 690), "udieresis": (15, -9, 494, 655), "ugrave": (15, -9, 492, 697), "yacute": (-94, -205, 435, 697), "ydieresis": (-94, -205, 438, 655), "zcaron": (-43, -78, 424, 690), }, "Times-Bold": { ".notdef": (0, 0, 0, 0), "exclam": (81, -13, 251, 691), "quotedbl": (83, 404, 472, 691), "numbersign": (4, 0, 496, 700), "dollar": (29, -99, 472, 750), "percent": (124, -14, 877, 692), "ampersand": (62, -16, 787, 691), "quoteright": (79, 356, 263, 691), "parenleft": (46, -168, 306, 694), "parenright": (27, -168, 287, 694), "asterisk": (56, 255, 447, 691), "plus": (33, 0, 537, 506), "comma": (39, -180, 223, 155), "hyphen": (44, 171, 287, 287), "period": (41, -13, 210, 156), "slash": (-24, -19, 302, 691), "zero": (24, -13, 476, 688), "one": (65, 0, 442, 688), "two": (17, 0, 478, 688), "three": (16, -14, 468, 688), "four": (19, 0, 475, 688), "five": (22, -8, 470, 676), "six": (28, -13, 475, 688), "seven": (17, 0, 477, 676), "eight": (28, -13, 472, 688), "nine": (26, -13, 473, 688), "colon": (82, -13, 251, 472), "semicolon": (82, -180, 266, 472), "less": (31, -8, 539, 514), "equal": (33, 107, 537, 399), "greater": (31, -8, 539, 514), "question": (57, -13, 445, 689), "at": (108, -19, 822, 691), "A": (9, 0, 689, 690), "B": (16, 0, 619, 676), "C": (49, -19, 687, 691), "D": (14, 0, 690, 676), "E": (16, 0, 641, 676), "F": (16, 0, 583, 676), "G": (37, -19, 755, 691), "H": (21, 0, 759, 676), "I": (20, 0, 370, 676), "J": (3, -96, 479, 676), "K": (30, 0, 769, 676), "L": (19, 0, 638, 676), "M": (14, 0, 921, 676), "N": (16, -18, 701, 676), "O": (35, -19, 743, 691), "P": (16, 0, 600, 676), "Q": (35, -176, 743, 691), "R": (26, 0, 715, 676), "S": (35, -19, 513, 692), "T": (31, 0, 636, 676), "U": (16, -19, 701, 676), "V": (16, -18, 701, 676), "W": (19, -15, 981, 676), "X": (16, 0, 699, 676), "Y": (15, 0, 699, 676), "Z": (28, 0, 634, 676), "bracketleft": (67, -149, 301, 678), "backslash": (-25, -19, 303, 691), "bracketright": (32, -149, 266, 678), "asciicircum": (73, 311, 509, 676), "underscore": (0, -125, 500, -75), "quoteleft": (70, 356, 254, 691), "a": (25, -14, 488, 473), "b": (17, -14, 521, 676), "c": (25, -14, 430, 473), "d": (25, -14, 534, 676), "e": (25, -14, 426, 473), "f": (14, 0, 389, 691), "g": (28, -206, 483, 473), "h": (16, 0, 534, 676), "i": (16, 0, 255, 691), "j": (-57, -203, 263, 691), "k": (22, 0, 543, 676), "l": (16, 0, 255, 676), "m": (16, 0, 814, 473), "n": (21, 0, 539, 473), "o": (25, -14, 476, 473), "p": (19, -205, 524, 473), "q": (34, -205, 536, 473), "r": (29, 0, 434, 473), "s": (25, -14, 361, 473), "t": (20, -12, 332, 630), "u": (16, -14, 537, 461), "v": (21, -14, 485, 461), "w": (23, -14, 707, 461), "x": (12, 0, 484, 461), "y": (16, -205, 480, 461), "z": (21, 0, 420, 461), "braceleft": (22, -175, 340, 698), "bar": (66, -19, 154, 691), "braceright": (54, -175, 372, 698), "asciitilde": (29, 173, 491, 333), "exclamdown": (82, -203, 252, 501), "cent": (53, -140, 458, 588), "sterling": (21, -14, 477, 684), "fraction": (-168, -12, 329, 688), "yen": (-64, 0, 547, 676), "florin": (0, -155, 498, 706), "section": (57, -132, 443, 691), "currency": (-26, 61, 526, 613), "quotesingle": (75, 404, 204, 691), "quotedblleft": (32, 356, 486, 691), "guillemotleft": (23, 36, 473, 415), "guilsinglleft": (51, 36, 305, 415), "guilsinglright": (28, 36, 282, 415), "fi": (14, 0, 536, 691), "fl": (14, 0, 536, 691), "endash": (0, 181, 500, 271), "dagger": (47, -134, 453, 691), "daggerdbl": (45, -132, 456, 691), "periodcentered": (41, 248, 210, 417), "paragraph": (0, -186, 519, 676), "bullet": (35, 198, 315, 478), "quotesinglbase": (79, -180, 263, 155), "quotedblbase": (14, -180, 468, 155), "quotedblright": (14, 356, 468, 691), "guillemotright": (27, 36, 477, 415), "ellipsis": (82, -13, 917, 156), "perthousand": (7, -29, 995, 706), "questiondown": (55, -201, 443, 501), "grave": (8, 528, 246, 713), "acute": (86, 528, 324, 713), "circumflex": (-2, 528, 335, 704), "tilde": (-16, 547, 349, 674), "macron": (1, 565, 331, 637), "breve": (15, 528, 318, 691), "dotaccent": (103, 537, 230, 667), "dieresis": (-2, 537, 335, 667), "ring": (60, 527, 273, 740), "cedilla": (68, -218, 294, 0), "hungarumlaut": (-13, 528, 425, 713), "ogonek": (90, -173, 319, 44), "caron": (-2, 528, 335, 704), "emdash": (0, 181, 1000, 271), "AE": (4, 0, 951, 676), "ordfeminine": (-1, 397, 301, 688), "Lslash": (19, 0, 638, 676), "Oslash": (35, -74, 743, 737), "OE": (22, -5, 981, 684), "ordmasculine": (18, 397, 312, 688), "ae": (33, -14, 693, 473), "dotlessi": (16, 0, 255, 461), "lslash": (-22, 0, 303, 676), "oslash": (25, -92, 476, 549), "oe": (22, -14, 696, 473), "germandbls": (19, -12, 517, 691), "onesuperior": (28, 275, 273, 688), "logicalnot": (33, 108, 537, 399), "mu": (33, -206, 536, 461), "trademark": (24, 271, 977, 676), "Eth": (6, 0, 690, 676), "onehalf": (-7, -12, 775, 688), "plusminus": (33, 0, 537, 506), "Thorn": (16, 0, 600, 676), "onequarter": (28, -12, 743, 688), "divide": (33, -31, 537, 537), "brokenbar": (66, -19, 154, 691), "degree": (57, 402, 343, 688), "thorn": (19, -205, 524, 676), "threequarters": (23, -12, 733, 688), "twosuperior": (0, 275, 300, 688), "registered": (26, -19, 721, 691), "minus": (33, 209, 537, 297), "eth": (25, -14, 476, 691), "multiply": (48, 16, 522, 490), "threesuperior": (3, 268, 297, 688), "copyright": (26, -19, 721, 691), "space": (0, 0, 0, 0), "Aacute": (9, 0, 689, 923), "Acircumflex": (9, 0, 689, 914), "Adieresis": (9, 0, 689, 877), "Agrave": (9, 0, 689, 923), "Aring": (9, 0, 689, 935), "Atilde": (9, 0, 689, 884), "Ccedilla": (49, -218, 687, 691), "Eacute": (16, 0, 641, 923), "Ecircumflex": (16, 0, 641, 914), "Edieresis": (16, 0, 641, 877), "Egrave": (16, 0, 641, 923), "Iacute": (20, 0, 370, 923), "Icircumflex": (20, 0, 370, 914), "Idieresis": (20, 0, 370, 877), "Igrave": (20, 0, 370, 923), "Ntilde": (16, -18, 701, 884), "Oacute": (35, -19, 743, 923), "Ocircumflex": (35, -19, 743, 914), "Odieresis": (35, -19, 743, 877), "Ograve": (35, -19, 743, 923), "Otilde": (35, -19, 743, 884), "Scaron": (35, -19, 513, 914), "Uacute": (16, -19, 701, 923), "Ucircumflex": (16, -19, 701, 914), "Udieresis": (16, -19, 701, 877), "Ugrave": (16, -19, 701, 923), "Yacute": (15, 0, 699, 928), "Ydieresis": (15, 0, 699, 877), "Zcaron": (28, 0, 634, 914), "aacute": (25, -14, 488, 713), "acircumflex": (25, -14, 488, 704), "adieresis": (25, -14, 488, 667), "agrave": (25, -14, 488, 713), "aring": (25, -14, 488, 740), "atilde": (25, -14, 488, 674), "ccedilla": (25, -218, 430, 473), "eacute": (25, -14, 426, 713), "ecircumflex": (25, -14, 426, 704), "edieresis": (25, -14, 426, 667), "egrave": (25, -14, 426, 713), "iacute": (16, 0, 290, 713), "icircumflex": (-36, 0, 301, 704), "idieresis": (-36, 0, 301, 667), "igrave": (-26, 0, 255, 713), "ntilde": (21, 0, 539, 674), "oacute": (25, -14, 476, 713), "ocircumflex": (25, -14, 476, 704), "odieresis": (25, -14, 476, 667), "ograve": (25, -14, 476, 713), "otilde": (25, -14, 476, 674), "scaron": (25, -14, 363, 704), "uacute": (16, -14, 537, 713), "ucircumflex": (16, -14, 537, 704), "udieresis": (16, -14, 537, 667), "ugrave": (16, -14, 537, 713), "yacute": (16, -205, 480, 713), "ydieresis": (16, -205, 480, 667), "zcaron": (21, 0, 420, 704), }, "Times-Italic": { ".notdef": (0, 0, 0, 0), "exclam": (39, -11, 302, 667), "quotedbl": (144, 421, 432, 666), "numbersign": (2, 0, 540, 676), "dollar": (31, -89, 497, 731), "percent": (79, -13, 790, 676), "ampersand": (76, -18, 723, 666), "quoteright": (151, 436, 290, 666), "parenleft": (42, -181, 315, 669), "parenright": (16, -180, 289, 669), "asterisk": (128, 255, 492, 666), "plus": (86, 0, 590, 506), "comma": (-4, -129, 135, 101), "hyphen": (49, 192, 282, 255), "period": (27, -11, 138, 100), "slash": (-65, -18, 386, 666), "zero": (32, -7, 497, 676), "one": (49, 0, 409, 676), "two": (12, 0, 452, 676), "three": (15, -7, 465, 676), "four": (1, 0, 479, 676), "five": (15, -7, 491, 666), "six": (30, -7, 521, 686), "seven": (75, -8, 537, 666), "eight": (30, -7, 493, 676), "nine": (23, -17, 492, 676), "colon": (50, -11, 261, 441), "semicolon": (27, -129, 261, 441), "less": (84, -8, 592, 514), "equal": (86, 120, 590, 386), "greater": (84, -8, 592, 514), "question": (132, -12, 472, 664), "at": (118, -18, 806, 666), "A": (-51, 0, 564, 668), "B": (-8, 0, 588, 653), "C": (66, -18, 689, 666), "D": (-8, 0, 700, 653), "E": (-1, 0, 634, 653), "F": (8, 0, 645, 653), "G": (52, -18, 722, 666), "H": (-8, 0, 767, 653), "I": (-8, 0, 384, 653), "J": (-6, -18, 491, 653), "K": (7, 0, 722, 653), "L": (-8, 0, 559, 653), "M": (-18, 0, 873, 653), "N": (-20, -15, 727, 653), "O": (60, -18, 699, 666), "P": (0, 0, 605, 653), "Q": (59, -182, 699, 666), "R": (-13, 0, 588, 653), "S": (17, -18, 508, 667), "T": (59, 0, 633, 653), "U": (102, -18, 765, 653), "V": (76, -18, 688, 653), "W": (71, -18, 906, 653), "X": (-29, 0, 655, 653), "Y": (78, 0, 633, 653), "Z": (-6, 0, 606, 653), "bracketleft": (21, -153, 391, 663), "backslash": (-41, -18, 319, 666), "bracketright": (12, -153, 382, 663), "asciicircum": (0, 301, 422, 666), "underscore": (0, -125, 500, -75), "quoteleft": (171, 436, 310, 666), "a": (17, -11, 476, 441), "b": (23, -11, 473, 683), "c": (30, -11, 425, 441), "d": (15, -13, 527, 683), "e": (31, -11, 412, 441), "f": (-147, -207, 424, 678), "g": (8, -206, 472, 441), "h": (19, -9, 478, 683), "i": (49, -11, 264, 654), "j": (-124, -207, 276, 654), "k": (14, -11, 461, 683), "l": (41, -11, 279, 683), "m": (12, -9, 704, 441), "n": (14, -9, 474, 441), "o": (27, -11, 468, 441), "p": (-75, -205, 469, 441), "q": (25, -209, 483, 441), "r": (45, 0, 412, 441), "s": (16, -13, 366, 442), "t": (37, -11, 296, 546), "u": (42, -11, 475, 441), "v": (21, -18, 426, 441), "w": (16, -18, 648, 441), "x": (-27, -11, 447, 441), "y": (-24, -206, 426, 441), "z": (-2, -81, 380, 428), "braceleft": (51, -177, 407, 687), "bar": (105, -18, 171, 666), "braceright": (-7, -177, 349, 687), "asciitilde": (40, 183, 502, 323), "exclamdown": (59, -205, 322, 473), "cent": (77, -143, 472, 560), "sterling": (10, -6, 517, 670), "fraction": (-169, -10, 337, 676), "yen": (27, 0, 603, 653), "florin": (25, -182, 507, 682), "section": (53, -162, 461, 666), "currency": (-22, 53, 522, 597), "quotesingle": (132, 421, 241, 666), "quotedblleft": (166, 436, 514, 666), "guillemotleft": (53, 37, 445, 403), "guilsinglleft": (51, 37, 281, 403), "guilsinglright": (52, 37, 282, 403), "fi": (-141, -207, 481, 681), "fl": (-141, -204, 517, 682), "endash": (-6, 197, 505, 243), "dagger": (101, -159, 488, 666), "daggerdbl": (22, -143, 491, 666), "periodcentered": (70, 199, 181, 310), "paragraph": (55, -123, 616, 653), "bullet": (40, 191, 310, 461), "quotesinglbase": (44, -129, 183, 101), "quotedblbase": (57, -129, 405, 101), "quotedblright": (151, 436, 499, 666), "guillemotright": (55, 37, 447, 403), "ellipsis": (57, -11, 762, 100), "perthousand": (25, -19, 1010, 706), "questiondown": (28, -205, 368, 471), "grave": (121, 492, 311, 664), "acute": (180, 494, 403, 664), "circumflex": (91, 492, 385, 661), "tilde": (100, 517, 427, 624), "macron": (99, 532, 411, 583), "breve": (117, 492, 418, 650), "dotaccent": (207, 508, 305, 606), "dieresis": (107, 508, 405, 606), "ring": (155, 492, 355, 691), "cedilla": (-30, -217, 182, 0), "hungarumlaut": (93, 494, 486, 664), "ogonek": (-20, -169, 200, 40), "caron": (121, 492, 426, 661), "emdash": (-6, 197, 894, 243), "AE": (-27, 0, 911, 653), "ordfeminine": (42, 406, 352, 676), "Lslash": (-8, 0, 559, 653), "Oslash": (60, -105, 699, 722), "OE": (49, -8, 964, 666), "ordmasculine": (67, 406, 362, 676), "ae": (23, -11, 640, 441), "dotlessi": (49, -11, 235, 441), "lslash": (37, -11, 307, 683), "oslash": (28, -135, 469, 554), "oe": (20, -12, 646, 441), "germandbls": (-168, -207, 493, 679), "onesuperior": (43, 271, 283, 676), "logicalnot": (86, 108, 590, 386), "mu": (-30, -209, 497, 428), "trademark": (30, 247, 957, 653), "Eth": (-8, 0, 700, 653), "onehalf": (34, -10, 749, 676), "plusminus": (86, 0, 590, 506), "Thorn": (0, 0, 569, 653), "onequarter": (33, -10, 736, 676), "divide": (86, -11, 590, 517), "brokenbar": (105, -18, 171, 666), "degree": (101, 390, 387, 676), "thorn": (-75, -205, 469, 683), "threequarters": (23, -10, 736, 676), "twosuperior": (33, 271, 324, 676), "registered": (41, -18, 719, 666), "minus": (86, 220, 590, 286), "eth": (27, -11, 482, 683), "multiply": (93, 8, 582, 497), "threesuperior": (43, 268, 339, 676), "copyright": (41, -18, 719, 666), "space": (0, 0, 0, 0), "Aacute": (-51, 0, 564, 876), "Acircumflex": (-51, 0, 564, 873), "Adieresis": (-51, 0, 564, 818), "Agrave": (-51, 0, 564, 876), "Aring": (-51, 0, 564, 883), "Atilde": (-51, 0, 566, 836), "Ccedilla": (66, -217, 689, 666), "Eacute": (-1, 0, 634, 876), "Ecircumflex": (-1, 0, 634, 873), "Edieresis": (-1, 0, 634, 818), "Egrave": (-1, 0, 634, 876), "Iacute": (-8, 0, 413, 876), "Icircumflex": (-8, 0, 425, 873), "Idieresis": (-8, 0, 435, 818), "Igrave": (-8, 0, 384, 876), "Ntilde": (-20, -15, 727, 836), "Oacute": (60, -18, 699, 876), "Ocircumflex": (60, -18, 699, 873), "Odieresis": (60, -18, 699, 818), "Ograve": (60, -18, 699, 876), "Otilde": (60, -18, 699, 836), "Scaron": (17, -18, 520, 873), "Uacute": (102, -18, 765, 876), "Ucircumflex": (102, -18, 765, 873), "Udieresis": (102, -18, 765, 818), "Ugrave": (102, -18, 765, 876), "Yacute": (78, 0, 633, 876), "Ydieresis": (78, 0, 633, 818), "Zcaron": (-6, 0, 606, 873), "aacute": (17, -11, 487, 664), "acircumflex": (17, -11, 476, 661), "adieresis": (17, -11, 489, 606), "agrave": (17, -11, 476, 664), "aring": (17, -11, 476, 691), "atilde": (17, -11, 511, 624), "ccedilla": (26, -217, 425, 441), "eacute": (31, -11, 459, 664), "ecircumflex": (31, -11, 441, 661), "edieresis": (31, -11, 451, 606), "egrave": (31, -11, 412, 664), "iacute": (49, -11, 356, 664), "icircumflex": (34, -11, 328, 661), "idieresis": (49, -11, 353, 606), "igrave": (49, -11, 284, 664), "ntilde": (14, -9, 476, 624), "oacute": (27, -11, 487, 664), "ocircumflex": (27, -11, 468, 661), "odieresis": (27, -11, 489, 606), "ograve": (27, -11, 468, 664), "otilde": (27, -11, 496, 624), "scaron": (16, -13, 454, 661), "uacute": (42, -11, 477, 664), "ucircumflex": (42, -11, 475, 661), "udieresis": (42, -11, 479, 606), "ugrave": (42, -11, 475, 664), "yacute": (-24, -206, 459, 664), "ydieresis": (-24, -206, 441, 606), "zcaron": (-2, -81, 434, 661), }, "Times-Roman": { ".notdef": (0, 0, 0, 0), "exclam": (130, -9, 238, 676), "quotedbl": (77, 431, 331, 676), "numbersign": (5, 0, 496, 662), "dollar": (44, -87, 457, 727), "percent": (61, -13, 772, 676), "ampersand": (42, -13, 750, 676), "quoteright": (79, 433, 218, 676), "parenleft": (48, -177, 304, 676), "parenright": (29, -177, 285, 676), "asterisk": (69, 265, 432, 676), "plus": (30, 0, 534, 506), "comma": (56, -141, 195, 102), "hyphen": (39, 194, 285, 257), "period": (70, -11, 181, 100), "slash": (-9, -14, 287, 676), "zero": (24, -14, 476, 676), "one": (111, 0, 394, 676), "two": (30, 0, 475, 676), "three": (43, -14, 431, 676), "four": (12, 0, 472, 676), "five": (32, -14, 438, 688), "six": (34, -14, 468, 684), "seven": (20, -8, 449, 662), "eight": (56, -14, 445, 676), "nine": (30, -22, 459, 676), "colon": (81, -11, 192, 459), "semicolon": (80, -141, 219, 459), "less": (28, -8, 536, 514), "equal": (30, 120, 534, 386), "greater": (28, -8, 536, 514), "question": (68, -8, 414, 676), "at": (116, -14, 809, 676), "A": (15, 0, 706, 674), "B": (17, 0, 593, 662), "C": (28, -14, 633, 676), "D": (16, 0, 685, 662), "E": (12, 0, 597, 662), "F": (12, 0, 546, 662), "G": (32, -14, 709, 676), "H": (19, 0, 702, 662), "I": (18, 0, 315, 662), "J": (10, -14, 370, 662), "K": (34, 0, 723, 662), "L": (12, 0, 598, 662), "M": (12, 0, 863, 662), "N": (12, -11, 707, 662), "O": (34, -14, 688, 676), "P": (16, 0, 542, 662), "Q": (34, -178, 701, 676), "R": (17, 0, 659, 662), "S": (42, -14, 491, 676), "T": (17, 0, 593, 662), "U": (14, -14, 705, 662), "V": (16, -11, 697, 662), "W": (5, -11, 932, 662), "X": (10, 0, 704, 662), "Y": (22, 0, 703, 662), "Z": (9, 0, 597, 662), "bracketleft": (88, -156, 299, 662), "backslash": (-9, -14, 287, 676), "bracketright": (34, -156, 245, 662), "asciicircum": (24, 297, 446, 662), "underscore": (0, -125, 500, -75), "quoteleft": (115, 433, 254, 676), "a": (37, -10, 442, 460), "b": (3, -10, 468, 683), "c": (25, -10, 412, 460), "d": (27, -10, 491, 683), "e": (25, -10, 424, 460), "f": (20, 0, 383, 683), "g": (28, -218, 470, 460), "h": (9, 0, 487, 683), "i": (16, 0, 253, 683), "j": (-70, -218, 194, 683), "k": (7, 0, 505, 683), "l": (19, 0, 257, 683), "m": (16, 0, 775, 460), "n": (16, 0, 485, 460), "o": (29, -10, 470, 460), "p": (5, -217, 470, 460), "q": (24, -217, 488, 460), "r": (5, 0, 335, 460), "s": (51, -10, 348, 460), "t": (13, -10, 279, 579), "u": (9, -10, 479, 450), "v": (19, -14, 477, 450), "w": (21, -14, 694, 450), "x": (17, 0, 479, 450), "y": (14, -218, 475, 450), "z": (27, 0, 418, 450), "braceleft": (100, -181, 350, 680), "bar": (67, -14, 133, 676), "braceright": (130, -181, 380, 680), "asciitilde": (40, 183, 502, 323), "exclamdown": (97, -218, 205, 467), "cent": (53, -138, 448, 579), "sterling": (12, -8, 490, 676), "fraction": (-168, -14, 331, 676), "yen": (-53, 0, 512, 662), "florin": (7, -189, 490, 676), "section": (70, -148, 426, 676), "currency": (-22, 58, 522, 602), "quotesingle": (48, 431, 133, 676), "quotedblleft": (43, 433, 414, 676), "guillemotleft": (42, 33, 456, 416), "guilsinglleft": (63, 33, 285, 416), "guilsinglright": (48, 33, 270, 416), "fi": (31, 0, 521, 683), "fl": (32, 0, 521, 683), "endash": (0, 201, 500, 250), "dagger": (59, -149, 442, 676), "daggerdbl": (58, -153, 442, 676), "periodcentered": (70, 199, 181, 310), "paragraph": (-22, -154, 450, 662), "bullet": (40, 196, 310, 466), "quotesinglbase": (79, -141, 218, 102), "quotedblbase": (45, -141, 416, 102), "quotedblright": (30, 433, 401, 676), "guillemotright": (44, 33, 458, 416), "ellipsis": (111, -11, 888, 100), "perthousand": (7, -19, 994, 706), "questiondown": (30, -218, 376, 466), "grave": (19, 507, 242, 678), "acute": (93, 507, 317, 678), "circumflex": (11, 507, 322, 674), "tilde": (1, 532, 331, 638), "macron": (11, 547, 322, 601), "breve": (26, 507, 307, 664), "dotaccent": (118, 523, 216, 623), "dieresis": (18, 523, 315, 623), "ring": (67, 512, 266, 711), "cedilla": (52, -215, 261, 0), "hungarumlaut": (-3, 507, 377, 678), "ogonek": (64, -165, 249, 0), "caron": (11, 507, 322, 674), "emdash": (0, 201, 1000, 250), "AE": (0, 0, 863, 662), "ordfeminine": (4, 394, 270, 676), "Lslash": (12, 0, 598, 662), "Oslash": (34, -80, 688, 734), "OE": (30, -6, 885, 668), "ordmasculine": (6, 394, 304, 676), "ae": (38, -10, 632, 460), "dotlessi": (16, 0, 253, 460), "lslash": (19, 0, 259, 683), "oslash": (29, -112, 470, 551), "oe": (30, -10, 690, 460), "germandbls": (12, -9, 468, 683), "onesuperior": (57, 270, 248, 676), "logicalnot": (30, 108, 534, 386), "mu": (36, -218, 512, 450), "trademark": (30, 256, 957, 662), "Eth": (16, 0, 685, 662), "onehalf": (31, -14, 746, 676), "plusminus": (30, 0, 534, 506), "Thorn": (16, 0, 542, 662), "onequarter": (37, -14, 718, 676), "divide": (30, -10, 534, 516), "brokenbar": (67, -14, 133, 676), "degree": (57, 390, 343, 676), "thorn": (5, -217, 470, 683), "threequarters": (15, -14, 718, 676), "twosuperior": (1, 270, 296, 676), "registered": (38, -14, 722, 676), "minus": (30, 220, 534, 286), "eth": (29, -10, 471, 686), "multiply": (38, 8, 527, 497), "threesuperior": (15, 262, 291, 676), "copyright": (38, -14, 722, 676), "space": (0, 0, 0, 0), "Aacute": (15, 0, 706, 890), "Acircumflex": (15, 0, 706, 886), "Adieresis": (15, 0, 706, 835), "Agrave": (15, 0, 706, 890), "Aring": (15, 0, 706, 898), "Atilde": (15, 0, 706, 850), "Ccedilla": (28, -215, 633, 676), "Eacute": (12, 0, 597, 890), "Ecircumflex": (12, 0, 597, 886), "Edieresis": (12, 0, 597, 835), "Egrave": (12, 0, 597, 890), "Iacute": (18, 0, 317, 890), "Icircumflex": (11, 0, 322, 886), "Idieresis": (18, 0, 315, 835), "Igrave": (18, 0, 315, 890), "Ntilde": (12, -11, 707, 850), "Oacute": (34, -14, 688, 890), "Ocircumflex": (34, -14, 688, 886), "Odieresis": (34, -14, 688, 835), "Ograve": (34, -14, 688, 890), "Otilde": (34, -14, 688, 850), "Scaron": (42, -14, 491, 886), "Uacute": (14, -14, 705, 890), "Ucircumflex": (14, -14, 705, 886), "Udieresis": (14, -14, 705, 835), "Ugrave": (14, -14, 705, 890), "Yacute": (22, 0, 703, 890), "Ydieresis": (22, 0, 703, 835), "Zcaron": (9, 0, 597, 886), "aacute": (37, -10, 442, 678), "acircumflex": (37, -10, 442, 674), "adieresis": (37, -10, 442, 623), "agrave": (37, -10, 442, 678), "aring": (37, -10, 442, 711), "atilde": (37, -10, 442, 638), "ccedilla": (25, -215, 412, 460), "eacute": (25, -10, 424, 678), "ecircumflex": (25, -10, 424, 674), "edieresis": (25, -10, 424, 623), "egrave": (25, -10, 424, 678), "iacute": (16, 0, 290, 678), "icircumflex": (-16, 0, 295, 674), "idieresis": (-9, 0, 288, 623), "igrave": (-8, 0, 253, 678), "ntilde": (16, 0, 485, 638), "oacute": (29, -10, 470, 678), "ocircumflex": (29, -10, 470, 674), "odieresis": (29, -10, 470, 623), "ograve": (29, -10, 470, 678), "otilde": (29, -10, 470, 638), "scaron": (39, -10, 350, 674), "uacute": (9, -10, 479, 678), "ucircumflex": (9, -10, 479, 674), "udieresis": (9, -10, 479, 623), "ugrave": (9, -10, 479, 678), "yacute": (14, -218, 475, 678), "ydieresis": (14, -218, 475, 623), "zcaron": (27, 0, 418, 674), }, "ZapfDingbats": { ".notdef": (0, 0, 0, 0), "a1": (35, 72, 939, 621), "a2": (35, 81, 927, 611), "a202": (35, 72, 939, 621), "a3": (35, 0, 945, 692), "a4": (34, 139, 685, 566), "a5": (35, -14, 755, 705), "a119": (35, -14, 755, 705), "a118": (35, -13, 761, 705), "a117": (35, 138, 655, 553), "a11": (35, 123, 925, 568), "a12": (35, 134, 904, 559), "a13": (29, -11, 516, 705), "a14": (34, 59, 820, 632), "a15": (35, 50, 876, 642), "a16": (35, 139, 899, 550), "a105": (35, 50, 876, 642), "a17": (35, 139, 909, 553), "a18": (35, 104, 938, 587), "a19": (34, -13, 721, 705), "a20": (36, -14, 811, 705), "a21": (35, 0, 727, 692), "a22": (35, 0, 727, 692), "a23": (-1, -68, 571, 661), "a24": (36, -13, 642, 705), "a25": (35, 0, 728, 692), "a26": (35, 0, 726, 692), "a27": (35, 0, 725, 692), "a28": (35, 0, 720, 692), "a6": (35, 0, 460, 692), "a7": (35, 0, 517, 692), "a8": (35, 0, 503, 692), "a9": (35, 96, 542, 596), "a10": (35, -14, 657, 705), "a29": (35, -14, 751, 705), "a30": (35, -14, 752, 705), "a31": (35, -14, 753, 705), "a32": (35, -14, 756, 705), "a33": (35, -13, 759, 705), "a34": (35, -13, 759, 705), "a35": (35, -14, 782, 705), "a36": (35, -14, 787, 705), "a37": (35, -14, 754, 705), "a38": (35, -14, 807, 705), "a39": (35, -14, 789, 705), "a40": (35, -14, 798, 705), "a41": (35, -13, 782, 705), "a42": (35, -14, 796, 705), "a43": (35, -14, 888, 705), "a44": (35, 0, 710, 692), "a45": (35, 0, 688, 692), "a46": (35, 0, 714, 692), "a47": (34, -14, 756, 705), "a48": (35, -14, 758, 705), "a49": (35, -14, 661, 706), "a50": (35, -6, 741, 699), "a51": (35, -7, 734, 699), "a52": (35, -14, 757, 705), "a53": (35, 0, 725, 692), "a54": (35, -13, 672, 704), "a55": (35, -14, 672, 705), "a56": (35, -14, 647, 705), "a57": (35, -14, 666, 705), "a58": (35, -14, 791, 705), "a59": (35, -14, 780, 705), "a60": (35, -14, 754, 705), "a61": (35, -14, 754, 705), "a62": (34, -14, 673, 705), "a63": (36, 0, 651, 692), "a64": (35, 1, 661, 690), "a65": (35, 0, 655, 692), "a66": (34, -14, 751, 705), "a67": (35, -14, 752, 705), "a68": (35, -14, 678, 705), "a69": (35, -14, 756, 705), "a70": (36, -14, 751, 705), "a71": (35, -14, 757, 705), "a72": (35, -14, 838, 705), "a73": (35, 0, 726, 692), "a74": (35, 0, 727, 692), "a203": (35, 0, 727, 692), "a75": (35, 0, 725, 692), "a204": (35, 0, 725, 692), "a76": (35, 0, 858, 705), "a77": (35, -14, 858, 692), "a78": (35, -14, 754, 705), "a79": (35, -14, 749, 705), "a81": (35, -14, 403, 705), "a82": (35, 0, 104, 692), "a83": (35, 0, 242, 692), "a84": (35, 0, 380, 692), "a97": (35, 263, 357, 705), "a98": (34, 263, 357, 705), "a99": (35, 263, 633, 705), "a100": (36, 263, 634, 705), "a101": (35, -143, 697, 806), "a102": (56, -14, 488, 706), "a103": (34, -14, 508, 705), "a104": (35, 40, 875, 651), "a106": (35, -14, 633, 705), "a107": (35, -14, 726, 705), "a108": (0, 121, 758, 569), "a112": (35, 0, 741, 705), "a111": (34, -14, 560, 705), "a110": (35, -14, 659, 705), "a109": (34, 0, 591, 705), "a120": (35, -14, 754, 705), "a121": (35, -14, 754, 705), "a122": (35, -14, 754, 705), "a123": (35, -14, 754, 705), "a124": (35, -14, 754, 705), "a125": (35, -14, 754, 705), "a126": (35, -14, 754, 705), "a127": (35, -14, 754, 705), "a128": (35, -14, 754, 705), "a129": (35, -14, 754, 705), "a130": (35, -14, 754, 705), "a131": (35, -14, 754, 705), "a132": (35, -14, 754, 705), "a133": (35, -14, 754, 705), "a134": (35, -14, 754, 705), "a135": (35, -14, 754, 705), "a136": (35, -14, 754, 705), "a137": (35, -14, 754, 705), "a138": (35, -14, 754, 705), "a139": (35, -14, 754, 705), "a140": (35, -14, 754, 705), "a141": (35, -14, 754, 705), "a142": (35, -14, 754, 705), "a143": (35, -14, 754, 705), "a144": (35, -14, 754, 705), "a145": (35, -14, 754, 705), "a146": (35, -14, 754, 705), "a147": (35, -14, 754, 705), "a148": (35, -14, 754, 705), "a149": (35, -14, 754, 705), "a150": (35, -14, 754, 705), "a151": (35, -14, 754, 705), "a152": (35, -14, 754, 705), "a153": (35, -14, 754, 705), "a154": (35, -14, 754, 705), "a155": (35, -14, 754, 705), "a156": (35, -14, 754, 705), "a157": (35, -14, 754, 705), "a158": (35, -14, 754, 705), "a159": (35, -14, 754, 705), "a160": (35, 58, 860, 634), "a161": (35, 152, 803, 540), "a163": (34, 152, 981, 540), "a164": (35, -127, 422, 820), "a196": (35, 94, 698, 597), "a165": (35, 140, 890, 552), "a192": (35, 94, 698, 597), "a166": (35, 166, 884, 526), "a167": (35, 32, 892, 660), "a168": (35, 129, 891, 562), "a169": (35, 128, 893, 563), "a170": (35, 155, 799, 537), "a171": (35, 93, 838, 599), "a172": (35, 104, 791, 588), "a173": (35, 98, 889, 594), "a162": (35, 98, 889, 594), "a174": (35, 0, 882, 692), "a175": (35, 84, 896, 608), "a176": (35, 84, 896, 608), "a177": (35, -99, 429, 791), "a178": (35, 71, 848, 623), "a179": (35, 44, 802, 648), "a193": (35, 44, 802, 648), "a180": (35, 101, 832, 591), "a199": (35, 101, 832, 591), "a181": (35, 44, 661, 648), "a200": (35, 44, 661, 648), "a182": (35, 77, 840, 619), "a201": (35, 73, 840, 615), "a183": (35, 0, 725, 692), "a184": (35, 160, 911, 533), "a197": (34, 37, 736, 655), "a185": (35, 207, 830, 481), "a194": (34, 37, 736, 655), "a198": (34, -19, 853, 712), "a186": (35, 124, 932, 568), "a195": (34, -19, 853, 712), "a187": (35, 113, 796, 579), "a188": (36, 118, 838, 578), "a189": (35, 150, 891, 542), "a190": (35, 76, 931, 616), "a191": (34, 99, 884, 593), "a86": (35, 0, 375, 692), "a85": (35, 0, 475, 692), "a95": (35, 0, 299, 692), "a205": (35, 0, 475, 692), "a89": (35, -14, 356, 705), "a87": (35, -14, 199, 705), "a91": (35, 0, 242, 692), "a90": (35, -14, 355, 705), "a206": (35, 0, 375, 692), "a94": (35, 0, 283, 692), "a93": (35, 0, 283, 692), "a92": (35, 0, 242, 692), "a96": (35, 0, 299, 692), "a88": (35, -14, 199, 705), "space": (0, 0, 0, 0), }, } base14_alias = { "Times New Roman": "Times-Roman", "Times New Roman,Bold": "Times-Bold", "Times New Roman,Italic": "Times-Italic", } def get_cached_bbox(database, family, encoding): bbox = [(0, 0, 0, 0)] * 256 base_font = database[family] for index, name in enumerate(encoding): if name: if cur_bbox := base_font.get(name, None): bbox[index] = cur_bbox return bbox def get_base14_bbox(family, encoding_name="WinAnsiEncoding"): bbox = [(0, 0, 0, 0)] * 256 encoding = get_type1_encoding(encoding_name) if not encoding: return [(0, 0, 0, 0)] * 256 if family in base14_alias: family = base14_alias[family] if family in base14_bbox: bbox = get_cached_bbox(base14_bbox, family, encoding) if family in win_core: bbox = get_cached_bbox(win_core, family, encoding) return bbox ================================================ FILE: babeldoc/format/pdf/babelpdf/cidfont.py ================================================ import re from io import BytesIO import freetype def indirect(obj): if isinstance(obj, tuple) and obj[0] == "xref": return int(obj[1].split(" ")[0]) def get_xref(doc, xref, key): obj = doc.xref_get_key(xref, key) if obj[0] == "xref": return indirect(obj) def get_font_file(doc, xref): if idx := get_xref(doc, xref, "FontFile"): return doc.xref_stream(idx) if idx := get_xref(doc, xref, "FontFile2"): return doc.xref_stream(idx) if idx := get_xref(doc, xref, "FontFile3"): return doc.xref_stream(idx) def get_font_descriptor(doc, xref): if idx := get_xref(doc, xref, "FontDescriptor"): return get_font_file(doc, idx) def get_descendant_fonts(doc, xref): obj = doc.xref_get_key(xref, "DescendantFonts") array_text = "" if obj[0] == "xref": array_text = doc.xref_object(indirect(obj)) elif obj[0] == "array": array_text = obj[1] if m := re.search(r"\d+", array_text): return get_font_descriptor(doc, int(m.group(0))) def get_glyph_bbox(face, g): try: face.load_glyph(g, freetype.FT_LOAD_NO_SCALE) outline = face.glyph.outline if outline.contours: cbox = outline.get_bbox() return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax else: return 0, 0, 0, 0 except Exception: return 0, 0, 0, 0 def get_face_bbox(blob): face = freetype.Face(BytesIO(blob)) scale = 1000 / face.units_per_EM bbox_list = [get_glyph_bbox(face, code) for code in range(face.num_glyphs)] bbox_list = [[v * scale for v in bbox] for bbox in bbox_list] return bbox_list def get_cidfont_bbox(doc, xref): if doc.xref_get_key(xref, "Subtype")[1] == "/Type0": if blob := get_descendant_fonts(doc, xref): return get_face_bbox(blob) ================================================ FILE: babeldoc/format/pdf/babelpdf/cmap.py ================================================ import re import struct pattern_map_r = ( r"\s+begincidrange\s*" r"(?P(<[a-fA-F0-9]+>\s*<[a-fA-F0-9]+>\s*\d+\s*)+)" r"\s+endcidrange\s+" ) pattern_map_c = ( r"\s+begincidchar\s*" r"(?P(<[a-fA-F0-9]+>\s*\d+\s*)+)" r"\s+endcidchar\s+" ) pattern_one_c = ( r"<(?P[a-fA-F0-9]+)>" r"\s*" r"(?P\d+)" ) pattern_one_r = ( r"<(?P[a-fA-F0-9]+)>" r"\s*" r"<(?P[a-fA-F0-9]+)>" r"\s*" r"(?P\d+)" ) def parse_blob_value(text): return int(text, 16), len(text) // 2 def parse_cmap_char(text, store): for m in re.finditer(pattern_one_c, text): pat = m["pat"] val = m["val"] store.append((pat, int(val))) def parse_cmap_range(text, store): for m in re.finditer(pattern_one_r, text): pat = m["pat"] end = m["end"] val = m["val"] store.append((pat, end, int(val))) def parse_cmap(text): usecmap = "" if m := re.search(r"/(?P[a-zA-Z0-9-]+)\s+usecmap\s+", text): usecmap = m["usecmap"] cidrange = [] for m in re.finditer(pattern_map_r, text): parse_cmap_range(m["cidrange"], cidrange) cidchar = [] for m in re.finditer(pattern_map_c, text): parse_cmap_char(m["cidchar"], cidchar) return usecmap, cidrange, cidchar _CMAP_CACHE: dict[str, tuple[list, list]] = {} def _normalize_cmap_name(name: str) -> str: """Normalize cmap name for internal cache key.""" if name.endswith(".json"): return name[: -len(".json")] return name def use_cmap(name: str): key = _normalize_cmap_name(name) if key in _CMAP_CACHE: return _CMAP_CACHE[key] # Lazy import to avoid circular dependency at import time. from babeldoc.assets.assets import get_cmap_data data = get_cmap_data(key) if not isinstance(data, dict): raise TypeError(f"Invalid cmap data type for {key}: {type(data)!r}") cid_u = data.get("u") or "" cid_r = data.get("r") or [] cid_c = data.get("c") or [] store_r: list = [] store_c: list = [] if cid_u: use_r, use_c = use_cmap(cid_u) store_r += use_r store_c += use_c store_r += cid_r store_c += cid_c _CMAP_CACHE[key] = (store_r, store_c) return store_r, store_c def propagation(r, c): encoding = {} len_set = set() for one_r in r: val_l, len_l = parse_blob_value(one_r[0]) val_r, len_r = parse_blob_value(one_r[1]) if len_l != len_r: continue len_set.add(len_l) for i, v in enumerate(range(val_l, val_r + 1)): val_b = struct.pack(">L", v) fin_b = val_b[4 - len_l :] encoding[fin_b] = one_r[2] + i for one_c in c: encoding[one_c[0]] = one_c[1] len_list = list(len_set) len_list.sort(reverse=True) return encoding, len_list class CharacterMap: def __init__(self, text): cid_r = [] cid_c = [] usecmap, cidrange, cidchar = parse_cmap(text) if usecmap: use_r, use_c = use_cmap(usecmap) cid_r += use_r cid_c += use_c cid_r += cidrange cid_c += cidchar self.encoding, self.len_list = propagation(cid_r, cid_c) def decode_one(self, text): for l in self.len_list: pat = text[:l] if pat in self.encoding: return self.encoding[pat], l return 0, 1 def decode(self, text): index = 0 size = len(text) gstr = [] while index < size: g, l = self.decode_one(text[index:]) gstr.append(g) index += l return gstr ================================================ FILE: babeldoc/format/pdf/babelpdf/encoding.py ================================================ adobe_standard = [ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", "ampersand", "quoteright", "parenleft", "parenright", "asterisk", "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "colon", "semicolon", "less", "equal", "greater", "question", "at", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", "backslash", "bracketright", "asciicircum", "underscore", "quoteleft", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", "bar", "braceright", "asciitilde", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "exclamdown", "cent", "sterling", "fraction", "yen", "florin", "section", "currency", "quotesingle", "quotedblleft", "guillemotleft", "guilsinglleft", "guilsinglright", "fi", "fl", None, "endash", "dagger", "daggerdbl", "periodcentered", None, "paragraph", "bullet", "quotesinglbase", "quotedblbase", "quotedblright", "guillemotright", "ellipsis", "perthousand", None, "questiondown", None, "grave", "acute", "circumflex", "tilde", "macron", "breve", "dotaccent", "dieresis", None, "ring", "cedilla", None, "hungarumlaut", "ogonek", "caron", "emdash", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "AE", None, "ordfeminine", None, None, None, None, "Lslash", "Oslash", "OE", "ordmasculine", None, None, None, None, None, "ae", None, None, None, "dotlessi", None, None, "lslash", "oslash", "oe", "germandbls", None, None, None, None, ] mac_expert = [ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "space", "exclamsmall", "Hungarumlautsmall", "centoldstyle", "dollaroldstyle", "dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior", "parenrightsuperior", "twodotenleader", "onedotenleader", "comma", "hyphen", "period", "fraction", "zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle", "colon", "semicolon", None, "threequartersemdash", None, "questionsmall", None, None, None, None, "Ethsmall", None, None, "onequarter", "onehalf", "threequarters", "oneeighth", "threeeighths", "fiveeighths", "seveneighths", "onethird", "twothirds", None, None, None, None, None, None, "ff", "fi", "fl", "ffi", "ffl", "parenleftinferior", None, "parenrightinferior", "Circumflexsmall", "hypheninferior", "Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall", "Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall", "Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall", "Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall", "colonmonetary", "onefitted", "rupiah", "Tildesmall", None, None, "asuperior", "centsuperior", None, None, None, None, "Aacutesmall", "Agravesmall", "Acircumflexsmall", "Adieresissmall", "Atildesmall", "Aringsmall", "Ccedillasmall", "Eacutesmall", "Egravesmall", "Ecircumflexsmall", "Edieresissmall", "Iacutesmall", "Igravesmall", "Icircumflexsmall", "Idieresissmall", "Ntildesmall", "Oacutesmall", "Ogravesmall", "Ocircumflexsmall", "Odieresissmall", "Otildesmall", "Uacutesmall", "Ugravesmall", "Ucircumflexsmall", "Udieresissmall", None, "eightsuperior", "fourinferior", "threeinferior", "sixinferior", "eightinferior", "seveninferior", "Scaronsmall", None, "centinferior", "twoinferior", None, "Dieresissmall", None, "Caronsmall", "osuperior", "fiveinferior", None, "commainferior", "periodinferior", "Yacutesmall", None, "dollarinferior", None, None, "Thornsmall", None, "nineinferior", "zeroinferior", "Zcaronsmall", "AEsmall", "Oslashsmall", "questiondownsmall", "oneinferior", "Lslashsmall", None, None, None, None, None, None, "Cedillasmall", None, None, None, None, None, "OEsmall", "figuredash", "hyphensuperior", None, None, None, None, "exclamdownsmall", None, "Ydieresissmall", None, "onesuperior", "twosuperior", "threesuperior", "foursuperior", "fivesuperior", "sixsuperior", "sevensuperior", "ninesuperior", "zerosuperior", None, "esuperior", "rsuperior", "tsuperior", None, None, "isuperior", "ssuperior", "dsuperior", None, None, None, None, None, "lsuperior", "Ogoneksmall", "Brevesmall", "Macronsmall", "bsuperior", "nsuperior", "msuperior", "commasuperior", "periodsuperior", "Dotaccentsmall", "Ringsmall", None, None, None, None, ] mac_roman = [ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "space", "exclamsmall", "Hungarumlautsmall", "centoldstyle", "dollaroldstyle", "dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior", "parenrightsuperior", "twodotenleader", "onedotenleader", "comma", "hyphen", "period", "fraction", "zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle", "colon", "semicolon", None, "threequartersemdash", None, "questionsmall", None, None, None, None, "Ethsmall", None, None, "onequarter", "onehalf", "threequarters", "oneeighth", "threeeighths", "fiveeighths", "seveneighths", "onethird", "twothirds", None, None, None, None, None, None, "ff", "fi", "fl", "ffi", "ffl", "parenleftinferior", None, "parenrightinferior", "Circumflexsmall", "hypheninferior", "Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall", "Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall", "Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall", "Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall", "colonmonetary", "onefitted", "rupiah", "Tildesmall", None, None, "asuperior", "centsuperior", None, None, None, None, "Aacutesmall", "Agravesmall", "Acircumflexsmall", "Adieresissmall", "Atildesmall", "Aringsmall", "Ccedillasmall", "Eacutesmall", "Egravesmall", "Ecircumflexsmall", "Edieresissmall", "Iacutesmall", "Igravesmall", "Icircumflexsmall", "Idieresissmall", "Ntildesmall", "Oacutesmall", "Ogravesmall", "Ocircumflexsmall", "Odieresissmall", "Otildesmall", "Uacutesmall", "Ugravesmall", "Ucircumflexsmall", "Udieresissmall", None, "eightsuperior", "fourinferior", "threeinferior", "sixinferior", "eightinferior", "seveninferior", "Scaronsmall", None, "centinferior", "twoinferior", None, "Dieresissmall", None, "Caronsmall", "osuperior", "fiveinferior", None, "commainferior", "periodinferior", "Yacutesmall", None, "dollarinferior", None, None, "Thornsmall", None, "nineinferior", "zeroinferior", "Zcaronsmall", "AEsmall", "Oslashsmall", "questiondownsmall", "oneinferior", "Lslashsmall", None, None, None, None, None, None, "Cedillasmall", None, None, None, None, None, "OEsmall", "figuredash", "hyphensuperior", None, None, None, None, "exclamdownsmall", None, "Ydieresissmall", None, "onesuperior", "twosuperior", "threesuperior", "foursuperior", "fivesuperior", "sixsuperior", "sevensuperior", "ninesuperior", "zerosuperior", None, "esuperior", "rsuperior", "tsuperior", None, None, "isuperior", "ssuperior", "dsuperior", None, None, None, None, None, "lsuperior", "Ogoneksmall", "Brevesmall", "Macronsmall", "bsuperior", "nsuperior", "msuperior", "commasuperior", "periodsuperior", "Dotaccentsmall", "Ringsmall", None, None, None, None, ] win_ansi = [ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", "ampersand", "quotesingle", "parenleft", "parenright", "asterisk", "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "colon", "semicolon", "less", "equal", "greater", "question", "at", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", "backslash", "bracketright", "asciicircum", "underscore", "grave", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", "bar", "braceright", "asciitilde", "bullet", "Euro", "bullet", "quotesinglbase", "florin", "quotedblbase", "ellipsis", "dagger", "daggerdbl", "circumflex", "perthousand", "Scaron", "guilsinglleft", "OE", "bullet", "Zcaron", "bullet", "bullet", "quoteleft", "quoteright", "quotedblleft", "quotedblright", "bullet", "endash", "emdash", "tilde", "trademark", "scaron", "guilsinglright", "oe", "bullet", "zcaron", "Ydieresis", "space", "exclamdown", "cent", "sterling", "currency", "yen", "brokenbar", "section", "dieresis", "copyright", "ordfeminine", "guillemotleft", "logicalnot", "hyphen", "registered", "macron", "degree", "plusminus", "twosuperior", "threesuperior", "acute", "mu", "paragraph", "periodcentered", "cedilla", "onesuperior", "ordmasculine", "guillemotright", "onequarter", "onehalf", "threequarters", "questiondown", "Agrave", "Aacute", "Acircumflex", "Atilde", "Adieresis", "Aring", "AE", "Ccedilla", "Egrave", "Eacute", "Ecircumflex", "Edieresis", "Igrave", "Iacute", "Icircumflex", "Idieresis", "Eth", "Ntilde", "Ograve", "Oacute", "Ocircumflex", "Otilde", "Odieresis", "multiply", "Oslash", "Ugrave", "Uacute", "Ucircumflex", "Udieresis", "Yacute", "Thorn", "germandbls", "agrave", "aacute", "acircumflex", "atilde", "adieresis", "aring", "ae", "ccedilla", "egrave", "eacute", "ecircumflex", "edieresis", "igrave", "iacute", "icircumflex", "idieresis", "eth", "ntilde", "ograve", "oacute", "ocircumflex", "otilde", "odieresis", "divide", "oslash", "ugrave", "uacute", "ucircumflex", "udieresis", "yacute", "thorn", "ydieresis", ] def get_type1_encoding(name): match name: case "StandardEncoding": return adobe_standard case "MacRomanEncoding": return mac_roman case "WinAnsiEncoding": return win_ansi case "MacExpertEncoding": return mac_expert WinAnsiEncoding = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 8364, 0, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 0, 381, 0, 0, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 0, 382, 376, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] ================================================ FILE: babeldoc/format/pdf/babelpdf/type3.py ================================================ import io import re import pymupdf def merge_bbox(bbox_list, factor=1): if bbox_list: base = bbox_list[0] for bbox in bbox_list[1:]: base.include_rect(bbox) x0, y0, x1, y1 = [v / factor for v in tuple(base)] return x0, -y1, x1, -y0 def get_type3_bbox(doc, obj): bbox_list = [(0, 0, 0, 0)] * 256 first = int(doc.xref_get_key(obj, "FirstChar")[1]) last = int(doc.xref_get_key(obj, "LastChar")[1]) factor_text = doc.xref_get_key(obj, "FontMatrix")[1] factor = 1 if factor_m := re.search(r"(\d+)?\.\d+", factor_text): factor = float(factor_m.group(0)) page = doc.new_page(width=10, height=10) doc.xref_set_key(page.xref, "Resources", "<<>>") doc.xref_set_key(page.xref, "Resources/Font", f"<>") text = doc.get_new_xref() doc.update_object(text, "<<>>") for x in range(first, last + 1): doc.update_stream(text, b"1 0 0 1 0 10 cm BT /T0 1 Tf <%02X> Tj ET" % x) doc.xref_set_key(page.xref, "Contents", f"{text} 0 R") char_data = page.get_svg_image(text_as_path=True) char_doc = pymupdf.Document(stream=io.BytesIO(char_data.encode("U8"))) char_bbox = [] for element in char_doc: for item in element.get_drawings(): char_bbox.append(item["rect"]) if char_bbox_merged := merge_bbox(char_bbox, factor): bbox_list[x] = char_bbox_merged doc.delete_page(-1) return bbox_list ================================================ FILE: babeldoc/format/pdf/babelpdf/utils.py ================================================ from babeldoc.pdfminer.pdftypes import PDFObjRef def guarded_bbox(bbox): bbox_guarded = [] for v in bbox: u = v if isinstance(v, PDFObjRef): u = v.resolve() if isinstance(u, int) or isinstance(u, float): bbox_guarded.append(u) else: bbox_guarded.append(u) return bbox_guarded ================================================ FILE: babeldoc/format/pdf/babelpdf/win_core.py ================================================ win_core = { "Arial": { "space": (0, 0, 0, 0), "exclam": (85, 0, 194, 715), "quotedbl": (45, 462, 308, 715), "numbersign": (10, -12, 543, 728), "dollar": (35, -103, 509, 781), "percent": (58, -26, 827, 728), "ampersand": (42, -16, 644, 728), "quotesingle": (43, 462, 144, 715), "parenleft": (60, -210, 296, 728), "parenright": (60, -210, 296, 728), "asterisk": (31, 423, 354, 728), "plus": (55, 115, 528, 588), "comma": (83, -141, 188, 100), "hyphen": (31, 214, 301, 303), "period": (90, 0, 190, 100), "slash": (0, -12, 277, 728), "zero": (41, -12, 508, 718), "one": (108, 0, 372, 718), "two": (30, 0, 503, 718), "three": (41, -12, 510, 718), "four": (12, 0, 507, 715), "five": (41, -12, 516, 706), "six": (37, -12, 510, 718), "seven": (47, 0, 510, 706), "eight": (40, -12, 512, 718), "nine": (41, -12, 512, 718), "colon": (90, 0, 190, 518), "semicolon": (83, -141, 188, 518), "less": (54, 110, 528, 595), "equal": (55, 203, 528, 502), "greater": (54, 110, 528, 595), "question": (43, 0, 505, 728), "at": (54, -210, 979, 729), "A": (-1, 0, 668, 715), "B": (73, 0, 613, 715), "C": (49, -12, 682, 728), "D": (77, 0, 668, 715), "E": (79, 0, 613, 715), "F": (82, 0, 564, 715), "G": (53, -12, 715, 728), "H": (80, 0, 641, 715), "I": (93, 0, 187, 715), "J": (28, -12, 422, 715), "K": (73, 0, 665, 715), "L": (73, 0, 520, 715), "M": (74, 0, 757, 715), "N": (76, 0, 640, 715), "O": (48, -12, 732, 728), "P": (77, 0, 623, 715), "Q": (42, -55, 741, 728), "R": (78, 0, 709, 715), "S": (44, -12, 614, 728), "T": (23, 0, 590, 715), "U": (78, -12, 641, 715), "V": (4, 0, 659, 715), "W": (12, 0, 932, 715), "X": (4, 0, 660, 715), "Y": (2, 0, 659, 715), "Z": (20, 0, 585, 715), "bracketleft": (67, -198, 261, 715), "backslash": (0, -12, 277, 728), "bracketright": (19, -198, 212, 715), "asciicircum": (26, 336, 442, 728), "underscore": (-15, -198, 567, -135), "grave": (43, 583, 227, 719), "a": (36, -11, 513, 530), "b": (65, -11, 515, 715), "c": (39, -11, 490, 530), "d": (34, -11, 483, 715), "e": (36, -11, 514, 530), "f": (9, 0, 312, 728), "g": (32, -210, 489, 530), "h": (65, 0, 488, 715), "i": (66, 0, 154, 715), "j": (-45, -210, 153, 715), "k": (66, 0, 496, 715), "l": (63, 0, 151, 715), "m": (65, 0, 768, 530), "n": (65, 0, 487, 530), "o": (33, -11, 519, 530), "p": (65, -198, 516, 530), "q": (35, -198, 484, 530), "r": (64, 0, 346, 530), "s": (30, -11, 461, 530), "t": (17, -6, 270, 699), "u": (63, -11, 484, 518), "v": (12, 0, 488, 518), "w": (2, 0, 714, 518), "x": (7, 0, 492, 518), "y": (16, -210, 491, 518), "z": (19, 0, 478, 518), "braceleft": (27, -210, 310, 728), "bar": (91, -210, 168, 728), "braceright": (22, -210, 305, 728), "asciitilde": (42, 271, 541, 432), "bullet": (53, 226, 300, 474), "Euro": (-13, -12, 540, 728), "quotesinglbase": (52, -132, 154, 102), "florin": (22, -210, 529, 728), "quotedblbase": (34, -132, 288, 102), "ellipsis": (116, 0, 883, 100), "dagger": (35, -168, 514, 699), "daggerdbl": (35, -168, 516, 706), "circumflex": (12, 583, 321, 719), "perthousand": (18, -26, 981, 728), "Scaron": (44, -12, 614, 893), "guilsinglleft": (44, 35, 271, 480), "OE": (62, -12, 968, 728), "Zcaron": (20, 0, 585, 893), "quoteleft": (62, 493, 164, 728), "quoteright": (52, 488, 154, 723), "quotedblleft": (40, 493, 293, 728), "quotedblright": (34, 488, 288, 723), "endash": (-1, 223, 554, 294), "emdash": (0, 223, 1000, 294), "tilde": (3, 595, 330, 708), "trademark": (109, 317, 870, 715), "scaron": (30, -11, 461, 719), "guilsinglright": (44, 35, 266, 480), "oe": (40, -11, 906, 530), "zcaron": (19, 0, 478, 719), "Ydieresis": (2, 0, 659, 859), "exclamdown": (113, -197, 222, 518), "cent": (52, -199, 504, 715), "sterling": (13, -13, 528, 728), "currency": (36, 114, 516, 593), "yen": (-1, 0, 553, 715), "brokenbar": (91, -210, 168, 728), "section": (39, -210, 510, 728), "dieresis": (29, 620, 303, 720), "copyright": (1, -8, 738, 728), "ordfeminine": (22, 364, 350, 728), "guillemotleft": (65, 35, 483, 480), "logicalnot": (55, 207, 528, 502), "registered": (1, -8, 738, 728), "macron": (-15, 764, 567, 827), "degree": (62, 457, 333, 728), "plusminus": (38, 0, 510, 600), "twosuperior": (12, 357, 316, 724), "threesuperior": (16, 349, 315, 724), "acute": (108, 583, 288, 719), "mu": (78, -198, 497, 518), "paragraph": (0, -198, 540, 715), "periodcentered": (116, 311, 216, 411), "cedilla": (52, -205, 263, 11), "onesuperior": (52, 357, 232, 724), "ordmasculine": (21, 361, 342, 728), "guillemotright": (68, 35, 486, 480), "onequarter": (52, -27, 819, 728), "onehalf": (52, -27, 816, 728), "threequarters": (16, -27, 819, 728), "questiondown": (77, -209, 538, 518), "Agrave": (-1, 0, 668, 896), "Aacute": (-1, 0, 668, 896), "Acircumflex": (-1, 0, 668, 896), "Atilde": (-1, 0, 668, 872), "Adieresis": (-1, 0, 668, 859), "Aring": (-1, 0, 668, 869), "AE": (0, 0, 945, 715), "Ccedilla": (49, -205, 682, 728), "Egrave": (79, 0, 613, 896), "Eacute": (79, 0, 613, 896), "Ecircumflex": (79, 0, 613, 896), "Edieresis": (79, 0, 613, 859), "Igrave": (26, 0, 209, 896), "Iacute": (68, 0, 249, 896), "Icircumflex": (-15, 0, 293, 896), "Idieresis": (1, 0, 275, 859), "Eth": (-1, 0, 668, 715), "Ntilde": (76, 0, 640, 872), "Ograve": (48, -12, 732, 896), "Oacute": (48, -12, 732, 896), "Ocircumflex": (48, -12, 732, 896), "Otilde": (48, -12, 732, 872), "Odieresis": (48, -12, 732, 859), "multiply": (78, 140, 504, 566), "Oslash": (40, -28, 740, 742), "Ugrave": (78, -12, 641, 896), "Uacute": (78, -12, 641, 896), "Ucircumflex": (78, -12, 641, 896), "Udieresis": (78, -12, 641, 859), "Yacute": (2, 0, 659, 896), "Thorn": (77, 0, 623, 715), "germandbls": (74, -12, 579, 728), "agrave": (36, -11, 513, 719), "aacute": (36, -11, 513, 719), "acircumflex": (36, -11, 513, 719), "atilde": (36, -11, 513, 708), "adieresis": (36, -11, 513, 720), "aring": (36, -11, 513, 740), "ae": (33, -11, 848, 530), "ccedilla": (39, -195, 490, 530), "egrave": (36, -11, 514, 719), "eacute": (36, -11, 514, 719), "ecircumflex": (36, -11, 514, 719), "edieresis": (36, -11, 514, 720), "igrave": (17, 0, 200, 719), "iacute": (92, 0, 272, 719), "icircumflex": (-8, 0, 300, 719), "idieresis": (4, 0, 278, 720), "eth": (35, -12, 516, 715), "ntilde": (65, 0, 487, 708), "ograve": (33, -11, 519, 719), "oacute": (33, -11, 519, 719), "ocircumflex": (33, -11, 519, 719), "otilde": (33, -11, 519, 708), "odieresis": (33, -11, 519, 720), "divide": (38, 155, 510, 550), "oslash": (62, -38, 548, 550), "ugrave": (63, -11, 484, 719), "uacute": (63, -11, 484, 719), "ucircumflex": (63, -11, 484, 719), "udieresis": (63, -11, 484, 720), "yacute": (16, -210, 491, 719), "thorn": (65, -198, 516, 715), "ydieresis": (16, -210, 491, 720), }, "Arial,Bold": { "space": (0, 0, 0, 0), "exclam": (89, 0, 238, 715), "quotedbl": (54, 461, 424, 715), "numbersign": (8, -12, 544, 728), "dollar": (34, -100, 511, 773), "percent": (43, -28, 842, 728), "ampersand": (43, -18, 706, 728), "quotesingle": (44, 461, 194, 715), "parenleft": (52, -210, 300, 728), "parenright": (32, -210, 281, 728), "asterisk": (13, 386, 367, 728), "plus": (41, 103, 541, 603), "comma": (57, -159, 205, 137), "hyphen": (56, 190, 325, 328), "period": (71, 0, 208, 137), "slash": (-1, -12, 278, 728), "zero": (41, -12, 506, 718), "one": (79, 0, 393, 718), "two": (24, 0, 505, 718), "three": (37, -12, 513, 718), "four": (18, 0, 533, 718), "five": (44, -12, 525, 706), "six": (42, -12, 520, 718), "seven": (42, 0, 511, 706), "eight": (40, -12, 511, 718), "nine": (31, -12, 509, 718), "colon": (98, 0, 235, 518), "semicolon": (83, -159, 231, 518), "less": (46, 81, 537, 625), "equal": (41, 181, 541, 524), "greater": (46, 81, 537, 624), "question": (51, 0, 565, 723), "at": (29, -210, 971, 728), "A": (0, 0, 718, 715), "B": (73, 0, 672, 715), "C": (47, -12, 670, 728), "D": (72, 0, 672, 715), "E": (72, 0, 617, 715), "F": (73, 0, 564, 715), "G": (47, -12, 717, 728), "H": (73, 0, 645, 715), "I": (68, 0, 212, 715), "J": (17, -12, 475, 715), "K": (74, 0, 720, 715), "L": (76, 0, 580, 709), "M": (70, 0, 762, 715), "N": (74, 0, 642, 715), "O": (43, -12, 737, 728), "P": (72, 0, 621, 715), "Q": (43, -71, 764, 728), "R": (73, 0, 716, 715), "S": (36, -12, 618, 728), "T": (21, 0, 590, 715), "U": (71, -12, 642, 715), "V": (0, 0, 666, 715), "W": (3, 0, 942, 715), "X": (0, 0, 665, 715), "Y": (-1, 0, 667, 715), "Z": (10, 0, 592, 715), "bracketleft": (71, -201, 314, 715), "backslash": (-1, -12, 278, 728), "bracketright": (18, -201, 261, 715), "asciicircum": (56, 337, 527, 728), "underscore": (-9, -197, 561, -108), "grave": (20, 582, 241, 728), "a": (35, -11, 522, 530), "b": (65, -11, 572, 715), "c": (41, -11, 530, 530), "d": (41, -11, 547, 715), "e": (31, -11, 516, 530), "f": (11, 0, 362, 728), "g": (41, -210, 546, 530), "h": (71, 0, 543, 715), "i": (71, 0, 208, 715), "j": (-45, -210, 206, 715), "k": (66, 0, 546, 715), "l": (71, 0, 208, 715), "m": (61, 0, 824, 530), "n": (70, 0, 543, 530), "o": (40, -11, 575, 530), "p": (67, -197, 573, 530), "q": (44, -197, 547, 530), "r": (65, 0, 401, 530), "s": (23, -11, 507, 530), "t": (15, -11, 320, 701), "u": (68, -11, 540, 518), "v": (5, 0, 543, 518), "w": (4, 0, 777, 518), "x": (5, 0, 546, 518), "y": (6, -210, 540, 518), "z": (16, 0, 479, 518), "braceleft": (29, -210, 363, 728), "bar": (85, -210, 194, 728), "braceright": (21, -210, 355, 728), "asciitilde": (32, 253, 551, 451), "bullet": (32, 208, 320, 497), "Euro": (-15, -12, 524, 728), "quotesinglbase": (57, -159, 205, 137), "florin": (-9, -210, 557, 728), "quotedblbase": (51, -160, 430, 137), "ellipsis": (98, 0, 902, 137), "dagger": (33, -170, 517, 707), "daggerdbl": (33, -170, 517, 707), "circumflex": (1, 583, 332, 728), "perthousand": (0, -28, 999, 728), "Scaron": (36, -12, 618, 903), "guilsinglleft": (36, 34, 298, 479), "OE": (35, -12, 969, 728), "Zcaron": (10, 0, 592, 903), "quoteleft": (74, 425, 222, 722), "quoteright": (57, 416, 205, 713), "quotedblleft": (64, 425, 441, 722), "quotedblright": (51, 418, 430, 715), "endash": (-1, 208, 554, 310), "emdash": (0, 208, 1000, 310), "tilde": (-6, 588, 331, 712), "trademark": (105, 315, 877, 715), "scaron": (23, -11, 507, 728), "guilsinglright": (36, 34, 298, 479), "oe": (42, -11, 902, 530), "zcaron": (16, 0, 479, 728), "Ydieresis": (-1, 0, 667, 874), "exclamdown": (95, -198, 243, 518), "cent": (41, -196, 530, 710), "sterling": (6, -12, 540, 728), "currency": (21, 100, 530, 610), "yen": (0, 0, 551, 715), "brokenbar": (85, -210, 194, 728), "section": (28, -210, 521, 728), "dieresis": (2, 610, 330, 728), "copyright": (-4, -17, 743, 730), "ordfeminine": (18, 362, 345, 728), "guillemotleft": (46, 34, 500, 479), "logicalnot": (41, 183, 541, 524), "registered": (-4, -17, 743, 730), "macron": (-9, 757, 561, 847), "degree": (41, 416, 353, 728), "plusminus": (24, 0, 524, 674), "twosuperior": (12, 354, 308, 724), "threesuperior": (19, 349, 312, 724), "acute": (91, 582, 312, 728), "mu": (54, -198, 525, 518), "paragraph": (0, -196, 551, 715), "periodcentered": (97, 279, 234, 416), "cedilla": (18, -204, 284, -5), "onesuperior": (44, 354, 241, 724), "ordmasculine": (12, 361, 351, 728), "guillemotright": (51, 34, 505, 479), "onequarter": (44, -26, 824, 724), "onehalf": (44, -26, 808, 724), "threequarters": (19, -26, 824, 724), "questiondown": (49, -205, 563, 518), "Agrave": (0, 0, 718, 902), "Aacute": (0, 0, 718, 902), "Acircumflex": (0, 0, 718, 900), "Atilde": (0, 0, 718, 879), "Adieresis": (0, 0, 718, 874), "Aring": (0, 0, 718, 858), "AE": (-41, 0, 951, 715), "Ccedilla": (47, -204, 670, 728), "Egrave": (72, 0, 617, 902), "Eacute": (72, 0, 617, 902), "Ecircumflex": (72, 0, 617, 900), "Edieresis": (72, 0, 617, 874), "Igrave": (-4, 0, 216, 902), "Iacute": (51, 0, 272, 902), "Icircumflex": (-20, 0, 310, 900), "Idieresis": (-21, 0, 306, 874), "Eth": (-1, 0, 672, 715), "Ntilde": (74, 0, 642, 879), "Ograve": (43, -12, 737, 902), "Oacute": (43, -12, 737, 902), "Ocircumflex": (43, -12, 737, 900), "Otilde": (43, -12, 737, 879), "Odieresis": (43, -12, 737, 874), "multiply": (53, 114, 529, 591), "Oslash": (30, -40, 750, 750), "Ugrave": (71, -12, 642, 902), "Uacute": (71, -12, 642, 902), "Ucircumflex": (71, -12, 642, 900), "Udieresis": (71, -12, 642, 874), "Yacute": (-1, 0, 667, 902), "Thorn": (72, 0, 621, 715), "germandbls": (67, -11, 575, 728), "agrave": (35, -11, 522, 728), "aacute": (35, -11, 522, 728), "acircumflex": (35, -11, 522, 728), "atilde": (35, -11, 522, 712), "adieresis": (35, -11, 522, 728), "aring": (35, -11, 522, 750), "ae": (42, -11, 841, 530), "ccedilla": (41, -204, 530, 530), "egrave": (31, -11, 516, 728), "eacute": (31, -11, 516, 728), "ecircumflex": (31, -11, 516, 728), "edieresis": (31, -11, 516, 728), "igrave": (-11, 0, 209, 728), "iacute": (61, 0, 282, 728), "icircumflex": (-24, 0, 305, 728), "idieresis": (-23, 0, 304, 728), "eth": (40, -12, 573, 715), "ntilde": (70, 0, 543, 712), "ograve": (40, -11, 575, 728), "oacute": (40, -11, 575, 728), "ocircumflex": (40, -11, 575, 728), "otilde": (40, -11, 575, 712), "odieresis": (40, -11, 575, 728), "divide": (23, 90, 524, 616), "oslash": (42, -35, 577, 546), "ugrave": (68, -11, 540, 728), "uacute": (68, -11, 540, 728), "ucircumflex": (68, -11, 540, 728), "udieresis": (68, -11, 540, 728), "yacute": (6, -210, 540, 728), "thorn": (67, -197, 573, 715), "ydieresis": (6, -210, 540, 728), }, "Arial,BoldItalic": { "space": (0, 0, 0, 0), "exclam": (61, 0, 353, 715), "quotedbl": (151, 461, 506, 715), "numbersign": (47, -12, 583, 728), "dollar": (43, -99, 576, 770), "percent": (90, -30, 864, 728), "ampersand": (83, -16, 706, 728), "quotesingle": (151, 461, 329, 715), "parenleft": (65, -210, 435, 728), "parenright": (-78, -210, 291, 728), "asterisk": (98, 386, 452, 728), "plus": (80, 103, 581, 603), "comma": (10, -155, 212, 135), "hyphen": (38, 190, 338, 325), "period": (43, 0, 210, 135), "slash": (-43, -12, 408, 728), "zero": (64, -12, 571, 718), "one": (118, 0, 510, 720), "two": (60, 0, 570, 718), "three": (50, -12, 560, 718), "four": (27, 0, 560, 715), "five": (63, -12, 577, 706), "six": (81, -12, 575, 718), "seven": (103, 0, 602, 706), "eight": (65, -12, 566, 718), "nine": (63, -12, 558, 718), "colon": (70, 0, 316, 518), "semicolon": (40, -155, 319, 518), "less": (85, 81, 576, 625), "equal": (80, 181, 581, 524), "greater": (85, 81, 576, 624), "question": (123, 0, 618, 728), "at": (64, -210, 1006, 728), "A": (-11, 0, 673, 715), "B": (40, 0, 709, 715), "C": (94, -12, 745, 728), "D": (43, 0, 724, 715), "E": (41, 0, 721, 715), "F": (39, 0, 689, 715), "G": (88, -12, 785, 728), "H": (43, 0, 764, 715), "I": (34, 0, 331, 715), "J": (28, -12, 599, 715), "K": (39, 0, 801, 715), "L": (44, 0, 581, 715), "M": (40, 0, 878, 715), "N": (44, 0, 762, 715), "O": (87, -12, 784, 728), "P": (40, 0, 702, 715), "Q": (87, -95, 783, 728), "R": (43, 0, 741, 715), "S": (63, -12, 676, 728), "T": (120, 0, 708, 715), "U": (91, -12, 765, 715), "V": (113, 0, 793, 715), "W": (117, 0, 1067, 715), "X": (-30, 0, 783, 715), "Y": (114, 0, 784, 715), "Z": (24, 0, 667, 715), "bracketleft": (9, -197, 438, 715), "backslash": (78, -12, 287, 728), "bracketright": (-55, -197, 375, 715), "asciicircum": (104, 337, 576, 728), "underscore": (-9, -197, 561, -108), "grave": (133, 585, 331, 731), "a": (44, -12, 533, 530), "b": (36, -12, 601, 715), "c": (60, -12, 564, 530), "d": (59, -12, 668, 715), "e": (58, -12, 554, 530), "f": (53, 0, 470, 728), "g": (31, -210, 622, 530), "h": (41, 0, 590, 715), "i": (40, 0, 329, 715), "j": (-109, -210, 331, 715), "k": (37, 0, 614, 715), "l": (39, 0, 328, 715), "m": (35, 0, 868, 530), "n": (41, 0, 591, 530), "o": (60, -12, 599, 530), "p": (-5, -197, 605, 530), "q": (59, -197, 625, 530), "r": (32, 0, 474, 530), "s": (21, -12, 551, 530), "t": (75, -12, 390, 698), "u": (70, -12, 619, 518), "v": (74, 0, 618, 518), "w": (71, 0, 840, 518), "x": (-21, 0, 612, 518), "y": (6, -210, 620, 518), "z": (16, 0, 518, 518), "braceleft": (41, -210, 490, 728), "bar": (85, -210, 194, 728), "braceright": (-84, -210, 363, 728), "asciitilde": (66, 253, 585, 451), "bullet": (81, 208, 369, 497), "Euro": (26, -12, 639, 728), "quotesinglbase": (10, -155, 212, 135), "florin": (-9, -210, 557, 728), "quotedblbase": (3, -155, 441, 135), "ellipsis": (92, 0, 907, 135), "dagger": (84, -170, 594, 706), "daggerdbl": (0, -170, 599, 706), "circumflex": (56, 584, 391, 731), "perthousand": (67, -28, 1021, 728), "Scaron": (63, -12, 676, 905), "guilsinglleft": (59, 34, 378, 477), "OE": (68, -12, 1078, 728), "Zcaron": (24, 0, 667, 905), "quoteleft": (108, 433, 311, 724), "quoteright": (123, 424, 325, 715), "quotedblleft": (125, 433, 562, 724), "quotedblright": (128, 424, 566, 715), "endash": (-1, 208, 554, 310), "emdash": (0, 208, 1000, 310), "tilde": (92, 592, 428, 710), "trademark": (144, 315, 916, 715), "scaron": (21, -12, 551, 731), "guilsinglright": (9, 34, 318, 477), "oe": (58, -12, 943, 530), "zcaron": (16, 0, 527, 731), "Ydieresis": (114, 0, 784, 875), "exclamdown": (11, -197, 304, 518), "cent": (58, -192, 562, 713), "sterling": (20, -18, 610, 728), "currency": (65, 100, 574, 610), "yen": (23, 0, 666, 715), "brokenbar": (85, -210, 194, 728), "section": (21, -211, 560, 728), "dieresis": (84, 597, 435, 716), "copyright": (43, -17, 791, 730), "ordfeminine": (82, 362, 412, 728), "guillemotleft": (82, 34, 590, 477), "logicalnot": (80, 183, 581, 524), "registered": (43, -17, 791, 730), "macron": (68, 757, 638, 847), "degree": (109, 416, 421, 728), "plusminus": (63, 0, 563, 674), "twosuperior": (82, 354, 395, 724), "threesuperior": (76, 349, 389, 724), "acute": (183, 583, 435, 730), "mu": (-37, -200, 584, 518), "paragraph": (43, -196, 596, 715), "periodcentered": (136, 290, 303, 425), "cedilla": (6, -207, 267, -12), "onesuperior": (114, 354, 361, 725), "ordmasculine": (72, 362, 414, 728), "guillemotright": (22, 34, 531, 477), "onequarter": (99, -29, 839, 724), "onehalf": (84, -29, 835, 724), "threequarters": (75, -29, 851, 724), "questiondown": (26, -209, 521, 518), "Agrave": (-11, 0, 673, 905), "Aacute": (-11, 0, 686, 903), "Acircumflex": (-11, 0, 673, 905), "Atilde": (-11, 0, 673, 874), "Adieresis": (-11, 0, 680, 875), "Aring": (-11, -9, 673, 854), "AE": (-32, 0, 1059, 715), "Ccedilla": (94, -204, 745, 728), "Egrave": (41, 0, 721, 905), "Eacute": (41, 0, 721, 903), "Ecircumflex": (41, 0, 721, 905), "Edieresis": (41, 0, 721, 875), "Igrave": (34, 0, 382, 905), "Iacute": (34, 0, 451, 903), "Icircumflex": (34, 0, 426, 905), "Idieresis": (34, 0, 452, 875), "Eth": (36, 0, 725, 715), "Ntilde": (44, 0, 762, 874), "Ograve": (87, -12, 784, 905), "Oacute": (87, -12, 784, 903), "Ocircumflex": (87, -12, 784, 905), "Otilde": (87, -12, 784, 874), "Odieresis": (87, -12, 784, 875), "multiply": (92, 114, 568, 591), "Oslash": (77, -59, 786, 766), "Ugrave": (91, -12, 765, 905), "Uacute": (91, -12, 765, 903), "Ucircumflex": (91, -12, 765, 905), "Udieresis": (91, -12, 765, 875), "Yacute": (114, 0, 784, 903), "Thorn": (40, 0, 673, 715), "germandbls": (35, -12, 581, 728), "agrave": (44, -12, 533, 731), "aacute": (44, -12, 567, 730), "acircumflex": (44, -12, 533, 731), "atilde": (44, -12, 549, 710), "adieresis": (44, -12, 553, 716), "aring": (44, -12, 533, 753), "ae": (30, -12, 865, 530), "ccedilla": (60, -203, 564, 530), "egrave": (58, -12, 554, 731), "eacute": (58, -12, 562, 730), "ecircumflex": (58, -12, 554, 731), "edieresis": (58, -12, 554, 716), "igrave": (40, 0, 347, 731), "iacute": (40, 0, 413, 730), "icircumflex": (40, 0, 389, 731), "idieresis": (40, 0, 417, 716), "eth": (60, -12, 607, 715), "ntilde": (41, 0, 591, 710), "ograve": (60, -12, 599, 731), "oacute": (60, -12, 599, 730), "ocircumflex": (60, -12, 599, 731), "otilde": (60, -12, 599, 710), "odieresis": (60, -12, 599, 716), "divide": (63, 90, 563, 616), "oslash": (52, -52, 604, 571), "ugrave": (70, -12, 619, 731), "uacute": (70, -12, 619, 730), "ucircumflex": (70, -12, 619, 731), "udieresis": (70, -12, 619, 716), "yacute": (6, -210, 620, 730), "thorn": (-9, -197, 602, 715), "ydieresis": (6, -210, 620, 716), }, "Arial,Italic": { "space": (0, 0, 0, 0), "exclam": (56, 0, 303, 715), "quotedbl": (135, 462, 428, 715), "numbersign": (46, -12, 579, 728), "dollar": (51, -95, 572, 763), "percent": (97, -26, 852, 728), "ampersand": (78, -17, 651, 728), "quotesingle": (126, 462, 258, 715), "parenleft": (84, -210, 413, 728), "parenright": (-53, -210, 275, 728), "asterisk": (115, 423, 437, 728), "plus": (89, 115, 562, 588), "comma": (23, -144, 175, 100), "hyphen": (46, 214, 334, 303), "period": (57, 0, 178, 100), "slash": (-50, -11, 410, 728), "zero": (70, -12, 565, 718), "one": (147, 0, 479, 718), "two": (58, 0, 562, 718), "three": (54, -12, 557, 718), "four": (45, 0, 542, 715), "five": (69, -12, 572, 706), "six": (83, -12, 567, 718), "seven": (121, 0, 595, 706), "eight": (74, -12, 564, 718), "nine": (67, -12, 551, 718), "colon": (57, 0, 265, 518), "semicolon": (23, -144, 262, 518), "less": (89, 110, 563, 595), "equal": (89, 203, 562, 502), "greater": (89, 110, 563, 595), "question": (126, 0, 560, 728), "at": (54, -210, 979, 729), "A": (-20, 0, 616, 715), "B": (43, 0, 654, 715), "C": (90, -12, 730, 728), "D": (44, 0, 711, 715), "E": (44, 0, 711, 715), "F": (45, 0, 660, 715), "G": (97, -12, 766, 728), "H": (41, 0, 753, 715), "I": (57, 0, 302, 715), "J": (33, -12, 535, 715), "K": (44, 0, 741, 715), "L": (40, 0, 524, 715), "M": (43, 0, 872, 715), "N": (48, 0, 756, 715), "O": (91, -12, 772, 728), "P": (42, 0, 697, 715), "Q": (92, -82, 773, 728), "R": (46, 0, 729, 715), "S": (70, -12, 671, 728), "T": (124, 0, 705, 715), "U": (96, -12, 754, 715), "V": (124, 0, 756, 715), "W": (125, 0, 1061, 715), "X": (-31, 0, 769, 715), "Y": (116, 0, 772, 715), "Z": (24, 0, 636, 715), "bracketleft": (6, -195, 391, 715), "backslash": (84, -11, 273, 728), "bracketright": (-58, -195, 329, 715), "asciicircum": (70, 336, 486, 728), "underscore": (-63, -198, 519, -135), "grave": (145, 581, 309, 715), "a": (43, -11, 526, 530), "b": (33, -11, 535, 715), "c": (56, -11, 510, 530), "d": (52, -11, 598, 715), "e": (51, -11, 531, 530), "f": (45, 0, 407, 728), "g": (25, -207, 564, 530), "h": (33, 0, 528, 715), "i": (29, 0, 267, 715), "j": (-121, -207, 267, 715), "k": (34, 0, 553, 715), "l": (26, 0, 264, 715), "m": (32, 0, 812, 530), "n": (33, 0, 527, 530), "o": (48, -11, 540, 530), "p": (-10, -198, 535, 530), "q": (51, -198, 552, 530), "r": (33, 0, 419, 530), "s": (41, -11, 501, 530), "t": (56, -8, 321, 707), "u": (62, -11, 557, 518), "v": (79, 0, 559, 518), "w": (77, 0, 776, 518), "x": (-1, 0, 537, 518), "y": (0, -210, 561, 518), "z": (19, 0, 512, 518), "braceleft": (52, -210, 445, 728), "bar": (91, -210, 168, 728), "braceright": (-83, -210, 309, 728), "asciitilde": (80, 271, 579, 432), "bullet": (53, 226, 300, 474), "Euro": (39, -12, 645, 728), "quotesinglbase": (-7, -144, 144, 100), "florin": (22, -210, 529, 728), "quotedblbase": (-19, -144, 291, 100), "ellipsis": (143, 0, 932, 100), "dagger": (90, -170, 583, 706), "daggerdbl": (5, -170, 588, 706), "circumflex": (100, 581, 387, 715), "perthousand": (66, -26, 1003, 728), "Scaron": (70, -12, 671, 894), "guilsinglleft": (47, 35, 313, 478), "OE": (80, -12, 1043, 728), "Zcaron": (24, 0, 636, 894), "quoteleft": (128, 482, 280, 728), "quoteright": (125, 467, 276, 712), "quotedblleft": (105, 482, 413, 728), "quotedblright": (104, 467, 417, 712), "endash": (-1, 223, 554, 294), "emdash": (0, 223, 1000, 294), "tilde": (93, 596, 423, 706), "trademark": (136, 317, 897, 715), "scaron": (41, -11, 503, 715), "guilsinglright": (16, 35, 288, 478), "oe": (62, -11, 918, 530), "zcaron": (19, 0, 512, 715), "Ydieresis": (116, 0, 772, 858), "exclamdown": (57, -197, 305, 518), "cent": (75, -198, 529, 725), "sterling": (31, -12, 607, 728), "currency": (80, 114, 560, 593), "yen": (36, 0, 666, 715), "brokenbar": (91, -210, 168, 728), "section": (30, -210, 555, 728), "dieresis": (115, 599, 408, 699), "copyright": (40, -8, 777, 728), "ordfeminine": (81, 359, 409, 728), "guillemotleft": (78, 35, 537, 478), "logicalnot": (89, 207, 562, 502), "registered": (40, -8, 777, 728), "macron": (88, 764, 670, 827), "degree": (133, 457, 404, 728), "plusminus": (60, 0, 533, 600), "twosuperior": (74, 357, 400, 724), "threesuperior": (82, 349, 399, 724), "acute": (168, 581, 372, 715), "mu": (5, -200, 571, 518), "paragraph": (69, -198, 609, 715), "periodcentered": (151, 307, 272, 407), "cedilla": (37, -207, 287, 5), "onesuperior": (136, 357, 354, 724), "ordmasculine": (69, 360, 411, 728), "guillemotright": (40, 35, 504, 478), "onequarter": (83, -29, 850, 728), "onehalf": (60, -29, 827, 728), "threequarters": (82, -29, 865, 728), "questiondown": (83, -209, 517, 518), "Agrave": (-20, 0, 616, 894), "Aacute": (-20, 0, 616, 894), "Acircumflex": (-20, 0, 616, 894), "Atilde": (-20, 0, 616, 867), "Adieresis": (-20, 0, 616, 859), "Aring": (-20, 0, 616, 863), "AE": (-40, 0, 1043, 715), "Ccedilla": (90, -210, 730, 728), "Egrave": (44, 0, 711, 894), "Eacute": (44, 0, 711, 894), "Ecircumflex": (44, 0, 711, 894), "Edieresis": (44, 0, 711, 858), "Igrave": (57, 0, 340, 894), "Iacute": (57, 0, 389, 894), "Icircumflex": (57, 0, 407, 894), "Idieresis": (57, 0, 413, 859), "Eth": (44, 0, 720, 715), "Ntilde": (48, 0, 756, 867), "Ograve": (91, -12, 772, 894), "Oacute": (91, -12, 772, 894), "Ocircumflex": (91, -12, 772, 894), "Otilde": (91, -12, 772, 867), "Odieresis": (91, -12, 772, 859), "multiply": (127, 140, 553, 566), "Oslash": (84, -50, 776, 764), "Ugrave": (96, -12, 754, 894), "Uacute": (96, -12, 754, 894), "Ucircumflex": (96, -12, 754, 894), "Udieresis": (96, -12, 754, 859), "Yacute": (116, 0, 772, 894), "Thorn": (42, 0, 666, 715), "germandbls": (36, -12, 567, 728), "agrave": (43, -11, 526, 715), "aacute": (43, -11, 526, 715), "acircumflex": (43, -11, 526, 715), "atilde": (43, -11, 540, 706), "adieresis": (43, -11, 526, 699), "aring": (43, -11, 526, 733), "ae": (42, -12, 865, 530), "ccedilla": (56, -198, 510, 530), "egrave": (51, -11, 531, 715), "eacute": (51, -11, 531, 715), "ecircumflex": (51, -11, 531, 715), "edieresis": (51, -11, 531, 699), "igrave": (61, 0, 310, 715), "iacute": (61, 0, 349, 715), "icircumflex": (61, 0, 361, 715), "idieresis": (61, 0, 377, 699), "eth": (48, -12, 545, 715), "ntilde": (33, 0, 532, 706), "ograve": (48, -11, 540, 715), "oacute": (48, -11, 540, 715), "ocircumflex": (48, -11, 540, 715), "otilde": (48, -11, 540, 706), "odieresis": (48, -11, 540, 699), "divide": (62, 155, 535, 550), "oslash": (74, -49, 583, 565), "ugrave": (62, -11, 557, 715), "uacute": (62, -11, 557, 715), "ucircumflex": (62, -11, 557, 715), "udieresis": (62, -11, 557, 699), "yacute": (0, -210, 561, 715), "thorn": (-10, -198, 535, 715), "ydieresis": (0, -210, 561, 699), }, "ArialNarrow": { "space": (0, 0, 0, 0), "exclam": (72, 0, 161, 715), "quotedbl": (37, 462, 252, 715), "numbersign": (7, -12, 444, 728), "dollar": (27, -103, 416, 781), "percent": (45, -26, 676, 728), "ampersand": (35, -16, 528, 728), "quotesingle": (34, 462, 116, 715), "parenleft": (49, -210, 243, 728), "parenright": (29, -210, 223, 728), "asterisk": (24, 423, 289, 728), "plus": (44, 115, 432, 588), "comma": (69, -141, 156, 100), "hyphen": (25, 214, 247, 303), "period": (75, 0, 157, 100), "slash": (0, -12, 228, 728), "zero": (32, -12, 415, 718), "one": (87, 0, 304, 718), "two": (24, 0, 412, 718), "three": (33, -12, 417, 718), "four": (9, 0, 415, 715), "five": (32, -12, 421, 706), "six": (29, -12, 416, 718), "seven": (37, 0, 417, 706), "eight": (32, -12, 418, 718), "nine": (32, -12, 418, 718), "colon": (75, 0, 157, 518), "semicolon": (69, -141, 156, 518), "less": (43, 110, 432, 595), "equal": (44, 203, 432, 502), "greater": (43, 110, 432, 595), "question": (34, 0, 413, 728), "at": (43, -210, 801, 729), "A": (0, 0, 548, 715), "B": (60, 0, 503, 715), "C": (39, -12, 558, 728), "D": (61, 0, 546, 715), "E": (64, 0, 502, 715), "F": (68, 0, 464, 715), "G": (45, -12, 588, 728), "H": (62, 0, 523, 715), "I": (78, 0, 155, 715), "J": (21, -12, 344, 715), "K": (60, 0, 545, 715), "L": (58, 0, 425, 715), "M": (61, 0, 621, 715), "N": (61, 0, 523, 715), "O": (41, -12, 603, 728), "P": (63, 0, 511, 715), "Q": (37, -55, 609, 728), "R": (62, 0, 580, 715), "S": (37, -12, 504, 728), "T": (20, 0, 485, 715), "U": (62, -12, 524, 715), "V": (3, 0, 540, 715), "W": (11, 0, 766, 715), "X": (3, 0, 541, 715), "Y": (2, 0, 540, 715), "Z": (17, 0, 481, 715), "bracketleft": (57, -198, 216, 715), "backslash": (0, -12, 228, 728), "bracketright": (17, -198, 176, 715), "asciicircum": (21, 336, 363, 728), "underscore": (-5, -125, 460, -75), "grave": (35, 583, 186, 719), "a": (28, -11, 419, 530), "b": (52, -11, 420, 715), "c": (31, -11, 402, 530), "d": (26, -11, 395, 715), "e": (28, -11, 420, 530), "f": (9, 0, 257, 728), "g": (24, -210, 399, 530), "h": (52, 0, 398, 715), "i": (52, 0, 124, 715), "j": (-39, -210, 124, 715), "k": (54, 0, 406, 715), "l": (50, 0, 122, 715), "m": (53, 0, 629, 530), "n": (52, 0, 398, 530), "o": (25, -11, 424, 530), "p": (52, -198, 421, 530), "q": (27, -198, 395, 530), "r": (52, 0, 283, 530), "s": (25, -12, 378, 530), "t": (16, -6, 223, 699), "u": (51, -11, 395, 518), "v": (10, 0, 400, 518), "w": (0, 0, 584, 518), "x": (5, 0, 403, 518), "y": (13, -210, 402, 518), "z": (16, 0, 392, 518), "braceleft": (22, -210, 254, 728), "bar": (76, -210, 139, 728), "braceright": (18, -210, 250, 728), "asciitilde": (35, 271, 444, 432), "bullet": (44, 226, 247, 474), "Euro": (-11, -12, 443, 728), "quotesinglbase": (41, -132, 125, 102), "florin": (17, -210, 433, 728), "quotedblbase": (28, -132, 236, 102), "ellipsis": (95, 0, 724, 100), "dagger": (27, -168, 420, 699), "daggerdbl": (27, -168, 422, 706), "circumflex": (9, 583, 263, 719), "perthousand": (14, -26, 805, 728), "Scaron": (37, -12, 504, 901), "guilsinglleft": (36, 35, 222, 480), "OE": (51, -12, 793, 728), "Zcaron": (17, 0, 481, 901), "quoteleft": (49, 481, 133, 715), "quoteright": (41, 481, 125, 715), "quotedblleft": (33, 481, 241, 715), "quotedblright": (28, 481, 236, 715), "endash": (-2, 223, 453, 294), "emdash": (0, 223, 819, 294), "tilde": (2, 595, 270, 708), "trademark": (90, 317, 713, 715), "scaron": (25, -12, 378, 719), "guilsinglright": (50, 35, 235, 480), "oe": (34, -11, 744, 530), "zcaron": (16, 0, 392, 719), "Ydieresis": (2, 0, 540, 901), "exclamdown": (91, -197, 181, 518), "cent": (41, -199, 413, 715), "sterling": (9, -13, 432, 728), "currency": (28, 114, 421, 593), "yen": (-2, 0, 452, 715), "brokenbar": (76, -210, 139, 728), "section": (31, -210, 417, 728), "dieresis": (24, 620, 249, 720), "copyright": (1, -8, 606, 728), "ordfeminine": (20, 364, 289, 728), "guillemotleft": (52, 35, 395, 480), "logicalnot": (44, 207, 432, 502), "registered": (1, -8, 606, 728), "macron": (-5, 790, 505, 840), "degree": (62, 457, 333, 728), "plusminus": (38, 0, 510, 600), "twosuperior": (9, 357, 259, 724), "threesuperior": (12, 349, 258, 724), "acute": (88, 583, 236, 719), "mu": (78, -198, 497, 518), "paragraph": (2, -198, 444, 715), "periodcentered": (95, 311, 177, 411), "cedilla": (42, -205, 216, 11), "onesuperior": (41, 357, 189, 724), "ordmasculine": (18, 361, 280, 728), "guillemotright": (54, 35, 397, 480), "onequarter": (41, -27, 671, 728), "onehalf": (41, -27, 669, 728), "threequarters": (12, -27, 669, 728), "questiondown": (64, -209, 443, 518), "Agrave": (0, 0, 548, 901), "Aacute": (0, 0, 548, 901), "Acircumflex": (0, 0, 548, 901), "Atilde": (0, 0, 548, 878), "Adieresis": (0, 0, 548, 901), "Aring": (0, 0, 548, 921), "AE": (0, 0, 775, 715), "Ccedilla": (39, -205, 558, 728), "Egrave": (64, 0, 502, 901), "Eacute": (64, 0, 502, 901), "Ecircumflex": (64, 0, 502, 901), "Edieresis": (64, 0, 502, 901), "Igrave": (23, 0, 174, 901), "Iacute": (73, 0, 220, 901), "Icircumflex": (-11, 0, 241, 901), "Idieresis": (4, 0, 229, 901), "Eth": (-2, 0, 546, 715), "Ntilde": (61, 0, 523, 878), "Ograve": (41, -12, 603, 901), "Oacute": (41, -12, 603, 901), "Ocircumflex": (41, -12, 603, 901), "Otilde": (41, -12, 603, 878), "Odieresis": (41, -12, 603, 901), "multiply": (63, 140, 412, 566), "Oslash": (35, -28, 609, 742), "Ugrave": (62, -12, 524, 901), "Uacute": (62, -12, 524, 901), "Ucircumflex": (62, -12, 524, 901), "Udieresis": (62, -12, 524, 901), "Yacute": (2, 0, 540, 901), "Thorn": (63, 0, 511, 715), "germandbls": (62, -12, 476, 728), "agrave": (28, -11, 419, 719), "aacute": (28, -11, 419, 719), "acircumflex": (28, -11, 419, 719), "atilde": (28, -11, 419, 696), "adieresis": (28, -11, 419, 720), "aring": (28, -11, 419, 762), "ae": (25, -11, 694, 530), "ccedilla": (31, -205, 402, 530), "egrave": (28, -11, 420, 719), "eacute": (28, -11, 420, 719), "ecircumflex": (28, -11, 420, 719), "edieresis": (28, -11, 420, 720), "igrave": (9, 0, 160, 719), "iacute": (62, 0, 210, 719), "icircumflex": (-6, 0, 246, 719), "idieresis": (1, 0, 226, 720), "eth": (27, -12, 421, 715), "ntilde": (52, 0, 398, 696), "ograve": (25, -11, 424, 719), "oacute": (25, -11, 424, 719), "ocircumflex": (25, -11, 424, 719), "otilde": (25, -11, 424, 696), "odieresis": (25, -11, 424, 720), "divide": (38, 155, 510, 550), "oslash": (55, -38, 453, 550), "ugrave": (51, -11, 395, 719), "uacute": (51, -11, 395, 719), "ucircumflex": (51, -11, 395, 719), "udieresis": (51, -11, 395, 720), "yacute": (13, -210, 402, 719), "thorn": (52, -198, 421, 715), "ydieresis": (13, -210, 402, 720), }, "ArialNarrow,Bold": { "space": (0, 0, 0, 0), "exclam": (73, 0, 194, 715), "quotedbl": (44, 461, 348, 715), "numbersign": (7, -12, 446, 728), "dollar": (28, -100, 419, 773), "percent": (35, -28, 690, 728), "ampersand": (36, -18, 579, 728), "quotesingle": (36, 461, 159, 715), "parenleft": (42, -210, 246, 728), "parenright": (26, -210, 230, 728), "asterisk": (11, 386, 301, 728), "plus": (33, 103, 444, 603), "comma": (46, -159, 168, 137), "hyphen": (25, 190, 247, 328), "period": (59, 0, 171, 137), "slash": (0, -12, 229, 728), "zero": (34, -12, 416, 718), "one": (64, 0, 322, 718), "two": (20, 0, 415, 718), "three": (30, -12, 420, 718), "four": (15, 0, 437, 718), "five": (36, -12, 431, 706), "six": (35, -12, 427, 718), "seven": (34, 0, 419, 706), "eight": (33, -12, 419, 718), "nine": (25, -12, 417, 718), "colon": (80, 0, 192, 518), "semicolon": (67, -159, 189, 518), "less": (38, 81, 440, 625), "equal": (33, 181, 444, 524), "greater": (37, 81, 440, 624), "question": (42, 0, 463, 723), "at": (24, -210, 796, 728), "A": (0, 0, 588, 715), "B": (59, 0, 551, 715), "C": (39, -12, 550, 728), "D": (59, 0, 551, 715), "E": (60, 0, 506, 715), "F": (60, 0, 462, 715), "G": (39, -12, 588, 728), "H": (60, 0, 529, 715), "I": (56, 0, 174, 715), "J": (14, -12, 390, 715), "K": (61, 0, 590, 715), "L": (62, 0, 476, 709), "M": (58, 0, 625, 715), "N": (61, 0, 526, 715), "O": (36, -12, 605, 728), "P": (59, 0, 509, 715), "Q": (35, -71, 626, 728), "R": (60, 0, 587, 715), "S": (29, -12, 506, 728), "T": (17, 0, 483, 715), "U": (58, -12, 526, 715), "V": (0, 0, 546, 715), "W": (2, 0, 772, 715), "X": (0, 0, 546, 715), "Y": (0, 0, 547, 715), "Z": (8, 0, 485, 715), "bracketleft": (58, -201, 257, 715), "backslash": (0, -12, 229, 728), "bracketright": (15, -201, 214, 715), "asciicircum": (46, 337, 433, 728), "underscore": (-5, -125, 462, -75), "grave": (17, 582, 198, 728), "a": (29, -11, 428, 530), "b": (53, -11, 469, 715), "c": (34, -11, 435, 530), "d": (33, -11, 449, 715), "e": (26, -11, 423, 530), "f": (9, 0, 296, 728), "g": (33, -210, 448, 530), "h": (58, 0, 445, 715), "i": (59, 0, 171, 715), "j": (-37, -210, 169, 715), "k": (55, 0, 448, 715), "l": (59, 0, 171, 715), "m": (50, 0, 675, 530), "n": (58, 0, 445, 530), "o": (32, -11, 471, 530), "p": (55, -197, 470, 530), "q": (36, -197, 449, 530), "r": (54, 0, 329, 530), "s": (19, -11, 416, 530), "t": (12, -11, 262, 701), "u": (56, -11, 442, 518), "v": (4, 0, 445, 518), "w": (3, 0, 637, 518), "x": (4, 0, 448, 518), "y": (5, -210, 442, 518), "z": (13, 0, 393, 518), "braceleft": (23, -210, 297, 728), "bar": (70, -210, 160, 728), "braceright": (18, -210, 291, 728), "asciitilde": (26, 253, 452, 451), "bullet": (26, 208, 263, 497), "Euro": (-13, -12, 431, 728), "quotesinglbase": (46, -159, 168, 137), "florin": (-7, -210, 457, 728), "quotedblbase": (44, -159, 355, 137), "ellipsis": (80, 0, 739, 137), "dagger": (27, -170, 423, 707), "daggerdbl": (27, -170, 423, 707), "circumflex": (0, 583, 271, 728), "perthousand": (0, -28, 819, 728), "Scaron": (29, -12, 506, 909), "guilsinglleft": (30, 34, 245, 479), "OE": (28, -12, 794, 728), "Zcaron": (8, 0, 485, 909), "quoteleft": (61, 418, 182, 715), "quoteright": (45, 418, 166, 715), "quotedblleft": (52, 418, 362, 715), "quotedblright": (41, 418, 352, 715), "endash": (-1, 208, 454, 310), "emdash": (0, 208, 819, 310), "tilde": (-5, 588, 271, 712), "trademark": (86, 315, 719, 715), "scaron": (19, -11, 416, 728), "guilsinglright": (29, 34, 244, 479), "oe": (35, -11, 740, 530), "zcaron": (13, 0, 393, 728), "Ydieresis": (0, 0, 547, 909), "exclamdown": (78, -198, 199, 518), "cent": (33, -196, 434, 710), "sterling": (5, -12, 443, 728), "currency": (18, 100, 435, 610), "yen": (0, 0, 452, 715), "brokenbar": (70, -210, 160, 728), "section": (23, -210, 427, 728), "dieresis": (1, 610, 270, 728), "copyright": (-3, -17, 609, 730), "ordfeminine": (15, 362, 283, 728), "guillemotleft": (38, 34, 410, 479), "logicalnot": (33, 183, 444, 524), "registered": (-3, -17, 609, 730), "macron": (-5, 790, 505, 840), "degree": (41, 416, 353, 728), "plusminus": (24, 0, 524, 674), "twosuperior": (9, 354, 252, 724), "threesuperior": (15, 349, 255, 724), "acute": (74, 582, 256, 728), "mu": (54, -198, 525, 518), "paragraph": (0, -196, 452, 715), "periodcentered": (80, 279, 192, 416), "cedilla": (15, -204, 233, -5), "onesuperior": (36, 354, 198, 724), "ordmasculine": (10, 361, 288, 728), "guillemotright": (42, 34, 414, 479), "onequarter": (36, -26, 675, 724), "onehalf": (36, -26, 663, 724), "threequarters": (16, -26, 676, 724), "questiondown": (40, -205, 462, 518), "Agrave": (0, 0, 588, 909), "Aacute": (0, 0, 588, 909), "Acircumflex": (0, 0, 588, 909), "Atilde": (0, 0, 588, 894), "Adieresis": (0, 0, 588, 909), "Aring": (0, 0, 588, 932), "AE": (-34, 0, 780, 715), "Ccedilla": (39, -210, 550, 728), "Egrave": (60, 0, 506, 909), "Eacute": (60, 0, 506, 909), "Ecircumflex": (60, 0, 506, 909), "Edieresis": (60, 0, 506, 909), "Igrave": (-3, 0, 177, 909), "Iacute": (53, 0, 235, 909), "Icircumflex": (-20, 0, 250, 909), "Idieresis": (-19, 0, 250, 909), "Eth": (-1, 0, 551, 715), "Ntilde": (61, 0, 526, 894), "Ograve": (36, -12, 605, 909), "Oacute": (36, -12, 605, 909), "Ocircumflex": (36, -12, 605, 909), "Otilde": (36, -12, 605, 894), "Odieresis": (36, -12, 605, 909), "multiply": (43, 114, 434, 591), "Oslash": (25, -40, 615, 750), "Ugrave": (58, -12, 526, 909), "Uacute": (58, -12, 526, 909), "Ucircumflex": (58, -12, 526, 909), "Udieresis": (58, -12, 526, 909), "Yacute": (0, 0, 547, 909), "Thorn": (59, 0, 509, 715), "germandbls": (55, -11, 472, 728), "agrave": (29, -11, 428, 728), "aacute": (29, -11, 428, 728), "acircumflex": (29, -11, 428, 728), "atilde": (29, -11, 428, 712), "adieresis": (29, -11, 428, 728), "aring": (29, -11, 428, 750), "ae": (35, -11, 690, 530), "ccedilla": (34, -204, 435, 530), "egrave": (26, -11, 423, 728), "eacute": (26, -11, 423, 728), "ecircumflex": (26, -11, 423, 728), "edieresis": (26, -11, 423, 728), "igrave": (-9, 0, 172, 728), "iacute": (58, 0, 240, 728), "icircumflex": (-20, 0, 250, 728), "idieresis": (-19, 0, 250, 728), "eth": (33, -12, 470, 715), "ntilde": (58, 0, 445, 712), "ograve": (32, -11, 471, 728), "oacute": (32, -11, 471, 728), "ocircumflex": (32, -11, 471, 728), "otilde": (32, -11, 471, 712), "odieresis": (32, -11, 471, 728), "divide": (23, 90, 524, 616), "oslash": (35, -35, 474, 546), "ugrave": (56, -11, 442, 728), "uacute": (56, -11, 442, 728), "ucircumflex": (56, -11, 442, 728), "udieresis": (56, -11, 442, 728), "yacute": (5, -210, 442, 728), "thorn": (55, -197, 470, 715), "ydieresis": (5, -210, 442, 728), }, "ArialNarrow,BoldItalic": { "space": (0, 0, 0, 0), "exclam": (50, 0, 289, 715), "quotedbl": (121, 461, 447, 715), "numbersign": (7, -12, 446, 728), "dollar": (36, -99, 472, 770), "percent": (74, -30, 708, 728), "ampersand": (67, -16, 578, 728), "quotesingle": (124, 461, 269, 715), "parenleft": (53, -210, 356, 728), "parenright": (-64, -210, 238, 728), "asterisk": (78, 382, 368, 721), "plus": (33, 103, 444, 603), "comma": (8, -155, 173, 135), "hyphen": (31, 190, 277, 325), "period": (36, 0, 172, 135), "slash": (-35, -12, 335, 728), "zero": (52, -12, 468, 718), "one": (97, 0, 418, 720), "two": (49, 0, 468, 718), "three": (41, -12, 459, 718), "four": (22, 0, 458, 715), "five": (52, -12, 474, 706), "six": (66, -12, 471, 718), "seven": (84, 0, 494, 706), "eight": (54, -12, 464, 718), "nine": (52, -12, 457, 718), "colon": (57, 0, 259, 518), "semicolon": (33, -155, 262, 518), "less": (38, 81, 440, 625), "equal": (33, 181, 444, 524), "greater": (37, 81, 440, 624), "question": (100, 0, 506, 728), "at": (24, -210, 796, 728), "A": (-9, 0, 551, 715), "B": (32, 0, 582, 715), "C": (77, -12, 611, 728), "D": (35, 0, 594, 715), "E": (33, 0, 591, 715), "F": (31, 0, 565, 715), "G": (72, -12, 644, 728), "H": (35, 0, 626, 715), "I": (28, 0, 271, 715), "J": (23, -12, 492, 715), "K": (32, 0, 657, 715), "L": (37, 0, 477, 715), "M": (33, 0, 720, 715), "N": (36, 0, 625, 715), "O": (71, -12, 643, 728), "P": (33, 0, 576, 715), "Q": (72, -95, 643, 728), "R": (36, 0, 607, 715), "S": (51, -12, 554, 728), "T": (98, 0, 581, 715), "U": (74, -12, 626, 715), "V": (93, 0, 650, 715), "W": (96, 0, 875, 715), "X": (-24, 0, 642, 715), "Y": (94, 0, 643, 715), "Z": (20, 0, 547, 715), "bracketleft": (7, -197, 359, 715), "backslash": (63, -12, 235, 728), "bracketright": (-45, -197, 307, 715), "asciicircum": (46, 337, 433, 728), "underscore": (-5, -125, 462, -75), "grave": (109, 585, 271, 731), "a": (37, -11, 437, 530), "b": (29, -11, 493, 715), "c": (49, -11, 462, 530), "d": (48, -11, 548, 715), "e": (47, -11, 454, 530), "f": (43, 0, 385, 728), "g": (25, -210, 510, 530), "h": (34, 0, 484, 715), "i": (33, 0, 270, 715), "j": (-89, -210, 271, 715), "k": (31, 0, 503, 715), "l": (32, 0, 270, 715), "m": (29, 0, 712, 530), "n": (34, 0, 484, 530), "o": (49, -11, 491, 530), "p": (-4, -197, 496, 530), "q": (48, -197, 512, 530), "r": (26, 0, 388, 530), "s": (18, -11, 452, 530), "t": (61, -11, 320, 698), "u": (57, -11, 507, 518), "v": (61, 0, 506, 518), "w": (59, 0, 689, 518), "x": (-18, 0, 501, 518), "y": (5, -210, 509, 518), "z": (13, 0, 425, 518), "braceleft": (44, -210, 412, 728), "bar": (70, -210, 160, 728), "braceright": (-70, -210, 296, 728), "asciitilde": (26, 253, 452, 451), "bullet": (26, 208, 263, 497), "Euro": (21, -12, 523, 728), "quotesinglbase": (8, -155, 173, 135), "florin": (-7, -210, 457, 728), "quotedblbase": (2, -155, 361, 135), "ellipsis": (76, 0, 744, 135), "dagger": (69, -170, 487, 706), "daggerdbl": (0, -170, 491, 706), "circumflex": (45, 584, 320, 731), "perthousand": (55, -28, 837, 728), "Scaron": (51, -12, 554, 912), "guilsinglleft": (48, 34, 309, 477), "OE": (56, -12, 884, 728), "Zcaron": (20, 0, 547, 912), "quoteleft": (87, 424, 253, 715), "quoteright": (101, 424, 267, 715), "quotedblleft": (101, 424, 459, 715), "quotedblright": (104, 424, 463, 715), "endash": (-1, 208, 454, 310), "emdash": (0, 208, 819, 310), "tilde": (76, 592, 351, 710), "trademark": (86, 315, 719, 715), "scaron": (18, -11, 452, 731), "guilsinglright": (7, 34, 261, 477), "oe": (47, -11, 773, 530), "zcaron": (13, 0, 433, 731), "Ydieresis": (94, 0, 643, 898), "exclamdown": (9, -197, 250, 518), "cent": (48, -192, 461, 713), "sterling": (17, -18, 500, 728), "currency": (18, 100, 435, 610), "yen": (20, 0, 546, 715), "brokenbar": (70, -210, 160, 728), "section": (18, -211, 459, 728), "dieresis": (69, 597, 356, 716), "copyright": (-3, -17, 609, 730), "ordfeminine": (66, 362, 337, 728), "guillemotleft": (43, 34, 460, 477), "logicalnot": (33, 183, 444, 524), "registered": (-3, -17, 609, 730), "macron": (94, 790, 605, 840), "degree": (41, 416, 353, 728), "plusminus": (24, 0, 524, 674), "twosuperior": (66, 354, 324, 724), "threesuperior": (62, 349, 319, 724), "acute": (150, 583, 356, 730), "mu": (-37, -200, 584, 518), "paragraph": (0, -196, 452, 715), "periodcentered": (108, 290, 245, 425), "cedilla": (5, -207, 218, -12), "onesuperior": (93, 354, 296, 725), "ordmasculine": (59, 362, 339, 728), "guillemotright": (18, 34, 435, 477), "onequarter": (81, -29, 688, 725), "onehalf": (69, -29, 684, 725), "threequarters": (62, -29, 698, 724), "questiondown": (21, -209, 428, 518), "Agrave": (-9, 0, 551, 913), "Aacute": (-9, 0, 562, 912), "Acircumflex": (-9, 0, 551, 912), "Atilde": (-9, 0, 556, 892), "Adieresis": (-9, 0, 562, 898), "Aring": (-9, 0, 551, 935), "AE": (-26, 0, 868, 715), "Ccedilla": (77, -204, 611, 728), "Egrave": (33, 0, 591, 913), "Eacute": (33, 0, 591, 912), "Ecircumflex": (33, 0, 591, 912), "Edieresis": (33, 0, 591, 898), "Igrave": (28, 0, 297, 913), "Iacute": (28, 0, 368, 912), "Icircumflex": (28, 0, 347, 912), "Idieresis": (28, 0, 383, 898), "Eth": (29, 0, 594, 715), "Ntilde": (36, 0, 625, 892), "Ograve": (71, -12, 643, 913), "Oacute": (71, -12, 643, 912), "Ocircumflex": (71, -12, 643, 912), "Otilde": (71, -12, 643, 892), "Odieresis": (71, -12, 643, 898), "multiply": (43, 114, 434, 591), "Oslash": (63, -59, 645, 766), "Ugrave": (74, -12, 626, 913), "Uacute": (74, -12, 626, 912), "Ucircumflex": (74, -12, 626, 912), "Udieresis": (74, -12, 626, 898), "Yacute": (94, 0, 643, 912), "Thorn": (33, 0, 552, 715), "germandbls": (28, -11, 476, 728), "agrave": (37, -11, 437, 731), "aacute": (37, -11, 437, 730), "acircumflex": (37, -11, 437, 731), "atilde": (37, -11, 447, 710), "adieresis": (37, -11, 454, 716), "aring": (37, -11, 437, 753), "ae": (25, -11, 709, 530), "ccedilla": (49, -203, 462, 530), "egrave": (47, -11, 454, 731), "eacute": (47, -11, 454, 730), "ecircumflex": (47, -11, 454, 731), "edieresis": (47, -11, 454, 716), "igrave": (33, 0, 258, 731), "iacute": (33, 0, 319, 730), "icircumflex": (33, 0, 319, 731), "idieresis": (33, 0, 342, 716), "eth": (49, -11, 498, 715), "ntilde": (34, 0, 484, 710), "ograve": (49, -11, 491, 731), "oacute": (49, -11, 491, 730), "ocircumflex": (49, -11, 491, 731), "otilde": (49, -11, 491, 710), "odieresis": (49, -11, 491, 716), "divide": (23, 90, 524, 616), "oslash": (42, -52, 495, 571), "ugrave": (57, -11, 507, 731), "uacute": (57, -11, 507, 730), "ucircumflex": (57, -11, 507, 731), "udieresis": (57, -11, 507, 716), "yacute": (5, -210, 509, 730), "thorn": (-7, -197, 494, 715), "ydieresis": (5, -210, 509, 716), }, "ArialNarrow,Italic": { "space": (0, 0, 0, 0), "exclam": (46, 0, 249, 715), "quotedbl": (106, 462, 346, 715), "numbersign": (7, -12, 444, 728), "dollar": (41, -95, 469, 763), "percent": (79, -26, 698, 728), "ampersand": (64, -17, 534, 728), "quotesingle": (104, 462, 212, 715), "parenleft": (69, -210, 338, 728), "parenright": (-43, -210, 225, 728), "asterisk": (92, 422, 357, 727), "plus": (45, 115, 433, 588), "comma": (20, -144, 144, 100), "hyphen": (37, 214, 273, 303), "period": (47, 0, 146, 100), "slash": (-41, -11, 336, 728), "zero": (58, -12, 463, 718), "one": (121, 0, 393, 718), "two": (48, 0, 460, 718), "three": (44, -12, 457, 718), "four": (37, 0, 445, 715), "five": (57, -12, 469, 706), "six": (68, -12, 465, 718), "seven": (99, 0, 488, 706), "eight": (61, -12, 462, 718), "nine": (55, -12, 452, 718), "colon": (46, 0, 217, 518), "semicolon": (20, -144, 215, 518), "less": (44, 110, 433, 595), "equal": (45, 203, 433, 502), "greater": (44, 110, 433, 595), "question": (104, 0, 459, 728), "at": (44, -210, 803, 729), "A": (-16, 0, 505, 715), "B": (35, 0, 537, 715), "C": (74, -12, 598, 728), "D": (36, 0, 583, 715), "E": (37, 0, 583, 715), "F": (37, 0, 541, 715), "G": (79, -12, 628, 728), "H": (34, 0, 618, 715), "I": (46, 0, 248, 715), "J": (26, -12, 438, 715), "K": (36, 0, 607, 715), "L": (32, 0, 429, 715), "M": (36, 0, 715, 715), "N": (39, 0, 620, 715), "O": (75, -12, 633, 728), "P": (35, 0, 572, 715), "Q": (76, -82, 634, 728), "R": (38, 0, 599, 715), "S": (58, -12, 551, 728), "T": (102, 0, 578, 715), "U": (79, -12, 618, 715), "V": (101, 0, 620, 715), "W": (102, 0, 870, 715), "X": (-25, 0, 630, 715), "Y": (96, 0, 634, 715), "Z": (20, 0, 521, 715), "bracketleft": (5, -195, 320, 715), "backslash": (69, -11, 224, 728), "bracketright": (-47, -195, 270, 715), "asciicircum": (21, 336, 363, 728), "underscore": (-5, -125, 460, -75), "grave": (119, 581, 254, 715), "a": (36, -11, 431, 530), "b": (27, -11, 438, 715), "c": (45, -11, 418, 530), "d": (42, -11, 490, 715), "e": (42, -11, 436, 530), "f": (37, 0, 334, 728), "g": (21, -207, 462, 530), "h": (27, 0, 433, 715), "i": (24, 0, 219, 715), "j": (-99, -207, 218, 715), "k": (27, 0, 454, 715), "l": (21, 0, 216, 715), "m": (26, 0, 666, 530), "n": (27, 0, 433, 530), "o": (40, -11, 442, 530), "p": (-8, -198, 438, 530), "q": (42, -198, 453, 530), "r": (27, 0, 344, 530), "s": (31, -11, 408, 530), "t": (45, -8, 263, 707), "u": (51, -11, 457, 518), "v": (64, 0, 458, 518), "w": (63, 0, 636, 518), "x": (-1, 0, 440, 518), "y": (0, -210, 459, 518), "z": (16, 0, 419, 518), "braceleft": (55, -210, 376, 728), "bar": (75, -210, 138, 728), "braceright": (-68, -210, 253, 728), "asciitilde": (35, 271, 444, 432), "bullet": (43, 226, 246, 474), "Euro": (33, -12, 528, 728), "quotesinglbase": (-5, -144, 118, 100), "florin": (18, -210, 434, 728), "quotedblbase": (-16, -144, 238, 100), "ellipsis": (117, 0, 764, 100), "dagger": (74, -170, 478, 706), "daggerdbl": (4, -170, 482, 706), "circumflex": (82, 581, 317, 715), "perthousand": (54, -26, 822, 728), "Scaron": (58, -12, 551, 896), "guilsinglleft": (39, 35, 257, 478), "OE": (65, -12, 856, 728), "Zcaron": (20, 0, 521, 896), "quoteleft": (103, 470, 228, 715), "quoteright": (103, 470, 227, 715), "quotedblleft": (83, 470, 336, 715), "quotedblright": (85, 470, 342, 715), "endash": (-1, 223, 454, 294), "emdash": (0, 223, 819, 294), "tilde": (76, 596, 347, 706), "trademark": (90, 317, 713, 715), "scaron": (31, -11, 410, 715), "guilsinglright": (13, 35, 235, 478), "oe": (51, -11, 752, 530), "zcaron": (16, 0, 419, 715), "Ydieresis": (96, 0, 634, 880), "exclamdown": (24, -197, 228, 518), "cent": (62, -198, 434, 725), "sterling": (25, -12, 498, 728), "currency": (28, 114, 421, 593), "yen": (29, 0, 546, 715), "brokenbar": (75, -210, 138, 728), "section": (24, -210, 455, 728), "dieresis": (95, 599, 335, 699), "copyright": (0, -8, 605, 728), "ordfeminine": (66, 359, 335, 728), "guillemotleft": (64, 35, 440, 478), "logicalnot": (45, 207, 433, 502), "registered": (0, -8, 605, 728), "macron": (88, 790, 600, 840), "degree": (133, 457, 404, 728), "plusminus": (38, 0, 510, 600), "twosuperior": (61, 357, 329, 724), "threesuperior": (67, 349, 327, 724), "acute": (138, 581, 304, 715), "mu": (5, -200, 571, 518), "paragraph": (2, -198, 444, 715), "periodcentered": (124, 307, 223, 407), "cedilla": (30, -207, 235, 5), "onesuperior": (111, 357, 290, 724), "ordmasculine": (57, 360, 337, 728), "guillemotright": (33, 35, 414, 478), "onequarter": (68, -29, 697, 728), "onehalf": (48, -29, 677, 728), "threequarters": (67, -29, 708, 728), "questiondown": (46, -209, 401, 518), "Agrave": (-16, 0, 505, 896), "Aacute": (-16, 0, 505, 896), "Acircumflex": (-16, 0, 505, 896), "Atilde": (-16, 0, 514, 887), "Adieresis": (-16, 0, 505, 880), "Aring": (-16, 0, 505, 914), "AE": (-33, 0, 855, 715), "Ccedilla": (74, -210, 598, 728), "Egrave": (37, 0, 583, 896), "Eacute": (37, 0, 583, 896), "Ecircumflex": (37, 0, 583, 896), "Edieresis": (37, 0, 583, 880), "Igrave": (46, 0, 262, 896), "Iacute": (46, 0, 312, 896), "Icircumflex": (46, 0, 326, 896), "Idieresis": (46, 0, 343, 880), "Eth": (29, 0, 583, 715), "Ntilde": (39, 0, 620, 887), "Ograve": (75, -12, 633, 896), "Oacute": (75, -12, 633, 896), "Ocircumflex": (75, -12, 633, 896), "Otilde": (75, -12, 633, 887), "Odieresis": (75, -12, 633, 880), "multiply": (63, 140, 412, 566), "Oslash": (69, -50, 636, 764), "Ugrave": (79, -12, 618, 896), "Uacute": (79, -12, 618, 896), "Ucircumflex": (79, -12, 618, 896), "Udieresis": (79, -12, 618, 880), "Yacute": (96, 0, 634, 896), "Thorn": (35, 0, 547, 715), "germandbls": (29, -12, 465, 728), "agrave": (36, -11, 431, 715), "aacute": (36, -11, 431, 715), "acircumflex": (36, -11, 431, 715), "atilde": (36, -11, 443, 706), "adieresis": (36, -11, 431, 699), "aring": (36, -11, 431, 733), "ae": (34, -12, 708, 530), "ccedilla": (45, -207, 418, 530), "egrave": (42, -11, 436, 715), "eacute": (42, -11, 436, 715), "ecircumflex": (42, -11, 436, 715), "edieresis": (42, -11, 436, 699), "igrave": (50, 0, 254, 715), "iacute": (50, 0, 270, 715), "icircumflex": (50, 0, 305, 715), "idieresis": (50, 0, 310, 699), "eth": (40, -11, 447, 715), "ntilde": (27, 0, 436, 706), "ograve": (40, -11, 442, 715), "oacute": (40, -11, 442, 715), "ocircumflex": (40, -11, 442, 715), "otilde": (40, -11, 442, 706), "odieresis": (40, -11, 442, 699), "divide": (38, 155, 510, 550), "oslash": (58, -49, 476, 565), "ugrave": (51, -11, 457, 715), "uacute": (51, -11, 457, 715), "ucircumflex": (51, -11, 457, 715), "udieresis": (51, -11, 457, 699), "yacute": (0, -210, 459, 715), "thorn": (-8, -198, 438, 715), "ydieresis": (0, -210, 459, 699), }, "Arial,Black": { "space": (0, 0, 0, 0), "exclam": (60, 0, 272, 715), "quotedbl": (23, 452, 476, 715), "numbersign": (29, -11, 627, 728), "dollar": (26, -104, 631, 770), "percent": (48, -36, 951, 728), "ampersand": (74, -11, 848, 728), "quotesingle": (41, 452, 239, 715), "parenleft": (54, -210, 350, 728), "parenright": (39, -210, 334, 728), "asterisk": (86, 370, 465, 728), "plus": (62, 91, 594, 624), "comma": (60, -201, 272, 197), "hyphen": (21, 184, 311, 337), "period": (60, 0, 272, 199), "slash": (0, -11, 280, 728), "zero": (41, -12, 625, 728), "one": (81, 0, 491, 728), "two": (26, 0, 623, 728), "three": (35, -12, 626, 728), "four": (20, 0, 645, 728), "five": (32, -12, 627, 715), "six": (41, -12, 631, 728), "seven": (44, 0, 625, 715), "eight": (41, -12, 625, 728), "nine": (34, -12, 624, 728), "colon": (60, 0, 272, 518), "semicolon": (60, -201, 272, 518), "less": (52, 54, 607, 660), "equal": (61, 158, 594, 557), "greater": (52, 54, 607, 660), "question": (35, 0, 575, 728), "at": (-2, -113, 741, 728), "A": (0, 0, 780, 715), "B": (73, 0, 735, 715), "C": (47, -12, 743, 728), "D": (76, 0, 734, 715), "E": (72, 0, 676, 715), "F": (74, 0, 621, 715), "G": (45, -12, 774, 728), "H": (74, 0, 759, 715), "I": (82, 0, 303, 715), "J": (17, -12, 592, 715), "K": (74, 0, 833, 715), "L": (73, 0, 639, 715), "M": (70, 0, 875, 715), "N": (74, 0, 759, 715), "O": (45, -12, 787, 728), "P": (72, 0, 679, 715), "Q": (45, -80, 814, 728), "R": (76, 0, 780, 715), "S": (34, -12, 684, 728), "T": (22, 0, 695, 715), "U": (73, -12, 759, 715), "V": (2, 0, 778, 715), "W": (0, 0, 1000, 715), "X": (1, 0, 779, 715), "Y": (0, 0, 779, 715), "Z": (16, 0, 695, 715), "bracketleft": (65, -198, 366, 715), "backslash": (-2, -11, 277, 728), "bracketright": (22, -198, 323, 715), "asciicircum": (61, 331, 595, 728), "underscore": (-5, -125, 505, -75), "grave": (0, 582, 250, 728), "a": (35, -11, 632, 530), "b": (61, -11, 631, 715), "c": (36, -12, 635, 530), "d": (35, -11, 605, 715), "e": (35, -11, 635, 530), "f": (7, 0, 418, 728), "g": (35, -210, 607, 530), "h": (60, 0, 608, 715), "i": (67, 0, 266, 715), "j": (-48, -210, 267, 715), "k": (60, 0, 666, 715), "l": (66, 0, 266, 715), "m": (61, 0, 941, 530), "n": (60, 0, 608, 530), "o": (35, -11, 631, 530), "p": (61, -197, 631, 530), "q": (35, -197, 605, 530), "r": (62, 0, 470, 530), "s": (24, -12, 576, 530), "t": (27, -11, 416, 715), "u": (58, -11, 606, 518), "v": (0, 0, 613, 518), "w": (1, 0, 945, 518), "x": (5, 0, 661, 518), "y": (2, -210, 614, 518), "z": (18, 0, 534, 518), "braceleft": (12, -210, 377, 728), "bar": (78, -197, 202, 715), "braceright": (11, -210, 376, 728), "asciitilde": (48, 240, 608, 475), "bullet": (87, 189, 412, 514), "Euro": (8, -12, 641, 728), "quotesinglbase": (34, -201, 246, 197), "florin": (18, -210, 651, 728), "quotedblbase": (26, -201, 486, 197), "ellipsis": (60, 0, 939, 199), "dagger": (68, -198, 604, 715), "daggerdbl": (68, -198, 604, 715), "circumflex": (-13, 582, 347, 721), "perthousand": (0, -36, 1000, 728), "Scaron": (34, -12, 684, 898), "guilsinglleft": (11, 34, 319, 486), "OE": (34, -12, 968, 728), "Zcaron": (16, 0, 695, 898), "quoteleft": (34, 329, 246, 728), "quoteright": (34, 329, 246, 728), "quotedblleft": (26, 329, 486, 728), "quotedblright": (26, 329, 486, 728), "endash": (-5, 207, 505, 315), "emdash": (-5, 207, 1005, 315), "tilde": (-9, 580, 342, 715), "trademark": (17, 317, 910, 715), "scaron": (24, -12, 576, 721), "guilsinglright": (13, 34, 321, 486), "oe": (28, -11, 972, 530), "zcaron": (18, 0, 534, 721), "Ydieresis": (0, 0, 779, 883), "exclamdown": (60, -197, 272, 518), "cent": (36, -190, 635, 706), "sterling": (55, -12, 662, 728), "currency": (47, 0, 607, 560), "yen": (0, 0, 667, 715), "brokenbar": (78, -197, 202, 715), "section": (31, -210, 628, 728), "dieresis": (0, 583, 334, 706), "copyright": (28, -17, 773, 728), "ordfeminine": (16, 363, 371, 728), "guillemotleft": (46, 34, 607, 486), "logicalnot": (61, 154, 594, 553), "registered": (28, -17, 773, 728), "macron": (-5, 780, 505, 830), "degree": (58, 449, 337, 728), "plusminus": (62, 0, 594, 705), "twosuperior": (10, 361, 386, 728), "threesuperior": (15, 352, 384, 728), "acute": (79, 582, 332, 728), "mu": (58, -196, 607, 518), "paragraph": (65, -198, 789, 715), "periodcentered": (60, 258, 272, 457), "cedilla": (8, -210, 304, -11), "onesuperior": (68, 361, 306, 728), "ordmasculine": (11, 362, 384, 728), "guillemotright": (59, 34, 620, 486), "onequarter": (76, -25, 962, 728), "onehalf": (76, -25, 971, 728), "threequarters": (34, -25, 962, 728), "questiondown": (35, -209, 575, 518), "Agrave": (0, 0, 780, 905), "Aacute": (0, 0, 780, 905), "Acircumflex": (0, 0, 780, 898), "Atilde": (0, 0, 780, 893), "Adieresis": (0, 0, 780, 883), "Aring": (0, 0, 780, 892), "AE": (-37, 0, 964, 715), "Ccedilla": (47, -210, 743, 728), "Egrave": (72, 0, 676, 905), "Eacute": (72, 0, 676, 905), "Ecircumflex": (72, 0, 676, 898), "Edieresis": (72, 0, 676, 883), "Igrave": (28, 0, 303, 905), "Iacute": (82, 0, 360, 905), "Icircumflex": (14, 0, 375, 898), "Idieresis": (27, 0, 362, 883), "Eth": (0, 0, 734, 715), "Ntilde": (74, 0, 759, 893), "Ograve": (45, -12, 787, 905), "Oacute": (45, -12, 787, 905), "Ocircumflex": (45, -12, 787, 898), "Otilde": (45, -12, 787, 893), "Odieresis": (45, -12, 787, 883), "multiply": (61, 90, 595, 625), "Oslash": (17, -25, 815, 740), "Ugrave": (73, -12, 759, 905), "Uacute": (73, -12, 759, 905), "Ucircumflex": (73, -12, 759, 898), "Udieresis": (73, -12, 759, 883), "Yacute": (0, 0, 779, 905), "Thorn": (72, 0, 679, 715), "germandbls": (58, -11, 631, 728), "agrave": (35, -11, 632, 728), "aacute": (35, -11, 632, 728), "acircumflex": (35, -11, 632, 721), "atilde": (35, -11, 632, 715), "adieresis": (35, -11, 632, 706), "aring": (35, -11, 632, 802), "ae": (33, -11, 971, 530), "ccedilla": (36, -210, 635, 530), "egrave": (35, -11, 635, 728), "eacute": (35, -11, 635, 728), "ecircumflex": (35, -11, 635, 721), "edieresis": (35, -11, 635, 706), "igrave": (0, 0, 266, 728), "iacute": (67, 0, 332, 728), "icircumflex": (-13, 0, 347, 721), "idieresis": (0, 0, 334, 706), "eth": (36, -11, 629, 715), "ntilde": (60, 0, 608, 715), "ograve": (35, -11, 631, 728), "oacute": (35, -11, 631, 728), "ocircumflex": (35, -11, 631, 721), "otilde": (35, -11, 631, 715), "odieresis": (35, -11, 631, 706), "divide": (62, 51, 594, 662), "oslash": (35, -47, 630, 564), "ugrave": (58, -11, 606, 728), "uacute": (58, -11, 606, 728), "ucircumflex": (58, -11, 606, 721), "udieresis": (58, -11, 606, 706), "yacute": (2, -210, 614, 728), "thorn": (61, -197, 631, 715), "ydieresis": (2, -210, 614, 706), }, "Garamond": { "space": (0, 0, 0, 0), "exclam": (61, -12, 160, 638), "quotedbl": (64, 392, 341, 677), "numbersign": (45, -22, 620, 666), "dollar": (41, -133, 404, 655), "percent": (36, -32, 789, 637), "ampersand": (26, -14, 713, 594), "quotesingle": (39, 392, 137, 677), "parenleft": (76, -245, 309, 639), "parenright": (-21, -244, 213, 640), "asterisk": (28, 240, 393, 631), "plus": (70, 49, 595, 572), "comma": (41, -173, 189, 68), "hyphen": (37, 171, 275, 217), "period": (58, -14, 160, 93), "slash": (56, -135, 443, 696), "zero": (35, -14, 437, 636), "one": (75, 0, 354, 633), "two": (21, 0, 441, 633), "three": (38, -13, 424, 636), "four": (26, -11, 456, 636), "five": (51, -16, 418, 638), "six": (48, -13, 427, 639), "seven": (45, -12, 431, 619), "eight": (56, -13, 429, 633), "nine": (43, -14, 421, 638), "colon": (57, -13, 161, 387), "semicolon": (42, -156, 188, 391), "less": (71, 70, 594, 551), "equal": (71, 176, 595, 445), "greater": (71, 70, 594, 551), "question": (43, -14, 330, 640), "at": (47, -215, 896, 694), "A": (-7, 0, 669, 655), "B": (13, 0, 568, 633), "C": (43, -13, 601, 640), "D": (10, -8, 722, 635), "E": (23, -6, 632, 622), "F": (28, -9, 540, 631), "G": (46, -12, 758, 640), "H": (19, -10, 734, 629), "I": (20, 0, 324, 624), "J": (-84, -252, 277, 624), "K": (28, -8, 759, 625), "L": (5, -2, 574, 622), "M": (6, -4, 826, 629), "N": (12, -22, 732, 627), "O": (45, -9, 733, 630), "P": (18, -9, 536, 632), "Q": (47, -217, 748, 642), "R": (20, -2, 641, 629), "S": (37, -16, 437, 642), "T": (-1, -12, 602, 649), "U": (18, -16, 675, 627), "V": (-8, -19, 686, 628), "W": (-9, -27, 891, 624), "X": (4, -10, 707, 623), "Y": (-9, -6, 664, 629), "Z": (35, -7, 608, 657), "bracketleft": (101, -231, 295, 627), "backslash": (55, -135, 444, 696), "bracketright": (-20, -232, 174, 627), "asciicircum": (32, 382, 469, 670), "underscore": (-5, -125, 505, -75), "grave": (97, 479, 261, 631), "a": (32, -11, 399, 398), "b": (16, -20, 471, 658), "c": (38, -15, 390, 398), "d": (32, -18, 487, 658), "e": (38, -12, 392, 401), "f": (46, 0, 402, 653), "g": (6, -257, 460, 400), "h": (14, -3, 497, 650), "i": (0, -2, 221, 639), "j": (20, -263, 153, 634), "k": (25, 0, 477, 654), "l": (4, 0, 227, 648), "m": (17, 0, 753, 417), "n": (17, 0, 500, 411), "o": (35, -13, 474, 400), "p": (11, -256, 474, 434), "q": (34, -255, 498, 412), "r": (18, -1, 332, 422), "s": (55, -15, 321, 404), "t": (27, -10, 295, 482), "u": (16, -9, 483, 383), "v": (-5, -20, 477, 387), "w": (-10, -22, 675, 385), "x": (13, 0, 444, 385), "y": (3, -246, 430, 386), "z": (26, -2, 389, 422), "braceleft": (138, -215, 410, 694), "bar": (228, -257, 271, 653), "braceright": (86, -215, 358, 694), "asciitilde": (73, 243, 593, 378), "bullet": (54, 208, 299, 453), "Euro": (-13, -13, 454, 640), "quotesinglbase": (45, -173, 188, 68), "florin": (0, -256, 615, 642), "quotedblbase": (31, -172, 406, 71), "ellipsis": (114, -9, 885, 96), "dagger": (0, -243, 422, 640), "daggerdbl": (15, -240, 411, 643), "circumflex": (71, 477, 286, 650), "perthousand": (35, -32, 987, 637), "Scaron": (37, -16, 437, 859), "guilsinglleft": (6, 6, 190, 393), "OE": (46, -8, 909, 629), "Zcaron": (35, -7, 608, 859), "quoteleft": (51, 393, 199, 637), "quoteright": (49, 393, 193, 636), "quotedblleft": (43, 392, 418, 635), "quotedblright": (35, 395, 412, 643), "endash": (-5, 168, 505, 213), "emdash": (-5, 168, 1005, 213), "tilde": (42, 504, 322, 604), "trademark": (14, 268, 963, 662), "scaron": (55, -15, 321, 650), "guilsinglright": (8, 7, 190, 395), "oe": (38, -16, 666, 400), "zcaron": (26, -2, 389, 650), "Ydieresis": (-9, -6, 664, 770), "exclamdown": (59, -240, 159, 408), "cent": (38, -168, 389, 580), "sterling": (29, -235, 591, 633), "currency": (98, 89, 564, 555), "yen": (-9, -6, 664, 629), "brokenbar": (228, -257, 271, 653), "section": (56, -243, 369, 641), "dieresis": (64, 515, 316, 600), "copyright": (33, -15, 726, 677), "ordfeminine": (13, 377, 264, 630), "guillemotleft": (5, 5, 365, 390), "logicalnot": (71, 180, 595, 461), "registered": (33, -15, 726, 677), "macron": (-5, 743, 505, 793), "degree": (47, 376, 348, 676), "plusminus": (70, -18, 595, 660), "twosuperior": (24, 305, 284, 635), "threesuperior": (35, 297, 274, 636), "acute": (119, 479, 284, 630), "mu": (22, -216, 497, 383), "paragraph": (-6, -215, 454, 662), "periodcentered": (115, 284, 217, 391), "cedilla": (0, -210, 146, 6), "onesuperior": (56, 305, 231, 635), "ordmasculine": (18, 376, 314, 630), "guillemotright": (0, 5, 360, 390), "onequarter": (56, -34, 785, 635), "onehalf": (56, -32, 776, 637), "threequarters": (35, -32, 791, 637), "questiondown": (16, -245, 302, 408), "Agrave": (-7, 0, 669, 837), "Aacute": (-7, 0, 669, 836), "Acircumflex": (-7, 0, 669, 859), "Atilde": (-7, 0, 669, 785), "Adieresis": (-7, 0, 669, 770), "Aring": (-7, 0, 669, 807), "AE": (-62, -4, 828, 627), "Ccedilla": (43, -210, 601, 640), "Egrave": (23, -6, 632, 837), "Eacute": (23, -6, 632, 836), "Ecircumflex": (23, -6, 632, 859), "Edieresis": (23, -6, 632, 770), "Igrave": (20, 0, 324, 837), "Iacute": (20, 0, 324, 836), "Icircumflex": (20, 0, 324, 859), "Idieresis": (20, 0, 324, 770), "Eth": (7, -8, 722, 635), "Ntilde": (12, -22, 732, 785), "Ograve": (45, -9, 733, 837), "Oacute": (45, -9, 733, 836), "Ocircumflex": (45, -9, 733, 859), "Otilde": (45, -9, 733, 785), "Odieresis": (45, -9, 733, 770), "multiply": (96, 73, 571, 548), "Oslash": (45, -30, 733, 651), "Ugrave": (18, -16, 675, 837), "Uacute": (18, -16, 675, 836), "Ucircumflex": (18, -16, 675, 859), "Udieresis": (18, -16, 675, 770), "Yacute": (-9, -6, 664, 836), "Thorn": (18, -9, 536, 625), "germandbls": (7, -15, 469, 643), "agrave": (32, -11, 399, 631), "aacute": (32, -11, 399, 630), "acircumflex": (32, -11, 399, 650), "atilde": (32, -11, 399, 604), "adieresis": (32, -11, 399, 600), "aring": (32, -11, 399, 614), "ae": (36, -15, 561, 399), "ccedilla": (38, -210, 390, 398), "egrave": (38, -12, 392, 631), "eacute": (38, -12, 392, 630), "ecircumflex": (38, -12, 392, 650), "edieresis": (38, -12, 392, 600), "igrave": (-1, -2, 219, 631), "iacute": (-1, -2, 231, 630), "icircumflex": (-1, -2, 224, 650), "idieresis": (-1, -2, 250, 600), "eth": (44, -11, 485, 642), "ntilde": (17, 0, 500, 604), "ograve": (35, -13, 474, 631), "oacute": (35, -13, 474, 630), "ocircumflex": (35, -13, 474, 650), "otilde": (35, -13, 474, 604), "odieresis": (35, -13, 474, 600), "divide": (11, 136, 537, 524), "oslash": (38, -23, 476, 412), "ugrave": (16, -9, 483, 631), "uacute": (16, -9, 483, 630), "ucircumflex": (16, -9, 483, 650), "udieresis": (16, -9, 483, 600), "yacute": (3, -246, 430, 630), "thorn": (11, -256, 474, 648), "ydieresis": (3, -246, 430, 600), }, "Garamond,Bold": { "space": (0, 0, 0, 0), "exclam": (61, -8, 202, 649), "quotedbl": (85, 352, 465, 677), "numbersign": (41, -21, 625, 675), "dollar": (39, -94, 437, 635), "percent": (31, -12, 800, 653), "ampersand": (45, -10, 762, 613), "quotesingle": (68, 352, 212, 677), "parenleft": (68, -236, 350, 647), "parenright": (11, -236, 294, 647), "asterisk": (32, 213, 457, 649), "plus": (65, 50, 601, 584), "comma": (45, -179, 221, 134), "hyphen": (34, 158, 302, 251), "period": (61, -8, 202, 132), "slash": (57, -135, 495, 696), "zero": (27, -10, 438, 645), "one": (25, 3, 368, 644), "two": (19, 1, 449, 642), "three": (14, -13, 437, 642), "four": (23, -10, 445, 644), "five": (31, -10, 428, 641), "six": (29, -10, 439, 648), "seven": (34, -10, 430, 628), "eight": (42, -10, 434, 641), "nine": (30, -14, 442, 644), "colon": (57, -8, 199, 423), "semicolon": (48, -178, 224, 424), "less": (66, 59, 600, 576), "equal": (66, 164, 600, 471), "greater": (66, 59, 600, 576), "question": (48, -9, 375, 650), "at": (44, -215, 908, 677), "A": (-12, 3, 676, 647), "B": (35, 0, 627, 639), "C": (45, -6, 645, 649), "D": (24, 3, 736, 645), "E": (17, 0, 670, 635), "F": (29, 0, 585, 638), "G": (45, -8, 711, 646), "H": (31, 4, 826, 639), "I": (40, 1, 352, 639), "J": (-58, -235, 345, 638), "K": (26, 2, 709, 639), "L": (19, 1, 632, 641), "M": (20, 0, 894, 637), "N": (3, -13, 814, 636), "O": (43, -5, 744, 647), "P": (23, 0, 587, 639), "Q": (43, -170, 750, 648), "R": (39, 1, 710, 640), "S": (49, -6, 476, 649), "T": (0, 1, 657, 664), "U": (17, -13, 718, 634), "V": (-11, -4, 675, 640), "W": (0, -14, 898, 633), "X": (4, 1, 687, 635), "Y": (-18, 2, 672, 635), "Z": (21, 1, 620, 660), "bracketleft": (122, -225, 340, 631), "backslash": (58, -135, 494, 696), "bracketright": (20, -224, 240, 633), "asciicircum": (73, 325, 511, 675), "underscore": (-5, -125, 505, -75), "grave": (59, 468, 242, 625), "a": (48, -2, 468, 415), "b": (20, -8, 516, 646), "c": (38, -7, 447, 419), "d": (38, -11, 543, 652), "e": (35, -8, 435, 418), "f": (26, 1, 393, 648), "g": (24, -250, 539, 415), "h": (18, 0, 540, 646), "i": (14, 2, 268, 645), "j": (21, -229, 199, 645), "k": (15, 0, 539, 647), "l": (3, 1, 260, 647), "m": (20, 3, 833, 434), "n": (19, 0, 539, 440), "o": (36, -8, 484, 418), "p": (-1, -246, 515, 447), "q": (38, -248, 545, 443), "r": (17, 3, 343, 437), "s": (43, -8, 374, 417), "t": (27, -1, 301, 497), "u": (20, -8, 536, 401), "v": (-6, -6, 466, 402), "w": (-6, -6, 717, 400), "x": (9, 2, 485, 400), "y": (-7, -237, 471, 400), "z": (29, 3, 426, 447), "braceleft": (80, -202, 351, 677), "bar": (231, -249, 309, 644), "braceright": (44, -202, 315, 677), "asciitilde": (67, 238, 599, 396), "bullet": (37, 190, 316, 469), "Euro": (-17, -5, 448, 649), "quotesinglbase": (40, -179, 216, 134), "florin": (0, -236, 708, 645), "quotedblbase": (43, -177, 457, 134), "ellipsis": (94, -7, 904, 135), "dagger": (14, -236, 486, 648), "daggerdbl": (21, -232, 479, 652), "circumflex": (32, 460, 322, 633), "perthousand": (31, -12, 998, 653), "Scaron": (49, -6, 476, 848), "guilsinglleft": (11, 13, 251, 402), "OE": (50, 0, 943, 646), "Zcaron": (21, 1, 620, 848), "quoteleft": (45, 326, 223, 640), "quoteright": (34, 326, 210, 639), "quotedblleft": (46, 325, 461, 640), "quotedblright": (33, 326, 450, 639), "endash": (-5, 205, 505, 295), "emdash": (-5, 205, 1005, 295), "tilde": (10, 486, 334, 615), "trademark": (-1, 268, 1005, 662), "scaron": (43, -8, 374, 635), "guilsinglright": (22, 10, 262, 399), "oe": (36, -6, 699, 419), "zcaron": (29, 3, 428, 635), "Ydieresis": (-18, 2, 672, 822), "exclamdown": (58, -238, 199, 419), "cent": (27, -171, 436, 584), "sterling": (46, -229, 645, 647), "currency": (81, 78, 581, 578), "yen": (-18, 2, 672, 635), "brokenbar": (231, -249, 309, 644), "section": (41, -241, 463, 647), "dieresis": (33, 488, 319, 609), "copyright": (28, -15, 721, 677), "ordfeminine": (22, 393, 303, 645), "guillemotleft": (2, 12, 430, 396), "logicalnot": (65, 168, 601, 483), "registered": (28, -15, 721, 677), "macron": (-5, 682, 505, 732), "degree": (28, 337, 366, 675), "plusminus": (65, -23, 601, 676), "twosuperior": (23, 310, 287, 644), "threesuperior": (20, 302, 282, 644), "acute": (114, 467, 298, 625), "mu": (25, -186, 453, 401), "paragraph": (0, -215, 541, 662), "periodcentered": (96, 253, 237, 394), "cedilla": (43, -228, 291, 7), "onesuperior": (43, 311, 258, 645), "ordmasculine": (17, 389, 316, 647), "guillemotright": (17, 12, 444, 396), "onequarter": (46, -12, 804, 653), "onehalf": (46, -12, 805, 653), "threequarters": (23, -12, 804, 653), "questiondown": (42, -239, 369, 421), "Agrave": (-12, 3, 676, 837), "Aacute": (-12, 3, 676, 837), "Acircumflex": (-12, 3, 676, 846), "Atilde": (-12, 3, 676, 828), "Adieresis": (-12, 3, 676, 822), "Aring": (-12, 3, 676, 802), "AE": (-44, -2, 841, 633), "Ccedilla": (45, -228, 645, 649), "Egrave": (17, 0, 670, 837), "Eacute": (17, 0, 670, 837), "Ecircumflex": (17, 0, 670, 846), "Edieresis": (17, 0, 670, 822), "Igrave": (40, 1, 352, 837), "Iacute": (40, 1, 352, 837), "Icircumflex": (40, 1, 354, 846), "Idieresis": (40, 1, 352, 822), "Eth": (24, 3, 736, 645), "Ntilde": (3, -13, 814, 828), "Ograve": (43, -5, 744, 837), "Oacute": (43, -5, 744, 837), "Ocircumflex": (43, -5, 744, 846), "Otilde": (43, -5, 744, 828), "Odieresis": (43, -5, 744, 822), "multiply": (85, 70, 582, 565), "Oslash": (43, -7, 744, 650), "Ugrave": (17, -13, 718, 837), "Uacute": (17, -13, 718, 837), "Ucircumflex": (17, -13, 718, 846), "Udieresis": (17, -13, 718, 822), "Yacute": (-18, 2, 672, 837), "Thorn": (23, 0, 588, 639), "germandbls": (17, -1, 514, 647), "agrave": (48, -2, 468, 625), "aacute": (48, -2, 468, 625), "acircumflex": (48, -2, 468, 633), "atilde": (48, -2, 468, 615), "adieresis": (48, -2, 468, 609), "aring": (48, -2, 468, 629), "ae": (41, -8, 664, 416), "ccedilla": (38, -228, 447, 419), "egrave": (35, -8, 435, 625), "eacute": (35, -8, 435, 625), "ecircumflex": (35, -8, 435, 633), "edieresis": (35, -8, 435, 609), "igrave": (16, 2, 268, 625), "iacute": (16, 2, 271, 625), "icircumflex": (5, 2, 296, 633), "idieresis": (7, 2, 292, 609), "eth": (33, -8, 482, 648), "ntilde": (19, 0, 539, 615), "ograve": (36, -8, 484, 625), "oacute": (36, -8, 484, 625), "ocircumflex": (36, -8, 484, 633), "otilde": (36, -8, 484, 615), "odieresis": (36, -8, 484, 609), "divide": (65, 69, 601, 569), "oslash": (36, -38, 485, 449), "ugrave": (20, -8, 536, 625), "uacute": (20, -8, 536, 625), "ucircumflex": (20, -8, 536, 633), "udieresis": (20, -8, 536, 609), "yacute": (-7, -237, 471, 625), "thorn": (-1, -246, 515, 647), "ydieresis": (-7, -237, 471, 609), }, "Garamond,Italic": { "space": (0, 0, 0, 0), "exclam": (49, -11, 299, 623), "quotedbl": (124, 392, 465, 677), "numbersign": (81, -22, 656, 666), "dollar": (11, -105, 460, 629), "percent": (71, -32, 734, 633), "ampersand": (91, -9, 978, 655), "quotesingle": (131, 392, 261, 677), "parenleft": (95, -255, 428, 651), "parenright": (-78, -253, 257, 652), "asterisk": (95, 245, 490, 631), "plus": (105, 49, 630, 572), "comma": (-17, -160, 154, 119), "hyphen": (51, 169, 269, 219), "period": (41, -14, 142, 93), "slash": (56, -135, 443, 696), "zero": (52, -11, 471, 633), "one": (148, 0, 407, 631), "two": (16, 0, 485, 632), "three": (21, -11, 453, 632), "four": (16, 0, 443, 631), "five": (15, -11, 499, 640), "six": (56, -11, 505, 633), "seven": (81, -11, 518, 613), "eight": (45, -13, 475, 631), "nine": (28, -12, 478, 633), "colon": (42, -10, 238, 396), "semicolon": (0, -157, 251, 398), "less": (106, 69, 629, 551), "equal": (106, 175, 630, 445), "greater": (106, 69, 629, 551), "question": (110, -12, 416, 635), "at": (47, -215, 896, 694), "A": (-55, -8, 746, 641), "B": (12, -7, 544, 640), "C": (70, -15, 702, 646), "D": (18, -6, 734, 639), "E": (-2, -8, 673, 636), "F": (7, -8, 648, 640), "G": (70, -16, 708, 641), "H": (16, -7, 833, 639), "I": (7, -8, 393, 640), "J": (-117, -248, 390, 639), "K": (14, -8, 677, 637), "L": (1, -4, 674, 632), "M": (-25, -19, 883, 646), "N": (-9, -18, 865, 640), "O": (81, -13, 674, 648), "P": (12, -6, 574, 643), "Q": (-97, -235, 690, 643), "R": (30, -5, 673, 636), "S": (28, -15, 523, 645), "T": (69, -10, 682, 652), "U": (115, -15, 784, 641), "V": (118, -19, 925, 638), "W": (106, -18, 1003, 637), "X": (-10, -8, 826, 645), "Y": (71, -3, 760, 643), "Z": (41, 0, 631, 635), "bracketleft": (47, -229, 479, 625), "backslash": (55, -135, 444, 696), "bracketright": (-104, -229, 322, 625), "asciicircum": (67, 382, 504, 670), "underscore": (-5, -125, 505, -75), "grave": (194, 461, 357, 612), "a": (38, -12, 426, 387), "b": (66, -14, 429, 646), "c": (48, -10, 334, 400), "d": (44, -20, 509, 656), "e": (50, -16, 315, 395), "f": (-182, -256, 434, 642), "g": (-92, -246, 380, 400), "h": (35, -16, 422, 649), "i": (37, -11, 291, 621), "j": (-216, -245, 284, 606), "k": (32, -23, 512, 645), "l": (35, -13, 334, 649), "m": (24, -13, 649, 396), "n": (45, -14, 434, 403), "o": (55, -11, 354, 399), "p": (-141, -252, 409, 516), "q": (38, -252, 450, 402), "r": (55, -11, 397, 400), "s": (25, -8, 331, 399), "t": (38, -8, 335, 522), "u": (38, -12, 452, 400), "v": (52, -15, 379, 407), "w": (35, -18, 577, 401), "x": (8, -9, 556, 397), "y": (-215, -243, 350, 399), "z": (58, -253, 486, 399), "braceleft": (138, -215, 410, 694), "bar": (263, -246, 307, 641), "braceright": (133, -215, 406, 694), "asciitilde": (108, 243, 628, 377), "bullet": (102, 208, 347, 453), "Euro": (44, -16, 611, 645), "quotesinglbase": (7, -137, 151, 119), "florin": (0, -256, 615, 642), "quotedblbase": (6, -162, 357, 95), "ellipsis": (114, -9, 886, 96), "dagger": (84, -242, 499, 644), "daggerdbl": (-18, -254, 499, 654), "circumflex": (163, 439, 390, 622), "perthousand": (70, -32, 891, 633), "Scaron": (28, -15, 600, 856), "guilsinglleft": (61, -5, 317, 404), "OE": (80, -4, 963, 642), "Zcaron": (41, 0, 648, 853), "quoteleft": (177, 386, 326, 650), "quoteright": (152, 393, 297, 650), "quotedblleft": (188, 385, 536, 646), "quotedblright": (146, 388, 495, 645), "endash": (-5, 168, 505, 213), "emdash": (-5, 168, 1005, 213), "tilde": (158, 489, 437, 589), "trademark": (61, 268, 1010, 662), "scaron": (25, -8, 455, 624), "guilsinglright": (-19, -7, 236, 404), "oe": (52, -11, 493, 398), "zcaron": (58, -253, 522, 624), "Ydieresis": (71, -3, 760, 786), "exclamdown": (-17, -227, 232, 408), "cent": (-7, -121, 351, 534), "sterling": (31, -235, 593, 633), "currency": (133, 89, 600, 555), "yen": (45, -9, 741, 638), "brokenbar": (263, -246, 307, 641), "section": (-4, -227, 464, 644), "dieresis": (179, 494, 422, 574), "copyright": (81, -15, 773, 677), "ordfeminine": (103, 392, 365, 638), "guillemotleft": (52, -7, 458, 403), "logicalnot": (106, 180, 630, 461), "registered": (81, -15, 773, 677), "macron": (80, 669, 591, 719), "degree": (104, 378, 404, 678), "plusminus": (105, -18, 630, 660), "twosuperior": (49, 303, 338, 632), "threesuperior": (52, 297, 319, 632), "acute": (242, 460, 404, 611), "mu": (-62, -215, 481, 383), "paragraph": (-6, -215, 454, 662), "periodcentered": (162, 263, 264, 371), "cedilla": (23, -223, 147, 7), "onesuperior": (127, 303, 293, 632), "ordmasculine": (115, 392, 321, 645), "guillemotright": (-12, -6, 394, 404), "onequarter": (127, -32, 729, 633), "onehalf": (127, -32, 754, 633), "threequarters": (52, -32, 729, 633), "questiondown": (-4, -237, 301, 409), "Agrave": (-55, -8, 762, 845), "Aacute": (-55, -8, 853, 845), "Acircumflex": (-55, -8, 827, 861), "Atilde": (-55, -8, 890, 801), "Adieresis": (-55, -8, 844, 786), "Aring": (-55, -8, 758, 791), "AE": (-32, -6, 869, 637), "Ccedilla": (70, -226, 702, 646), "Egrave": (-2, -8, 673, 845), "Eacute": (-2, -8, 673, 845), "Ecircumflex": (-2, -8, 673, 861), "Edieresis": (-2, -8, 673, 786), "Igrave": (7, -8, 393, 845), "Iacute": (7, -8, 408, 845), "Icircumflex": (7, -8, 393, 861), "Idieresis": (7, -8, 446, 786), "Eth": (33, -6, 750, 639), "Ntilde": (-9, -18, 865, 801), "Ograve": (81, -13, 674, 845), "Oacute": (81, -13, 674, 845), "Ocircumflex": (81, -13, 674, 861), "Otilde": (81, -13, 674, 801), "Odieresis": (81, -13, 674, 786), "multiply": (131, 73, 606, 548), "Oslash": (81, -16, 674, 650), "Ugrave": (115, -15, 784, 845), "Uacute": (115, -15, 784, 845), "Ucircumflex": (115, -15, 784, 861), "Udieresis": (115, -15, 784, 786), "Yacute": (71, -3, 760, 845), "Thorn": (22, -6, 556, 642), "germandbls": (-145, -250, 538, 648), "agrave": (38, -12, 445, 612), "aacute": (38, -12, 444, 611), "acircumflex": (38, -12, 429, 622), "atilde": (38, -12, 476, 589), "adieresis": (38, -12, 495, 574), "aring": (38, -12, 426, 616), "ae": (26, -13, 514, 406), "ccedilla": (-7, -223, 334, 400), "egrave": (50, -16, 335, 612), "eacute": (50, -16, 382, 611), "ecircumflex": (50, -16, 367, 622), "edieresis": (50, -16, 399, 574), "igrave": (38, -9, 302, 612), "iacute": (38, -9, 349, 611), "icircumflex": (38, -9, 341, 622), "idieresis": (38, -9, 378, 574), "eth": (58, -13, 425, 642), "ntilde": (45, -14, 536, 589), "ograve": (55, -11, 369, 612), "oacute": (55, -11, 416, 611), "ocircumflex": (55, -11, 401, 622), "otilde": (55, -11, 448, 589), "odieresis": (55, -11, 433, 574), "divide": (106, 81, 630, 543), "oslash": (43, -10, 373, 400), "ugrave": (38, -12, 452, 612), "uacute": (38, -12, 455, 611), "ucircumflex": (38, -12, 452, 622), "udieresis": (38, -12, 472, 574), "yacute": (-215, -243, 404, 611), "thorn": (-141, -252, 409, 648), "ydieresis": (-215, -243, 363, 574), }, } ================================================ FILE: babeldoc/format/pdf/converter.py ================================================ import logging import re import unicodedata import numpy as np from pymupdf import Font from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater from babeldoc.pdfminer.converter import PDFConverter from babeldoc.pdfminer.layout import LTChar from babeldoc.pdfminer.layout import LTComponent from babeldoc.pdfminer.layout import LTCurve from babeldoc.pdfminer.layout import LTFigure from babeldoc.pdfminer.layout import LTLine from babeldoc.pdfminer.layout import LTPage from babeldoc.pdfminer.layout import LTText from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdffont import PDFCIDFont from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined from babeldoc.pdfminer.pdfinterp import PDFGraphicState from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import bbox2str from babeldoc.pdfminer.utils import matrix2str from babeldoc.pdfminer.utils import mult_matrix log = logging.getLogger(__name__) class PDFConverterEx(PDFConverter): def __init__( self, rsrcmgr: PDFResourceManager, il_creater: ILCreater | None = None, ) -> None: PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None) self.il_creater = il_creater def begin_page(self, page, ctm) -> None: # 重载替换 cropbox (x0, y0, x1, y1) = page.cropbox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) self.il_creater.on_page_media_box( mediabox[0], mediabox[1], mediabox[2], mediabox[3], ) self.il_creater.on_page_number(page.pageno) self.cur_item = LTPage(page.pageno, mediabox) def end_page(self, _page) -> None: # 重载返回指令流 return self.receive_layout(self.cur_item) def begin_figure(self, name, bbox, matrix) -> None: # 重载设置 pageid self._stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) self.cur_item.pageid = self._stack[-1].pageid def end_figure(self, _: str) -> None: # 重载返回指令流 fig = self.cur_item if not isinstance(self.cur_item, LTFigure): raise ValueError(f"Unexpected item type: {type(self.cur_item)}") self.cur_item = self._stack.pop() self.cur_item.add(fig) return self.receive_layout(fig) def render_char( self, matrix, font, fontsize: float, scaling: float, rise: float, cid: int, ncs, graphicstate: PDFGraphicState, ) -> float: # 重载设置 cid 和 font try: text = font.to_unichr(cid) if not isinstance(text, str): raise TypeError(f"Expected string, got {type(text)}") except PDFUnicodeNotDefined: text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) font_id = font.font_id_temp if font_id is not None: pass elif not hasattr(font, "xobj_id"): log.debug( f"Font {font.fontname} does not have xobj_id attribute.", ) font_id = "UNKNOW" else: font_id = self.il_creater.current_page_font_name_id_map.get( font.xobj_id, None ) item = AWLTChar( matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate, self.il_creater.xobj_id, font_id, self.il_creater.get_render_order_and_increase(), ) self.cur_item.add(item) item.cid = cid # hack 插入原字符编码 item.font = font # hack 插入原字符字体 return item.adv class AWLTChar(LTChar): """Actual letter in the text as a Unicode string.""" def __init__( self, matrix: Matrix, font: PDFFont, fontsize: float, scaling: float, rise: float, text: str, textwidth: float, textdisp: float | tuple[float | None, float], ncs: PDFColorSpace, graphicstate: PDFGraphicState, xobj_id: int, font_id: str, render_order: int, ) -> None: LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname self.ncs = ncs self.graphicstate = graphicstate self.xobj_id = xobj_id self.adv = textwidth * fontsize * scaling self.aw_font_id = font_id self.render_order = render_order # compute the boundary rectangle. if font.is_vertical(): # vertical assert isinstance(textdisp, tuple) (vx, vy) = textdisp if vx is None: vx = fontsize * 0.5 else: vx = vx * fontsize * 0.001 vy = (1000 - vy) * fontsize * 0.001 bbox_lower_left = (-vx, vy + rise + self.adv) bbox_upper_right = (-vx + fontsize, vy + rise) else: # horizontal descent = font.get_descent() * fontsize bbox_lower_left = (0, descent + rise) bbox_upper_right = (self.adv, descent + rise + fontsize) (a, b, c, d, e, f) = self.matrix self.upright = a * d * scaling > 0 and b * c <= 0 (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left) (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right) if x1 < x0: (x0, x1) = (x1, x0) if y1 < y0: (y0, y1) = (y1, y0) LTComponent.__init__(self, (x0, y0, x1, y1)) if font.is_vertical() or matrix[0] == 0: self.size = self.width else: self.size = self.height return def __repr__(self) -> str: return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>" def get_text(self) -> str: return self._text class Paragraph: def __init__(self, y, x, x0, x1, size, brk): self.y: float = y # 初始纵坐标 self.x: float = x # 初始横坐标 self.x0: float = x0 # 左边界 self.x1: float = x1 # 右边界 self.size: float = size # 字体大小 self.brk: bool = brk # 换行标记 # fmt: off class TranslateConverter(PDFConverterEx): def __init__( self, rsrcmgr, vfont: str | None = None, vchar: str | None = None, thread: int = 0, layout: dict | None = None, lang_in: str = "", # 保留参数但添加未使用标记 _lang_out: str = "", # 改为未使用参数 _service: str = "", # 改为未使用参数 resfont: str = "", noto: Font | None = None, envs: dict | None = None, _prompt: list | None = None, # 改为未使用参数 il_creater: ILCreater | None = None, ): layout = layout or {} super().__init__(rsrcmgr, il_creater) self.vfont = vfont self.vchar = vchar self.thread = thread self.layout = layout self.resfont = resfont self.noto = noto def receive_layout(self, ltpage: LTPage): # 段落 sstk: list[str] = [] # 段落文字栈 pstk: list[Paragraph] = [] # 段落属性栈 vbkt: int = 0 # 段落公式括号计数 # 公式组 vstk: list[LTChar] = [] # 公式符号组 vlstk: list[LTLine] = [] # 公式线条组 vfix: float = 0 # 公式纵向偏移 # 公式组栈 var: list[list[LTChar]] = [] # 公式符号组栈 varl: list[list[LTLine]] = [] # 公式线条组栈 varf: list[float] = [] # 公式纵向偏移栈 vlen: list[float] = [] # 公式宽度栈 # 全局 lstk: list[LTLine] = [] # 全局线条栈 xt: LTChar = None # 上一个字符 xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落 vmax: float = ltpage.width / 4 # 行内公式最大宽度 ops: str = "" # 渲染结果 def vflag(font: str, char: str): # 匹配公式(和角标)字体 if isinstance(font, bytes): # 不一定能 decode,直接转 str font = str(font) font = font.split("+")[-1] # 字体名截断 if re.match(r"\(cid:", char): return True # 基于字体名规则的判定 if self.vfont: if re.match(self.vfont, font): return True else: if re.match( # latex 字体 r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)", font, ): return True # 基于字符集规则的判定 if self.vchar: if re.match(self.vchar, char): return True else: if ( char and char != " " # 非空格 and ( unicodedata.category(char[0]) in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号 or ord(char[0]) in range(0x370, 0x400) # 希腊字母 ) ): return True return False ############################################################ # A. 原文档解析 for child in ltpage: if isinstance(child, LTChar): try: self.il_creater.on_lt_char(child) except Exception: log.exception( 'Error processing LTChar', ) continue cur_v = False layout = self.layout[ltpage.pageid] # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape h, w = layout.shape # 读取当前字符在 layout 中的类别 cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) cls = layout[cy, cx] # 锚定文档中 bullet 的位置 if child.get_text() == "•": cls = 0 # 判定当前字符是否属于公式 if ( # 判定当前字符是否属于公式 cls == 0 # 1. 类别为保留区域 or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 or vflag(child.fontname, child.get_text()) # 3. 公式字体 or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体 ): cur_v = True # 判定括号组是否属于公式 if not cur_v: if vstk and child.get_text() == "(": cur_v = True vbkt += 1 if vbkt and child.get_text() == ")": cur_v = True vbkt -= 1 if ( # 判定当前公式是否结束 not cur_v # 1. 当前字符不属于公式 or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落 # or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分 # 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况 # A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}" # B. 文字开头段落(排版相对位置)sstk[-1]!="" or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0 ): if vstk: if ( # 根据公式右侧的文字修正公式的纵向偏移 not cur_v # 1. 当前字符不属于公式 and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧 ): vfix = vstk[0].y0 - child.y0 if sstk[-1] == "": xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别 sstk[-1] += f"{{v{len(var)}}}" var.append(vstk) varl.append(vlstk) varf.append(vfix) vstk = [] vlstk = [] vfix = 0 # 当前字符不属于公式或当前字符是公式的第一个字符 if not vstk: if cls == xt_cls: # 当前字符与前一个字符属于同一段落 if child.x0 > xt.x1 + 1: # 添加行内空格 sstk[-1] += " " elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行 sstk[-1] += " " pstk[-1].brk = True else: # 根据当前字符构建一个新的段落 sstk.append("") pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False)) if not cur_v: # 文字入栈 if ( # 根据当前字符修正段落属性 child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大 or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况) ) and child.get_text() != " ": # 3. 当前字符不是空格 pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐 pstk[-1].size = child.size sstk[-1] += child.get_text() else: # 公式入栈 if ( # 根据公式左侧的文字修正公式的纵向偏移 not vstk # 1. 当前字符是公式的第一个字符 and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 and child.x0 > xt.x0 # 3. 前一个字符在公式左侧 ): vfix = child.y0 - xt.y0 vstk.append(child) # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理 pstk[-1].x0 = min(pstk[-1].x0, child.x0) pstk[-1].x1 = max(pstk[-1].x1, child.x1) # 更新上一个字符 xt = child xt_cls = cls elif isinstance(child, LTFigure): # 图表 self.il_creater.on_pdf_figure(child) pass # elif isinstance(child, LTLine): # 线条 # continue # layout = self.layout[ltpage.pageid] # # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape # h, w = layout.shape # # 读取当前线条在 layout 中的类别 # cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) # cls = layout[cy, cx] # if vstk and cls == xt_cls: # 公式线条 # vlstk.append(child) # else: # 全局线条 # lstk.append(child) elif isinstance(child, LTCurve): self.il_creater.on_lt_curve(child) pass else: pass return # 处理结尾 if vstk: # 公式出栈 sstk[-1] += f"{{v{len(var)}}}" var.append(vstk) varl.append(vlstk) varf.append(vfix) log.debug("\n==========[VSTACK]==========\n") for var_id, v in enumerate(var): # 计算公式宽度 l = max([vch.x1 for vch in v]) - v[0].x0 log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}') vlen.append(l) ############################################################ # B. 段落翻译 log.debug("\n==========[SSTACK]==========\n") news = sstk.copy() ############################################################ # C. 新文档排版 def raw_string(fcur: str, cstk: str): # 编码字符串 if fcur == 'noto': return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk]) elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 return "".join([f"{ord(c):04x}" for c in cstk]) else: return "".join([f"{ord(c):02x}" for c in cstk]) _x, _y = 0, 0 for para_id, new in enumerate(news): x: float = pstk[para_id].x # 段落初始横坐标 y: float = pstk[para_id].y # 段落初始纵坐标 x0: float = pstk[para_id].x0 # 段落左边界 x1: float = pstk[para_id].x1 # 段落右边界 size: float = pstk[para_id].size # 段落字体大小 brk: bool = pstk[para_id].brk # 段落换行标记 cstk: str = "" # 当前文字栈 fcur: str = None # 当前字体 ID tx = x fcur_ = fcur ptr = 0 log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}") while ptr < len(new): vy_regex = re.match( r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE, ) # 匹配 {vn} 公式标记 mod = 0 # 文字修饰符 if vy_regex: # 加载公式 ptr += len(vy_regex.group(0)) try: vid = int(vy_regex.group(1).replace(" ", "")) adv = vlen[vid] except Exception as e: log.debug("Skipping formula placeholder due to: %s", e) continue # 翻译器可能会自动补个越界的公式标记 if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符 mod = var[vid][-1].width else: # 加载文字 ch = new[ptr] fcur_ = None try: if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch: fcur_ = "tiro" # 默认拉丁字体 except Exception: pass if fcur_ is None: fcur_ = self.resfont # 默认非拉丁字体 if fcur_ == 'noto': adv = self.noto.char_lengths(ch, size)[0] else: adv = self.fontmap[fcur_].char_width(ord(ch)) * size ptr += 1 if ( # 输出文字缓冲区 fcur_ != fcur # 1. 字体更新 or vy_regex # 2. 插入公式 or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) ): if cstk: ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ " cstk = "" if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行 x = x0 lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8} # y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1 y -= size * 1.4 if vy_regex: # 插入公式 fix = 0 if fcur is not None: # 段落内公式修正纵向偏移 fix = varf[vid] for vch in var[vid]: # 排版公式字符 vc = chr(vch.cid) ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ " if log.isEnabledFor(logging.DEBUG): lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0))) _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0 for l in varl[vid]: # 排版公式线条 if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " else: # 插入文字缓冲区 if not cstk: # 单行开头 tx = x if x == x0 and ch == " ": # 消除段落换行空格 adv = 0 else: cstk += ch else: cstk += ch adv -= mod # 文字修饰符 fcur = fcur_ x += adv if log.isEnabledFor(logging.DEBUG): lstk.append(LTLine(0.1, (_x, _y), (x, y))) _x, _y = x, y # 处理结尾 if cstk: ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ " for l in lstk: # 排版全局线条 if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " ops = f"BT {ops}ET " return ops ================================================ FILE: babeldoc/format/pdf/document_il/__init__.py ================================================ from babeldoc.format.pdf.document_il.il_version_1 import BaseOperations from babeldoc.format.pdf.document_il.il_version_1 import Box from babeldoc.format.pdf.document_il.il_version_1 import Cropbox from babeldoc.format.pdf.document_il.il_version_1 import Document from babeldoc.format.pdf.document_il.il_version_1 import GraphicState from babeldoc.format.pdf.document_il.il_version_1 import Mediabox from babeldoc.format.pdf.document_il.il_version_1 import Page from babeldoc.format.pdf.document_il.il_version_1 import PageLayout from babeldoc.format.pdf.document_il.il_version_1 import PdfAffineTransform from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter from babeldoc.format.pdf.document_il.il_version_1 import PdfCurve from babeldoc.format.pdf.document_il.il_version_1 import PdfFigure from babeldoc.format.pdf.document_il.il_version_1 import PdfFont from babeldoc.format.pdf.document_il.il_version_1 import PdfFontCharBoundingBox from babeldoc.format.pdf.document_il.il_version_1 import PdfForm from babeldoc.format.pdf.document_il.il_version_1 import PdfFormSubtype from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula from babeldoc.format.pdf.document_il.il_version_1 import PdfInlineForm from babeldoc.format.pdf.document_il.il_version_1 import PdfLine from babeldoc.format.pdf.document_il.il_version_1 import PdfMatrix from babeldoc.format.pdf.document_il.il_version_1 import PdfOriginalPath from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraph from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition from babeldoc.format.pdf.document_il.il_version_1 import PdfPath from babeldoc.format.pdf.document_il.il_version_1 import PdfRectangle from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleCharacters from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleUnicodeCharacters from babeldoc.format.pdf.document_il.il_version_1 import PdfStyle from babeldoc.format.pdf.document_il.il_version_1 import PdfXobject from babeldoc.format.pdf.document_il.il_version_1 import PdfXobjForm from babeldoc.format.pdf.document_il.il_version_1 import VisualBbox __all__ = [ "BaseOperations", "Box", "Cropbox", "Document", "GraphicState", "Mediabox", "Page", "PageLayout", "PdfAffineTransform", "PdfCharacter", "PdfCurve", "PdfFigure", "PdfFont", "PdfFontCharBoundingBox", "PdfForm", "PdfFormSubtype", "PdfFormula", "PdfInlineForm", "PdfLine", "PdfMatrix", "PdfOriginalPath", "PdfParagraph", "PdfParagraphComposition", "PdfPath", "PdfRectangle", "PdfSameStyleCharacters", "PdfSameStyleUnicodeCharacters", "PdfStyle", "PdfXobjForm", "PdfXobject", "VisualBbox", ] ================================================ FILE: babeldoc/format/pdf/document_il/backend/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/backend/pdf_creater.py ================================================ import io import itertools import logging import os import re import time import unicodedata from abc import ABC from abc import abstractmethod from multiprocessing import Process from pathlib import Path import freetype import pymupdf from bitstring import BitStream from babeldoc.assets.embedding_assets_metadata import FONT_NAMES from babeldoc.format.pdf.document_il import PdfOriginalPath from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.matrix_helper import matrix_to_bytes from babeldoc.format.pdf.document_il.utils.zstd_helper import zstd_decompress from babeldoc.format.pdf.translation_config import TranslateResult from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.format.pdf.translation_config import WatermarkOutputMode logger = logging.getLogger(__name__) SUBSET_FONT_STAGE_NAME = "Subset font" SAVE_PDF_STAGE_NAME = "Save PDF" class RenderUnit(ABC): """Abstract base class for all renderable units.""" def __init__( self, render_order: int, sub_render_order: int = 0, xobj_id: str | None = None, ): self.render_order = render_order self.sub_render_order = sub_render_order self.xobj_id = xobj_id if self.render_order is None: self.render_order = 9999999999999999 if self.sub_render_order is None: self.sub_render_order = 9999999999999999 @abstractmethod def render( self, draw_op: BitStream, context: "RenderContext", ) -> None: """Render this unit to the draw_op BitStream.""" pass def get_sort_key(self) -> tuple[int, int]: """Get the sort key for ordering render units.""" return (self.render_order, self.sub_render_order) class CharacterRenderUnit(RenderUnit): """Render unit for PDF characters.""" def __init__( self, char: il_version_1.PdfCharacter, render_order: int, sub_render_order: int = 0, ): super().__init__(render_order, sub_render_order, char.xobj_id) self.char = char def render(self, draw_op: BitStream, context: "RenderContext") -> None: char = self.char if char.char_unicode == "\n": return if char.pdf_character_id is None: return char_size = char.pdf_style.font_size font_id = char.pdf_style.font_id # Get encoding length map based on xobj_id if self.xobj_id in context.xobj_encoding_length_map: encoding_length_map = context.xobj_encoding_length_map[self.xobj_id] else: encoding_length_map = context.page_encoding_length_map # Check font exists if needed if context.check_font_exists: if self.xobj_id in context.xobj_available_fonts: if font_id not in context.xobj_available_fonts[self.xobj_id]: return elif font_id not in context.available_font_list: return draw_op.append(b"q ") context.pdf_creator.render_graphic_state(draw_op, char.pdf_style.graphic_state) if char.vertical: draw_op.append( f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(), ) else: draw_op.append( f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(), ) encoding_length = encoding_length_map.get(font_id, None) if encoding_length is None: if font_id in context.all_encoding_length_map: encoding_length = context.all_encoding_length_map[font_id] else: logger.debug( f"Font {font_id} not found in encoding length map for page {context.page.page_number}" ) return draw_op.append( f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(), ) draw_op.append(b" Tj ET Q \n") class FormRenderUnit(RenderUnit): """Render unit for PDF forms.""" def __init__( self, form: il_version_1.PdfForm, render_order: int, sub_render_order: int = 0, ): super().__init__(render_order, sub_render_order, form.xobj_id) self.form = form def render(self, draw_op: BitStream, context: "RenderContext") -> None: form = self.form draw_op.append(b"q ") # Apply relocation transform first if present (before passthrough instructions) # This ensures masks in passthrough_per_char_instruction use the correct coordinate system assert form.pdf_matrix is not None if form.relocation_transform and len(form.relocation_transform) == 6: try: relocation_matrix = tuple(float(x) for x in form.relocation_transform) draw_op.append(matrix_to_bytes(relocation_matrix)) except (ValueError, TypeError): # If relocation transform conversion fails, skip it and use original matrix later pass draw_op.append(matrix_to_bytes(form.pdf_matrix)) draw_op.append(b" ") draw_op.append( form.graphic_state.passthrough_per_char_instruction.encode(), ) draw_op.append(b" ") assert form.pdf_form_subtype is not None if form.pdf_form_subtype.pdf_xobj_form: draw_op.append( f" /{form.pdf_form_subtype.pdf_xobj_form.do_args} Do ".encode() ) elif form.pdf_form_subtype.pdf_inline_form: # Handle inline form (inline image) inline_form = form.pdf_form_subtype.pdf_inline_form # Start inline image draw_op.append(b" BI ") # Add image parameters if available if inline_form.image_parameters: import json try: params = json.loads(inline_form.image_parameters) for key, value in params.items(): if key.startswith("/"): key = key[1:] # Remove leading slash # Convert Python boolean to PDF boolean if value is True: value = "true" elif value is False: value = "false" elif isinstance(value, str) and value in ( "True", "False", ): value = value.lower() draw_op.append(f"/{key} {value} ".encode()) except json.JSONDecodeError: pass # Start image data draw_op.append(b"ID ") # Add image data if available (base64 decode it first) if inline_form.form_data: import base64 try: image_data = base64.b64decode(inline_form.form_data) draw_op.append(image_data) except Exception: pass # End inline image draw_op.append(b" EI ") draw_op.append(b" Q\n") class RectangleRenderUnit(RenderUnit): """Render unit for PDF rectangles.""" def __init__( self, rectangle: il_version_1.PdfRectangle, render_order: int, sub_render_order: int = 0, line_width: float = 0.4, ): super().__init__(render_order, sub_render_order, rectangle.xobj_id) self.rectangle = rectangle self.line_width = line_width def render(self, draw_op: BitStream, context: "RenderContext") -> None: rectangle = self.rectangle x1 = rectangle.box.x y1 = rectangle.box.y x2 = rectangle.box.x2 y2 = rectangle.box.y2 width = x2 - x1 height = y2 - y1 draw_op.append(b"q n ") draw_op.append( rectangle.graphic_state.passthrough_per_char_instruction.encode(), ) line_width = self.line_width if rectangle.line_width is not None: line_width = rectangle.line_width if line_width > 0: draw_op.append(f" {line_width:.6f} w ".encode()) draw_op.append(f"{x1:.6f} {y1:.6f} {width:.6f} {height:.6f} re ".encode()) if rectangle.fill_background: draw_op.append(b" f ") else: draw_op.append(b" S ") draw_op.append(b"Q\n") class CurveRenderUnit(RenderUnit): """Render unit for PDF curves.""" def __init__( self, curve: il_version_1.PdfCurve, render_order: int, sub_render_order: int = 0, ): super().__init__(render_order, sub_render_order, curve.xobj_id) self.curve = curve def render(self, draw_op: BitStream, context: "RenderContext") -> None: curve = self.curve draw_op.append(b"q n ") # Apply relocation transform first if present (before passthrough instructions) # This ensures masks in passthrough_per_char_instruction use the correct coordinate system if curve.relocation_transform and len(curve.relocation_transform) == 6: try: relocation_matrix = tuple(float(x) for x in curve.relocation_transform) draw_op.append(matrix_to_bytes(relocation_matrix)) except (ValueError, TypeError): # If relocation transform conversion fails, skip it and use original CTM later pass draw_op.append(b" ") # Apply original CTM if present if curve.ctm and len(curve.ctm) == 6: ctm = curve.ctm draw_op.append( f"{ctm[0]:.6f} {ctm[1]:.6f} {ctm[2]:.6f} {ctm[3]:.6f} {ctm[4]:.6f} {ctm[5]:.6f} cm ".encode() ) draw_op.append(b" ") draw_op.append( curve.graphic_state.passthrough_per_char_instruction.encode(), ) draw_op.append(b" ") path_op = BitStream(b" ") # Use original path if available, otherwise fall back to transformed path path_to_use = ( curve.pdf_original_path if curve.pdf_original_path is not None else curve.pdf_path ) for path in path_to_use: if isinstance(path, PdfOriginalPath): path = path.pdf_path if path.has_xy: path_op.append(f"{path.x:F} {path.y:F} {path.op} ".encode()) else: path_op.append(f"{path.op} ".encode()) if curve.fill_background: draw_op.append(path_op) draw_op.append(b" f") if curve.evenodd: draw_op.append(b"* ") else: draw_op.append(b" ") if curve.stroke_path: draw_op.append(path_op) draw_op.append(b"S ") # final_op = b' B ' draw_op.append(b" n Q\n") class RenderContext: """Context object containing shared state for rendering.""" def __init__( self, pdf_creator: "PDFCreater", page: il_version_1.Page, available_font_list: set[str], page_encoding_length_map: dict[str, int], all_encoding_length_map: dict[str, int], xobj_available_fonts: dict[str, set[str]], xobj_encoding_length_map: dict[str, dict[str, int]], ctm_for_ops: bytes, check_font_exists: bool = False, ): self.pdf_creator = pdf_creator self.page = page self.available_font_list = available_font_list self.page_encoding_length_map = page_encoding_length_map self.all_encoding_length_map = all_encoding_length_map self.xobj_available_fonts = xobj_available_fonts self.xobj_encoding_length_map = xobj_encoding_length_map self.ctm_for_ops = ctm_for_ops self.check_font_exists = check_font_exists def to_int(src): return int(re.search(r"\d+", src).group(0)) def parse_mapping(text): mapping = [] for x in re.finditer(rb"<(?P[a-fA-F0-9]+)>", text): mapping.append(int(x.group("num"), 16)) return mapping def apply_normalization(cmap, gid, code): need = False if 0x2F00 <= code <= 0x2FD5: # Kangxi Radicals need = True if 0xF900 <= code <= 0xFAFF: # CJK Compatibility Ideographs need = True if need: norm = unicodedata.normalize("NFD", chr(code)) cmap[gid] = ord(norm) else: cmap[gid] = code def batched(iterable, n, *, strict=False): # batched('ABCDEFG', 3) → ABC DEF G if n < 1: raise ValueError("n must be at least one") iterator = iter(iterable) while batch := tuple(itertools.islice(iterator, n)): if strict and len(batch) != n: raise ValueError("batched(): incomplete batch") yield batch def update_tounicode_cmap_pair(cmap, data): for start, stop, value in batched(data, 3): for gid in range(start, stop + 1): code = value + gid - start apply_normalization(cmap, gid, code) def update_tounicode_cmap_code(cmap, data): for gid, code in batched(data, 2): apply_normalization(cmap, gid, code) def parse_tounicode_cmap(data): cmap = {} for x in re.finditer( rb"\s+beginbfrange\s*(?P(<[0-9a-fA-F]+>\s*)+)endbfrange\s+", data ): update_tounicode_cmap_pair(cmap, parse_mapping(x.group("r"))) for x in re.finditer( rb"\s+beginbfchar\s*(?P(<[0-9a-fA-F]+>\s*)+)endbfchar", data ): update_tounicode_cmap_code(cmap, parse_mapping(x.group("c"))) return cmap def parse_truetype_data(data): glyph_in_use = [] face = freetype.Face(io.BytesIO(data)) for i in range(face.num_glyphs): face.load_glyph(i) if face.glyph.outline.contours: glyph_in_use.append(i) return glyph_in_use TOUNICODE_HEAD = """\ /CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo <> def /CMapName /Adobe-Identity-UCS def /CMapType 2 def 1 begincodespacerange <0000> endcodespacerange""" TOUNICODE_TAIL = """\ endcmap CMapName currentdict /CMap defineresource pop end end""" def make_tounicode(cmap, used): short = [] for x in used: if x in cmap: short.append((x, cmap[x])) line = [TOUNICODE_HEAD] for block in batched(short, 100): line.append(f"{len(block)} beginbfchar") for glyph, code in block: if code < 0x10000: line.append(f"<{glyph:04x}><{code:04x}>") else: code -= 0x10000 high = 0xD800 + (code >> 10) low = 0xDC00 + (code & 0b1111111111) line.append(f"<{glyph:04x}><{high:04x}{low:04x}>") line.append("endbfchar") line.append(TOUNICODE_TAIL) return "\n".join(line) def reproduce_one_font(doc, index): m = doc.xref_get_key(index, "ToUnicode") f = doc.xref_get_key(index, "DescendantFonts") if m[0] == "xref" and f[0] == "array": mi = to_int(m[1]) fi = to_int(f[1]) ff = doc.xref_get_key(fi, "FontDescriptor/FontFile2") ms = doc.xref_stream(mi) fs = doc.xref_stream(to_int(ff[1])) cmap = parse_tounicode_cmap(ms) used = parse_truetype_data(fs) text = make_tounicode(cmap, used) doc.update_stream(mi, bytes(text, "U8")) def reproduce_cmap(doc): assert doc font_set = set() for page in doc: try: font_list = page.get_fonts() for font in font_list: if font[1] == "ttf" and font[3] in FONT_NAMES and ".ttf" in font[4]: font_set.add(font) except Exception as e: logger.error(f"Error in getting page fonts: {e}") for font in font_set: reproduce_one_font(doc, font[0]) return doc def _subset_fonts_process(pdf_path, output_path): """Function to run in subprocess for font subsetting. Args: pdf_path: Path to the PDF file to subset output_path: Path where to save the result """ try: pdf = pymupdf.open(pdf_path) pdf.subset_fonts(fallback=False) pdf.save(output_path) # 返回 0 表示成功 os._exit(0) except Exception as e: logger.error(f"Error in font subsetting subprocess: {e}") # 返回 1 表示失败 os._exit(1) def _save_pdf_clean_process( pdf_path, output_path, garbage=1, deflate=True, clean=True, deflate_fonts=True, linear=False, ): """Function to run in subprocess for saving PDF with clean=True which can be time-consuming. Args: pdf_path: Path to the PDF file to save output_path: Path where to save the result garbage: Garbage collection level (0, 1, 2, 3, 4) deflate: Whether to deflate the PDF clean: Whether to clean the PDF deflate_fonts: Whether to deflate fonts linear: Whether to linearize the PDF """ try: pdf = pymupdf.open(pdf_path) pdf.save( output_path, garbage=garbage, deflate=deflate, clean=clean, deflate_fonts=deflate_fonts, linear=linear, ) # 返回 0 表示成功 os._exit(0) except Exception as e: logger.error(f"Error in save PDF with clean=True subprocess: {e}") # 返回 1 表示失败 os._exit(1) class PDFCreater: stage_name = "Generate drawing instructions" def __init__( self, original_pdf_path: str, document: il_version_1.Document, translation_config: TranslationConfig, mediabox_data: dict, ): self.original_pdf_path = original_pdf_path self.docs = document self.font_path = translation_config.font self.font_mapper = FontMapper(translation_config) self.translation_config = translation_config self.mediabox_data = mediabox_data def render_graphic_state( self, draw_op: BitStream, graphic_state: il_version_1.GraphicState, ): if graphic_state is None: return # if graphic_state.stroking_color_space_name: # draw_op.append( # f"/{graphic_state.stroking_color_space_name} CS \n".encode() # ) # if graphic_state.non_stroking_color_space_name: # draw_op.append( # f"/{graphic_state.non_stroking_color_space_name}" # f" cs \n".encode() # ) # if graphic_state.ncolor is not None: # if len(graphic_state.ncolor) == 1: # draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode()) # elif len(graphic_state.ncolor) == 3: # draw_op.append( # f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode() # ) # if graphic_state.scolor is not None: # if len(graphic_state.scolor) == 1: # draw_op.append(f"{graphic_state.scolor[0]} G \n".encode()) # elif len(graphic_state.scolor) == 3: # draw_op.append( # f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode() # ) if graphic_state.passthrough_per_char_instruction: draw_op.append( f"{graphic_state.passthrough_per_char_instruction} \n".encode(), ) def render_paragraph_to_char( self, paragraph: il_version_1.PdfParagraph, ) -> list[il_version_1.PdfCharacter]: chars = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_character: chars.append(composition.pdf_character) elif composition.pdf_formula: # Flatten formula: extract all characters from the formula chars.extend(composition.pdf_formula.pdf_character) else: logger.error( f"Unknown composition type. " f"This type only appears in the IL " f"after the translation is completed." f"During pdf rendering, this type is not supported." f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue if not chars and paragraph.unicode and paragraph.debug_id: logger.error( f"Unable to export paragraphs that have " f"not yet been formatted: {paragraph}", ) return chars return chars def create_render_units_for_page( self, page: il_version_1.Page, translation_config: TranslationConfig, ) -> list[RenderUnit]: """Convert all renderable objects in a page to render units.""" render_units = [] # Collect all characters (from page and paragraphs) chars = [] if page.pdf_character: chars.extend(page.pdf_character) for paragraph in page.pdf_paragraph: chars.extend(self.render_paragraph_to_char(paragraph)) # Convert characters to render units for i, char in enumerate(chars): render_order = getattr(char, "render_order", 100) # Default render order sub_render_order = getattr(char, "sub_render_order", i) render_units.append( CharacterRenderUnit(char, render_order, sub_render_order) ) # Collect forms from formulas within paragraphs formula_forms = [] for paragraph in page.pdf_paragraph: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula: formula_forms.extend(composition.pdf_formula.pdf_form) # Convert forms to render units (page-level forms + forms from formulas) if not translation_config.skip_form_render: all_forms = list(page.pdf_form) + formula_forms for i, form in enumerate(all_forms): render_order = getattr( form, "render_order", 50 ) # Forms render before characters sub_render_order = getattr(form, "sub_render_order", i) render_units.append( FormRenderUnit(form, render_order, sub_render_order) ) # Convert rectangles to render units (only for OCR workaround or debug) for i, rect in enumerate(page.pdf_rectangle): if ( translation_config.ocr_workaround and not rect.debug_info and rect.fill_background ) or (translation_config.debug and rect.debug_info): render_order = getattr( rect, "render_order", 10 ) # Rectangles render first sub_render_order = getattr(rect, "sub_render_order", i) line_width = 0.1 if translation_config.ocr_workaround else 0.4 render_units.append( RectangleRenderUnit( rect, render_order, sub_render_order, line_width ) ) # Collect curves from formulas within paragraphs formula_curves = [] for paragraph in page.pdf_paragraph: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula: formula_curves.extend(composition.pdf_formula.pdf_curve) # Convert curves to render units (page-level curves + curves from formulas, only for debug) if not translation_config.skip_curve_render: all_curves = list(page.pdf_curve) + formula_curves for i, curve in enumerate(all_curves): if curve.debug_info or translation_config.debug: render_order = getattr( curve, "render_order", 20 ) # Curves render after rectangles sub_render_order = getattr(curve, "sub_render_order", i) render_units.append( CurveRenderUnit(curve, render_order, sub_render_order) ) return render_units def render_units_to_stream( self, render_units: list[RenderUnit], context: RenderContext, page_op: BitStream, xobj_draw_ops: dict[str, BitStream], ) -> None: """Render sorted render units to appropriate draw streams.""" # Sort render units by (render_order, sub_render_order) sorted_units = sorted(render_units, key=lambda unit: unit.get_sort_key()) for unit in sorted_units: # Determine which draw_op to use based on xobj_id if unit.xobj_id in xobj_draw_ops: draw_op = xobj_draw_ops[unit.xobj_id] else: draw_op = page_op # Render the unit unit.render(draw_op, context) def get_available_font_list(self, pdf, page): page_xref_id = pdf[page.page_number].xref return self.get_xobj_available_fonts(page_xref_id, pdf) def get_xobj_available_fonts(self, page_xref_id, pdf): try: resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources") if resources_type == "xref": resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1) r_id = pdf.xref_object(int(resource_xref_id)) resources_type = "dict" if resources_type == "dict": xref_id = re.search("/Font (\\d+) 0 R", r_id) if xref_id is not None: xref_id = xref_id.group(1) font_dict = pdf.xref_object(int(xref_id)) else: search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " ")) if search is None: # Have resources but no fonts return set() font_dict = search.group(1) else: r_id = int(r_id.split(" ")[0]) _, font_dict = pdf.xref_get_key(r_id, "Font") fonts = re.findall("/([^ ]+?) ", font_dict) return set(fonts) except Exception: return set() def _render_rectangle( self, draw_op: BitStream, rectangle: il_version_1.PdfRectangle, line_width: float = 0.4, ): """Draw a rectangle in PDF for visualization purposes. Args: draw_op: BitStream to append PDF drawing operations rectangle: Rectangle object containing position information line_width: Line width """ x1 = rectangle.box.x y1 = rectangle.box.y x2 = rectangle.box.x2 y2 = rectangle.box.y2 width = x2 - x1 height = y2 - y1 # Save graphics state draw_op.append(b"q ") # Set green color for debug visibility draw_op.append( rectangle.graphic_state.passthrough_per_char_instruction.encode(), ) # Green stroke if rectangle.line_width is not None: line_width = rectangle.line_width if line_width > 0: draw_op.append(f" {line_width:.6f} w ".encode()) # Line width draw_op.append(f"{x1:.6f} {y1:.6f} {width:.6f} {height:.6f} re ".encode()) if rectangle.fill_background: draw_op.append(b" f ") else: draw_op.append(b" S ") # Restore graphics state draw_op.append(b" n Q\n") def create_side_by_side_dual_pdf( self, original_pdf: pymupdf.Document, translated_pdf: pymupdf.Document, dual_out_path: str, translation_config: TranslationConfig, ) -> pymupdf.Document: """Create a dual PDF with side-by-side pages (original and translation). Args: original_pdf: Original PDF document translated_pdf: Translated PDF document dual_out_path: Output path for the dual PDF translation_config: Translation configuration Returns: The created dual PDF document """ # Create a new PDF for side-by-side pages dual = pymupdf.open() page_count = min(original_pdf.page_count, translated_pdf.page_count) for page_id in range(page_count): # Get pages from both PDFs orig_page = original_pdf[page_id] trans_page = translated_pdf[page_id] rotate_angle = orig_page.rotation total_width = orig_page.rect.width + trans_page.rect.width max_height = max(orig_page.rect.height, trans_page.rect.height) left_width = ( orig_page.rect.width if not translation_config.dual_translate_first else trans_page.rect.width ) orig_page.set_rotation(0) trans_page.set_rotation(0) # Create new page with combined width dual_page = dual.new_page(width=total_width, height=max_height) # Define rectangles for left and right sides rect_left = pymupdf.Rect(0, 0, left_width, max_height) rect_right = pymupdf.Rect(left_width, 0, total_width, max_height) # Show pages according to dual_translate_first setting if translation_config.dual_translate_first: # Show translated page on left and original on right rect_left, rect_right = rect_right, rect_left try: # Show original page on left and translated on right (default) dual_page.show_pdf_page( rect_left, original_pdf, page_id, keep_proportion=True, rotate=-rotate_angle, ) except Exception as e: logger.warning( f"Failed to show original page on left and translated on right (default). " f"Page ID: {page_id}. " f"Original PDF: {self.original_pdf_path}. " f"Translated PDF: {translation_config.input_file}. ", exc_info=e, ) try: dual_page.show_pdf_page( rect_right, translated_pdf, page_id, keep_proportion=True, rotate=-rotate_angle, ) except Exception as e: logger.warning( f"Failed to show translated page on left and original on right. " f"Page ID: {page_id}. " f"Original PDF: {self.original_pdf_path}. " f"Translated PDF: {translation_config.input_file}. ", exc_info=e, ) return dual def create_alternating_pages_dual_pdf( self, original_pdf: pymupdf.Document, translated_pdf: pymupdf.Document, translation_config: TranslationConfig, ) -> pymupdf.Document: """Create a dual PDF with alternating pages (original and translation). Args: original_pdf_path: Path to the original PDF translated_pdf: Translated PDF document translation_config: Translation configuration Returns: The created dual PDF document """ # Open the original PDF and insert translated PDF dual = original_pdf dual.insert_file(translated_pdf) # Rearrange pages to alternate between original and translated page_count = translated_pdf.page_count for page_id in range(page_count): if translation_config.dual_translate_first: dual.move_page(page_count + page_id, page_id * 2) else: dual.move_page(page_count + page_id, page_id * 2 + 1) return dual def write_debug_info( self, pdf: pymupdf.Document, translation_config: TranslationConfig, ): self.font_mapper.add_font(pdf, self.docs) for page in self.docs.page: _, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents") resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1) base_op = pdf.xref_stream(int(resource_xref_id)) translation_config.raise_if_cancelled() xobj_available_fonts = {} xobj_draw_ops = {} xobj_encoding_length_map = {} available_font_list = self.get_available_font_list(pdf, page) page_encoding_length_map = { f.font_id: f.encoding_length for f in page.pdf_font } page_op = BitStream() # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new} page_op.append(b"q ") if base_op is not None: page_op.append(base_op) page_op.append(b" Q ") page_op.append( f"q Q 1 0 0 1 {page.cropbox.box.x:.6f} {page.cropbox.box.y:.6f} cm \n".encode(), ) # 收集所有字符 chars = [] # 首先添加页面级别的字符 if page.pdf_character: chars.extend(page.pdf_character) # 然后添加段落中的字符 for paragraph in page.pdf_paragraph: chars.extend(self.render_paragraph_to_char(paragraph)) # 渲染所有字符 for char in chars: if not getattr(char, "debug_info", False): continue if char.char_unicode == "\n": continue if char.pdf_character_id is None: # dummy char continue char_size = char.pdf_style.font_size font_id = char.pdf_style.font_id if font_id not in available_font_list: continue draw_op = page_op encoding_length_map = page_encoding_length_map draw_op.append(b"q ") self.render_graphic_state(draw_op, char.pdf_style.graphic_state) if char.vertical: draw_op.append( f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(), ) else: draw_op.append( f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(), ) encoding_length = encoding_length_map[font_id] # pdf32000-2008 page14: # As hexadecimal data enclosed in angle brackets < > # see 7.3.4.3, "Hexadecimal Strings." draw_op.append( f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(), ) draw_op.append(b" Tj ET Q \n") for rect in page.pdf_rectangle: if not rect.debug_info: continue self._render_rectangle(page_op, rect) draw_op = page_op # Since this is a draw instruction container, # no additional information is needed pdf.update_stream(int(resource_xref_id), draw_op.tobytes()) translation_config.raise_if_cancelled() # 使用子进程进行字体子集化 if not translation_config.skip_clean: pdf = self.subset_fonts_in_subprocess(pdf, translation_config, tag="debug") return pdf @staticmethod def subset_fonts_in_subprocess( pdf: pymupdf.Document, translation_config: TranslationConfig, tag: str ) -> pymupdf.Document: """Run font subsetting in a subprocess with timeout. Args: pdf: The PDF document object translation_config: Translation configuration Returns: Path to the PDF with subsetted fonts, or original path if subsetting failed or timed out """ original_pdf = pdf # Create temporary file paths temp_input = str( translation_config.get_working_file_path(f"temp_subset_input_{tag}.pdf") ) temp_output = str( translation_config.get_working_file_path(f"temp_subset_output_{tag}.pdf") ) # Save PDF to temporary file without subsetting pdf.save(temp_input) # Create and start subprocess process = Process(target=_subset_fonts_process, args=(temp_input, temp_output)) process.start() # Wait for subprocess with timeout (1 minute) timeout = 60 # 1 minutes in seconds start_time = time.time() while process.is_alive(): if time.time() - start_time > timeout: logger.warning( f"Font subsetting timeout after {timeout} seconds, terminating subprocess" ) process.terminate() try: process.join(5) # Give it 5 seconds to clean up if process.is_alive(): logger.warning("Subprocess did not terminate, killing it") process.kill() process.terminate() process.kill() process.terminate() process.kill() process.terminate() except Exception as e: logger.error(f"Error terminating font subsetting process: {e}") return original_pdf time.sleep(0.5) # Check every half second # Process completed, check exit code exit_code = process.exitcode success = exit_code == 0 # Check if subsetting was successful if ( success and Path(temp_output).exists() and Path(temp_output).stat().st_size > 0 ): logger.info("Font subsetting completed successfully") return pymupdf.open(temp_output) else: logger.warning( f"Font subsetting failed with exit code {exit_code} or produced empty file" ) return original_pdf @staticmethod def save_pdf_with_timeout( pdf: pymupdf.Document, output_path: str, translation_config: TranslationConfig, garbage: int = 1, deflate: bool = True, clean: bool = True, deflate_fonts: bool = True, linear: bool = False, timeout: int = 120, tag: str = "", ) -> bool: """Save a PDF document with a timeout for the clean=True operation. Args: pdf: The PDF document object output_path: Path where to save the PDF translation_config: Translation configuration garbage: Garbage collection level (0, 1, 2, 3, 4) deflate: Whether to deflate the PDF clean: Whether to clean the PDF deflate_fonts: Whether to deflate fonts linear: Whether to linearize the PDF timeout: Timeout in seconds (default: 2 minutes) Returns: True if saved with clean=True successfully, False if fallback to clean=False was used """ # Create temporary file paths temp_input = str( translation_config.get_working_file_path(f"temp_save_input_{tag}.pdf") ) temp_output = str( translation_config.get_working_file_path(f"temp_save_output_{tag}.pdf") ) # Save PDF to temporary file first pdf.save(temp_input) # Try to save with clean=True in a subprocess process = Process( target=_save_pdf_clean_process, args=( temp_input, temp_output, garbage, deflate, clean, deflate_fonts, linear, ), ) process.start() # Wait for subprocess with timeout start_time = time.time() while process.is_alive(): if time.time() - start_time > timeout: logger.warning( f"PDF save with clean={clean} timeout after {timeout} seconds, terminating subprocess" ) process.terminate() try: process.join(5) # Give it 5 seconds to clean up if process.is_alive(): logger.warning("Subprocess did not terminate, killing it") process.kill() process.terminate() process.kill() process.terminate() process.kill() process.terminate() except Exception as e: logger.error(f"Error terminating PDF save process: {e}") # Fallback to save without clean parameter logger.info("Falling back to save with clean=False") try: pdf.save( output_path, garbage=garbage, deflate=deflate, clean=False, deflate_fonts=deflate_fonts, linear=linear, ) return False except Exception as e: logger.error(f"Error in fallback save: {e}") # Last resort: basic save pdf.save(output_path) return False time.sleep(0.5) # Check every half second # Process completed, check exit code exit_code = process.exitcode success = exit_code == 0 # Check if save was successful if ( success and Path(temp_output).exists() and Path(temp_output).stat().st_size > 0 ): logger.info(f"PDF save with clean={clean} completed successfully") # Copy the successfully created file to the target path try: import shutil shutil.copy2(temp_output, output_path) return True except Exception as e: logger.error(f"Error copying saved PDF: {e}") pdf.save(output_path) # Fallback to direct save return False finally: Path(temp_input).unlink() Path(temp_output).unlink() else: logger.warning( f"PDF save with clean={clean} failed with exit code {exit_code} or produced empty file" ) # Fallback to save without clean parameter try: pdf.save( output_path, garbage=garbage, deflate=deflate, clean=False, deflate_fonts=deflate_fonts, linear=linear, ) except Exception as e: logger.error(f"Error in fallback save: {e}") # Last resort: basic save pdf.save(output_path) return False def restore_media_box(self, doc: pymupdf.Document, mediabox_data: dict) -> None: for xref, page_box_data in mediabox_data.items(): for name, box in page_box_data.items(): try: doc.xref_set_key(xref, name, box) except Exception: logger.debug(f"Error restoring media box {name} from PDF") def write( self, translation_config: TranslationConfig, check_font_exists: bool = False, ) -> TranslateResult: try: basename = Path(translation_config.input_file).stem debug_suffix = ".debug" if translation_config.debug else "" if ( translation_config.watermark_output_mode != WatermarkOutputMode.Watermarked ): debug_suffix += ".no_watermark" mono_out_path = translation_config.get_output_file_path( f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf", ) pdf = pymupdf.open(self.original_pdf_path) self.font_mapper.add_font(pdf, self.docs) with self.translation_config.progress_monitor.stage_start( self.stage_name, len(self.docs.page), ) as pbar: for page in self.docs.page: self.update_page_content_stream( check_font_exists, page, pdf, translation_config ) pbar.advance() translation_config.raise_if_cancelled() gc_level = 1 if self.translation_config.ocr_workaround: gc_level = 4 with self.translation_config.progress_monitor.stage_start( SUBSET_FONT_STAGE_NAME, 1, ) as pbar: if not translation_config.skip_clean: pdf = self.subset_fonts_in_subprocess( pdf, translation_config, tag="mono" ) pbar.advance() try: self.restore_media_box(pdf, self.mediabox_data) except Exception: logger.exception("restore media box failed") if translation_config.only_include_translated_page: total_page = set(range(0, len(pdf))) pages_to_translate = { page.page_number for page in self.docs.page if self.translation_config.should_translate_page( page.page_number + 1 ) } should_removed_page = list(total_page - pages_to_translate) pdf.delete_pages(should_removed_page) with self.translation_config.progress_monitor.stage_start( SAVE_PDF_STAGE_NAME, 2, ) as pbar: if not translation_config.no_mono: if translation_config.debug: translation_config.raise_if_cancelled() pdf.save( f"{mono_out_path}.decompressed.pdf", expand=True, pretty=True, ) translation_config.raise_if_cancelled() self.save_pdf_with_timeout( pdf, mono_out_path, translation_config, garbage=gc_level, deflate=True, clean=not translation_config.skip_clean, deflate_fonts=True, linear=False, tag="mono", ) pbar.advance() dual_out_path = None if not translation_config.no_dual: dual_out_path = translation_config.get_output_file_path( f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf", ) translation_config.raise_if_cancelled() original_pdf = pymupdf.open(self.original_pdf_path) if translation_config.debug: translation_config.raise_if_cancelled() try: original_pdf = self.write_debug_info( original_pdf, translation_config ) except Exception: logger.warning( "Failed to write debug info to dual PDF", exc_info=True, ) if ( self.translation_config.only_include_translated_page and should_removed_page ): original_pdf.delete_pages(should_removed_page) translated_pdf = pdf # Choose between alternating pages and side-by-side format # Default to side-by-side if not specified use_alternating_pages = ( translation_config.use_alternating_pages_dual ) if use_alternating_pages: # Create a dual PDF with alternating pages (original and translation) dual = self.create_alternating_pages_dual_pdf( original_pdf, translated_pdf, translation_config, ) else: # Create a dual PDF with side-by-side pages (original and translation) dual = self.create_side_by_side_dual_pdf( original_pdf, translated_pdf, dual_out_path, translation_config, ) self.save_pdf_with_timeout( dual, dual_out_path, translation_config, garbage=gc_level, deflate=True, clean=not translation_config.skip_clean, deflate_fonts=True, linear=False, tag="dual", ) if translation_config.debug: translation_config.raise_if_cancelled() dual.save( f"{dual_out_path}.decompressed.pdf", expand=True, pretty=True, ) pbar.advance() if self.translation_config.no_mono: mono_out_path = None if self.translation_config.no_dual: dual_out_path = None auto_extracted_glossary_path = None if ( self.translation_config.save_auto_extracted_glossary and self.translation_config.shared_context_cross_split_part.auto_extracted_glossary ): auto_extracted_glossary_path = self.translation_config.get_output_file_path( f"{basename}{debug_suffix}.{translation_config.lang_out}.glossary.csv" ) with auto_extracted_glossary_path.open("w", encoding="utf-8") as f: logger.info( f"save auto extracted glossary to {auto_extracted_glossary_path}" ) f.write( self.translation_config.shared_context_cross_split_part.auto_extracted_glossary.to_csv() ) return TranslateResult( mono_out_path, dual_out_path, auto_extracted_glossary_path ) except Exception: logger.exception( "Failed to create PDF: %s", translation_config.input_file, ) if not check_font_exists: return self.write(translation_config, True) raise def update_page_content_stream( self, check_font_exists, page, pdf, translation_config, skip_char: bool = False ): assert page.cropbox is not None and page.cropbox.box is not None page_crop_box = page.cropbox.box ctm_for_ops = ( 1, 0, 0, 1, -page_crop_box.x, -page_crop_box.y, ) ctm_for_ops = f" {' '.join(f'{x:f}' for x in ctm_for_ops)} cm ".encode() translation_config.raise_if_cancelled() xobj_available_fonts = {} xobj_draw_ops = {} xobj_encoding_length_map = {} available_font_list = self.get_available_font_list(pdf, page) page_encoding_length_map: dict[str | None, int | None] = { f.font_id: f.encoding_length for f in page.pdf_font } all_encoding_length_map = page_encoding_length_map.copy() for xobj in page.pdf_xobject: xobj_available_fonts[xobj.xobj_id] = available_font_list.copy() try: xobj_available_fonts[xobj.xobj_id].update( self.get_xobj_available_fonts(xobj.xref_id, pdf), ) except Exception: pass xobj_encoding_length_map[xobj.xobj_id] = { f.font_id: f.encoding_length for f in xobj.pdf_font } all_encoding_length_map.update(xobj_encoding_length_map[xobj.xobj_id]) xobj_encoding_length_map[xobj.xobj_id].update(page_encoding_length_map) xobj_op = BitStream() base_op = xobj.base_operations.value base_op = zstd_decompress(base_op) xobj_op.append(base_op.encode()) xobj_draw_ops[xobj.xobj_id] = xobj_op page_op = BitStream() # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new} # page_op.append(b"q ") # base_op = page.base_operations.value # base_op = zstd_decompress(base_op) # page_op.append(base_op.encode()) # page_op.append(b" \n") page_op.append(ctm_for_ops) page_op.append(b" \n") # Create render context context = RenderContext( pdf_creator=self, page=page, available_font_list=available_font_list, page_encoding_length_map=page_encoding_length_map, all_encoding_length_map=all_encoding_length_map, xobj_available_fonts=xobj_available_fonts, xobj_encoding_length_map=xobj_encoding_length_map, ctm_for_ops=ctm_for_ops, check_font_exists=check_font_exists, ) # Create render units for all renderable objects render_units = self.create_render_units_for_page(page, translation_config) if skip_char: render_units = [ unit for unit in render_units if not isinstance(unit, CharacterRenderUnit) ] # Render all units to their appropriate streams self.render_units_to_stream(render_units, context, page_op, xobj_draw_ops) # Update xobject streams for xobj in page.pdf_xobject: draw_op = xobj_draw_ops[xobj.xobj_id] try: pdf.update_stream(xobj.xref_id, draw_op.tobytes()) except Exception: logger.warning(f"update xref {xobj.xref_id} stream fail, continue") draw_op = page_op op_container = pdf.get_new_xref() # Since this is a draw instruction container, # no additional information is needed pdf.update_object(op_container, "<<>>") pdf.update_stream(op_container, draw_op.tobytes()) pdf[page.page_number].set_contents(op_container) ================================================ FILE: babeldoc/format/pdf/document_il/frontend/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/frontend/il_creater.py ================================================ import base64 import functools import logging import math import re import unicodedata from io import BytesIO from itertools import islice from typing import Literal import freetype import pymupdf import tiktoken import babeldoc.pdfminer.pdfinterp from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox from babeldoc.format.pdf.babelpdf.cidfont import get_cidfont_bbox from babeldoc.format.pdf.babelpdf.cidfont import get_glyph_bbox from babeldoc.format.pdf.babelpdf.encoding import WinAnsiEncoding from babeldoc.format.pdf.babelpdf.encoding import get_type1_encoding from babeldoc.format.pdf.babelpdf.type3 import get_type3_bbox from babeldoc.format.pdf.babelpdf.utils import guarded_bbox from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils import zstd_helper from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm from babeldoc.format.pdf.document_il.utils.style_helper import BLACK from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.pdfminer.layout import LTChar from babeldoc.pdfminer.layout import LTFigure from babeldoc.pdfminer.pdffont import PDFCIDFont from babeldoc.pdfminer.pdffont import PDFFont # from babeldoc.pdfminer.pdfpage import PDFPage as PDFMinerPDFPage # from babeldoc.pdfminer.pdftypes import PDFObjRef as PDFMinerPDFObjRef # from babeldoc.pdfminer.pdftypes import resolve1 as pdftypes_resolve1 from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import get_bound from babeldoc.pdfminer.utils import mult_matrix def invert_matrix( ctm: tuple[float, float, float, float, float, float], ) -> tuple[float, float, float, float, float, float]: """ Calculate the inverse of a 2D transformation matrix. Matrix format: (a, b, c, d, e, f) representing: [a c e] [b d f] [0 0 1] """ a, b, c, d, e, f = ctm # Calculate determinant det = a * d - b * c if abs(det) < 1e-10: # Matrix is singular, return identity matrix return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0) # Calculate inverse matrix elements inv_a = d / det inv_b = -b / det inv_c = -c / det inv_d = a / det inv_e = (c * f - d * e) / det inv_f = (b * e - a * f) / det return (inv_a, inv_b, inv_c, inv_d, inv_e, inv_f) def batched(iterable, n, *, strict=False): # batched('ABCDEFG', 3) → ABC DEF G if n < 1: raise ValueError("n must be at least one") iterator = iter(iterable) while batch := tuple(islice(iterator, n)): if strict and len(batch) != n: raise ValueError("batched(): incomplete batch") yield batch logger = logging.getLogger(__name__) # # def create_hook(func, hook): # @wraps(func) # def wrapper(*args, **kwargs): # hook(*args, **kwargs) # return func(*args, **kwargs) # # return wrapper # # # def hook_pdfminer_pdf_page_init(*args): # attrs = args[3] # try: # while isinstance(attrs["MediaBox"], PDFMinerPDFObjRef): # attrs["MediaBox"] = pdftypes_resolve1(attrs["MediaBox"]) # except Exception: # logger.exception(f"try to fix mediabox failed: {attrs}") # # # PDFMinerPDFPage.__init__ = create_hook( # PDFMinerPDFPage.__init__, hook_pdfminer_pdf_page_init # ) def indirect(obj): if isinstance(obj, tuple) and obj[0] == "xref": return int(obj[1].split(" ")[0]) def get_char_cbox(face, idx): g = face.get_char_index(idx) return get_glyph_bbox(face, g) def get_name_cbox(face, name): if name: if isinstance(name, str): name = name.encode("utf-8") g = face.get_name_index(name) return get_glyph_bbox(face, g) return (0, 0, 0, 0) def font_encoding_lookup(doc, idx, key): obj = doc.xref_get_key(idx, key) if obj[0] == "name": enc_name = obj[1][1:] if enc_vector := get_type1_encoding(enc_name): return enc_name, enc_vector def parse_font_encoding(doc, idx): if encoding := font_encoding_lookup(doc, idx, "Encoding/BaseEncoding"): return encoding if encoding := font_encoding_lookup(doc, idx, "Encoding"): return encoding return ("Custom", get_type1_encoding("StandardEncoding")) def get_truetype_ansi_bbox_list(face): scale = 1000 / face.units_per_EM bbox_list = [get_char_cbox(face, code) for code in WinAnsiEncoding] bbox_list = [[v * scale for v in bbox] for bbox in bbox_list] return bbox_list def collect_face_cmap(face): umap = [] # unicode maps lmap = [] # legacy maps for cmap in face.charmaps: if cmap.encoding_name == "FT_ENCODING_UNICODE": umap.append(cmap) else: lmap.append(cmap) return umap, lmap def get_truetype_custom_bbox_list(face): umap, lmap = collect_face_cmap(face) if umap: face.set_charmap(umap[0]) elif lmap: face.set_charmap(lmap[0]) else: return [] scale = 1000 / face.units_per_EM bbox_list = [get_char_cbox(face, code) for code in range(256)] bbox_list = [[v * scale for v in bbox] for bbox in bbox_list] return bbox_list def parse_font_file(doc, idx, encoding, differences): bbox_list = [] data = doc.xref_stream(idx) face = freetype.Face(BytesIO(data)) if face.get_format() == b"TrueType": if encoding[0] == "WinAnsiEncoding": return get_truetype_ansi_bbox_list(face) elif encoding[0] == "Custom": return get_truetype_custom_bbox_list(face) glyph_name_set = set() for x in range(0, face.num_glyphs): glyph_name_set.add(face.get_glyph_name(x).decode("U8")) scale = 1000 / face.units_per_EM enc_name, enc_vector = encoding _, lmap = collect_face_cmap(face) abbr = enc_name.removesuffix("Encoding") if lmap and abbr in ["Custom", "MacRoman", "Standard", "WinAnsi", "MacExpert"]: face.set_charmap(lmap[0]) for i, x in enumerate(enc_vector): if x in glyph_name_set: v = get_name_cbox(face, x.encode("U8")) else: v = get_char_cbox(face, i) bbox_list.append(v) if differences: for code, name in differences: bbox_list[code] = get_name_cbox(face, name.encode("U8")) norm_bbox_list = [[v * scale for v in box] for box in bbox_list] return norm_bbox_list def parse_encoding(obj_str): delta = [] current = 0 for x in re.finditer( r"(?P

[\[\]])|(?P\d+)|(?P/[^\s/\[\]()<>]+)|(?P.)", obj_str ): key = x.lastgroup val = x.group() if key == "c": current = int(val) if key == "n": delta.append((current, val[1:])) current += 1 return delta def parse_mapping(text): mapping = [] for x in re.finditer(r"<(?P[a-fA-F0-9]+)>", text): mapping.append(x.group("num")) return mapping def update_cmap_pair(cmap, data): for start_str, stop_str, value_str in batched(data, 3): start = int(start_str, 16) stop = int(stop_str, 16) try: value = base64.b16decode(value_str, True).decode("UTF-16-BE") for code in range(start, stop + 1): cmap[code] = value except Exception: pass # to skip surrogate pairs (D800-DFFF) def update_cmap_code(cmap, data): for code_str, value_str in batched(data, 2): code = int(code_str, 16) try: value = base64.b16decode(value_str, True).decode("UTF-16-BE") cmap[code] = value except Exception: pass # to skip surrogate pairs (D800-DFFF) def parse_cmap(cmap_str): cmap = {} for x in re.finditer( r"\s+beginbfrange\s*(?P(<[0-9a-fA-F]+>\s*)+)endbfrange\s+", cmap_str ): update_cmap_pair(cmap, parse_mapping(x.group("r"))) for x in re.finditer( r"\s+beginbfchar\s*(?P(<[0-9a-fA-F]+>\s*)+)endbfchar", cmap_str ): update_cmap_code(cmap, parse_mapping(x.group("c"))) return cmap def get_code(cmap, c): for k, v in cmap.items(): if v == c: return k return -1 def get_bbox(bbox, size, c, x, y): x_min, y_min, x_max, y_max = bbox[c] factor = 1 / 1000 * size x_min = x_min * factor y_min = -y_min * factor x_max = x_max * factor y_max = -y_max * factor ll = (x + x_min, y + y_min) lr = (x + x_max, y + y_min) ul = (x + x_min, y + y_max) ur = (x + x_max, y + y_max) return pymupdf.Quad(ll, lr, ul, ur) # 常见 Unicode 空格字符的代码点 unicode_spaces = [ "\u0020", # 半角空格 "\u00a0", # 不间断空格 "\u1680", # Ogham 空格标记 "\u2000", # En Quad "\u2001", # Em Quad "\u2002", # En Space "\u2003", # Em Space "\u2004", # 三分之一 Em 空格 "\u2005", # 四分之一 Em 空格 "\u2006", # 六分之一 Em 空格 "\u2007", # 数样间距 "\u2008", # 行首前导空格 "\u2009", # 瘦弱空格 "\u200a", # hair space "\u202f", # 窄不间断空格 "\u205f", # 数学中等空格 "\u3000", # 全角空格 "\u200b", # 零宽度空格 "\u2060", # 零宽度非断空格 "\t", # 水平制表符 ] # 构建正则表达式 pattern = "^[" + "".join(unicode_spaces) + "]+$" # 编译正则 space_regex = re.compile(pattern) def get_rotation_angle(matrix): """ 根据 PDF 的字符矩阵计算旋转角度(单位:度) matrix: tuple/list, 格式 (a, b, c, d, e, f) """ a, b, c, d, e, f = matrix # 旋转角度:arctan2(b, a) angle_rad = math.atan2(b, a) angle_deg = math.degrees(angle_rad) return angle_deg class ILCreater: stage_name = "Parse PDF and Create Intermediate Representation" def __init__(self, translation_config: TranslationConfig): self.progress = None self.current_page: il_version_1.Page = None self.mupdf: pymupdf.Document = None self.model = translation_config.doc_layout_model self.docs = il_version_1.Document(page=[]) self.stroking_color_space_name = None self.non_stroking_color_space_name = None self.passthrough_per_char_instruction: list[tuple[str, str]] = [] self.translation_config = translation_config self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = [] self.xobj_id = 0 self.xobj_inc = 0 self.xobj_map: dict[int, il_version_1.PdfXobject] = {} self.xobj_stack = [] self.current_page_font_name_id_map = {} self.current_page_font_char_bounding_box_map = {} self.current_available_fonts = {} self.mupdf_font_map: dict[int, pymupdf.Font] = {} self.graphic_state_pool = {} self.enable_graphic_element_process = ( translation_config.enable_graphic_element_process ) self.render_order = 0 self.current_clip_paths: list[tuple] = [] self.clip_paths_stack: list[list[tuple]] = [] # For valid character collection self.font_mapper = FontMapper(translation_config) self.tokenizer = tiktoken.encoding_for_model("gpt-4o") self._page_valid_chars_buffer: list[str] | None = None def transform_clip_path( self, clip_path, source_ctm: tuple[float, float, float, float, float, float], target_ctm: tuple[float, float, float, float, float, float], ): """Transform clip path coordinates from source CTM to target CTM.""" if source_ctm == target_ctm: return clip_path # Calculate transformation matrix: inverse(target_ctm) * source_ctm inv_target_ctm = invert_matrix(target_ctm) transform_matrix = mult_matrix(source_ctm, inv_target_ctm) transformed_path = [] for path_element in clip_path: if len(path_element) == 1: # Path operation without coordinates (e.g., 'h' for close path) transformed_path.append(path_element) else: # Path operation with coordinates op = path_element[0] coords = path_element[1:] transformed_coords = [] # Transform coordinate pairs for i in range(0, len(coords), 2): if i + 1 < len(coords): x, y = coords[i], coords[i + 1] transformed_point = apply_matrix_pt(transform_matrix, (x, y)) transformed_coords.extend(transformed_point) else: # Handle odd number of coordinates (shouldn't happen in well-formed paths) transformed_coords.append(coords[i]) transformed_path.append([op] + transformed_coords) return transformed_path def get_render_order_and_increase(self): self.render_order += 1 return self.render_order def get_render_order(self): return self.render_order def on_finish(self): self.progress.__exit__(None, None, None) def is_graphic_operation(self, operator: str): if not self.enable_graphic_element_process: return False return re.match( "^(m|l|c|v|y|re|h|S|s|f|f*|F|B|B*|b|b*|n|Do)$", operator, ) def is_passthrough_per_char_operation(self, operator: str): return re.match( "^(sc|SC|sh|scn|SCN|g|G|rg|RG|k|K|cs|CS|gs|ri|w|J|j|M|i)$", operator, ) def can_remove_old_passthrough_per_char_instruction(self, operator: str): return re.match( "^(sc|SC|sh|scn|SCN|g|G|rg|RG|k|K|cs|CS|ri|w|J|j|M|i|d)$", operator, ) def on_line_dash(self, dash, phase): dash_str = f"[{' '.join(f'{arg}' for arg in dash)}]" self.on_passthrough_per_char("d", [dash_str, str(phase)]) def on_passthrough_per_char(self, operator: str, args: list[str]): if not self.is_passthrough_per_char_operation(operator) and operator not in ( "W n", "W* n", "d", "W", "W*", ): logger.error("Unknown passthrough_per_char operation: %s", operator) return # logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args) args = [self.parse_arg(arg) for arg in args] if self.can_remove_old_passthrough_per_char_instruction(operator): for _i, value in enumerate(self.passthrough_per_char_instruction.copy()): op, arg = value if op == operator: self.passthrough_per_char_instruction.remove(value) break self.passthrough_per_char_instruction.append((operator, " ".join(args))) pass def remove_latest_passthrough_per_char_instruction(self): if self.passthrough_per_char_instruction: self.passthrough_per_char_instruction.pop() def parse_arg(self, arg: str): if isinstance(arg, PSLiteral): return f"/{arg.name}" elif isinstance(arg, float): return f"{arg:f}" elif not isinstance(arg, str): return str(arg) return arg def pop_passthrough_per_char_instruction(self): if self.passthrough_per_char_instruction_stack: self.passthrough_per_char_instruction = ( self.passthrough_per_char_instruction_stack.pop() ) else: self.passthrough_per_char_instruction = [] logging.error( "pop_passthrough_per_char_instruction error on page: %s", self.current_page.page_number, ) if self.clip_paths_stack: self.current_clip_paths = self.clip_paths_stack.pop() else: self.current_clip_paths = [] def push_passthrough_per_char_instruction(self): self.passthrough_per_char_instruction_stack.append( self.passthrough_per_char_instruction.copy(), ) self.clip_paths_stack.append(self.current_clip_paths.copy()) # pdf32000 page 171 def on_stroking_color_space(self, color_space_name): self.stroking_color_space_name = color_space_name def on_non_stroking_color_space(self, color_space_name): self.non_stroking_color_space_name = color_space_name def on_new_stream(self): self.stroking_color_space_name = None self.non_stroking_color_space_name = None self.passthrough_per_char_instruction = [] self.current_clip_paths = [] def push_xobj(self): self.xobj_stack.append( ( self.xobj_id, self.current_clip_paths.copy(), self.current_available_fonts.copy(), ), ) self.current_clip_paths = [] def pop_xobj(self): (self.xobj_id, self.current_clip_paths, self.current_available_fonts) = ( self.xobj_stack.pop() ) def on_xobj_begin(self, bbox, xref_id): logger.debug(f"on_xobj_begin: {bbox} @ {xref_id}") self.push_passthrough_per_char_instruction() self.push_xobj() self.xobj_inc += 1 self.xobj_id = self.xobj_inc xobject = il_version_1.PdfXobject( box=il_version_1.Box( x=float(bbox[0]), y=float(bbox[1]), x2=float(bbox[2]), y2=float(bbox[3]), ), xobj_id=self.xobj_id, xref_id=xref_id, pdf_font=[], ) self.current_page.pdf_xobject.append(xobject) self.xobj_map[self.xobj_id] = xobject xobject.pdf_font.extend(self.current_available_fonts.values()) return self.xobj_id def on_xobj_end(self, xobj_id, base_op): self.pop_passthrough_per_char_instruction() self.pop_xobj() xobj = self.xobj_map[xobj_id] base_op = zstd_helper.zstd_compress(base_op) xobj.base_operations = il_version_1.BaseOperations(value=base_op) self.xobj_inc += 1 def on_page_start(self): self.current_page = il_version_1.Page( pdf_font=[], pdf_character=[], page_layout=[], pdf_curve=[], pdf_form=[], # currently don't support UserUnit page parameter # pdf32000 page 79 unit="point", ) self.current_page_font_name_id_map = {} self.current_page_font_char_bounding_box_map = {} self.passthrough_per_char_instruction_stack = [] self.xobj_stack = [] self.non_stroking_color_space_name = None self.stroking_color_space_name = None self.current_clip_paths = [] self.clip_paths_stack = [] self.docs.page.append(self.current_page) # Prepare per-page buffer for valid characters on translated pages self._page_valid_chars_buffer = [] def on_page_end(self): # Accumulate this page's valid characters and tokens into shared context try: if ( self._page_valid_chars_buffer is not None and len(self._page_valid_chars_buffer) > 0 ): page_text = "".join(self._page_valid_chars_buffer) char_count = len(page_text) try: token_count = len( self.tokenizer.encode(page_text, disallowed_special=()) ) except Exception as e: logger.warning("Failed to compute token count for page: %s", e) token_count = 0 self.translation_config.shared_context_cross_split_part.add_valid_counts( char_count, token_count ) except Exception as e: logger.warning("Failed to accumulate page valid stats: %s", e) finally: self._page_valid_chars_buffer = [] self.progress.advance(1) def on_page_crop_box( self, x0: float | int, y0: float | int, x1: float | int, y1: float | int, ): box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1)) self.current_page.cropbox = il_version_1.Cropbox(box=box) def on_page_media_box( self, x0: float | int, y0: float | int, x1: float | int, y1: float | int, ): box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1)) self.current_page.mediabox = il_version_1.Mediabox(box=box) def on_page_number(self, page_number: int): assert isinstance(page_number, int) assert page_number >= 0 self.current_page.page_number = page_number def on_page_base_operation(self, operation: str): operation = zstd_helper.zstd_compress(operation) self.current_page.base_operations = il_version_1.BaseOperations(value=operation) def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str): font_name = font.fontname logger.debug(f"handle font {font_name} @ {xref_id} in {self.xobj_id}") if isinstance(font_name, bytes): try: font_name = font_name.decode("utf-8") except UnicodeDecodeError: font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8") encoding_length = 1 if isinstance(font, PDFCIDFont): try: # pdf 32000:2008 page 273 # Table 118 - Predefined CJK CMap names _, encoding = self.mupdf.xref_get_key(xref_id, "Encoding") if encoding == "/Identity-H" or encoding == "/Identity-V": encoding_length = 2 elif encoding == "/WinAnsiEncoding": encoding_length = 1 else: _, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode") if to_unicode_id is not None: to_unicode_bytes = self.mupdf.xref_stream( int(to_unicode_id.split(" ")[0]), ) code_range = re.search( b"begincodespacerange\n?.*<(\\d+?)>.*", to_unicode_bytes, ).group(1) encoding_length = len(code_range) // 2 except Exception: if ( font.unicode_map and font.unicode_map.cid2unichr and max(font.unicode_map.cid2unichr.keys()) > 255 ): encoding_length = 2 else: encoding_length = 1 try: if xref_id in self.mupdf_font_map: mupdf_font = self.mupdf_font_map[xref_id] else: mupdf_font = pymupdf.Font( fontbuffer=self.mupdf.extract_font(xref_id)[3] ) mupdf_font.has_glyph = functools.lru_cache(maxsize=10240, typed=True)( mupdf_font.has_glyph, ) bold = mupdf_font.is_bold italic = mupdf_font.is_italic monospaced = mupdf_font.is_monospaced serif = mupdf_font.is_serif self.mupdf_font_map[xref_id] = mupdf_font except Exception: bold = None italic = None monospaced = None serif = None il_font_metadata = il_version_1.PdfFont( name=font_name, xref_id=xref_id, font_id=font_id, encoding_length=encoding_length, bold=bold, italic=italic, monospace=monospaced, serif=serif, ascent=font.ascent, descent=font.descent, pdf_font_char_bounding_box=[], ) try: if xref_id is None: logger.warning("xref_id is None for font %s", font_name) raise ValueError("xref_id is None for font %s", font_name) bbox_list, cmap = self.parse_font_xobj_id(xref_id) font_char_bounding_box_map = {} if not cmap: cmap = {x: x for x in range(257)} for char_id, char_bbox in enumerate(bbox_list): font_char_bounding_box_map[char_id] = char_bbox for char_id in cmap: if char_id < 0 or char_id >= len(bbox_list): continue bbox = bbox_list[char_id] x, y, x2, y2 = bbox if ( x == 0 and y == 0 and x2 == 500 and y2 == 698 or x == 0 and y == 0 and x2 == 0 and y2 == 0 ): # ignore default bounding box continue il_font_metadata.pdf_font_char_bounding_box.append( il_version_1.PdfFontCharBoundingBox( x=x, y=y, x2=x2, y2=y2, char_id=char_id, ) ) font_char_bounding_box_map[char_id] = bbox if self.xobj_id in self.xobj_map: if self.xobj_id not in self.current_page_font_char_bounding_box_map: self.current_page_font_char_bounding_box_map[self.xobj_id] = {} self.current_page_font_char_bounding_box_map[self.xobj_id][xref_id] = ( font_char_bounding_box_map ) else: self.current_page_font_char_bounding_box_map[xref_id] = ( font_char_bounding_box_map ) except Exception as e: if xref_id is None: logger.error("failed to parse font xobj id None: %s", e) else: logger.error("failed to parse font xobj id %d: %s", xref_id, e) self.current_page_font_name_id_map[xref_id] = font_id self.current_available_fonts[font_id] = il_font_metadata fonts = self.current_page.pdf_font if self.xobj_id in self.xobj_map: fonts = self.xobj_map[self.xobj_id].pdf_font should_remove = [] for f in fonts: if f.font_id == font_id: should_remove.append(f) for sr in should_remove: fonts.remove(sr) fonts.append(il_font_metadata) def parse_font_xobj_id(self, xobj_id: int): if xobj_id is None: return [], {} bbox_list = [] encoding = parse_font_encoding(self.mupdf, xobj_id) differences = [] font_differences = self.mupdf.xref_get_key(xobj_id, "Encoding/Differences") if font_differences: differences = parse_encoding(font_differences[1]) for file_key in ["FontFile", "FontFile2", "FontFile3"]: font_file = self.mupdf.xref_get_key(xobj_id, f"FontDescriptor/{file_key}") if file_idx := indirect(font_file): bbox_list = parse_font_file( self.mupdf, file_idx, encoding, differences, ) cmap = {} to_unicode = self.mupdf.xref_get_key(xobj_id, "ToUnicode") if to_unicode_idx := indirect(to_unicode): cmap = parse_cmap(self.mupdf.xref_stream(to_unicode_idx).decode("U8")) if not bbox_list: obj_type, obj_val = self.mupdf.xref_get_key(xobj_id, "BaseFont") if obj_type == "name": bbox_list = get_base14_bbox(obj_val[1:]) if cid_bbox := get_cidfont_bbox(self.mupdf, xobj_id): bbox_list = cid_bbox if self.mupdf.xref_get_key(xobj_id, "Subtype")[1] == "/Type3": bbox_list = get_type3_bbox(self.mupdf, xobj_id) return bbox_list, cmap def create_graphic_state( self, gs: babeldoc.pdfminer.pdfinterp.PDFGraphicState | list[tuple[str, str]], include_clipping: bool = False, target_ctm: tuple[float, float, float, float, float, float] = None, clip_paths=None, ): if clip_paths is None: clip_paths = self.current_clip_paths passthrough_instruction = getattr(gs, "passthrough_instruction", gs) def filter_clipping(op): return op not in ("W n", "W* n") def pass_all(_op): return True if include_clipping: filter_clipping = pass_all passthrough_per_char_instruction_parts = [ f"{arg} {op}" for op, arg in passthrough_instruction if filter_clipping(op) ] # Add transformed clipping paths if requested and target CTM is provided if include_clipping and target_ctm and clip_paths: for clip_path, source_ctm, evenodd in clip_paths: try: # Transform clip path from source CTM to target CTM transformed_path = self.transform_clip_path( clip_path, source_ctm, target_ctm ) # Generate clipping instruction op = "W* n" if evenodd else "W n" args = [] for p in transformed_path: if len(p) == 1: args.append(p[0]) elif len(p) > 1: args.extend([f"{x:F}" for x in p[1:]]) args.append(p[0]) if args: clipping_instruction = f"{' '.join(args)} {op}" passthrough_per_char_instruction_parts.append( clipping_instruction ) except Exception as e: logger.warning("Error transforming clip path: %s", e) passthrough_per_char_instruction = " ".join( passthrough_per_char_instruction_parts ) # 可能会影响部分 graphic state 准确度。不过 BabelDOC 仅使用 passthrough_per_char_instruction # 所以应该是没啥影响 # 但是池化 graphic state 后可以减少内存占用 if passthrough_per_char_instruction not in self.graphic_state_pool: self.graphic_state_pool[passthrough_per_char_instruction] = ( il_version_1.GraphicState( passthrough_per_char_instruction=passthrough_per_char_instruction ) ) graphic_state = self.graphic_state_pool[passthrough_per_char_instruction] return graphic_state def on_lt_char(self, char: LTChar): if char.aw_font_id is None: return try: rotation_angle = get_rotation_angle(char.matrix) if not (-0.1 <= rotation_angle <= 0.1 or 89.9 <= rotation_angle <= 90.1): return except Exception: logger.warning( "Failed to get rotation angle for char %s", char.get_text(), ) # Collect valid characters for statistics try: self._collect_valid_char(char.get_text()) except Exception as e: logger.warning("Error collecting valid char: %s", e) gs = self.create_graphic_state(char.graphicstate) # Get font from current page or xobject font = None pdf_font = None for pdf_font in self.xobj_map.get(char.xobj_id, self.current_page).pdf_font: if pdf_font.font_id == char.aw_font_id: font = pdf_font break # Get descent from font descent = 0 if font and hasattr(font, "descent"): descent = font.descent * char.size / 1000 char_id = char.cid char_bounding_box = None try: if ( font_bounding_box_map := self.current_page_font_char_bounding_box_map.get( char.xobj_id, self.current_page_font_char_bounding_box_map ).get(font.xref_id) ): char_bounding_box = font_bounding_box_map.get(char_id, None) else: char_bounding_box = None except Exception: # logger.debug( # "Failed to get font bounding box for char %s", # char.get_text(), # ) char_bounding_box = None char_unicode = char.get_text() # if "(cid:" not in char_unicode and len(char_unicode) > 1: # return if space_regex.match(char_unicode): char_unicode = " " advance = char.adv bbox = il_version_1.Box( x=char.bbox[0], y=char.bbox[1], x2=char.bbox[2], y2=char.bbox[3], ) if bbox.x2 < bbox.x or bbox.y2 < bbox.y: logger.warning( "Invalid bounding box for character %s: %s", char_unicode, bbox, ) if char.matrix[0] == 0 and char.matrix[3] == 0: vertical = True visual_bbox = il_version_1.Box( x=char.bbox[0] - descent, y=char.bbox[1], x2=char.bbox[2] - descent, y2=char.bbox[3], ) else: vertical = False # Add descent to y coordinates visual_bbox = il_version_1.Box( x=char.bbox[0], y=char.bbox[1] + descent, x2=char.bbox[2], y2=char.bbox[3] + descent, ) visual_bbox = il_version_1.VisualBbox(box=visual_bbox) pdf_style = il_version_1.PdfStyle( font_id=char.aw_font_id, font_size=char.size, graphic_state=gs, ) if font: font_xref_id = font.xref_id if font_xref_id in self.mupdf_font_map: mupdf_font = self.mupdf_font_map[font_xref_id] # if "(cid:" not in char_unicode: # if mupdf_cid := mupdf_font.has_glyph(ord(char_unicode)): # char_id = mupdf_cid pdf_char = il_version_1.PdfCharacter( box=bbox, pdf_character_id=char_id, advance=advance, char_unicode=char_unicode, vertical=vertical, pdf_style=pdf_style, xobj_id=char.xobj_id, visual_bbox=visual_bbox, render_order=char.render_order, sub_render_order=0, ) if self.translation_config.ocr_workaround: pdf_char.pdf_style.graphic_state = BLACK pdf_char.render_order = None if pdf_style.font_size == 0.0: logger.warning( "Font size is 0.0 for character %s. Skip it.", char_unicode, ) return if char_bounding_box and len(char_bounding_box) == 4: x_min, y_min, x_max, y_max = char_bounding_box factor = 1 / 1000 * pdf_style.font_size x_min = x_min * factor y_min = y_min * factor x_max = x_max * factor y_max = y_max * factor ll = (char.bbox[0] + x_min, char.bbox[1] + y_min) ur = (char.bbox[0] + x_max, char.bbox[1] + y_max) volume = (ur[0] - ll[0]) * (ur[1] - ll[1]) if volume > 1: pdf_char.visual_bbox = il_version_1.VisualBbox( il_version_1.Box(ll[0], ll[1], ur[0], ur[1]) ) self.current_page.pdf_character.append(pdf_char) if self.translation_config.show_char_box: self.current_page.pdf_rectangle.append( il_version_1.PdfRectangle( box=pdf_char.visual_bbox.box, graphic_state=YELLOW, debug_info=True, line_width=0.2, ) ) def _collect_valid_char(self, ch: str): """Append a valid character into the current page buffer according to rules. Rules: - Include whitespace matched by space_regex directly. - Ignore categories that are never normal text: {Cc, Cs, Co, Cn}. - Apply inverted criteria from formular_helper.py (21-28): empty -> invalid, contains '(cid:' -> invalid, not has_char(ch) -> invalid unless len(ch) > 1 and all(has_char(x)). """ if self._page_valid_chars_buffer is None: return if space_regex.match(ch): self._page_valid_chars_buffer.append(ch) return try: cat = unicodedata.category(ch[0]) if ch else None except Exception: cat = None if cat in {"Cc", "Cs", "Co", "Cn"}: return is_invalid = False if not ch: is_invalid = True elif "(cid:" in ch: is_invalid = True else: try: if not self.font_mapper.has_char(ch): if len(ch) > 1 and all(self.font_mapper.has_char(x) for x in ch): is_invalid = False else: is_invalid = True except Exception: is_invalid = True if not is_invalid: self._page_valid_chars_buffer.append(ch) def on_lt_curve(self, curve: babeldoc.pdfminer.layout.LTCurve): if not self.enable_graphic_element_process: return bbox = il_version_1.Box( x=curve.bbox[0], y=curve.bbox[1], x2=curve.bbox[2], y2=curve.bbox[3], ) # Extract CTM from curve object if it exists curve_ctm = getattr(curve, "ctm", None) gs = self.create_graphic_state( curve.passthrough_instruction, include_clipping=True, target_ctm=curve_ctm, clip_paths=curve.clip_paths, ) paths = [] for point in curve.original_path: op = point[0] if len(point) == 1: paths.append( il_version_1.PdfPath( op=op, x=None, y=None, has_xy=False, ) ) continue for p in point[1:-1]: paths.append( il_version_1.PdfPath( op="", x=p[0], y=p[1], has_xy=True, ) ) paths.append( il_version_1.PdfPath( op=point[0], x=point[-1][0], y=point[-1][1], has_xy=True, ) ) fill_background = curve.fill stroke_path = curve.stroke evenodd = curve.evenodd # Extract CTM from curve object if it exists ctm = getattr(curve, "ctm", None) # Extract raw path from curve object if it exists raw_path = getattr(curve, "raw_path", None) raw_pdf_paths = None if raw_path is not None: raw_pdf_paths = [] for path in raw_path: if path[0] == "h": # h command (close path) raw_pdf_paths.append( il_version_1.PdfOriginalPath( pdf_path=il_version_1.PdfPath( x=0.0, y=0.0, op=path[0], has_xy=False, ) ) ) else: # commands with coordinates (m, l, c, v, y, etc.) for p in batched(path[1:-2], 2, strict=True): raw_pdf_paths.append( il_version_1.PdfOriginalPath( pdf_path=il_version_1.PdfPath( x=float(p[0]), y=float(p[1]), op="", has_xy=True, ) ) ) # Last point in the path raw_pdf_paths.append( il_version_1.PdfOriginalPath( pdf_path=il_version_1.PdfPath( x=float(path[-2]), y=float(path[-1]), op=path[0], has_xy=True, ) ) ) curve_obj = il_version_1.PdfCurve( box=bbox, graphic_state=gs, pdf_path=paths, fill_background=fill_background, stroke_path=stroke_path, evenodd=evenodd, debug_info="a", xobj_id=curve.xobj_id, render_order=curve.render_order, ctm=list(ctm) if ctm is not None else None, pdf_original_path=raw_pdf_paths, ) self.current_page.pdf_curve.append(curve_obj) pass def on_xobj_form( self, ctm: tuple[float, float, float, float, float, float], xobj_id: int, xref_id: int, form_type: Literal["image", "form"], do_args: str, bbox: tuple[float, float, float, float], matrix: tuple[float, float, float, float, float, float], ): logger.debug(f"on_xobj_form: {do_args}[{bbox}] @ {xref_id} in {self.xobj_id}") matrix = mult_matrix(matrix, ctm) (x, y, w, h) = guarded_bbox(bbox) bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds) gs = self.create_graphic_state( self.passthrough_per_char_instruction, include_clipping=True, target_ctm=ctm ) figure_bbox = il_version_1.Box( x=bbox[0], y=bbox[1], x2=bbox[2], y2=bbox[3], ) pdf_matrix = il_version_1.PdfMatrix( a=ctm[0], b=ctm[1], c=ctm[2], d=ctm[3], e=ctm[4], f=ctm[5], ) affine_transform = decompose_ctm(ctm) xobj_form = il_version_1.PdfXobjForm( xref_id=xref_id, do_args=do_args, ) pdf_form_subtype = il_version_1.PdfFormSubtype( pdf_xobj_form=xobj_form, ) new_form = il_version_1.PdfForm( xobj_id=xobj_id, box=figure_bbox, pdf_matrix=pdf_matrix, graphic_state=gs, pdf_affine_transform=affine_transform, render_order=self.get_render_order_and_increase(), form_type=form_type, pdf_form_subtype=pdf_form_subtype, ctm=list(ctm), ) self.current_page.pdf_form.append(new_form) def on_pdf_clip_path( self, clip_path, evenodd: bool, ctm: tuple[float, float, float, float, float, float], ): try: self.current_clip_paths.append((clip_path.copy(), ctm, evenodd)) except Exception as e: logger.warning("Error in on_pdf_clip_path: %s", e) def create_il(self): pages = [ page for page in self.docs.page if self.translation_config.should_translate_page(page.page_number + 1) ] self.docs.page = pages return self.docs def on_total_pages(self, total_pages: int): assert isinstance(total_pages, int) assert total_pages > 0 self.docs.total_pages = total_pages total = 0 for page in range(total_pages): if self.translation_config.should_translate_page(page + 1) is False: continue total += 1 self.progress = self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) def on_pdf_figure(self, figure: LTFigure): box = il_version_1.Box( figure.bbox[0], figure.bbox[1], figure.bbox[2], figure.bbox[3], ) self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box)) def on_inline_image_begin(self): """Begin processing inline image""" # Store current state for inline image processing self._inline_image_state = { "ctm": None, "parameters": {}, } def on_inline_image_end(self, stream_obj, ctm): """End processing inline image and create PdfForm""" import base64 import json from babeldoc.format.pdf.babelpdf.utils import guarded_bbox from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import get_bound # Extract image parameters from stream dictionary image_dict = stream_obj.attrs if hasattr(stream_obj, "attrs") else {} # Build parameters dictionary parameters = {} for key, value in image_dict.items(): if hasattr(value, "name"): parameters[key] = value.name else: parameters[key] = str(value) # Get image data (encoded as base64) image_data = "" if hasattr(stream_obj, "data") and stream_obj.data is not None: image_data = base64.b64encode(stream_obj.data).decode("ascii") elif hasattr(stream_obj, "rawdata") and stream_obj.rawdata is not None: image_data = base64.b64encode(stream_obj.rawdata).decode("ascii") # Create inline form with parameters as JSON string inline_form = il_version_1.PdfInlineForm( form_data=image_data, image_parameters=json.dumps(parameters) ) # Calculate bounding box - inline images are typically 1x1 unit square in user space bbox = (0, 0, 1, 1) (x, y, w, h) = guarded_bbox(bbox) bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) final_bbox = get_bound(apply_matrix_pt(ctm, (p, q)) for (p, q) in bounds) # Create graphics state gs = self.create_graphic_state( self.passthrough_per_char_instruction, include_clipping=True, target_ctm=ctm ) # Create PdfMatrix from CTM pdf_matrix = il_version_1.PdfMatrix( a=ctm[0], b=ctm[1], c=ctm[2], d=ctm[3], e=ctm[4], f=ctm[5] ) # Create affine transform affine_transform = decompose_ctm(ctm) # Create PdfFormSubtype with inline form pdf_form_subtype = il_version_1.PdfFormSubtype(pdf_inline_form=inline_form) # Create PdfForm for the inline image pdf_form = il_version_1.PdfForm( box=il_version_1.Box( x=final_bbox[0], y=final_bbox[1], x2=final_bbox[2], y2=final_bbox[3], ), graphic_state=gs, pdf_matrix=pdf_matrix, pdf_affine_transform=affine_transform, pdf_form_subtype=pdf_form_subtype, xobj_id=self.xobj_id, ctm=list(ctm), render_order=self.get_render_order_and_increase(), form_type="image", ) # Add to current page self.current_page.pdf_form.append(pdf_form) ================================================ FILE: babeldoc/format/pdf/document_il/il_version_1.py ================================================ from dataclasses import dataclass from dataclasses import field @dataclass(slots=True) class BaseOperations: class Meta: name = "baseOperations" value: str = field( default="", metadata={ "required": True, }, ) @dataclass(slots=True) class Box: class Meta: name = "box" x: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) x2: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y2: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class GraphicState: class Meta: name = "graphicState" passthrough_per_char_instruction: str | None = field( default=None, metadata={ "name": "passthroughPerCharInstruction", "type": "Attribute", }, ) @dataclass(slots=True) class PdfAffineTransform: class Meta: name = "pdfAffineTransform" translation_x: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) translation_y: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) rotation: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) scale_x: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) scale_y: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) shear: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfFontCharBoundingBox: class Meta: name = "pdfFontCharBoundingBox" x: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) x2: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y2: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) char_id: int | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfInlineForm: class Meta: name = "pdfInlineForm" form_data: str | None = field( default=None, metadata={ "name": "formData", "type": "Attribute", }, ) image_parameters: str | None = field( default=None, metadata={ "name": "imageParameters", "type": "Attribute", }, ) @dataclass(slots=True) class PdfMatrix: class Meta: name = "pdfMatrix" a: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) b: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) c: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) d: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) e: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) f: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfPath: class Meta: name = "pdfPath" x: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) op: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) has_xy: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) @dataclass(slots=True) class PdfXobjForm: class Meta: name = "pdfXobjForm" xref_id: int | None = field( default=None, metadata={ "name": "xrefId", "type": "Attribute", "required": True, }, ) do_args: str | None = field( default=None, metadata={ "name": "doArgs", "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class Cropbox: class Meta: name = "cropbox" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) @dataclass(slots=True) class Mediabox: class Meta: name = "mediabox" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) @dataclass(slots=True) class PageLayout: class Meta: name = "pageLayout" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) id: int | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) conf: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) class_name: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfFigure: class Meta: name = "pdfFigure" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) @dataclass(slots=True) class PdfFont: class Meta: name = "pdfFont" pdf_font_char_bounding_box: list[PdfFontCharBoundingBox] = field( default_factory=list, metadata={ "name": "pdfFontCharBoundingBox", "type": "Element", }, ) name: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) font_id: str | None = field( default=None, metadata={ "name": "fontId", "type": "Attribute", "required": True, }, ) xref_id: int | None = field( default=None, metadata={ "name": "xrefId", "type": "Attribute", "required": True, }, ) encoding_length: int | None = field( default=None, metadata={ "name": "encodingLength", "type": "Attribute", "required": True, }, ) bold: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) italic: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) monospace: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) serif: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) ascent: float | None = field( default=None, metadata={ "type": "Attribute", }, ) descent: float | None = field( default=None, metadata={ "type": "Attribute", }, ) @dataclass(slots=True) class PdfFormSubtype: class Meta: name = "pdfFormSubtype" pdf_inline_form: PdfInlineForm | None = field( default=None, metadata={ "name": "pdfInlineForm", "type": "Element", }, ) pdf_xobj_form: PdfXobjForm | None = field( default=None, metadata={ "name": "pdfXobjForm", "type": "Element", }, ) @dataclass(slots=True) class PdfOriginalPath: class Meta: name = "pdfOriginalPath" pdf_path: PdfPath | None = field( default=None, metadata={ "name": "pdfPath", "type": "Element", "required": True, }, ) @dataclass(slots=True) class PdfRectangle: class Meta: name = "pdfRectangle" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) graphic_state: GraphicState | None = field( default=None, metadata={ "name": "graphicState", "type": "Element", "required": True, }, ) debug_info: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) fill_background: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", }, ) line_width: float | None = field( default=None, metadata={ "name": "lineWidth", "type": "Attribute", }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", }, ) @dataclass(slots=True) class PdfStyle: class Meta: name = "pdfStyle" graphic_state: GraphicState | None = field( default=None, metadata={ "name": "graphicState", "type": "Element", "required": True, }, ) font_id: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) font_size: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class VisualBbox: class Meta: name = "visual_bbox" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) @dataclass(slots=True) class PdfCharacter: class Meta: name = "pdfCharacter" pdf_style: PdfStyle | None = field( default=None, metadata={ "name": "pdfStyle", "type": "Element", "required": True, }, ) box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) visual_bbox: VisualBbox | None = field( default=None, metadata={ "type": "Element", }, ) vertical: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) scale: float | None = field( default=None, metadata={ "type": "Attribute", }, ) pdf_character_id: int | None = field( default=None, metadata={ "name": "pdfCharacterId", "type": "Attribute", }, ) char_unicode: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) advance: float | None = field( default=None, metadata={ "type": "Attribute", }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", }, ) debug_info: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) formula_layout_id: int | None = field( default=None, metadata={ "type": "Attribute", }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", }, ) sub_render_order: int | None = field( default=None, metadata={ "name": "subRenderOrder", "type": "Attribute", }, ) @dataclass(slots=True) class PdfCurve: class Meta: name = "pdfCurve" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) graphic_state: GraphicState | None = field( default=None, metadata={ "name": "graphicState", "type": "Element", "required": True, }, ) pdf_path: list[PdfPath] = field( default_factory=list, metadata={ "name": "pdfPath", "type": "Element", }, ) pdf_original_path: list[PdfOriginalPath] = field( default_factory=list, metadata={ "name": "pdfOriginalPath", "type": "Element", }, ) debug_info: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) fill_background: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) stroke_path: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) evenodd: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", }, ) ctm: list[object] = field( default_factory=list, metadata={ "type": "Attribute", "length": 6, "tokens": True, }, ) relocation_transform: list[object] = field( default_factory=list, metadata={ "type": "Attribute", "length": 6, "tokens": True, }, ) @dataclass(slots=True) class PdfForm: class Meta: name = "pdfForm" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) graphic_state: GraphicState | None = field( default=None, metadata={ "name": "graphicState", "type": "Element", "required": True, }, ) pdf_matrix: PdfMatrix | None = field( default=None, metadata={ "name": "pdfMatrix", "type": "Element", "required": True, }, ) pdf_affine_transform: PdfAffineTransform | None = field( default=None, metadata={ "name": "pdfAffineTransform", "type": "Element", "required": True, }, ) pdf_form_subtype: PdfFormSubtype | None = field( default=None, metadata={ "name": "pdfFormSubtype", "type": "Element", "required": True, }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", "required": True, }, ) ctm: list[object] = field( default_factory=list, metadata={ "type": "Attribute", "length": 6, "tokens": True, }, ) relocation_transform: list[object] = field( default_factory=list, metadata={ "type": "Attribute", "length": 6, "tokens": True, }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", "required": True, }, ) form_type: str | None = field( default=None, metadata={ "name": "formType", "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfSameStyleUnicodeCharacters: class Meta: name = "pdfSameStyleUnicodeCharacters" pdf_style: PdfStyle | None = field( default=None, metadata={ "name": "pdfStyle", "type": "Element", }, ) unicode: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) debug_info: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) @dataclass(slots=True) class PdfXobject: class Meta: name = "pdfXobject" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_font: list[PdfFont] = field( default_factory=list, metadata={ "name": "pdfFont", "type": "Element", }, ) base_operations: BaseOperations | None = field( default=None, metadata={ "name": "baseOperations", "type": "Element", "required": True, }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", "required": True, }, ) xref_id: int | None = field( default=None, metadata={ "name": "xrefId", "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class PdfFormula: class Meta: name = "pdfFormula" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_character: list[PdfCharacter] = field( default_factory=list, metadata={ "name": "pdfCharacter", "type": "Element", "min_occurs": 1, }, ) pdf_curve: list[PdfCurve] = field( default_factory=list, metadata={ "name": "pdfCurve", "type": "Element", }, ) pdf_form: list[PdfForm] = field( default_factory=list, metadata={ "name": "pdfForm", "type": "Element", }, ) x_offset: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) y_offset: float | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) x_advance: float | None = field( default=None, metadata={ "type": "Attribute", }, ) line_id: int | None = field( default=None, metadata={ "name": "lineId", "type": "Attribute", }, ) is_corner_mark: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) @dataclass(slots=True) class PdfLine: class Meta: name = "pdfLine" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_character: list[PdfCharacter] = field( default_factory=list, metadata={ "name": "pdfCharacter", "type": "Element", "min_occurs": 1, }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", }, ) @dataclass(slots=True) class PdfSameStyleCharacters: class Meta: name = "pdfSameStyleCharacters" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_style: PdfStyle | None = field( default=None, metadata={ "name": "pdfStyle", "type": "Element", "required": True, }, ) pdf_character: list[PdfCharacter] = field( default_factory=list, metadata={ "name": "pdfCharacter", "type": "Element", "min_occurs": 1, }, ) @dataclass(slots=True) class PdfParagraphComposition: class Meta: name = "pdfParagraphComposition" pdf_line: PdfLine | None = field( default=None, metadata={ "name": "pdfLine", "type": "Element", }, ) pdf_formula: PdfFormula | None = field( default=None, metadata={ "name": "pdfFormula", "type": "Element", }, ) pdf_same_style_characters: PdfSameStyleCharacters | None = field( default=None, metadata={ "name": "pdfSameStyleCharacters", "type": "Element", }, ) pdf_character: PdfCharacter | None = field( default=None, metadata={ "name": "pdfCharacter", "type": "Element", }, ) pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field( default=None, metadata={ "name": "pdfSameStyleUnicodeCharacters", "type": "Element", }, ) @dataclass(slots=True) class PdfParagraph: class Meta: name = "pdfParagraph" box: Box | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_style: PdfStyle | None = field( default=None, metadata={ "name": "pdfStyle", "type": "Element", "required": True, }, ) pdf_paragraph_composition: list[PdfParagraphComposition] = field( default_factory=list, metadata={ "name": "pdfParagraphComposition", "type": "Element", }, ) xobj_id: int | None = field( default=None, metadata={ "name": "xobjId", "type": "Attribute", }, ) unicode: str | None = field( default=None, metadata={ "type": "Attribute", "required": True, }, ) scale: float | None = field( default=None, metadata={ "type": "Attribute", }, ) optimal_scale: float | None = field( default=None, metadata={ "type": "Attribute", }, ) vertical: bool | None = field( default=None, metadata={ "type": "Attribute", }, ) first_line_indent: bool | None = field( default=None, metadata={ "name": "FirstLineIndent", "type": "Attribute", }, ) debug_id: str | None = field( default=None, metadata={ "type": "Attribute", }, ) layout_label: str | None = field( default=None, metadata={ "type": "Attribute", }, ) layout_id: int | None = field( default=None, metadata={ "type": "Attribute", }, ) render_order: int | None = field( default=None, metadata={ "name": "renderOrder", "type": "Attribute", }, ) @dataclass(slots=True) class Page: class Meta: name = "page" mediabox: Mediabox | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) cropbox: Cropbox | None = field( default=None, metadata={ "type": "Element", "required": True, }, ) pdf_xobject: list[PdfXobject] = field( default_factory=list, metadata={ "name": "pdfXobject", "type": "Element", }, ) page_layout: list[PageLayout] = field( default_factory=list, metadata={ "name": "pageLayout", "type": "Element", }, ) pdf_rectangle: list[PdfRectangle] = field( default_factory=list, metadata={ "name": "pdfRectangle", "type": "Element", }, ) pdf_font: list[PdfFont] = field( default_factory=list, metadata={ "name": "pdfFont", "type": "Element", }, ) pdf_paragraph: list[PdfParagraph] = field( default_factory=list, metadata={ "name": "pdfParagraph", "type": "Element", }, ) pdf_figure: list[PdfFigure] = field( default_factory=list, metadata={ "name": "pdfFigure", "type": "Element", }, ) pdf_character: list[PdfCharacter] = field( default_factory=list, metadata={ "name": "pdfCharacter", "type": "Element", }, ) pdf_curve: list[PdfCurve] = field( default_factory=list, metadata={ "name": "pdfCurve", "type": "Element", }, ) pdf_form: list[PdfForm] = field( default_factory=list, metadata={ "name": "pdfForm", "type": "Element", }, ) base_operations: BaseOperations | None = field( default=None, metadata={ "name": "baseOperations", "type": "Element", "required": True, }, ) page_number: int | None = field( default=None, metadata={ "name": "pageNumber", "type": "Attribute", "required": True, }, ) unit: str | None = field( default=None, metadata={ "name": "Unit", "type": "Attribute", "required": True, }, ) @dataclass(slots=True) class Document: class Meta: name = "document" page: list[Page] = field( default_factory=list, metadata={ "type": "Element", "min_occurs": 1, }, ) total_pages: int | None = field( default=None, metadata={ "name": "totalPages", "type": "Attribute", "required": True, }, ) ================================================ FILE: babeldoc/format/pdf/document_il/il_version_1.rnc ================================================ start = Document Document = element document { Page+, attribute totalPages { xsd:int } } Page = element page { element mediabox { Box }, element cropbox { Box }, PDFXobject*, PageLayout*, PDFRectangle*, PDFFont*, PDFParagraph*, PDFFigure*, PDFCharacter*, PDFCurve*, PDFForm*, attribute pageNumber { xsd:int }, attribute Unit { xsd:string }, element baseOperations { xsd:string } } Box = element box { # from (x,y) to (x2,y2) attribute x { xsd:float }, attribute y { xsd:float }, attribute x2 { xsd:float }, attribute y2 { xsd:float } } PDFXrefId = xsd:int PDFFont = element pdfFont { attribute name { xsd:string }, attribute fontId { xsd:string }, attribute xrefId { PDFXrefId }, attribute encodingLength { xsd:int }, attribute bold { xsd:boolean }?, attribute italic { xsd:boolean }?, attribute monospace { xsd:boolean }?, attribute serif { xsd:boolean }?, attribute ascent { xsd:float }?, attribute descent { xsd:float }?, PDFFontCharBoundingBox* } PDFFontCharBoundingBox = element pdfFontCharBoundingBox { attribute x { xsd:float }, attribute y { xsd:float }, attribute x2 { xsd:float }, attribute y2 { xsd:float }, attribute char_id { xsd:int } } PDFXobject = element pdfXobject { attribute xobjId { xsd:int }, attribute xrefId { PDFXrefId }, Box, PDFFont*, element baseOperations { xsd:string } } PDFCharacter = element pdfCharacter { attribute vertical { xsd:boolean }?, attribute scale { xsd:float }?, attribute pdfCharacterId { xsd:int }?, attribute char_unicode { xsd:string }, attribute advance { xsd:float }?, # xobject nesting depth attribute xobjId { xsd:int }?, attribute debug_info { xsd:boolean }?, attribute formula_layout_id { xsd:int }?, attribute renderOrder { xsd:int }?, attribute subRenderOrder { xsd:int }?, PDFStyle, Box, element visual_bbox { Box }? } PageLayout = element pageLayout { attribute id { xsd:int }, attribute conf { xsd:float }, attribute class_name { xsd:string }, Box } GraphicState = element graphicState { attribute passthroughPerCharInstruction { xsd:string }? } PDFStyle = element pdfStyle { attribute font_id { xsd:string }, attribute font_size { xsd:float }, GraphicState } PDFParagraph = element pdfParagraph { attribute xobjId { xsd:int }?, attribute unicode { xsd:string }, attribute scale { xsd:float }?, attribute optimal_scale { xsd:float }?, attribute vertical { xsd:boolean }?, attribute FirstLineIndent { xsd:boolean }?, attribute debug_id { xsd:string }?, attribute layout_label { xsd:string }?, attribute layout_id { xsd:int }?, attribute renderOrder { xsd:int }?, Box, PDFStyle, PDFParagraphComposition* } PDFParagraphComposition = element pdfParagraphComposition { PDFLine | PDFFormula | PDFSameStyleCharacters | PDFCharacter | PDFSameStyleUnicodeCharacters } PDFLine = element pdfLine { Box, PDFCharacter+, attribute renderOrder { xsd:int }? } PDFSameStyleCharacters = element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ } PDFSameStyleUnicodeCharacters = element pdfSameStyleUnicodeCharacters { PDFStyle?, attribute unicode { xsd:string }, attribute debug_info { xsd:boolean }? } PDFFormula = element pdfFormula { Box, PDFCharacter+, PDFCurve*, PDFForm*, attribute x_offset { xsd:float }, attribute y_offset { xsd:float }, attribute x_advance { xsd:float }?, attribute lineId { xsd:int }?, attribute is_corner_mark { xsd:boolean }? } PDFFigure = element pdfFigure { Box } PDFRectangle = element pdfRectangle { Box, GraphicState, attribute debug_info { xsd:boolean }?, attribute fill_background { xsd:boolean }?, attribute xobjId { xsd:int }?, attribute lineWidth { xsd:float }?, attribute renderOrder { xsd:int }? } PDFCurve = element pdfCurve { Box, GraphicState, PDFPath*, PDFOriginalPath*, attribute debug_info { xsd:boolean }?, attribute fill_background { xsd:boolean }?, attribute stroke_path { xsd:boolean }?, attribute evenodd { xsd:boolean }?, attribute xobjId { xsd:int }?, attribute renderOrder { xsd:int }?, attribute ctm { list { xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float } }?, attribute relocation_transform { list { xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float } }? } PDFOriginalPath = element pdfOriginalPath { PDFPath } PDFPath = element pdfPath { attribute x { xsd:float }, attribute y { xsd:float }, attribute op { xsd:string }, attribute has_xy { xsd:boolean }? } PDFForm = element pdfForm { attribute xobjId { xsd:int }, Box, GraphicState, PDFMatrix, PDFAffineTransform, attribute ctm { list { xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float } }?, attribute relocation_transform { list { xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float } }?, attribute renderOrder { xsd:int }, attribute formType { xsd:string }, PDFFormSubtype } PDFFormSubtype = element pdfFormSubtype { PDFInlineForm | PDFXobjForm } PDFInlineForm = element pdfInlineForm { attribute formData { xsd:string }?, attribute imageParameters { xsd:string }? } PDFXobjForm = element pdfXobjForm { attribute xrefId { PDFXrefId }, attribute doArgs { xsd:string } } PDFMatrix = element pdfMatrix { attribute a { xsd:float }, attribute b { xsd:float }, attribute c { xsd:float }, attribute d { xsd:float }, attribute e { xsd:float }, attribute f { xsd:float } } # Decomposed transform parameters for a CTM PDFAffineTransform = element pdfAffineTransform { attribute translation_x { xsd:float }, attribute translation_y { xsd:float }, attribute rotation { xsd:float }, attribute scale_x { xsd:float }, attribute scale_y { xsd:float }, attribute shear { xsd:float } } ================================================ FILE: babeldoc/format/pdf/document_il/il_version_1.rng ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/il_version_1.xsd ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/midend/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/midend/add_debug_information.py ================================================ import logging import babeldoc.format.pdf.document_il.il_version_1 as il_version_1 from babeldoc.format.pdf.document_il import GraphicState from babeldoc.format.pdf.document_il.utils.style_helper import BLUE from babeldoc.format.pdf.document_il.utils.style_helper import ORANGE from babeldoc.format.pdf.document_il.utils.style_helper import PINK from babeldoc.format.pdf.document_il.utils.style_helper import TEAL from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class AddDebugInformation: stage_name = "Add Debug Information" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config self.model = translation_config.doc_layout_model def process(self, docs: il_version_1.Document): if not self.translation_config.debug: return for page in docs.page: self.process_page(page) def _create_rectangle( self, box: il_version_1.Box, color: GraphicState, line_width: float | None = None, ): rect = il_version_1.PdfRectangle( box=box, graphic_state=color, debug_info=True, line_width=line_width, ) return rect def _create_text( self, text: str, color: GraphicState, box: il_version_1.Box, font_size: float = 4, ): style = il_version_1.PdfStyle( font_id="base", font_size=font_size, graphic_state=color, ) return il_version_1.PdfParagraph( first_line_indent=False, box=il_version_1.Box( x=box.x, y=box.y2, x2=box.x2, y2=box.y2 + 5, ), vertical=False, pdf_style=style, unicode=text, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=text, pdf_style=style, debug_info=True, ), ), ], xobj_id=-1, ) def process_page(self, page: il_version_1.Page): # Add page number text at top-left corner page_width = page.cropbox.box.x2 - page.cropbox.box.x page_height = page.cropbox.box.y2 - page.cropbox.box.y page_number_text = f"pagenumber: {page.page_number + 1}" page_number_box = il_version_1.Box( x=page.cropbox.box.x + page_width * 0.02, y=page.cropbox.box.y, x2=page.cropbox.box.x2, y2=page.cropbox.box.y2 - page_height * 0.02, ) page_number_paragraph = self._create_text( page_number_text, BLUE, page_number_box, ) page.pdf_paragraph.append(page_number_paragraph) new_paragraphs = [] for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue if any( x.pdf_same_style_unicode_characters.debug_info for x in paragraph.pdf_paragraph_composition if x.pdf_same_style_unicode_characters ): continue # Create a rectangle box rect = self._create_rectangle(paragraph.box, BLUE) page.pdf_rectangle.append(rect) # Create text label at top-left corner # Note: PDF coordinates are from bottom-left, # so we use y2 for top position debug_text = "paragraph" if hasattr(paragraph, "debug_id") and paragraph.debug_id: debug_text = ( f"paragraph[{paragraph.debug_id}]-[{paragraph.layout_label}]" ) new_paragraphs.append(self._create_text(debug_text, BLUE, paragraph.box)) for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula: new_paragraphs.append( self._create_text( "formula", ORANGE, composition.pdf_formula.box, ), ) page.pdf_rectangle.append( self._create_rectangle( composition.pdf_formula.box, ORANGE, ), ) for char in composition.pdf_formula.pdf_character: page.pdf_rectangle.append( self._create_rectangle( char.visual_bbox.box, TEAL, line_width=0.2 ), ) # page.pdf_rectangle.append( # self._create_rectangle(char.box, CYAN, line_width=0.2), # ) for xobj in page.pdf_xobject: # new_paragraphs.append( # self._create_text( # "xobj", # YELLOW, # xobj.box, # ), # ) page.pdf_rectangle.append( self._create_rectangle( xobj.box, YELLOW, ), ) for form in page.pdf_form: debug_text = "Form" if form.pdf_form_subtype.pdf_xobj_form: debug_text += f"[{form.pdf_form_subtype.pdf_xobj_form.do_args}]" elif form.pdf_form_subtype.pdf_inline_form: debug_text += "[inline]" new_paragraphs.append( self._create_text(debug_text, PINK, form.box, font_size=0.4), ) page.pdf_rectangle.append( self._create_rectangle( form.box, PINK, ), ) page.pdf_paragraph.extend(new_paragraphs) ================================================ FILE: babeldoc/format/pdf/document_il/midend/automatic_term_extractor.py ================================================ from __future__ import annotations import json import logging from pathlib import Path from typing import TYPE_CHECKING import tiktoken from tqdm import tqdm from babeldoc.format.pdf.document_il import ( Document as ILDocument, # Renamed to avoid conflict ) from babeldoc.format.pdf.document_il import PdfParagraph # Renamed to avoid conflict from babeldoc.format.pdf.document_il.midend.il_translator import Page from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_placeholder_only_paragraph, ) from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_pure_numeric_paragraph, ) from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor if TYPE_CHECKING: from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.translator.translator import BaseTranslator logger = logging.getLogger(__name__) LLM_PROMPT_TEMPLATE: str = """ You are an expert multilingual terminologist. Extract key terms from the text and translate them into {target_language}. ### Extraction Rules 1. Include only: named entities (people, orgs, locations, theorem/algorithm names, dates) and domain-specific nouns/noun phrases essential to meaning. 2. No full sentences. Ignore function words. 3. Use minimal noun phrases (≤5 words unless a named entity). No generic academic nouns (e.g., model, case, property) unless part of a standard term. 4. No mathematical items: variables (X1, a, ε), symbols (=, +, →, ⊥⊥, ∈), subscripts/superscripts, formula fragments, mappings (T: H1→H2), etc. Keep only natural-language concepts. 5. Extract each term once. Keep order of first appearance. ### Translation Rules 1. Translate each term into {target_language}. 2. If in the reference glossary, use its translation exactly. 3. Keep proper names in original language unless a well-known translation exists. 4. Ensure consistent translations. {reference_glossary_section} ### Output Format - Return ONLY a valid JSON array. - Each element: {{"src": "...", "tgt": "..."}}. - No comments, no backticks, no extra text. - If no terms: []. ### Example For terms “LLM”, “GPT”: {example_output} Input Text: ``` {text_to_process} ``` Return JSON ONLY. NO OTHER TEXT. Result: """ class BatchParagraph: def __init__( self, paragraphs: list[PdfParagraph], page_tracker: PageTermExtractTracker, ): self.paragraphs = paragraphs self.tracker = page_tracker.new_paragraph() class DocumentTermExtractTracker: def __init__(self): self.page = [] def new_page(self): page = PageTermExtractTracker() self.page.append(page) return page def to_json(self): pages = [] for page in self.page: paragraphs = [] for para in page.paragraph: o_str = getattr(para, "output", None) i_str = getattr(para, "input", None) pdf_unicodes = getattr(para, "pdf_unicodes", None) if not pdf_unicodes: continue paragraphs.append( { "pdf_unicodes": pdf_unicodes, "output": o_str, "input": i_str, }, ) pages.append({"paragraph": paragraphs}) return json.dumps({"page": pages}, ensure_ascii=False, indent=2) class PageTermExtractTracker: def __init__(self): self.paragraph = [] def new_paragraph(self): paragraph = ParagraphTermExtractTracker() self.paragraph.append(paragraph) return paragraph class ParagraphTermExtractTracker: def __init__(self): self.pdf_unicodes = [] def append_paragraph_unicode(self, unicode: str): self.pdf_unicodes.append(unicode) def set_output(self, output: str): self.output = output def set_input(self, _input: str): self.input = _input class AutomaticTermExtractor: stage_name = "Automatic Term Extraction" def __init__( self, translate_engine: BaseTranslator, translation_config: TranslationConfig, ): self.translate_engine = translate_engine self.translation_config = translation_config self.shared_context = translation_config.shared_context_cross_split_part self.tokenizer = tiktoken.encoding_for_model("gpt-4o") # Check if the translate_engine has llm_translate capability if not hasattr(self.translate_engine, "llm_translate") or not callable( self.translate_engine.llm_translate ): raise ValueError( "The provided translate_engine does not support LLM-based translation, which is required for AutomaticTermExtractor." ) def calc_token_count(self, text: str) -> int: try: return len(self.tokenizer.encode(text, disallowed_special=())) except Exception: return 0 def _snapshot_token_usage(self) -> tuple[int, int, int, int]: if not self.translate_engine: return 0, 0, 0, 0 token_counter = getattr(self.translate_engine, "token_count", None) prompt_counter = getattr(self.translate_engine, "prompt_token_count", None) completion_counter = getattr( self.translate_engine, "completion_token_count", None ) cache_hit_prompt_counter = getattr( self.translate_engine, "cache_hit_prompt_token_count", None ) total_tokens = token_counter.value if token_counter else 0 prompt_tokens = prompt_counter.value if prompt_counter else 0 completion_tokens = completion_counter.value if completion_counter else 0 cache_hit_prompt_tokens = ( cache_hit_prompt_counter.value if cache_hit_prompt_counter else 0 ) return total_tokens, prompt_tokens, completion_tokens, cache_hit_prompt_tokens def _clean_json_output(self, llm_output: str) -> str: llm_output = llm_output.strip() if llm_output.startswith(""): llm_output = llm_output[6:] if llm_output.endswith(""): llm_output = llm_output[:-7] if llm_output.startswith("```json"): llm_output = llm_output[7:] if llm_output.startswith("```"): llm_output = llm_output[3:] if llm_output.endswith("```"): llm_output = llm_output[:-3] return llm_output.strip() def _process_llm_response(self, llm_response_text: str, request_id: str): try: cleaned_response_text = self._clean_json_output(llm_response_text) extracted_data = json.loads(cleaned_response_text) if not isinstance(extracted_data, list): logger.warning( f"Request ID {request_id}: LLM response was not a JSON list, but type: {type(extracted_data)}. Content: {cleaned_response_text[:200]}" ) return for item in extracted_data: if isinstance(item, dict) and "src" in item and "tgt" in item: src_term = str(item["src"]).strip() tgt_term = str(item["tgt"]).strip() if ( src_term and tgt_term and len(src_term) < 100 ): # Basic validation self.shared_context.add_raw_extracted_term_pair( src_term, tgt_term ) else: logger.warning( f"Request ID {request_id}: Skipping malformed item in LLM JSON response: {item}" ) except json.JSONDecodeError as e: logger.error( f"Request ID {request_id}: JSON Parsing Error: {e}. Problematic LLM Response after cleaning (start): {cleaned_response_text[:200]}..." ) except Exception as e: logger.error(f"Request ID {request_id}: Error processing LLM response: {e}") def process_page( self, page: Page, executor: PriorityThreadPoolExecutor, pbar: tqdm | None = None, tracker: PageTermExtractTracker = None, ): self.translation_config.raise_if_cancelled() paragraphs = [] total_token_count = 0 for paragraph in page.pdf_paragraph: if paragraph.debug_id is None or paragraph.unicode is None: pbar.advance(1) continue if is_cid_paragraph(paragraph): pbar.advance(1) continue if is_pure_numeric_paragraph(paragraph): pbar.advance(1) continue if is_placeholder_only_paragraph(paragraph): pbar.advance(1) continue # if len(paragraph.unicode) < self.translation_config.min_text_length: # pbar.advance(1) # continue total_token_count += self.calc_token_count(paragraph.unicode) paragraphs.append(paragraph) if total_token_count > 600 or len(paragraphs) > 12: executor.submit( self.extract_terms_from_paragraphs, BatchParagraph(paragraphs, tracker), pbar, total_token_count, priority=1048576 - total_token_count, ) paragraphs = [] total_token_count = 0 if paragraphs: executor.submit( self.extract_terms_from_paragraphs, BatchParagraph(paragraphs, tracker), pbar, total_token_count, priority=1048576 - total_token_count, ) def extract_terms_from_paragraphs( self, paragraphs: BatchParagraph, pbar: tqdm | None = None, paragraph_token_count: int = 0, ): self.translation_config.raise_if_cancelled() try: inputs = [p.unicode for p in paragraphs.paragraphs if p.unicode] tracker = paragraphs.tracker for u in inputs: tracker.append_paragraph_unicode(u) if not inputs: return # Build reference glossary section reference_glossary_section = "" user_glossaries = self.shared_context.user_glossaries if user_glossaries: text_for_glossary = "\n\n".join(inputs) # Group entries by glossary name glossary_entries = {} for glossary in user_glossaries: active_entries = glossary.get_active_entries_for_text( text_for_glossary ) if active_entries: glossary_entries[glossary.name] = active_entries if glossary_entries: reference_glossary_section = ( "Reference Glossaries (for consistency and quality):\n" ) # Add entries grouped by glossary name for glossary_name, entries in glossary_entries.items(): reference_glossary_section += f"\n{glossary_name}:\n" for src, tgt in sorted(set(entries)): reference_glossary_section += f"- {src} → {tgt}\n" reference_glossary_section += "\nPlease consider these existing translations for consistency when extracting new terms. IMPORTANT: You should also extract terms that appear in the reference glossaries above if they are found in the input text - don't skip them just because they already exist in the reference." prompt = LLM_PROMPT_TEMPLATE.format( target_language=self.translation_config.lang_out, text_to_process="\n\n".join(inputs), reference_glossary_section=reference_glossary_section, example_output="""[ {"src": "LLM", "tgt": "大语言模型"}, {"src": "GPT", "tgt": "GPT"} ]""", ) tracker.set_input(prompt) output = self.translate_engine.llm_translate( prompt, rate_limit_params={ "paragraph_token_count": paragraph_token_count, "request_json_mode": True, }, ) tracker.set_output(output) cleaned_output = self._clean_json_output(output) response = json.loads(cleaned_output) if not isinstance(response, list): response = [response] # Ensure we have a list for term in response: if isinstance(term, dict) and "src" in term and "tgt" in term: src_term = str(term["src"]).strip() tgt_term = str(term["tgt"]).strip() if src_term == tgt_term and len(src_term) < 3: continue if src_term and tgt_term and len(src_term) < 100: self.shared_context.add_raw_extracted_term_pair( src_term, tgt_term ) except Exception as e: logger.warning(f"Error during automatic terms extract: {e}") return finally: pbar.advance(len(paragraphs.paragraphs)) def procress(self, doc_il: ILDocument): logger.info(f"{self.stage_name}: Starting term extraction for document.") start_total, start_prompt, start_completion, start_cache_hit_prompt = ( self._snapshot_token_usage() ) tracker = DocumentTermExtractTracker() total = sum(len(page.pdf_paragraph) for page in doc_il.page) with self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) as pbar: max_workers = self.translation_config.term_pool_max_workers logger.info( f"Using {max_workers} worker threads for automatic term extraction." ) with PriorityThreadPoolExecutor( max_workers=max_workers, ) as executor: for page in doc_il.page: self.process_page(page, executor, pbar, tracker.new_page()) self.shared_context.finalize_auto_extracted_glossary() end_total, end_prompt, end_completion, end_cache_hit_prompt = ( self._snapshot_token_usage() ) self.translation_config.record_term_extraction_usage( end_total - start_total, end_prompt - start_prompt, end_completion - start_completion, end_cache_hit_prompt - start_cache_hit_prompt, ) if ( self.translation_config.debug or self.translation_config.working_dir is not None ): path = self.translation_config.get_working_file_path( "term_extractor_tracking.json" ) logger.debug(f"save translate tracking to {path}") with Path(path).open("w", encoding="utf-8") as f: f.write(tracker.to_json()) path = self.translation_config.get_working_file_path( "term_extractor_freq.json" ) logger.debug(f"save term frequency to {path}") with Path(path).open("w", encoding="utf-8") as f: json.dump( self.shared_context.raw_extracted_terms, f, ensure_ascii=False, indent=2, ) path = self.translation_config.get_working_file_path( "auto_extractor_glossary.csv" ) logger.debug(f"save auto extracted glossary to {path}") with Path(path).open("w", encoding="utf-8") as f: auto_extracted_glossary = self.shared_context.auto_extracted_glossary if auto_extracted_glossary: f.write(auto_extracted_glossary.to_csv()) ================================================ FILE: babeldoc/format/pdf/document_il/midend/detect_scanned_file.py ================================================ import logging import cv2 import numpy as np import pymupdf import regex from skimage.metrics import structural_similarity from babeldoc.babeldoc_exception.BabelDOCException import ScannedPDFError from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater from babeldoc.format.pdf.document_il.utils.style_helper import BLACK from babeldoc.format.pdf.document_il.utils.style_helper import GREEN from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class DetectScannedFile: stage_name = "DetectScannedFile" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config def _save_debug_box_to_page(self, page: il_version_1.Page, similarity: float): """Save debug boxes and text labels to the PDF page.""" if not self.translation_config.debug: return color = GREEN # Create text label at top-left corner # Note: PDF coordinates are from bottom-left, # so we use y2 for top position style = il_version_1.PdfStyle( font_id="base", font_size=4, graphic_state=color, ) page_width = page.cropbox.box.x2 - page.cropbox.box.x page_height = page.cropbox.box.y2 - page.cropbox.box.y unicode = f"scanned score: {similarity * 100:.2f} %" page.pdf_paragraph.append( il_version_1.PdfParagraph( first_line_indent=False, box=il_version_1.Box( x=page.cropbox.box.x + page_width * 0.03, y=page.cropbox.box.y, x2=page.cropbox.box.x2, y2=page.cropbox.box.y2 - page_height * 0.03, ), vertical=False, pdf_style=style, unicode=unicode, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=unicode, pdf_style=style, debug_info=True, ), ), ], xobj_id=-1, ), ) def fast_check(self, doc: pymupdf.Document) -> bool: if doc: hit_list = [0] * len(doc) for page in doc: contents_list = page.get_contents() for index in contents_list: contents = doc.xref_stream(index) if regex.search( rb"(/Artifact|/P)(\s*\<\<\s*/MCID\s+|\s+BDC)", contents ): hit_list[page.number] += 1 if regex.search(rb"\s3\s+Tr\s", contents): hit_list[page.number] += 1 return bool(sum(hit_list) > len(doc) * 0.8) return False def process( self, docs: il_version_1.Document, original_pdf_path, mediabox_data: dict ): """Generate layouts for all pages that need to be translated.""" # Get pages that need to be translated pdf_creater = PDFCreater( original_pdf_path, docs, self.translation_config, mediabox_data ) pages_to_translate = [ page for page in docs.page if self.translation_config.should_translate_page(page.page_number + 1) ] if not pages_to_translate: return mupdf = pymupdf.open(self.translation_config.get_working_file_path("input.pdf")) total = len(pages_to_translate) threshold = 0.8 * total threshold = max(threshold, 1) scanned = 0 non_scanned = 0 non_scanned_threshold = total - threshold with self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) as progress: for page in pages_to_translate: if scanned < threshold and non_scanned < non_scanned_threshold: # Only continue detection if both counts are below thresholds is_scanned = self.detect_page_is_scanned(page, mupdf, pdf_creater) if is_scanned: scanned += 1 else: non_scanned += 1 else: # We have enough information to determine document type non_scanned += 1 progress.advance(1) if scanned >= threshold: if self.translation_config.auto_enable_ocr_workaround: logger.warning( f"Detected {scanned} scanned pages, which is more than 80% of the total pages. " "Turning on OCR workaround.", ) self.translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround = True self.translation_config.ocr_workaround = True self.translation_config.skip_scanned_detection = True self.translation_config.disable_rich_text_translate = True self.clean_render_order_for_chars(docs) self.translation_config.remove_non_formula_lines = False else: logger.warning( f"Detected {scanned} scanned pages, which is more than 80% of the total pages. " "Please check the input PDF file.", ) raise ScannedPDFError("Scanned PDF detected.") def clean_render_order_for_chars(self, docs: il_version_1.Document): for page in docs.page: for char in page.pdf_character: char.render_order = None if not char.debug_info: char.pdf_style.graphic_state = BLACK def detect_page_is_scanned( self, page: il_version_1.Page, pdf: pymupdf.Document, pdf_creater: PDFCreater ) -> bool: before_page_image = pdf[page.page_number].get_pixmap() before_page_image = np.frombuffer(before_page_image.samples, np.uint8).reshape( before_page_image.height, before_page_image.width, 3, )[:, :, ::-1] pdf_creater.update_page_content_stream( False, page, pdf, self.translation_config, True ) after_page_image = pdf[page.page_number].get_pixmap() after_page_image = np.frombuffer(after_page_image.samples, np.uint8).reshape( after_page_image.height, after_page_image.width, 3, )[:, :, ::-1] before_page_image = cv2.cvtColor(before_page_image, cv2.COLOR_RGB2GRAY) after_page_image = cv2.cvtColor(after_page_image, cv2.COLOR_RGB2GRAY) similarity = structural_similarity(before_page_image, after_page_image) return similarity > 0.95 ================================================ FILE: babeldoc/format/pdf/document_il/midend/il_translator.py ================================================ from __future__ import annotations import copy import json import logging import re import threading from pathlib import Path from string import Template import tiktoken from tqdm import tqdm import babeldoc.format.pdf.document_il.il_version_1 as il_version_1 from babeldoc.babeldoc_exception.BabelDOCException import ContentFilterError from babeldoc.format.pdf.document_il import Document from babeldoc.format.pdf.document_il import GraphicState from babeldoc.format.pdf.document_il import Page from babeldoc.format.pdf.document_il import PdfFont from babeldoc.format.pdf.document_il import PdfFormula from babeldoc.format.pdf.document_il import PdfParagraph from babeldoc.format.pdf.document_il import PdfParagraphComposition from babeldoc.format.pdf.document_il import PdfSameStyleCharacters from babeldoc.format.pdf.document_il import PdfSameStyleUnicodeCharacters from babeldoc.format.pdf.document_il import PdfStyle from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.layout_helper import get_char_unicode_string from babeldoc.format.pdf.document_il.utils.layout_helper import get_paragraph_unicode from babeldoc.format.pdf.document_il.utils.layout_helper import is_same_style from babeldoc.format.pdf.document_il.utils.layout_helper import ( is_same_style_except_font, ) from babeldoc.format.pdf.document_il.utils.layout_helper import ( is_same_style_except_size, ) from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_placeholder_only_paragraph, ) from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_pure_numeric_paragraph, ) from babeldoc.format.pdf.document_il.utils.style_helper import GRAY80 from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.translator.translator import BaseTranslator from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor logger = logging.getLogger(__name__) PROMPT_TEMPLATE = Template( """$role_block ## Rules 1. Keep the structure exactly unchanged: do NOT add/remove/reorder any tags, placeholders, or tokens. 2. Keep all tags unchanged (e.g., ). - Translate human-readable text inside tags. - Do NOT translate text inside . 3. Do NOT translate or alter placeholders: {v1}, {name}, %s, %d, [[...]], %%...%%. 4. If the entire input is pure code/identifiers, return it unchanged. 5. Translate ALL human-readable content into $lang_out. $glossary_block $context_block ## Output Output ONLY the translated $lang_out text. No explanations, no backticks, no extra text. Now translate the following text: $text_to_translate""" ) class RichTextPlaceholder: def __init__( self, placeholder_id: int, composition: PdfSameStyleCharacters, left_placeholder: str, right_placeholder: str, left_regex_pattern: str = None, right_regex_pattern: str = None, ): self.id = placeholder_id self.composition = composition self.left_placeholder = left_placeholder self.right_placeholder = right_placeholder self.left_regex_pattern = left_regex_pattern self.right_regex_pattern = right_regex_pattern def to_dict(self) -> dict: return { "type": "rich_text", "id": self.id, "left_placeholder": self.left_placeholder, "right_placeholder": self.right_placeholder, "left_regex_pattern": self.left_regex_pattern, "right_regex_pattern": self.right_regex_pattern, "composition_chars": get_char_unicode_string(self.composition.pdf_character) if self.composition and self.composition.pdf_character else None, } class FormulaPlaceholder: def __init__( self, placeholder_id: int, formula: PdfFormula, placeholder: str, regex_pattern: str, ): self.id = placeholder_id self.formula = formula self.placeholder = placeholder self.regex_pattern = regex_pattern def to_dict(self) -> dict: return { "type": "formula", "id": self.id, "placeholder": self.placeholder, "regex_pattern": self.regex_pattern, "formula_chars": get_char_unicode_string(self.formula.pdf_character) if self.formula and self.formula.pdf_character else None, } class PbarContext: def __init__(self, pbar): self.pbar = pbar def __enter__(self): return self.pbar def __exit__(self, exc_type, exc_value, traceback): self.pbar.advance() class DocumentTranslateTracker: def __init__(self): self.page = [] self.cross_page = [] # Track paragraphs that are combined due to cross-column detection within the same page self.cross_column = [] def new_page(self): page = PageTranslateTracker() self.page.append(page) return page def new_cross_page(self): page = PageTranslateTracker() self.cross_page.append(page) return page def new_cross_column(self): """Create and return a new PageTranslateTracker dedicated to cross-column merging.""" page = PageTranslateTracker() self.cross_column.append(page) return page def to_json(self): pages = [] for page in self.page: paragraphs = self.convert_paragraph(page) pages.append({"paragraph": paragraphs}) cross_page = [] for page in self.cross_page: paragraphs = self.convert_paragraph(page) cross_page.append({"paragraph": paragraphs}) cross_column = [] for page in self.cross_column: paragraphs = self.convert_paragraph(page) cross_column.append({"paragraph": paragraphs}) return json.dumps( { "cross_page": cross_page, "cross_column": cross_column, "page": pages, }, ensure_ascii=False, indent=2, ) def convert_paragraph(self, page): paragraphs = [] for para in page.paragraph: i_str = getattr(para, "input", None) o_str = getattr(para, "output", None) pdf_unicode = getattr(para, "pdf_unicode", None) llm_translate_trackers = getattr(para, "llm_translate_trackers", None) placeholders = getattr(para, "placeholders", None) original_placeholders = getattr(para, "original_placeholders", None) removed_hallucinated_placeholders = getattr( para, "removed_hallucinated_placeholders", None, ) llm_translate_trackers_json = [] if llm_translate_trackers: for tracker in llm_translate_trackers: llm_translate_trackers_json.append(tracker.to_dict()) placeholders_json = [] if placeholders: for placeholder in placeholders: placeholders_json.append(placeholder.to_dict()) if pdf_unicode is None or i_str is None: continue paragraph_json = { "input": i_str, "output": o_str, "pdf_unicode": pdf_unicode, "llm_translate_trackers": llm_translate_trackers_json, "placeholders": placeholders_json, "multi_paragraph_id": getattr(para, "multi_paragraph_id", None), "multi_paragraph_index": getattr(para, "multi_paragraph_index", None), "original_placeholders": original_placeholders, "removed_hallucinated_placeholders": removed_hallucinated_placeholders, } paragraphs.append( paragraph_json, ) return paragraphs class PageTranslateTracker: def __init__(self): self.paragraph = [] def new_paragraph(self): paragraph = ParagraphTranslateTracker() self.paragraph.append(paragraph) return paragraph class ParagraphTranslateTracker: def __init__(self): self.llm_translate_trackers = [] self.original_placeholders: dict[str, int] = {} self.removed_hallucinated_placeholders: dict[str, int] = {} def set_pdf_unicode(self, unicode: str): self.pdf_unicode = unicode def set_input(self, input_text: str): self.input = input_text def set_placeholders( self, placeholders: list[RichTextPlaceholder | FormulaPlaceholder] ): self.placeholders = placeholders def set_original_placeholders(self, placeholders: dict[str, int] | None): """Record original placeholder-like tokens from the source text.""" self.original_placeholders = placeholders or {} def record_multi_paragraph_id(self, mid): self.multi_paragraph_id = mid def record_multi_paragraph_index(self, index): self.multi_paragraph_index = index def set_output(self, output: str): self.output = output def record_removed_hallucinated_placeholder(self, token: str): """Record placeholder-like tokens removed from translated text.""" if not token: return self.removed_hallucinated_placeholders[token] = ( self.removed_hallucinated_placeholders.get(token, 0) + 1 ) def new_llm_translate_tracker(self) -> LLMTranslateTracker: tracker = LLMTranslateTracker() self.llm_translate_trackers.append(tracker) return tracker def last_llm_translate_tracker(self) -> LLMTranslateTracker | None: if self.llm_translate_trackers: return self.llm_translate_trackers[-1] return None class LLMTranslateTracker: def __init__(self): self.input = "" self.output = "" self.has_error = False self.error_message = "" self.placeholder_full_match = False self.fallback_to_translate = False def set_input(self, input_text: str): self.input = input_text def set_output(self, output_text: str): self.output = output_text def set_error_message(self, error_message: str): self.has_error = True self.error_message = error_message def set_placeholder_full_match(self): self.placeholder_full_match = True def set_fallback_to_translate(self): self.fallback_to_translate = True def to_dict(self): return { "input": self.input, "output": self.output, "has_error": self.has_error, "error_message": self.error_message, "placeholder_full_match": self.placeholder_full_match, "fallback_to_translate": self.fallback_to_translate, } class ILTranslator: stage_name = "Translate Paragraphs" def __init__( self, translate_engine: BaseTranslator, translation_config: TranslationConfig, tokenizer=None, ): self.translate_engine = translate_engine self.translation_config = translation_config self.font_mapper = FontMapper(translation_config) self.shared_context_cross_split_part = ( translation_config.shared_context_cross_split_part ) if tokenizer is None: self.tokenizer = tiktoken.encoding_for_model("gpt-4o") else: self.tokenizer = tokenizer # Cache glossaries at initialization self._cached_glossaries = ( self.shared_context_cross_split_part.get_glossaries_for_translation( self.translation_config.auto_extract_glossary ) ) self.support_llm_translate = False try: if translate_engine and hasattr(translate_engine, "do_llm_translate"): translate_engine.do_llm_translate(None) self.support_llm_translate = True except NotImplementedError: self.support_llm_translate = False self.use_as_fallback = False self.add_content_filter_hint_lock = threading.Lock() self.docs = None # Pre-compile patterns for placeholder-like tokens that may be hallucinated by LLM. # We only consider the same shapes as our own formula & rich-text placeholders. self._formula_placeholder_pattern = re.compile( self.translate_engine.get_formular_placeholder(r"\d+")[1], re.IGNORECASE ) self._style_left_placeholder_pattern = re.compile( self.translate_engine.get_rich_text_left_placeholder(r"\d+")[1], re.IGNORECASE, ) self._style_right_placeholder_pattern = re.compile( self.translate_engine.get_rich_text_right_placeholder(r"\d+")[1], re.IGNORECASE, ) def calc_token_count(self, text: str) -> int: try: return len(self.tokenizer.encode(text, disallowed_special=())) except Exception: return 0 def translate(self, docs: Document): self.docs = docs tracker = DocumentTranslateTracker() if not self.translation_config.shared_context_cross_split_part.first_paragraph: # Try to find the first title paragraph title_paragraph = self.find_title_paragraph(docs) self.translation_config.shared_context_cross_split_part.first_paragraph = ( copy.deepcopy(title_paragraph) ) self.translation_config.shared_context_cross_split_part.recent_title_paragraph = copy.deepcopy( title_paragraph ) if title_paragraph: logger.info(f"Found first title paragraph: {title_paragraph.unicode}") # count total paragraph total = sum(len(page.pdf_paragraph) for page in docs.page) with self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) as pbar: with PriorityThreadPoolExecutor( max_workers=self.translation_config.pool_max_workers, ) as executor: for page in docs.page: self.process_page(page, executor, pbar, tracker.new_page()) path = self.translation_config.get_working_file_path("translate_tracking.json") if ( self.translation_config.debug or self.translation_config.working_dir is not None ): logger.debug(f"save translate tracking to {path}") with Path(path).open("w", encoding="utf-8") as f: f.write(tracker.to_json()) def find_title_paragraph(self, docs: Document) -> PdfParagraph | None: """Find the first paragraph with layout_label 'title' in the document. Args: docs: The document to search in Returns: The first title paragraph found, or None if no title paragraph exists """ for page in docs.page: for paragraph in page.pdf_paragraph: if paragraph.layout_label == "title": logger.info(f"Found title paragraph: {paragraph.unicode}") return paragraph return None def process_page( self, page: Page, executor: PriorityThreadPoolExecutor, pbar: tqdm | None = None, tracker: PageTranslateTracker = None, ): self.translation_config.raise_if_cancelled() for paragraph in page.pdf_paragraph: page_font_map = {} for font in page.pdf_font: page_font_map[font.font_id] = font page_xobj_font_map = {} for xobj in page.pdf_xobject: page_xobj_font_map[xobj.xobj_id] = page_font_map.copy() for font in xobj.pdf_font: page_xobj_font_map[xobj.xobj_id][font.font_id] = font # self.translate_paragraph(paragraph, pbar,tracker.new_paragraph(), page_font_map, page_xobj_font_map) paragraph_token_count = self.calc_token_count(paragraph.unicode) if paragraph.layout_label == "title": self.shared_context_cross_split_part.recent_title_paragraph = ( copy.deepcopy(paragraph) ) executor.submit( self.translate_paragraph, paragraph, page, pbar, tracker.new_paragraph(), page_font_map, page_xobj_font_map, priority=1048576 - paragraph_token_count, paragraph_token_count=paragraph_token_count, title_paragraph=self.translation_config.shared_context_cross_split_part.first_paragraph, local_title_paragraph=self.translation_config.shared_context_cross_split_part.recent_title_paragraph, ) class TranslateInput: def __init__( self, unicode: str, placeholders: list[RichTextPlaceholder | FormulaPlaceholder], base_style: PdfStyle = None, ): self.unicode = unicode self.placeholders = placeholders self.base_style = base_style # Original placeholder-like tokens extracted from the source text. # Key: exact matched token string; Value: occurrence count. self.original_placeholder_tokens: dict[str, int] = {} def set_original_placeholder_tokens(self, tokens: dict[str, int] | None): """Attach original placeholder-like tokens from source text.""" self.original_placeholder_tokens = tokens or {} def get_placeholders_hint(self) -> dict[str, str] | None: hint = {} for placeholder in self.placeholders: if isinstance(placeholder, FormulaPlaceholder): cid_count = 0 for char in placeholder.formula.pdf_character: if re.match(r"^\(cid:\d+\)$", char.char_unicode): cid_count += 1 if cid_count > len(placeholder.formula.pdf_character) * 0.8: continue hint[placeholder.placeholder] = get_char_unicode_string( placeholder.formula.pdf_character ) if hint: return hint return None def create_formula_placeholder( self, formula: PdfFormula, formula_id: int, paragraph: PdfParagraph, ): placeholder = self.translate_engine.get_formular_placeholder(formula_id) if isinstance(placeholder, tuple): placeholder, regex_pattern = placeholder else: regex_pattern = re.escape(placeholder) if re.match(regex_pattern, paragraph.unicode, re.IGNORECASE): return self.create_formula_placeholder(formula, formula_id + 1, paragraph) return FormulaPlaceholder(formula_id, formula, placeholder, regex_pattern) def create_rich_text_placeholder( self, composition: PdfSameStyleCharacters, composition_id: int, paragraph: PdfParagraph, ): left_placeholder = self.translate_engine.get_rich_text_left_placeholder( composition_id, ) right_placeholder = self.translate_engine.get_rich_text_right_placeholder( composition_id, ) if isinstance(left_placeholder, tuple): left_placeholder, left_placeholder_regex_pattern = left_placeholder else: left_placeholder_regex_pattern = re.escape(left_placeholder) if isinstance(right_placeholder, tuple): right_placeholder, right_placeholder_regex_pattern = right_placeholder else: right_placeholder_regex_pattern = re.escape(right_placeholder) if re.match( f"{left_placeholder_regex_pattern}|{right_placeholder_regex_pattern}", paragraph.unicode, re.IGNORECASE, ): return self.create_rich_text_placeholder( composition, composition_id + 1, paragraph, ) return RichTextPlaceholder( composition_id, composition, left_placeholder, right_placeholder, left_placeholder_regex_pattern, right_placeholder_regex_pattern, ) def get_translate_input( self, paragraph: PdfParagraph, page_font_map: dict[str, PdfFont] = None, disable_rich_text_translate: bool | None = None, ): if not paragraph.pdf_paragraph_composition: return # Skip pure numeric paragraphs if is_pure_numeric_paragraph(paragraph): return None # Skip paragraphs with only placeholders if is_placeholder_only_paragraph(paragraph): return None # Extract original placeholder-like tokens from the raw paragraph text original_placeholder_tokens: dict[str, int] = {} def scan_placeholder_tokens(text: str, tokens: dict[str, int]): for pattern in ( self._formula_placeholder_pattern, self._style_left_placeholder_pattern, self._style_right_placeholder_pattern, ): for match in pattern.finditer(text): token = match.group(0) tokens[token] = tokens.get(token, 0) + 1 if paragraph.unicode: scan_placeholder_tokens(paragraph.unicode, original_placeholder_tokens) if len(paragraph.pdf_paragraph_composition) == 1: # 如果整个段落只有一个组成部分,那么直接返回,不需要套占位符等 composition = paragraph.pdf_paragraph_composition[0] if ( composition.pdf_line or composition.pdf_same_style_characters or composition.pdf_character ): translate_input = self.TranslateInput( paragraph.unicode, [], paragraph.pdf_style, ) translate_input.set_original_placeholder_tokens( original_placeholder_tokens, ) return translate_input elif composition.pdf_formula: # 不需要翻译纯公式 return None elif composition.pdf_same_style_unicode_characters: # DEBUG INSERT CHAR, NOT TRANSLATE return None else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) return None # 如果没有指定 disable_rich_text_translate,使用配置中的值 if disable_rich_text_translate is None: disable_rich_text_translate = ( self.translation_config.disable_rich_text_translate ) placeholder_id = 1 placeholders = [] chars = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: chars.extend(composition.pdf_line.pdf_character) elif composition.pdf_formula: formula_placeholder = self.create_formula_placeholder( composition.pdf_formula, placeholder_id, paragraph, ) placeholders.append(formula_placeholder) # 公式只需要一个占位符,所以 id+1 placeholder_id = formula_placeholder.id + 1 chars.extend(formula_placeholder.placeholder) elif composition.pdf_character: chars.append(composition.pdf_character) elif composition.pdf_same_style_characters: if disable_rich_text_translate: # 如果禁用富文本翻译,直接添加字符 chars.extend(composition.pdf_same_style_characters.pdf_character) continue fonta = self.font_mapper.map( page_font_map[ composition.pdf_same_style_characters.pdf_style.font_id ], "1", ) fontb = self.font_mapper.map( page_font_map[paragraph.pdf_style.font_id], "1", ) if ( # 样式和段落基准样式一致,无需占位符 is_same_style( composition.pdf_same_style_characters.pdf_style, paragraph.pdf_style, ) # 字号差异在 0.7-1.3 之间,可能是首字母变大效果,无需占位符 or is_same_style_except_size( composition.pdf_same_style_characters.pdf_style, paragraph.pdf_style, ) or ( # 除了字体以外样式都和基准一样,并且字体都映射到同一个字体。无需占位符 is_same_style_except_font( composition.pdf_same_style_characters.pdf_style, paragraph.pdf_style, ) and fonta and fontb and fonta.font_id == fontb.font_id ) # or len(composition.pdf_same_style_characters.pdf_character) == 1 ): chars.extend(composition.pdf_same_style_characters.pdf_character) continue placeholder = self.create_rich_text_placeholder( composition.pdf_same_style_characters, placeholder_id, paragraph, ) placeholders.append(placeholder) # 样式需要一左一右两个占位符,所以 id+2 placeholder_id = placeholder.id + 2 chars.append(placeholder.left_placeholder) chars.extend(composition.pdf_same_style_characters.pdf_character) chars.append(placeholder.right_placeholder) else: logger.error( "Unexpected PdfParagraphComposition type " "in PdfParagraph during translation. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) return None # 如果占位符数量超过阈值,且未禁用富文本翻译,则递归调用并禁用富文本翻译 if len(placeholders) > 40 and not disable_rich_text_translate: logger.warning( f"Too many placeholders ({len(placeholders)}) in paragraph[{paragraph.debug_id}], " "disabling rich text translation for this paragraph", ) return self.get_translate_input(paragraph, page_font_map, True) text = get_char_unicode_string(chars) translate_input = self.TranslateInput(text, placeholders, paragraph.pdf_style) translate_input.set_original_placeholder_tokens(original_placeholder_tokens) return translate_input def process_formula( self, formula: PdfFormula, formula_id: int, paragraph: PdfParagraph, ): placeholder = self.create_formula_placeholder(formula, formula_id, paragraph) if placeholder.placeholder in paragraph.unicode: return self.process_formula(formula, formula_id + 1, paragraph) return placeholder def process_composition( self, composition: PdfSameStyleCharacters, composition_id: int, paragraph: PdfParagraph, ): placeholder = self.create_rich_text_placeholder( composition, composition_id, paragraph, ) if ( placeholder.left_placeholder in paragraph.unicode or placeholder.right_placeholder in paragraph.unicode ): return self.process_composition( composition, composition_id + 1, paragraph, ) return placeholder def parse_translate_output( self, input_text: TranslateInput, output: str, tracker: ParagraphTranslateTracker | None = None, llm_translate_tracker: LLMTranslateTracker | None = None, ) -> [PdfParagraphComposition]: result = [] # 如果没有占位符,直接返回整个文本 if not input_text.placeholders: comp = PdfParagraphComposition() comp.pdf_same_style_unicode_characters = PdfSameStyleUnicodeCharacters() comp.pdf_same_style_unicode_characters.unicode = output comp.pdf_same_style_unicode_characters.pdf_style = input_text.base_style if llm_translate_tracker: llm_translate_tracker.set_placeholder_full_match() return [comp] # 构建正则表达式模式 patterns = [] placeholder_patterns = [] placeholder_map = {} for placeholder in input_text.placeholders: if isinstance(placeholder, FormulaPlaceholder): # 转义特殊字符 # pattern = re.escape(placeholder.placeholder) pattern = placeholder.regex_pattern patterns.append(f"({pattern})") placeholder_patterns.append(f"({pattern})") placeholder_map[placeholder.placeholder] = placeholder else: left = placeholder.left_regex_pattern right = placeholder.right_regex_pattern patterns.append(f"({left}.*?{right})") placeholder_patterns.append(f"({left})") placeholder_patterns.append(f"({right})") placeholder_map[placeholder.left_placeholder] = placeholder all_match = True for pattern in patterns: if not re.search(pattern, output, flags=re.IGNORECASE): all_match = False break if all_match: if llm_translate_tracker: llm_translate_tracker.set_placeholder_full_match() else: logger.debug(f"Failed to match all placeholder for {input_text.unicode}") # 合并所有模式 combined_pattern = "|".join(patterns) combined_placeholder_pattern = "|".join(placeholder_patterns) # Build allowed placeholder tokens: originals from source + placeholders we injected. allowed_placeholder_tokens: set[str] = set() if getattr(input_text, "original_placeholder_tokens", None): allowed_placeholder_tokens.update(input_text.original_placeholder_tokens) for placeholder in input_text.placeholders: if isinstance(placeholder, FormulaPlaceholder): allowed_placeholder_tokens.add(placeholder.placeholder) else: allowed_placeholder_tokens.add(placeholder.left_placeholder) allowed_placeholder_tokens.add(placeholder.right_placeholder) def remove_placeholder(text: str): """Remove placeholder artifacts and hallucinated placeholder-like tokens.""" # First, remove any leftover placeholders built from our own regex patterns. if combined_placeholder_pattern: text = re.sub( combined_placeholder_pattern, "", text, flags=re.IGNORECASE, ) # Then, detect placeholder-like tokens of the same shapes as our own # formula and rich-text placeholders. Only keep those in the allowed set. def _replace_token(match: re.Match) -> str: token = match.group(0) if token in allowed_placeholder_tokens: return token if tracker is not None: tracker.record_removed_hallucinated_placeholder(token) return "" text = self._formula_placeholder_pattern.sub(_replace_token, text) text = self._style_left_placeholder_pattern.sub(_replace_token, text) text = self._style_right_placeholder_pattern.sub(_replace_token, text) return text # 找到所有匹配 last_end = 0 for match in re.finditer(combined_pattern, output, flags=re.IGNORECASE): # 处理匹配之前的普通文本 if match.start() > last_end: text = output[last_end : match.start()] if text: comp = PdfParagraphComposition() comp.pdf_same_style_unicode_characters = ( PdfSameStyleUnicodeCharacters() ) comp.pdf_same_style_unicode_characters.unicode = remove_placeholder( text, ) comp.pdf_same_style_unicode_characters.pdf_style = ( input_text.base_style ) result.append(comp) matched_text = match.group(0) # 处理占位符 if any( isinstance(p, FormulaPlaceholder) and re.match(f"^{p.regex_pattern}$", matched_text, re.IGNORECASE) for p in input_text.placeholders ): # 处理公式占位符 placeholder = next( p for p in input_text.placeholders if isinstance(p, FormulaPlaceholder) and re.match(f"^{p.regex_pattern}$", matched_text, re.IGNORECASE) ) comp = PdfParagraphComposition() comp.pdf_formula = placeholder.formula result.append(comp) else: # 处理富文本占位符 placeholder = next( p for p in input_text.placeholders if not isinstance(p, FormulaPlaceholder) and re.match( f"^{p.left_regex_pattern}", matched_text, re.IGNORECASE ) ) text = re.match( f"^{placeholder.left_regex_pattern}(.*){placeholder.right_regex_pattern}$", matched_text, re.IGNORECASE, ).group(1) if isinstance( placeholder.composition, PdfSameStyleCharacters, ) and text.replace(" ", "") == "".join( x.char_unicode for x in placeholder.composition.pdf_character ).replace( " ", "", ): comp = PdfParagraphComposition( pdf_same_style_characters=placeholder.composition, ) else: comp = PdfParagraphComposition() comp.pdf_same_style_unicode_characters = ( PdfSameStyleUnicodeCharacters() ) comp.pdf_same_style_unicode_characters.pdf_style = ( placeholder.composition.pdf_style ) comp.pdf_same_style_unicode_characters.unicode = remove_placeholder( text, ) result.append(comp) last_end = match.end() # 处理最后的普通文本 if last_end < len(output): text = output[last_end:] if text: comp = PdfParagraphComposition() comp.pdf_same_style_unicode_characters = PdfSameStyleUnicodeCharacters() comp.pdf_same_style_unicode_characters.unicode = remove_placeholder( text, ) comp.pdf_same_style_unicode_characters.pdf_style = input_text.base_style result.append(comp) return result def pre_translate_paragraph( self, paragraph: PdfParagraph, tracker: ParagraphTranslateTracker, page_font_map: dict[str, PdfFont], xobj_font_map: dict[int, dict[str, PdfFont]], ): """Pre-translation processing: prepare text for translation.""" if paragraph.vertical: return None, None tracker.set_pdf_unicode(paragraph.unicode) if paragraph.xobj_id in xobj_font_map: page_font_map = xobj_font_map[paragraph.xobj_id] disable_rich_text_translate = ( self.translation_config.disable_rich_text_translate ) if not self.support_llm_translate: disable_rich_text_translate = True translate_input = self.get_translate_input( paragraph, page_font_map, disable_rich_text_translate ) if not translate_input: return None, None tracker.set_input(translate_input.unicode) tracker.set_placeholders(translate_input.placeholders) tracker.set_original_placeholders( getattr(translate_input, "original_placeholder_tokens", None), ) text = translate_input.unicode if len(text) < self.translation_config.min_text_length: logger.debug( f"Text too short to translate, skip. Text: {text}. Paragraph id: {paragraph.debug_id}." ) return None, None return text, translate_input def post_translate_paragraph( self, paragraph: PdfParagraph, tracker: ParagraphTranslateTracker, translate_input, translated_text: str, ): """Post-translation processing: update paragraph with translated text.""" tracker.set_output(translated_text) if translated_text == translate_input: if llm_translate_tracker := tracker.last_llm_translate_tracker(): llm_translate_tracker.set_placeholder_full_match() return False paragraph.unicode = translated_text paragraph.pdf_paragraph_composition = self.parse_translate_output( translate_input, translated_text, tracker, tracker.last_llm_translate_tracker(), ) for composition in paragraph.pdf_paragraph_composition: if ( composition.pdf_same_style_unicode_characters and composition.pdf_same_style_unicode_characters.pdf_style is None ): composition.pdf_same_style_unicode_characters.pdf_style = ( paragraph.pdf_style ) return True def _build_role_block(self) -> str: """Build the role block for LLM prompt. Returns: Role block string with custom_system_prompt or default role description. """ custom_prompt = getattr(self.translation_config, "custom_system_prompt", None) if custom_prompt: role_block = custom_prompt.strip() if "Follow all rules strictly." not in role_block: if not role_block.endswith("\n"): role_block += "\n" role_block += "Follow all rules strictly." else: role_block = ( f"You are a professional {self.translation_config.lang_out} native translator who needs to fluently translate text " f"into {self.translation_config.lang_out}.\n\n" "Follow all rules strictly." ) return role_block def _build_context_block( self, title_paragraph: PdfParagraph | None = None, local_title_paragraph: PdfParagraph | None = None, translate_input: TranslateInput | None = None, ) -> str: """Build the context/hints block for LLM prompt. Args: title_paragraph: First title paragraph in the document local_title_paragraph: Most recent title paragraph translate_input: TranslateInput containing placeholder hints Returns: Context block string, empty if no context hints available """ context_lines: list[str] = [] hint_idx = 1 if title_paragraph: context_lines.append( f"{hint_idx}. First title in the full text: {title_paragraph.unicode}" ) hint_idx += 1 if local_title_paragraph: is_different_from_global = True if title_paragraph: if local_title_paragraph.debug_id == title_paragraph.debug_id: is_different_from_global = False if is_different_from_global: context_lines.append( f"{hint_idx}. The most recent title is: {local_title_paragraph.unicode}" ) hint_idx += 1 if translate_input and self.translation_config.add_formula_placehold_hint: placeholders_hint = translate_input.get_placeholders_hint() if placeholders_hint: context_lines.append( f"{hint_idx}. Formula placeholder hint:\n{placeholders_hint}" ) if context_lines: return "## Context / Hints\n" + "\n".join(context_lines) + "\n" return "" def _build_glossary_block(self, text: str) -> str: """Build the glossary block for LLM prompt. Args: text: Text to match against glossary entries Returns: Glossary block string with tables, empty if no active glossary entries """ if not self._cached_glossaries: return "" glossary_entries_per_glossary: dict[str, list[tuple[str, str]]] = {} for glossary in self._cached_glossaries: active_entries = glossary.get_active_entries_for_text(text) if active_entries: glossary_entries_per_glossary[glossary.name] = sorted(active_entries) if not glossary_entries_per_glossary: return "" glossary_block_lines: list[str] = [ "## Glossary", "", "Always use the glossary's **Target Term** for any occurrence of its **Source Term** " "(including variants, inside tags, or broken across lines).", "", "Unlisted terms are translated naturally.", "", ] for glossary_name, entries in glossary_entries_per_glossary.items(): glossary_block_lines.append(f"### Glossary: {glossary_name}") glossary_block_lines.append("") glossary_block_lines.append( "| Source Term | Target Term |\n|-------------|-------------|" ) for original_source, target_text in entries: glossary_block_lines.append(f"| {original_source} | {target_text} |") glossary_block_lines.append("") return "\n".join(glossary_block_lines) def generate_prompt_for_llm( self, text: str, title_paragraph: PdfParagraph | None = None, local_title_paragraph: PdfParagraph | None = None, translate_input: TranslateInput | None = None, ): """Generate LLM prompt using template-based approach. Args: text: Text to be translated title_paragraph: First title paragraph in the document local_title_paragraph: Most recent title paragraph translate_input: TranslateInput containing placeholder information Returns: Final LLM prompt string """ role_block = self._build_role_block() context_block = self._build_context_block( title_paragraph, local_title_paragraph, translate_input ) glossary_block = self._build_glossary_block(text) return PROMPT_TEMPLATE.substitute( role_block=role_block, glossary_block=glossary_block, context_block=context_block, lang_out=self.translation_config.lang_out, text_to_translate=text, ) def add_content_filter_hint(self, page: Page, paragraph: PdfParagraph): with self.add_content_filter_hint_lock: new_box = il_version_1.Box( x=paragraph.box.x, y=paragraph.box.y2, x2=paragraph.box.x2, y2=paragraph.box.y2 + 1.1, ) page.pdf_paragraph.append( self._create_text( "翻译服务检测到内容可能包含不安全或敏感内容,请您避免翻译敏感内容,感谢您的配合。", GRAY80, new_box, 1, ) ) logger.info("success add content filter hint") def _create_text( self, text: str, color: GraphicState, box: il_version_1.Box, font_size: float = 4, ): style = il_version_1.PdfStyle( font_id="base", font_size=font_size, graphic_state=color, ) return il_version_1.PdfParagraph( first_line_indent=False, box=box, vertical=False, pdf_style=style, unicode=text, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=text, pdf_style=style, debug_info=True, ), ), ], xobj_id=-1, ) def translate_paragraph( self, paragraph: PdfParagraph, page: Page, pbar: tqdm | None = None, tracker: ParagraphTranslateTracker = None, page_font_map: dict[str, PdfFont] = None, xobj_font_map: dict[int, dict[str, PdfFont]] = None, paragraph_token_count: int = 0, title_paragraph: PdfParagraph | None = None, local_title_paragraph: PdfParagraph | None = None, ): """Translate a paragraph using pre and post processing functions.""" self.translation_config.raise_if_cancelled() with PbarContext(pbar): try: if self.use_as_fallback: # il translator llm only modifies unicode in some situations paragraph.unicode = get_paragraph_unicode(paragraph) # Pre-translation processing text, translate_input = self.pre_translate_paragraph( paragraph, tracker, page_font_map, xobj_font_map ) if text is None: return llm_translate_tracker = tracker.new_llm_translate_tracker() # Perform translation if self.support_llm_translate: llm_prompt = self.generate_prompt_for_llm( text, title_paragraph, local_title_paragraph, translate_input, ) llm_translate_tracker.set_input(llm_prompt) translated_text = self.translate_engine.llm_translate( llm_prompt, rate_limit_params={ "paragraph_token_count": paragraph_token_count }, ) llm_translate_tracker.set_output(translated_text) else: translated_text = self.translate_engine.translate( text, rate_limit_params={ "paragraph_token_count": paragraph_token_count }, ) translated_text = re.sub(r"[. 。…,]{20,}", ".", translated_text) # Post-translation processing self.post_translate_paragraph( paragraph, tracker, translate_input, translated_text ) except ContentFilterError as e: logger.warning(f"ContentFilterError: {e.message}") self.add_content_filter_hint(page, paragraph) return except Exception as e: logger.exception( f"Error translating paragraph. Paragraph: {paragraph.debug_id} ({paragraph.unicode}). Error: {e}. ", ) # ignore error and continue return ================================================ FILE: babeldoc/format/pdf/document_il/midend/il_translator_llm_only.py ================================================ import copy import json import logging import re from pathlib import Path from string import Template import Levenshtein import tiktoken from tqdm import tqdm from babeldoc.format.pdf.document_il import Document from babeldoc.format.pdf.document_il import Page from babeldoc.format.pdf.document_il import PdfFont from babeldoc.format.pdf.document_il import PdfParagraph from babeldoc.format.pdf.document_il.midend import il_translator from babeldoc.format.pdf.document_il.midend.il_translator import ( DocumentTranslateTracker, ) from babeldoc.format.pdf.document_il.midend.il_translator import ILTranslator from babeldoc.format.pdf.document_il.midend.il_translator import PageTranslateTracker from babeldoc.format.pdf.document_il.midend.il_translator import ( ParagraphTranslateTracker, ) from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_placeholder_only_paragraph, ) from babeldoc.format.pdf.document_il.utils.paragraph_helper import ( is_pure_numeric_paragraph, ) from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.translator.translator import BaseTranslator from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor logger = logging.getLogger(__name__) PROMPT_TEMPLATE = Template( """$role_block ## Structure Rules 1. Keep **the same number of paragraphs as the input**. 2. Input paragraphs may be **sliced pieces of the same original paragraph**. → You MUST treat each input paragraph **as an independent, fixed unit**. → Do NOT merge paragraphs, split paragraphs, or move content between paragraphs. 3. Inside each paragraph, you may adjust word order for fluency, but: - Do NOT change the meaning. - Do NOT move placeholders, tags, or code outside their paragraph. 4. Translate ALL human-readable content into $lang_out. ## Do NOT Modify - Tags (e.g., , world!", "layout_label": "text" } ] Output: [ { "id": 0, "output": "{v1},世界!" } ] $contextual_hints_block $glossary_tables_block ## Here is the input: $json_input_str""" ) class BatchParagraph: def __init__( self, paragraphs: list[PdfParagraph], pages: list[Page], page_tracker: PageTranslateTracker, ): self.paragraphs = paragraphs self.pages = pages self.trackers = [page_tracker.new_paragraph() for _ in paragraphs] class ILTranslatorLLMOnly: stage_name = "Translate Paragraphs" def __init__( self, translate_engine: BaseTranslator, translation_config: TranslationConfig, tokenizer=None, ): self.translate_engine = translate_engine self.translation_config = translation_config self.font_mapper = FontMapper(translation_config) self.shared_context_cross_split_part = ( translation_config.shared_context_cross_split_part ) if tokenizer is None: self.tokenizer = tiktoken.encoding_for_model("gpt-4o") else: self.tokenizer = tokenizer # Cache glossaries at initialization self._cached_glossaries = ( self.shared_context_cross_split_part.get_glossaries_for_translation( translation_config.auto_extract_glossary ) ) self.il_translator = ILTranslator( translate_engine=translate_engine, translation_config=translation_config, tokenizer=self.tokenizer, ) self.il_translator.use_as_fallback = True try: self.translate_engine.do_llm_translate(None) except NotImplementedError as e: raise ValueError("LLM translator not supported") from e self.ok_count = 0 self.fallback_count = 0 self.total_count = 0 def calc_token_count(self, text: str) -> int: try: return len(self.tokenizer.encode(text, disallowed_special=())) except Exception: return 0 def find_title_paragraph(self, docs: Document) -> PdfParagraph | None: """Find the first paragraph with layout_label 'title' in the document. Args: docs: The document to search in Returns: The first title paragraph found, or None if no title paragraph exists """ for page in docs.page: for paragraph in page.pdf_paragraph: if paragraph.layout_label == "title": logger.info(f"Found title paragraph: {paragraph.unicode}") return paragraph return None def translate(self, docs: Document) -> None: self.il_translator.docs = docs tracker = DocumentTranslateTracker() self.mid = 0 if not self.translation_config.shared_context_cross_split_part.first_paragraph: # Try to find the first title paragraph title_paragraph = self.find_title_paragraph(docs) self.translation_config.shared_context_cross_split_part.first_paragraph = ( copy.deepcopy(title_paragraph) ) self.translation_config.shared_context_cross_split_part.recent_title_paragraph = copy.deepcopy( title_paragraph ) if title_paragraph: logger.info(f"Found first title paragraph: {title_paragraph.unicode}") # count total paragraph total = sum( [ len( [ p for p in page.pdf_paragraph if p.debug_id is not None and p.unicode is not None ] ) for page in docs.page ] ) translated_ids = set() with self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) as pbar: with PriorityThreadPoolExecutor( max_workers=self.translation_config.pool_max_workers, ) as executor2: with PriorityThreadPoolExecutor( max_workers=self.translation_config.pool_max_workers, ) as executor: self.process_cross_page_paragraph( docs, executor, pbar, tracker, executor2, translated_ids, ) # Cross-column detection per page (after cross-page processing) for page in docs.page: self.process_cross_column_paragraph( page, executor, pbar, tracker, executor2, translated_ids, ) for page in docs.page: self.process_page( page, executor, pbar, tracker.new_page(), executor2, translated_ids, ) path = self.translation_config.get_working_file_path("translate_tracking.json") if ( self.translation_config.debug or self.translation_config.working_dir is not None ): logger.debug(f"save translate tracking to {path}") with Path(path).open("w", encoding="utf-8") as f: f.write(tracker.to_json()) logger.info( f"Translation completed. Total: {self.total_count}, Successful: {self.ok_count}, Fallback: {self.fallback_count}" ) def _is_body_text_paragraph(self, paragraph: PdfParagraph) -> bool: """判断正文段落(当前仅 layout_label == 'text')。 Args: paragraph: PDF paragraph to check Returns: True if this is a body text paragraph, False otherwise """ return paragraph.layout_label in ( "text", "plain text", "paragraph_hybrid", ) def _should_translate_paragraph( self, paragraph: PdfParagraph, translated_ids: set[int] | None = None, require_body_text: bool = False, ) -> bool: """Check if a paragraph should be translated based on common filtering criteria. Args: paragraph: PDF paragraph to check translated_ids: Set of already translated paragraph IDs require_body_text: Whether to additionally check if paragraph is body text Returns: True if paragraph should be translated, False otherwise """ # Basic validation checks if paragraph.debug_id is None or paragraph.unicode is None: return False # Check if already translated if translated_ids is not None and id(paragraph) in translated_ids: return False # CID paragraph check if is_cid_paragraph(paragraph): return False # Minimum length check if len(paragraph.unicode) < self.translation_config.min_text_length: return False # Body text check if requested if require_body_text and not self._is_body_text_paragraph(paragraph): return False return True def _filter_paragraphs( self, page: Page, translated_ids: set[int] | None = None, require_body_text: bool = False, ) -> list[PdfParagraph]: """Get list of paragraphs that should be translated from a page. Args: page: Page to get paragraphs from translated_ids: Set of already translated paragraph IDs require_body_text: Whether to filter for body text paragraphs only Returns: List of paragraphs that should be translated """ return [ paragraph for paragraph in page.pdf_paragraph if self._should_translate_paragraph( paragraph, translated_ids, require_body_text ) ] def _build_font_maps( self, page: Page ) -> tuple[dict[str, PdfFont], dict[int, dict[str, PdfFont]]]: """Build font maps for a page. Args: page: The page to build font maps for Returns: Tuple of (page_font_map, page_xobj_font_map) """ page_font_map = {} for font in page.pdf_font: page_font_map[font.font_id] = font page_xobj_font_map = {} for xobj in page.pdf_xobject: page_xobj_font_map[xobj.xobj_id] = page_font_map.copy() for font in xobj.pdf_font: page_xobj_font_map[xobj.xobj_id][font.font_id] = font return page_font_map, page_xobj_font_map def process_cross_page_paragraph( self, docs: Document, executor: PriorityThreadPoolExecutor, pbar: tqdm | None = None, tracker: DocumentTranslateTracker | None = None, executor2: PriorityThreadPoolExecutor | None = None, translated_ids: set[int] | None = None, ): """Process cross-page paragraphs by combining last body text paragraph of current page with first body text paragraph of next page. Args: docs: Document containing pages to process executor: Thread pool executor for translation tasks pbar: Progress bar for tracking translation progress tracker: Page translation tracker executor2: Secondary executor for fallback translation translated_ids: Set of already translated paragraph IDs """ self.translation_config.raise_if_cancelled() if tracker is None: tracker = DocumentTranslateTracker() if translated_ids is None: translated_ids = set() # Process adjacent page pairs for i in range(len(docs.page) - 1): page_curr = docs.page[i] page_next = docs.page[i + 1] # Find body text paragraphs in current page curr_body_paragraphs = self._filter_paragraphs( page_curr, translated_ids, require_body_text=True ) # Find body text paragraphs in next page next_body_paragraphs = self._filter_paragraphs( page_next, translated_ids, require_body_text=True ) # Get last paragraph from current page and first paragraph from next page if not curr_body_paragraphs or not next_body_paragraphs: continue last_curr_paragraph = curr_body_paragraphs[-1] first_next_paragraph = next_body_paragraphs[0] # Skip if either paragraph is already translated if ( id(last_curr_paragraph) in translated_ids or id(first_next_paragraph) in translated_ids ): continue # Build font maps for both pages curr_font_map, curr_xobj_font_map = self._build_font_maps(page_curr) next_font_map, next_xobj_font_map = self._build_font_maps(page_next) # Merge font maps merged_font_map = {**curr_font_map, **next_font_map} merged_xobj_font_map = {**curr_xobj_font_map, **next_xobj_font_map} # Calculate total token count total_token_count = self.calc_token_count( last_curr_paragraph.unicode ) + self.calc_token_count(first_next_paragraph.unicode) # Create batch with both paragraphs cross_page_paragraphs = [last_curr_paragraph, first_next_paragraph] cross_page_pages = [page_curr, page_next] batch_paragraph = BatchParagraph( cross_page_paragraphs, cross_page_pages, tracker.new_cross_page() ) self.mid += 1 # Submit translation task (force submit regardless of token count) executor.submit( self.translate_paragraph, batch_paragraph, pbar, merged_font_map, merged_xobj_font_map, self.translation_config.shared_context_cross_split_part.first_paragraph, self.translation_config.shared_context_cross_split_part.recent_title_paragraph, executor2, priority=1048576 - total_token_count, paragraph_token_count=total_token_count, mp_id=self.mid, ) # Mark paragraphs as translated translated_ids.add(id(last_curr_paragraph)) translated_ids.add(id(first_next_paragraph)) def process_cross_column_paragraph( self, page: Page, executor: PriorityThreadPoolExecutor, pbar: tqdm | None = None, tracker: DocumentTranslateTracker | None = None, executor2: PriorityThreadPoolExecutor | None = None, translated_ids: set[int] | None = None, ): """Process cross-column paragraphs within the same page. If two adjacent body-text paragraphs have a gap in their y2 coordinate greater than 20 units, they are considered split across columns and will be translated together. """ self.translation_config.raise_if_cancelled() if tracker is None: tracker = DocumentTranslateTracker() if translated_ids is None: translated_ids = set() # Filter body-text paragraphs maintaining original order body_paragraphs = self._filter_paragraphs( page, translated_ids, require_body_text=True ) if len(body_paragraphs) < 2: return # Build font maps once for the whole page page_font_map, page_xobj_font_map = self._build_font_maps(page) for idx in range(len(body_paragraphs) - 1): p1 = body_paragraphs[idx] p2 = body_paragraphs[idx + 1] # Skip already translated if id(p1) in translated_ids or id(p2) in translated_ids: continue # Safety checks for box information if not ( p1.box and p2.box and p1.box.y2 is not None and p2.box.y2 is not None ): continue if p2.box.y2 - p1.box.y2 <= 20: continue total_token_count = self.calc_token_count( p1.unicode ) + self.calc_token_count(p2.unicode) batch = BatchParagraph([p1, p2], [page, page], tracker.new_cross_column()) self.mid += 1 executor.submit( self.translate_paragraph, batch, pbar, page_font_map, page_xobj_font_map, self.translation_config.shared_context_cross_split_part.first_paragraph, self.translation_config.shared_context_cross_split_part.recent_title_paragraph, executor2, priority=1048576 - total_token_count, paragraph_token_count=total_token_count, mp_id=self.mid, ) translated_ids.add(id(p1)) translated_ids.add(id(p2)) def process_page( self, page: Page, executor: PriorityThreadPoolExecutor, pbar: tqdm | None = None, tracker: PageTranslateTracker = None, executor2: PriorityThreadPoolExecutor | None = None, translated_ids: set | None = None, ): self.translation_config.raise_if_cancelled() page_font_map = {} for font in page.pdf_font: page_font_map[font.font_id] = font page_xobj_font_map = {} for xobj in page.pdf_xobject: page_xobj_font_map[xobj.xobj_id] = page_font_map.copy() for font in xobj.pdf_font: page_xobj_font_map[xobj.xobj_id][font.font_id] = font paragraphs = [] total_token_count = 0 for paragraph in page.pdf_paragraph: # Check if already translated if id(paragraph) in translated_ids: continue # Check basic validation if paragraph.debug_id is None or paragraph.unicode is None: continue # Check CID paragraph - advance progress bar if filtered out if is_cid_paragraph(paragraph): if pbar: pbar.advance(1) continue # Check minimum length - advance progress bar if filtered out if len(paragraph.unicode) < self.translation_config.min_text_length: if pbar: pbar.advance(1) continue if is_pure_numeric_paragraph(paragraph): if pbar: pbar.advance(1) continue if is_placeholder_only_paragraph(paragraph): if pbar: pbar.advance(1) continue # self.translate_paragraph(paragraph, pbar,tracker.new_paragraph(), page_font_map, page_xobj_font_map) total_token_count += self.calc_token_count(paragraph.unicode) paragraphs.append(paragraph) translated_ids.add(id(paragraph)) if paragraph.layout_label == "title": self.shared_context_cross_split_part.recent_title_paragraph = ( copy.deepcopy(paragraph) ) if total_token_count > 200 or len(paragraphs) > 5: self.mid += 1 executor.submit( self.translate_paragraph, BatchParagraph(paragraphs, [page] * len(paragraphs), tracker), pbar, page_font_map, page_xobj_font_map, self.translation_config.shared_context_cross_split_part.first_paragraph, self.translation_config.shared_context_cross_split_part.recent_title_paragraph, executor2, priority=1048576 - total_token_count, paragraph_token_count=total_token_count, mp_id=self.mid, ) paragraphs = [] total_token_count = 0 if paragraphs: self.mid += 1 executor.submit( self.translate_paragraph, BatchParagraph(paragraphs, [page] * len(paragraphs), tracker), pbar, page_font_map, page_xobj_font_map, self.translation_config.shared_context_cross_split_part.first_paragraph, self.translation_config.shared_context_cross_split_part.recent_title_paragraph, executor2, priority=1048576 - total_token_count, paragraph_token_count=total_token_count, mp_id=self.mid, ) def translate_paragraph( self, batch_paragraph: BatchParagraph, pbar: tqdm | None = None, page_font_map: dict[str, PdfFont] = None, xobj_font_map: dict[int, dict[str, PdfFont]] = None, title_paragraph: PdfParagraph | None = None, local_title_paragraph: PdfParagraph | None = None, executor: PriorityThreadPoolExecutor | None = None, paragraph_token_count: int = 0, mp_id: int = 0, ): """Translate a paragraph using pre and post processing functions.""" self.translation_config.raise_if_cancelled() should_translate_paragraph = [] try: inputs = [] llm_translate_trackers = [] paragraph_unicodes = [] for i in range(len(batch_paragraph.paragraphs)): paragraph = batch_paragraph.paragraphs[i] tracker = batch_paragraph.trackers[i] text, translate_input = self.il_translator.pre_translate_paragraph( paragraph, tracker, page_font_map, xobj_font_map ) if text is None: pbar.advance(1) continue tracker.record_multi_paragraph_id(mp_id) llm_translate_tracker = tracker.new_llm_translate_tracker() should_translate_paragraph.append(i) llm_translate_trackers.append(llm_translate_tracker) inputs.append( ( text, translate_input, paragraph, tracker, llm_translate_tracker, paragraph_unicodes, ) ) paragraph_unicodes.append(paragraph.unicode) if not inputs: return json_format_input = [] for id_, input_text in enumerate(inputs): ti: il_translator.ILTranslator.TranslateInput = input_text[1] tracker: ParagraphTranslateTracker = input_text[3] tracker.record_multi_paragraph_index(id_) placeholders_hint = ti.get_placeholders_hint() obj = { "id": id_, "input": input_text[0], "layout_label": input_text[2].layout_label, } if ( placeholders_hint and self.translation_config.add_formula_placehold_hint ): obj["formula_placeholders_hint"] = placeholders_hint json_format_input.append(obj) json_format_input_str = json.dumps( json_format_input, ensure_ascii=False, indent=2 ) batch_text_for_glossary_matching = "\n".join( item.get("input", "") for item in json_format_input ) final_input = self._build_llm_prompt( json_input_str=json_format_input_str, title_paragraph=title_paragraph, local_title_paragraph=local_title_paragraph, batch_text_for_glossary_matching=batch_text_for_glossary_matching, ) for llm_translate_tracker in llm_translate_trackers: llm_translate_tracker.set_input(final_input) llm_output = self.translate_engine.llm_translate( final_input, rate_limit_params={ "paragraph_token_count": paragraph_token_count, "request_json_mode": True, }, ) for llm_translate_tracker in llm_translate_trackers: llm_translate_tracker.set_output(llm_output) llm_output = llm_output.strip() llm_output = self._clean_json_output(llm_output) parsed_output = json.loads(llm_output) if isinstance(parsed_output, dict) and parsed_output.get( "output", parsed_output.get("input", False) ): parsed_output = [parsed_output] translation_results = { item["id"]: item.get("output", item.get("input")) for item in parsed_output } if len(translation_results) != len(inputs): raise Exception( f"Translation results length mismatch. Expected: {len(inputs)}, Got: {len(translation_results)}" ) for id_, output in translation_results.items(): should_fallback = True try: if not isinstance(output, str): logger.warning( f"Translation result is not a string. Output: {output}" ) continue id_ = int(id_) # Ensure id is an integer if id_ >= len(inputs): logger.warning(f"Invalid id {id_}, skipping") continue # Clean up any excessive punctuation in the translated text translated_text = re.sub(r"[. 。…,]{20,}", ".", output) # Get the original input for this translation translate_input = inputs[id_][1] llm_translate_tracker = inputs[id_][4] input_unicode = inputs[id_][0] output_unicode = translated_text trimed_input = re.sub(r"[. 。…,]{20,}", ".", input_unicode) input_token_count = self.calc_token_count(trimed_input) output_token_count = self.calc_token_count(output_unicode) same_as_input = trimed_input == output_unicode if ( same_as_input and input_token_count > 10 and not self.translation_config.disable_same_text_fallback ): llm_translate_tracker.set_error_message( "Translation result is the same as input, fallback." ) llm_translate_tracker.set_placeholder_full_match() logger.warning( "Translation result is the same as input, fallback." ) continue if not (0.3 < output_token_count / input_token_count < 3): llm_translate_tracker.set_error_message( f"Translation result is too long or too short. Input: {input_token_count}, Output: {output_token_count}" ) logger.warning( f"Translation result is too long or too short. Input: {input_token_count}, Output: {output_token_count}" ) llm_translate_tracker.set_placeholder_full_match() continue if not self.translation_config.disable_same_text_fallback: edit_distance = Levenshtein.distance( input_unicode, output_unicode ) if edit_distance < 5 and input_token_count > 20: llm_translate_tracker.set_error_message( f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}" ) logger.warning( f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}" ) llm_translate_tracker.set_placeholder_full_match() continue # Apply the translation to the paragraph self.il_translator.post_translate_paragraph( inputs[id_][2], inputs[id_][3], translate_input, translated_text, ) should_fallback = False if pbar: pbar.advance(1) except Exception as e: error_message = f"Error translating paragraph. Error: {e}." logger.exception(error_message) # Ignore error and continue for llm_translate_tracker in llm_translate_trackers: llm_translate_tracker.set_error_message(error_message) continue finally: self.total_count += 1 if should_fallback: self.fallback_count += 1 inputs[id_][4].set_fallback_to_translate() logger.warning( f"Fallback to simple translation. paragraph id: {inputs[id_][2].debug_id}" ) paragraph_token_count = self.calc_token_count( inputs[id_][2].unicode ) paragraph_unicodes = inputs[id_][5] inputs[id_][2].unicode = paragraph_unicodes[id_] executor.submit( self.il_translator.translate_paragraph, inputs[id_][2], batch_paragraph.pages[id_], pbar, inputs[id_][3], page_font_map, xobj_font_map, priority=1048576 - paragraph_token_count, paragraph_token_count=paragraph_token_count, title_paragraph=title_paragraph, local_title_paragraph=local_title_paragraph, ) else: self.ok_count += 1 except Exception as e: error_message = f"Error {e} during translation. try fallback" logger.warning(error_message) for llm_translate_tracker in llm_translate_trackers: llm_translate_tracker.set_error_message(error_message) llm_translate_tracker.set_fallback_to_translate() self.total_count += len(llm_translate_trackers) self.fallback_count += len(llm_translate_trackers) for input_ in inputs: input_[2].unicode = input_[5] if not should_translate_paragraph: should_translate_paragraph = list( range(len(batch_paragraph.paragraphs)) ) for i in should_translate_paragraph: paragraph = batch_paragraph.paragraphs[i] tracker = batch_paragraph.trackers[i] if paragraph.debug_id is None: continue paragraph_token_count = self.calc_token_count(paragraph.unicode) executor.submit( self.il_translator.translate_paragraph, paragraph, batch_paragraph.pages[i], pbar, tracker, page_font_map, xobj_font_map, priority=1048576 - paragraph_token_count, paragraph_token_count=paragraph_token_count, title_paragraph=title_paragraph, local_title_paragraph=local_title_paragraph, ) def _build_llm_prompt( self, json_input_str: str, title_paragraph: PdfParagraph | None, local_title_paragraph: PdfParagraph | None, batch_text_for_glossary_matching: str, ) -> str: """Build LLM prompt using a single template for easier maintenance.""" # Build role block, honoring custom_system_prompt if provided. custom_prompt = getattr(self.translation_config, "custom_system_prompt", None) if custom_prompt: role_block = custom_prompt.strip() if "Follow all rules strictly." not in role_block: if not role_block.endswith("\n"): role_block += "\n" role_block += "Follow all rules strictly." else: role_block = ( f"You are a professional {self.translation_config.lang_out} native translator who needs to fluently translate text " f"into {self.translation_config.lang_out}.\n\n" "Follow all rules strictly." ) # Build contextual hints section. contextual_lines: list[str] = [] hint_idx = 1 if title_paragraph: contextual_lines.append( f"{hint_idx}. First title in full text: {title_paragraph.unicode}" ) hint_idx += 1 if local_title_paragraph: is_different_from_global = True if title_paragraph: if local_title_paragraph.debug_id == title_paragraph.debug_id: is_different_from_global = False if is_different_from_global: contextual_lines.append( f"{hint_idx}. The most recent title is: {local_title_paragraph.unicode}" ) if contextual_lines: contextual_hints_block = ( "## Contextual Hints for Better Translation\n" + "\n".join(contextual_lines) + "\n" ) else: contextual_hints_block = "" # Build glossary usage rules and glossary tables. glossary_usage_rules_block = "" glossary_tables_block = "" glossary_entries_per_glossary: dict[str, list[tuple[str, str]]] = {} if self._cached_glossaries: for glossary in self._cached_glossaries: active_entries = glossary.get_active_entries_for_text( batch_text_for_glossary_matching ) if active_entries: glossary_entries_per_glossary[glossary.name] = sorted( active_entries ) if glossary_entries_per_glossary: glossary_usage_rules_block = ( "## Glossary\n" "If a glossary is provided:\n" "- Always use the exact target term.\n" "- Apply glossary items even inside tags or when broken by hyphens/line breaks.\n" "- If glossary does NOT include a term, translate it naturally.\n\n" ) glossary_table_lines: list[str] = ["## Glossary Tables", ""] for glossary_name, entries in glossary_entries_per_glossary.items(): glossary_table_lines.append(f"### Glossary: {glossary_name}") glossary_table_lines.append("") glossary_table_lines.append( "| Source Term | Target Term |\n|-------------|-------------|" ) for original_source, target_text in entries: glossary_table_lines.append( f"| {original_source} | {target_text} |" ) glossary_table_lines.append("") glossary_tables_block = "\n".join(glossary_table_lines) return PROMPT_TEMPLATE.substitute( role_block=role_block, glossary_usage_rules_block=glossary_usage_rules_block, contextual_hints_block=contextual_hints_block, json_input_str=json_input_str, glossary_tables_block=glossary_tables_block, lang_out=self.translation_config.lang_out, ) def _clean_json_output(self, llm_output: str) -> str: # Clean up JSON output by removing common wrapper tags llm_output = llm_output.strip() if llm_output.startswith(""): llm_output = llm_output[6:] if llm_output.endswith(""): llm_output = llm_output[:-7] if llm_output.startswith("```json"): llm_output = llm_output[7:] if llm_output.startswith("```"): llm_output = llm_output[3:] if llm_output.endswith("```"): llm_output = llm_output[:-3] return llm_output.strip() ================================================ FILE: babeldoc/format/pdf/document_il/midend/layout_parser.py ================================================ import logging import math import os from concurrent.futures import ThreadPoolExecutor from pathlib import Path import cv2 import numpy as np from pymupdf import Document import babeldoc.format.pdf.document_il.utils.extract_char from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils.style_helper import GREEN from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class LayoutParser: stage_name = "Parse Page Layout" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config self.model = translation_config.doc_layout_model def _save_debug_image(self, image: np.ndarray, layout, page_number: int): """Save debug image with drawn boxes if debug mode is enabled.""" if not self.translation_config.debug: return debug_dir = Path(self.translation_config.get_working_file_path("ocr-box-image")) debug_dir.mkdir(parents=True, exist_ok=True) # Draw boxes on the image debug_image = image.copy() for box in layout.boxes: x0, y0, x1, y1 = box.xyxy cv2.rectangle( debug_image, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 2, ) # Add text label cv2.putText( debug_image, layout.names[box.cls], (int(x0), int(y0) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, ) img_bgr = cv2.cvtColor(debug_image, cv2.COLOR_RGB2BGR) # Save the image output_path = debug_dir / f"{page_number}.jpg" cv2.imwrite(str(output_path), img_bgr) def _save_debug_box_to_page(self, page: il_version_1.Page): """Save debug boxes and text labels to the PDF page.""" if not self.translation_config.debug: return color = GREEN for layout in page.page_layout: # Create a rectangle box scale_factor = 1 if layout.class_name == "fallback_line": scale_factor = 0.1 rect = il_version_1.PdfRectangle( box=il_version_1.Box( x=layout.box.x, y=layout.box.y, x2=layout.box.x2, y2=layout.box.y2, ), graphic_state=color, debug_info=True, line_width=0.4 * scale_factor, ) page.pdf_rectangle.append(rect) # Create text label at top-left corner # Note: PDF coordinates are from bottom-left, # so we use y2 for top position style = il_version_1.PdfStyle( font_id="base", font_size=4 * scale_factor, graphic_state=color, ) page.pdf_paragraph.append( il_version_1.PdfParagraph( first_line_indent=False, box=il_version_1.Box( x=layout.box.x, y=layout.box.y2, x2=layout.box.x2, y2=layout.box.y2 + 5, ), vertical=False, pdf_style=style, unicode=layout.class_name, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=layout.class_name, pdf_style=style, debug_info=True, ), ), ], xobj_id=-1, ), ) def process(self, docs: il_version_1.Document, mupdf_doc: Document): """Generate layouts for all pages that need to be translated.""" # Get pages that need to be translated total = len(docs.page) with self.translation_config.progress_monitor.stage_start( self.stage_name, total * 2, ) as progress: # Process predictions for each page for page, layouts in self.model.handle_document( docs.page, mupdf_doc, self.translation_config, self._save_debug_image, ): page_layouts = [] for layout in layouts.boxes: # Convert coordinate system from picture to il # system to the il coordinate system x0, y0, x1, y1 = layout.xyxy # pix = get_no_rotation_img(mupdf_doc[page.page_number]) # pix = mupdf_doc[page.page_number].get_pixmap() # h, w = pix.height, pix.width box = mupdf_doc[page.page_number].mediabox_size b_h = math.ceil(box.y) b_w = math.ceil(box.x) # if b_h != h or b_w != w: # logger.warning(f"page {page.page_number} mediabox is not correct, b_h: {b_h}, h: {h}, b_w: {b_w}, w: {w}") h, w = b_h, b_w x0, y0, x1, y1 = ( np.clip(int(x0 - 1), 0, w - 1), np.clip(int(h - y1 - 1), 0, h - 1), np.clip(int(x1 + 1), 0, w - 1), np.clip(int(h - y0 + 1), 0, h - 1), ) page_layout = il_version_1.PageLayout( id=len(page_layouts) + 1, box=il_version_1.Box( x0.item(), y0.item(), x1.item(), y1.item(), ), conf=layout.conf.item(), class_name=layouts.names[layout.cls], ) page_layouts.append(page_layout) page.page_layout = page_layouts # self.generate_fallback_line_layout_for_page(page) # self._save_debug_box_to_page(page) progress.advance(1) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: for page in docs.page: executor.submit( self.generate_fallback_line_layout_for_page, page, progress ) return docs def generate_fallback_line_layout_for_page(self, page: il_version_1.Page, progress): try: exists_page_layouts = page.page_layout char_boxes = babeldoc.format.pdf.document_il.utils.extract_char.convert_page_to_char_boxes( page ) if not char_boxes: return clusters = babeldoc.format.pdf.document_il.utils.extract_char.process_page_chars_to_lines( char_boxes ) for cluster in clusters: boxes = [c[0] for c in cluster.chars] min_x = min(b.x for b in boxes) max_x = max(b.x2 for b in boxes) min_y = min(b.y for b in boxes) max_y = max(b.y2 for b in boxes) cluster.chars = il_version_1.Box(min_x, min_y, max_x, max_y) page_layout = il_version_1.PageLayout( id=len(exists_page_layouts) + 1, box=il_version_1.Box( min_x, min_y, max_x, max_y, ), conf=1, class_name="fallback_line", ) exists_page_layouts.append(page_layout) self._save_debug_box_to_page(page) finally: progress.advance(1) ================================================ FILE: babeldoc/format/pdf/document_il/midend/paragraph_finder.py ================================================ import logging import random import re import numpy as np from babeldoc.babeldoc_exception.BabelDOCException import ExtractTextError from babeldoc.format.pdf.document_il import Box from babeldoc.format.pdf.document_il import Document from babeldoc.format.pdf.document_il import Page from babeldoc.format.pdf.document_il import PdfCharacter from babeldoc.format.pdf.document_il import PdfLine from babeldoc.format.pdf.document_il import PdfParagraph from babeldoc.format.pdf.document_il import PdfParagraphComposition from babeldoc.format.pdf.document_il import PdfRectangle from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.formular_helper import ( collect_page_formula_font_ids, ) from babeldoc.format.pdf.document_il.utils.layout_helper import ( HEIGHT_NOT_USFUL_CHAR_IN_CHAR, ) from babeldoc.format.pdf.document_il.utils.layout_helper import SPACE_REGEX from babeldoc.format.pdf.document_il.utils.layout_helper import Layout from babeldoc.format.pdf.document_il.utils.layout_helper import add_space_dummy_chars from babeldoc.format.pdf.document_il.utils.layout_helper import build_layout_index from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes from babeldoc.format.pdf.document_il.utils.layout_helper import get_char_unicode_string from babeldoc.format.pdf.document_il.utils.layout_helper import get_character_layout from babeldoc.format.pdf.document_il.utils.layout_helper import is_bullet_point from babeldoc.format.pdf.document_il.utils.layout_helper import ( is_character_in_formula_layout, ) from babeldoc.format.pdf.document_il.utils.layout_helper import is_text_layout from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph from babeldoc.format.pdf.document_il.utils.style_helper import INDIGO from babeldoc.format.pdf.document_il.utils.style_helper import WHITE from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) # Base58 alphabet (Bitcoin style, without numbers 0, O, I, l) BASE58_ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" def generate_base58_id(length: int = 5) -> str: """Generate a random base58 ID of specified length.""" return "".join(random.choice(BASE58_ALPHABET) for _ in range(length)) class ParagraphFinder: stage_name = "Parse Paragraphs" # 定义项目符号的正则表达式模式 def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config self.font_mapper = FontMapper(translation_config) def _preprocess_formula_layouts(self, page: Page): """ Identifies 'formula' layouts that do not significantly overlap with any text layouts and re-labels them as 'isolate_formula'. """ # Use a simplified Layout object for is_text_layout check text_layouts = [ layout for layout in page.page_layout if is_text_layout(Layout(layout.id, layout.class_name)) ] formula_layouts = [ layout for layout in page.page_layout if layout.class_name == "formula" ] if not text_layouts or not formula_layouts: return for formula_layout in formula_layouts: is_isolated = True for text_layout in text_layouts: iou = calculate_iou_for_boxes(formula_layout.box, text_layout.box) if iou >= 0.5: is_isolated = False break if is_isolated: formula_layout.class_name = "isolate_formula" def add_text_fill_background(self, page: Page): layout_map = {layout.id: layout for layout in page.page_layout} for paragraph in page.pdf_paragraph: layout_id = paragraph.layout_id if layout_id is None: continue layout = layout_map[layout_id] if paragraph.box is None: continue x1, y1, x2, y2 = ( paragraph.box.x, paragraph.box.y, paragraph.box.x2, paragraph.box.y2, ) layout_box = layout.box if layout_box.x < x1: x1 = layout_box.x if layout_box.y < y1: y1 = layout_box.y if layout_box.x2 > x2: x2 = layout_box.x2 if layout_box.y2 > y2: y2 = layout_box.y2 assert x2 > x1 and y2 > y1 page.pdf_rectangle.append( PdfRectangle( box=Box(x1, y1, x2, y2), fill_background=True, graphic_state=WHITE, debug_info=False, xobj_id=paragraph.xobj_id, ) ) def update_paragraph_data(self, paragraph: PdfParagraph, update_unicode=False): if not paragraph.pdf_paragraph_composition: return chars = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: chars.extend(composition.pdf_line.pdf_character) elif composition.pdf_formula: chars.extend(composition.pdf_formula.pdf_character) elif composition.pdf_character: chars.append(composition.pdf_character) elif composition.pdf_same_style_unicode_characters: continue else: logger.error( "Unexpected composition type" " in PdfParagraphComposition. " "This type only appears in the IL " "after the translation is completed.", ) continue if update_unicode and chars: paragraph.unicode = get_char_unicode_string(chars) if not chars: return # 更新边界框 min_x = min(char.visual_bbox.box.x for char in chars) min_y = min(char.visual_bbox.box.y for char in chars) max_x = max(char.visual_bbox.box.x2 for char in chars) max_y = max(char.visual_bbox.box.y2 for char in chars) paragraph.box = Box(min_x, min_y, max_x, max_y) paragraph.vertical = chars[0].vertical paragraph.xobj_id = chars[0].xobj_id paragraph.first_line_indent = False if ( paragraph.pdf_paragraph_composition and paragraph.pdf_paragraph_composition[0].pdf_line and paragraph.pdf_paragraph_composition[0] .pdf_line.pdf_character[0] .visual_bbox.box.x - paragraph.box.x > 1 ): paragraph.first_line_indent = True def update_line_data(self, line: PdfLine): min_x = min(char.visual_bbox.box.x for char in line.pdf_character) min_y = min(char.visual_bbox.box.y for char in line.pdf_character) max_x = max(char.visual_bbox.box.x2 for char in line.pdf_character) max_y = max(char.visual_bbox.box.y2 for char in line.pdf_character) line.box = Box(min_x, min_y, max_x, max_y) def add_debug_info(self, page: Page): if not self.translation_config.debug: return for paragraph in page.pdf_paragraph: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: line = composition.pdf_line page.pdf_rectangle.append( PdfRectangle( box=line.box, fill_background=False, graphic_state=INDIGO, debug_info=True, line_width=0.2, ) ) def process(self, document): with self.translation_config.progress_monitor.stage_start( self.stage_name, len(document.page), ) as pbar: if not document.page: return for page in document.page: self.translation_config.raise_if_cancelled() self.process_page(page) pbar.advance() total_paragraph_count = 0 for page in document.page: total_paragraph_count += len(page.pdf_paragraph) if total_paragraph_count == 0: raise ExtractTextError("The document contains no paragraphs.") if self.check_cid_paragraph(document): raise ExtractTextError("The document contains too many CID paragraphs.") def check_cid_paragraph(self, doc: Document): cid_para_count = 0 para_total = 0 for page in doc.page: para_total += len(page.pdf_paragraph) for para in page.pdf_paragraph: if is_cid_paragraph(para): cid_para_count += 1 return cid_para_count / para_total > 0.8 def bbox_overlap(self, bbox1: Box, bbox2: Box) -> bool: return ( bbox1.x < bbox2.x2 and bbox1.x2 > bbox2.x and bbox1.y < bbox2.y2 and bbox1.y2 > bbox2.y ) def process_page(self, page: Page): layout_index, layout_map = build_layout_index(page) # 预处理公式布局的标签 self._preprocess_formula_layouts(page) # 第一步:根据 layout 创建 paragraphs # 在这一步中,page.pdf_character 中的字符会被移除 paragraphs = self._group_characters_into_paragraphs( page, layout_index, layout_map ) page.pdf_paragraph = paragraphs page_level_formula_font_ids, xobj_specific_formula_font_ids = ( collect_page_formula_font_ids( page, self.translation_config.formular_font_pattern ) ) # for para in paragraphs: # if not para.debug_id: # continue # new_line = PdfLine( # pdf_character=[x.pdf_character for x in para.pdf_paragraph_composition] # ) # self.update_line_data(new_line) # para.pdf_paragraph_composition = [ # PdfParagraphComposition(pdf_line=new_line) # ] # 第二步:将段落内的字符拆分为行 for paragraph in paragraphs: if ( paragraph.xobj_id and paragraph.xobj_id in xobj_specific_formula_font_ids ): current_formula_font_ids = xobj_specific_formula_font_ids[ paragraph.xobj_id ] else: current_formula_font_ids = page_level_formula_font_ids self._split_paragraph_into_lines(paragraph, current_formula_font_ids) # 第三步:处理段落中的空格 for paragraph in paragraphs: add_space_dummy_chars(paragraph) self.process_paragraph_spacing(paragraph) self.update_paragraph_data(paragraph) # 第四步:计算所有行宽度的中位数 median_width = self.calculate_median_line_width(paragraphs) # 第五步:处理独立段落 self.process_independent_paragraphs(paragraphs, median_width) # 新增后处理:合并带行号交替的正文段落(a 正文、b 行号、c 正文 -> 合并 a 与 c,保留 b) if getattr(self.translation_config, "merge_alternating_line_numbers", True): self.merge_alternating_line_number_paragraphs(paragraphs) for paragraph in paragraphs: self.update_paragraph_data(paragraph, update_unicode=True) if self.translation_config.ocr_workaround: self.add_text_fill_background(page) # since this is ocr file, # image characters are not needed page.pdf_character = [] self.fix_overlapping_paragraphs(page) # 第六步:对每一行的字符进行排序 # self._sort_characters_in_lines(page) self.add_debug_info(page) # 新阶段:设置段落的 renderorder 为所有组成部分中 renderorder 最小的 self._set_paragraph_render_order(page) def _set_paragraph_render_order(self, page: Page): """ 设置段落的 renderorder 为段落所有组成部分中 renderorder 最小的值 """ for paragraph in page.pdf_paragraph: min_render_order = 9999999999999999 # 遍历段落的所有组成部分 for composition in paragraph.pdf_paragraph_composition: # 检查 PdfLine 中的字符 if composition.pdf_line: for char in composition.pdf_line.pdf_character: if ( hasattr(char, "render_order") and char.render_order is not None ): min_render_order = min(min_render_order, char.render_order) # 检查单个字符 elif composition.pdf_character: char = composition.pdf_character if hasattr(char, "render_order") and char.render_order is not None: min_render_order = min(min_render_order, char.render_order) # 检查公式中的字符 elif composition.pdf_formula: for char in composition.pdf_formula.pdf_character: if ( hasattr(char, "render_order") and char.render_order is not None ): min_render_order = min(min_render_order, char.render_order) # 如果找到了有效的 renderorder,设置段落的 renderorder if min_render_order != 9999999999999999: paragraph.render_order = min_render_order def is_isolated_formula(self, char: PdfCharacter): return char.char_unicode in ( "(cid:122)", "(cid:123)", "(cid:124)", "(cid:125)", ) def _paragraph_text_ascii(self, p: PdfParagraph) -> str: parts: list[str] = [] for comp in p.pdf_paragraph_composition or []: if comp.pdf_line: for ch in comp.pdf_line.pdf_character or []: if ch.char_unicode is not None: parts.append(ch.char_unicode) elif comp.pdf_character and comp.pdf_character.char_unicode is not None: parts.append(comp.pdf_character.char_unicode) return "".join(parts) def _is_ascii_digit_or_space_paragraph(self, p: PdfParagraph) -> bool: text = self._paragraph_text_ascii(p) if not text: return True has_digit = False for c in text: if c.isdigit() and ord(c) < 128: has_digit = True continue if c.isspace(): continue return False return True if has_digit or text.strip() == "" else False @staticmethod def _same_layout_and_xobj(a: PdfParagraph, c: PdfParagraph) -> bool: return ( a.layout_id is not None and c.layout_id is not None and a.layout_id == c.layout_id and a.xobj_id is not None and c.xobj_id is not None and a.xobj_id == c.xobj_id ) def merge_alternating_line_number_paragraphs(self, paragraphs: list[PdfParagraph]): # a 代表正文 # l 代表行号 if not paragraphs or len(paragraphs) < 3: return i = 0 while i < len(paragraphs) - 2: a = paragraphs[i] # 吞掉一个或多个连续的行号段 l j = i + 1 saw_l = False while j < len(paragraphs) and self._is_ascii_digit_or_space_paragraph( paragraphs[j] ): saw_l = True j += 1 # 现在 j 指向候选的 c if saw_l and j < len(paragraphs): c = paragraphs[j] if self._same_layout_and_xobj(a, c): a.pdf_paragraph_composition.extend(c.pdf_paragraph_composition) self.update_paragraph_data(a) del paragraphs[j] # 不移动 i,继续尝试把更多正文接到 a,实现 a l+ a l+ a ... 链式合并 continue i += 1 def _group_characters_into_paragraphs( self, page: Page, layout_index, layout_map ) -> list[PdfParagraph]: paragraphs: list[PdfParagraph] = [] if page.pdf_paragraph: paragraphs.extend(page.pdf_paragraph) page.pdf_paragraph = [] char_areas = [ (char.visual_bbox.box.x2 - char.visual_bbox.box.x) * (char.visual_bbox.box.y2 - char.visual_bbox.box.y) for char in page.pdf_character ] median_char_area = 0.0 if char_areas: char_areas.sort() mid = len(char_areas) // 2 median_char_area = ( char_areas[mid] if len(char_areas) % 2 == 1 else (char_areas[mid - 1] + char_areas[mid]) / 2 ) current_paragraph: PdfParagraph | None = None current_layout: Layout | None = None skip_chars = [] for char in page.pdf_character: char_layout = get_character_layout(char, layout_index, layout_map) # Check if character is in any formula layout and set formula_layout_id char.formula_layout_id = is_character_in_formula_layout( char, page, layout_index, layout_map ) if not is_text_layout(char_layout) or self.is_isolated_formula(char): skip_chars.append(char) continue char_box = char.visual_bbox.box # char_pdf_box = char.box # if calculate_iou_for_boxes(char_box, char_pdf_box) < 0.2: # char_box = char_pdf_box char_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y) is_small_char = char_area < median_char_area * 0.05 is_new_paragraph = False if current_paragraph is None: is_new_paragraph = True elif ( not ( is_small_char and current_paragraph.pdf_paragraph_composition and char_layout.id == current_layout.id ) and char.char_unicode not in HEIGHT_NOT_USFUL_CHAR_IN_CHAR ): if ( ( char_layout.id != current_layout.id and not SPACE_REGEX.match(char.char_unicode) ) or ( # not same xobject current_paragraph.pdf_paragraph_composition and current_paragraph.pdf_paragraph_composition[ -1 ].pdf_character.xobj_id != char.xobj_id ) or ( is_bullet_point(char) and not current_paragraph.pdf_paragraph_composition ) ): is_new_paragraph = True if is_new_paragraph: current_layout = char_layout current_paragraph = PdfParagraph( pdf_paragraph_composition=[], layout_id=current_layout.id, debug_id=generate_base58_id(), layout_label=current_layout.name, ) paragraphs.append(current_paragraph) current_paragraph.pdf_paragraph_composition.append( PdfParagraphComposition(pdf_character=char) ) page.pdf_character = skip_chars for para in paragraphs: self.update_paragraph_data(para) return paragraphs def _merge_overlapping_clusters( self, lines: dict[int, list[PdfCharacter]], char_height_average: float ) -> dict[int, list[PdfCharacter]]: """ Merge clusters that have significant y-axis overlap. If y_intersection / min_height > 0.5 or the distance between y-midlines is less than char_height_average, merge the two clusters. """ if len(lines) <= 1: return lines # Calculate y-axis ranges for each cluster cluster_ranges = {} cluster_midlines = {} for label, chars in lines.items(): y_values = [char.visual_bbox.box.y for char in chars] + [ char.visual_bbox.box.y2 for char in chars ] y_min, y_max = min(y_values), max(y_values) cluster_ranges[label] = (y_min, y_max) cluster_midlines[label] = (y_min + y_max) / 2 # Keep merging until no more merges are possible changed = True while changed: changed = False labels_to_check = list(lines.keys()) for i in range(len(labels_to_check)): if not changed: # Only continue if no merge happened in this iteration for j in range(i + 1, len(labels_to_check)): label1, label2 = labels_to_check[i], labels_to_check[j] # Skip if either label has been merged away if label1 not in lines or label2 not in lines: continue y1_min, y1_max = cluster_ranges[label1] y2_min, y2_max = cluster_ranges[label2] # Calculate intersection intersection_start = max(y1_min, y2_min) intersection_end = min(y1_max, y2_max) # Calculate midline distance midline_distance = abs( cluster_midlines[label1] - cluster_midlines[label2] ) should_merge = False if ( intersection_end > intersection_start ): # There is intersection intersection_height = intersection_end - intersection_start height1 = y1_max - y1_min height2 = y2_max - y2_min min_height = min(height1, height2) # Check if intersection ratio exceeds threshold if ( min_height > 0 and intersection_height / min_height > 0.3 ): should_merge = True # Check if midline distance is less than char_height_average if midline_distance < char_height_average: should_merge = True if should_merge: # Merge label2 into label1 lines[label1].extend(lines[label2]) del lines[label2] # Update cluster range and midline for the merged cluster new_y_min = min(y1_min, y2_min) new_y_max = max(y1_max, y2_max) cluster_ranges[label1] = (new_y_min, new_y_max) cluster_midlines[label1] = (new_y_min + new_y_max) / 2 del cluster_ranges[label2] del cluster_midlines[label2] changed = True break return lines def _get_effective_y_bounds(self, char: PdfCharacter) -> tuple[float, float]: """ Determines the effective vertical boundaries (y1, y2) for a character. It prioritizes the visual bounding box if its Intersection over Union (IoU) with the PDF bounding box is high (>= 0.5), otherwise, it falls back to the PDF bounding box. This helps use more accurate layout information when available. """ visual_box = char.visual_bbox.box return visual_box.y, visual_box.y2 pdf_box = char.box if calculate_iou_for_boxes(visual_box, pdf_box) >= 0.5: return visual_box.y, visual_box.y2 return pdf_box.y, pdf_box.y2 @staticmethod def _compute_collision_counts_histogram( y1_arr: np.ndarray, y2_arr: np.ndarray, para_y_min: float, para_y_max: float, step: float, ) -> np.ndarray: """Compute overlap counts at each scan line using a difference-array histogram. Args: y1_arr: 1-D array with lower y bounds of characters (inclusive). y2_arr: 1-D array with upper y bounds of characters (exclusive). para_y_min: Minimum y of the paragraph. para_y_max: Maximum y of the paragraph. step: Scan step size. Returns: 1-D NumPy int32 array where index i corresponds to y = para_y_max - i × step. """ # Number of scan positions m = int(np.ceil((para_y_max - para_y_min) / step)) if m <= 0: return np.array([], dtype=np.int32) # Map character bounds to discrete indices (top inclusive, bottom exclusive) starts = np.floor((para_y_max - y2_arr) / step).astype(np.int32) ends = np.floor((para_y_max - y1_arr) / step).astype(np.int32) + 1 # Clip ends to the valid range [0, m] np.clip(ends, 0, m, out=ends) hist = np.zeros(m + 1, dtype=np.int32) np.add.at(hist, starts, 1) np.add.at(hist, ends, -1) return np.cumsum(hist[:-1]) def _split_paragraph_into_lines( self, paragraph: PdfParagraph, formula_font_ids: set[str] ): """ Splits a paragraph into lines using a "line-threading" method. This method works by scanning vertically across the paragraph's bounding box and counting how many characters intersect with a horizontal line at each y-coordinate. The regions with a low number of intersections (less than 2) are identified as gaps between lines. The characters are then partitioned into lines based on these identified gaps. """ if not paragraph.pdf_paragraph_composition: return # 1. Extract all characters and other compositions from the paragraph. all_chars: list[PdfCharacter] = [] other_compositions: list[PdfParagraphComposition] = [] for comp in paragraph.pdf_paragraph_composition: if comp.pdf_character: all_chars.append(comp.pdf_character) else: other_compositions.append(comp) if not all_chars: return # 2. Determine effective y-bounds for each character and the paragraph's total vertical range. char_y_bounds = [ {"char": char, "y1": y1, "y2": y2} for char in all_chars for y1, y2 in [self._get_effective_y_bounds(char)] ] if not char_y_bounds: paragraph.pdf_paragraph_composition = other_compositions self.update_paragraph_data(paragraph) return para_y_min = min(b["y1"] for b in char_y_bounds) para_y_max = max(b["y2"] for b in char_y_bounds) # If the paragraph is vertically flat, treat it as a single line. if (para_y_max - para_y_min) < 5: # Using a small threshold # all_chars.sort(key=lambda c: c.visual_bbox.box.x) single_line_composition = self.create_line(all_chars) paragraph.pdf_paragraph_composition = [ single_line_composition ] + other_compositions self.update_paragraph_data(paragraph) return # 3. Perform "threading" scan to create a collision histogram. # Scan from top (max y) to bottom (min y) with a step of 0.5. scan_y_min = para_y_min scan_y_max = para_y_max step = 0.25 y_coordinates = np.arange(scan_y_max, scan_y_min, -step) # Compute collision counts using NumPy histogram (O(m + n)) y1_arr = np.array([b["y1"] for b in char_y_bounds], dtype=np.float32) y2_arr = np.array([b["y2"] for b in char_y_bounds], dtype=np.float32) collision_counts = self._compute_collision_counts_histogram( y1_arr, y2_arr, scan_y_min, scan_y_max, step, ) # 4. Find gaps (regions with low collision count) from the histogram. gaps = [] in_gap = False for i, count in enumerate(collision_counts): if count < 1 and not in_gap: in_gap = True gap_start_index = i elif count >= 1 and in_gap: in_gap = False gaps.append((gap_start_index, i - 1)) if in_gap: gaps.append((gap_start_index, len(collision_counts) - 1)) # If no significant gaps are found, treat it as a single line. if not gaps: # all_chars.sort(key=lambda c: c.visual_bbox.box.x) single_line_composition = self.create_line(all_chars) paragraph.pdf_paragraph_composition = [ single_line_composition ] + other_compositions self.update_paragraph_data(paragraph) return # 5. Assign characters to lines based on the identified gaps. # Calculate separator y-coordinates from the midpoints of the gaps. separator_y_coords = sorted( [y_coordinates[start_idx] for start_idx, end_idx in gaps], reverse=True, ) lines: list[list[PdfCharacter]] = [ [] for _ in range(len(separator_y_coords) + 1) ] for b in char_y_bounds: char_y_center = (b["y1"] + b["y2"]) / 2 line_idx = 0 # Find which line bucket the character belongs to. for sep_y in separator_y_coords: if char_y_center > sep_y: break line_idx += 1 lines[line_idx].append(b["char"]) # 6. Rebuild the paragraph's composition list from the new lines. new_line_compositions = [] for line_chars in lines: if line_chars: # Sort characters within each line by x-coordinate (left-to-right). # line_chars.sort(key=lambda c: c.visual_bbox.box.x) new_line_compositions.append(self.create_line(line_chars)) # The lines are already sorted vertically due to the scanning process. paragraph.pdf_paragraph_composition = new_line_compositions + other_compositions self.update_paragraph_data(paragraph) def process_paragraph_spacing(self, paragraph: PdfParagraph): if not paragraph.pdf_paragraph_composition: return # 处理行级别的空格 processed_lines = [] for composition in paragraph.pdf_paragraph_composition: if not composition.pdf_line: processed_lines.append(composition) continue line = composition.pdf_line if not "".join( x.char_unicode for x in line.pdf_character ).strip(): # 跳过完全空白的行 continue # 处理行内字符的尾随空格 processed_chars = [] for char in line.pdf_character: if not char.char_unicode.isspace(): processed_chars = processed_chars + [char] elif processed_chars: # 只有在有非空格字符后才考虑保留空格 processed_chars.append(char) # 移除尾随空格 while processed_chars and processed_chars[-1].char_unicode.isspace(): processed_chars.pop() if processed_chars: # 如果行内还有字符 line = self.create_line(processed_chars) processed_lines.append(line) paragraph.pdf_paragraph_composition = processed_lines self.update_paragraph_data(paragraph) def create_line(self, chars: list[PdfCharacter]) -> PdfParagraphComposition: assert chars line = PdfLine(pdf_character=chars) self.update_line_data(line) return PdfParagraphComposition(pdf_line=line) def calculate_median_line_width(self, paragraphs: list[PdfParagraph]) -> float: # 收集所有行的宽度 line_widths = [] for paragraph in paragraphs: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: line = composition.pdf_line line_widths.append(line.box.x2 - line.box.x) if not line_widths: return 0.0 # 计算中位数 line_widths.sort() mid = len(line_widths) // 2 if len(line_widths) % 2 == 0: return (line_widths[mid - 1] + line_widths[mid]) / 2 return line_widths[mid] def process_independent_paragraphs( self, paragraphs: list[PdfParagraph], median_width: float, ): i = 0 while i < len(paragraphs): paragraph = paragraphs[i] if len(paragraph.pdf_paragraph_composition) <= 1: # 跳过只有一行的段落 i += 1 continue j = 1 while j < len(paragraph.pdf_paragraph_composition): prev_composition = paragraph.pdf_paragraph_composition[j - 1] if not prev_composition.pdf_line: j += 1 continue prev_line = prev_composition.pdf_line prev_width = prev_line.box.x2 - prev_line.box.x prev_text = "".join([c.char_unicode for c in prev_line.pdf_character]) # 检查是否包含连续的点(至少 20 个) # 如果有至少连续 20 个点,则代表这是目录条目 if re.search(r"\.{20,}", prev_text): # 创建新的段落 new_paragraph = PdfParagraph( box=Box(0, 0, 0, 0), # 临时边界框 pdf_paragraph_composition=( paragraph.pdf_paragraph_composition[j:] ), unicode="", debug_id=generate_base58_id(), layout_label=paragraph.layout_label, layout_id=paragraph.layout_id, ) # 更新原段落 paragraph.pdf_paragraph_composition = ( paragraph.pdf_paragraph_composition[:j] ) # 更新两个段落的数据 self.update_paragraph_data(paragraph) self.update_paragraph_data(new_paragraph) # 在原段落后插入新段落 paragraphs.insert(i + 1, new_paragraph) break # 如果前一行宽度小于中位数的一半,将当前行及后续行分割成新段落 if ( self.translation_config.split_short_lines and prev_width < median_width * self.translation_config.short_line_split_factor ) or ( paragraph.pdf_paragraph_composition and (current_line := paragraph.pdf_paragraph_composition[j]) and (line := current_line.pdf_line) and (chars := line.pdf_character) and (char := chars[0]) and is_bullet_point(char) ): # 创建新的段落 new_paragraph = PdfParagraph( box=Box(0, 0, 0, 0), # 临时边界框 pdf_paragraph_composition=( paragraph.pdf_paragraph_composition[j:] ), unicode="", debug_id=generate_base58_id(), layout_label=paragraph.layout_label, layout_id=paragraph.layout_id, ) # 更新原段落 paragraph.pdf_paragraph_composition = ( paragraph.pdf_paragraph_composition[:j] ) # 更新两个段落的数据 self.update_paragraph_data(paragraph) self.update_paragraph_data(new_paragraph) # 在原段落后插入新段落 paragraphs.insert(i + 1, new_paragraph) break j += 1 i += 1 @staticmethod def is_bbox_contain_in_vertical(bbox1: Box, bbox2: Box) -> bool: """Check if one bounding box is completely contained within the other.""" # Check if bbox1 is contained in bbox2 bbox1_in_bbox2 = bbox1.y >= bbox2.y and bbox1.y2 <= bbox2.y2 # Check if bbox2 is contained in bbox1 bbox2_in_bbox1 = bbox2.y >= bbox1.y and bbox2.y2 <= bbox1.y2 return bbox1_in_bbox2 or bbox2_in_bbox1 def fix_overlapping_paragraphs(self, page: Page): """ Adjusts the bounding boxes of paragraphs on a page to resolve vertical overlaps. Iteratively checks pairs of paragraphs and adjusts their vertical boundaries (y and y2) if they overlap, aiming to place the boundary at the midpoint of the vertical overlap. """ paragraphs = page.pdf_paragraph if not paragraphs or len(paragraphs) < 2: return max_iterations = len(paragraphs) * len(paragraphs) # Safety break iterations = 0 while iterations < max_iterations: iterations += 1 overlap_found_in_pass = False for i in range(len(paragraphs)): for j in range(i + 1, len(paragraphs)): para1 = paragraphs[i] para2 = paragraphs[j] if para1.box is None or para2.box is None: continue if para1.xobj_id != para2.xobj_id: continue # Check for overlap using the existing method if self.bbox_overlap(para1.box, para2.box): if self.is_bbox_contain_in_vertical(para1.box, para2.box): continue # Calculate vertical overlap details overlap_y_start = max(para1.box.y, para2.box.y) overlap_y_end = min(para1.box.y2, para2.box.y2) overlap_height = overlap_y_end - overlap_y_start # Calculate horizontal overlap details overlap_x_start = max(para1.box.x, para2.box.x) overlap_x_end = min(para1.box.x2, para2.box.x2) overlap_width = overlap_x_end - overlap_x_start # Ensure there's a real 2D overlap, focusing on vertical adjustment if overlap_height > 1e-6 and overlap_width > 1e-6: overlap_found_in_pass = True # Determine which paragraph is visually higher if para1.box.y2 > para2.box.y and para1.box.y < para2.box.y: lower_para = para1 higher_para = para2 # Handle cases where y values are identical (or very close) # Prefer the one with smaller y2 as the higher one, or break tie arbitrarily elif para1.box.y2 < para2.box.y2: lower_para = para1 higher_para = para2 else: lower_para = para2 higher_para = para1 # Calculate the midpoint of the vertical overlap mid_y = overlap_y_start + overlap_height / 2 # Adjust boxes, ensuring they remain valid (y2 > y) if mid_y > higher_para.box.y and mid_y < lower_para.box.y2: higher_para.box.y = mid_y + 1 lower_para.box.y2 = mid_y - 1 else: # This might happen if one box is fully contained vertically # within another, or due to floating point issues. # Log a warning and skip adjustment for this pair in this iteration. # A more complex strategy might be needed for full containment. logger.warning( "Could not resolve overlap between paragraphs" f" {higher_para.debug_id} and {lower_para.debug_id}" " using simple midpoint strategy." f" Midpoint: {mid_y}," f" Higher Box: {higher_para.box}," f" Lower Box: {lower_para.box}" ) # If no overlaps were found and adjusted in this pass, we're done. if not overlap_found_in_pass: break if iterations == max_iterations: logger.warning( f"Maximum iterations ({max_iterations}) reached in" f" fix_overlapping_paragraphs for page {page.page_number}." " Some overlaps might remain." ) def _sort_characters_in_lines(self, page: Page): """Sort characters in each line from left to right, top to bottom.""" for paragraph in page.pdf_paragraph: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: line = composition.pdf_line line.pdf_character.sort(key=self._get_char_sort_key) def _get_char_sort_key(self, char: PdfCharacter): """Get sort key for character positioning (top to bottom, left to right).""" visual_box = char.visual_bbox.box pdf_box = char.box # Use visual box if IoU with bbox is >= 0.1, otherwise use bbox if calculate_iou_for_boxes(visual_box, pdf_box) >= 0.1: box = visual_box else: box = pdf_box # Sort by y coordinate first (top to bottom), then x coordinate (left to right) # Note: In PDF coordinate system, y increases upward, so we negate y for top-to-bottom sorting return (box.x, -box.y) ================================================ FILE: babeldoc/format/pdf/document_il/midend/remove_descent.py ================================================ import logging from collections import Counter from functools import cache from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class RemoveDescent: stage_name = "Remove Char Descent" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config def _remove_char_descent( self, char: il_version_1.PdfCharacter, font: il_version_1.PdfFont, ) -> float | None: """Remove descent from a single character and return the descent value. Args: char: The character to process font: The font used by this character Returns: The descent value if it was removed, None otherwise """ if ( char.box and char.box.y is not None and char.box.y2 is not None and font and hasattr(font, "descent") ): descent = font.descent * char.pdf_style.font_size / 1000 if char.vertical: # For vertical text, remove descent from x coordinates char.box.x += descent char.box.x2 += descent else: # For horizontal text, remove descent from y coordinates char.box.y -= descent char.box.y2 -= descent return descent return None def process(self, document: il_version_1.Document): """Process the document to remove descent adjustments from character boxes. Args: document: The document to process """ with self.translation_config.progress_monitor.stage_start( self.stage_name, len(document.page), ) as pbar: for page in document.page: self.translation_config.raise_if_cancelled() self.process_page(page) pbar.advance() def process_page(self, page: il_version_1.Page): """Process a single page to remove descent adjustments. Args: page: The page to process """ # Build font map including xobjects fonts: dict[ str | int, il_version_1.PdfFont | dict[str, il_version_1.PdfFont], ] = {f.font_id: f for f in page.pdf_font} page_fonts = {f.font_id: f for f in page.pdf_font} # Add xobject fonts for xobj in page.pdf_xobject: fonts[xobj.xobj_id] = page_fonts.copy() for font in xobj.pdf_font: fonts[xobj.xobj_id][font.font_id] = font @cache def get_font( font_id: str, xobj_id: int | None = None, ) -> il_version_1.PdfFont | None: if xobj_id is not None and xobj_id in fonts: font_map = fonts[xobj_id] if isinstance(font_map, dict) and font_id in font_map: return font_map[font_id] return ( fonts.get(font_id) if isinstance(fonts.get(font_id), il_version_1.PdfFont) else None ) # Process all standalone characters in the page for char in page.pdf_character: if font := get_font(char.pdf_style.font_id, char.xobj_id): self._remove_char_descent(char, font) # Process all paragraphs for paragraph in page.pdf_paragraph: descent_values = [] vertical_chars = [] # Process all characters in paragraph compositions for comp in paragraph.pdf_paragraph_composition: # Handle direct characters if comp.pdf_character: font = get_font( comp.pdf_character.pdf_style.font_id, comp.pdf_character.xobj_id, ) if font: descent = self._remove_char_descent(comp.pdf_character, font) if descent is not None: descent_values.append(descent) vertical_chars.append(comp.pdf_character.vertical) # Handle characters in PdfLine elif comp.pdf_line: for char in comp.pdf_line.pdf_character: if font := get_font(char.pdf_style.font_id, char.xobj_id): descent = self._remove_char_descent(char, font) if descent is not None: descent_values.append(descent) vertical_chars.append(char.vertical) # Handle characters in PdfFormula elif comp.pdf_formula: for char in comp.pdf_formula.pdf_character: if font := get_font(char.pdf_style.font_id, char.xobj_id): descent = self._remove_char_descent(char, font) if descent is not None: descent_values.append(descent) vertical_chars.append(char.vertical) # Handle characters in PdfSameStyleCharacters elif comp.pdf_same_style_characters: for char in comp.pdf_same_style_characters.pdf_character: if font := get_font(char.pdf_style.font_id, char.xobj_id): descent = self._remove_char_descent(char, font) if descent is not None: descent_values.append(descent) vertical_chars.append(char.vertical) # Adjust paragraph box based on most common descent value if descent_values and paragraph.box: # Calculate mode of descent values descent_counter = Counter(descent_values) most_common_descent = descent_counter.most_common(1)[0][0] # Check if paragraph is vertical (all characters are vertical) is_vertical = all(vertical_chars) if vertical_chars else False # Adjust paragraph box if paragraph.box.y is not None and paragraph.box.y2 is not None: if is_vertical: # For vertical paragraphs, adjust x coordinates paragraph.box.x += most_common_descent paragraph.box.x2 += most_common_descent else: # For horizontal paragraphs, adjust y coordinates paragraph.box.y -= most_common_descent paragraph.box.y2 -= most_common_descent ================================================ FILE: babeldoc/format/pdf/document_il/midend/styles_and_formulas.py ================================================ import math import re from babeldoc.format.pdf.document_il.il_version_1 import Box from babeldoc.format.pdf.document_il.il_version_1 import Document from babeldoc.format.pdf.document_il.il_version_1 import GraphicState from babeldoc.format.pdf.document_il.il_version_1 import Page from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula from babeldoc.format.pdf.document_il.il_version_1 import PdfLine from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleCharacters from babeldoc.format.pdf.document_il.il_version_1 import PdfStyle from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.formular_helper import ( collect_page_formula_font_ids, ) from babeldoc.format.pdf.document_il.utils.formular_helper import ( is_formulas_middle_char, ) from babeldoc.format.pdf.document_il.utils.formular_helper import is_formulas_start_char from babeldoc.format.pdf.document_il.utils.formular_helper import update_formula_data from babeldoc.format.pdf.document_il.utils.layout_helper import LEFT_BRACKET from babeldoc.format.pdf.document_il.utils.layout_helper import RIGHT_BRACKET from babeldoc.format.pdf.document_il.utils.layout_helper import build_layout_index from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes from babeldoc.format.pdf.document_il.utils.layout_helper import ( calculate_y_true_iou_for_boxes, ) from babeldoc.format.pdf.document_il.utils.layout_helper import is_bullet_point from babeldoc.format.pdf.document_il.utils.layout_helper import ( is_curve_in_figure_table_layout, ) from babeldoc.format.pdf.document_il.utils.layout_helper import ( is_curve_overlapping_with_paragraphs, ) from babeldoc.format.pdf.document_il.utils.layout_helper import is_same_style from babeldoc.format.pdf.document_il.utils.spatial_analyzer import ( is_element_contained_in_formula, ) from babeldoc.format.pdf.translation_config import TranslationConfig class StylesAndFormulas: stage_name = "Parse Formulas and Styles" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config self.font_mapper = FontMapper(translation_config) def update_formula_data(self, formula: PdfFormula): update_formula_data(formula) def process(self, document: Document): with self.translation_config.progress_monitor.stage_start( self.stage_name, len(document.page), ) as pbar: for page in document.page: self.translation_config.raise_if_cancelled() self.process_page(page) pbar.advance() def update_all_formula_data(self, page: Page): for para in page.pdf_paragraph: for comp in para.pdf_paragraph_composition: if comp.pdf_formula: self.update_formula_data(comp.pdf_formula) def _calculate_element_formula_iou( self, element_box: Box, formula_box: Box, tolerance: float = 2.0 ) -> float: """Calculate precise IoU between an element and a formula with tolerance. Args: element_box: Bounding box of the element (curve/form) formula_box: Bounding box of the formula tolerance: Tolerance to expand formula box for containment check Returns: IoU value between element and expanded formula box """ if element_box is None or formula_box is None: return 0.0 # Expand formula box by tolerance for more lenient containment check expanded_formula_box = Box( x=formula_box.x - tolerance, y=formula_box.y - tolerance, x2=formula_box.x2 + tolerance, y2=formula_box.y2 + tolerance, ) return calculate_iou_for_boxes(element_box, expanded_formula_box) def _is_element_contained_exact( self, element_box: Box, formula_box: Box, containment_threshold: float = 0.95, ) -> bool: """Check if an element is contained within a formula with zero tolerance. Args: element_box: Bounding box of the element (curve/form) formula_box: Bounding box of the formula containment_threshold: Minimum IoU ratio to consider as contained Returns: True if the element is contained within the formula (exact match) """ if element_box is None or formula_box is None: return False # Use formula box without any tolerance expansion iou = calculate_iou_for_boxes(element_box, formula_box) return iou >= containment_threshold def _calculate_element_formula_distance( self, element_box: Box, formula_box: Box ) -> float: """Calculate the shortest distance between an element and a formula. Args: element_box: Bounding box of the element (curve/form) formula_box: Bounding box of the formula Returns: Shortest distance between the element and formula boxes """ if element_box is None or formula_box is None: return float("inf") # Calculate horizontal distance if element_box.x2 < formula_box.x: # Element is to the left of formula dx = formula_box.x - element_box.x2 elif element_box.x > formula_box.x2: # Element is to the right of formula dx = element_box.x - formula_box.x2 else: # Horizontal overlap dx = 0.0 # Calculate vertical distance if element_box.y2 < formula_box.y: # Element is above formula dy = formula_box.y - element_box.y2 elif element_box.y > formula_box.y2: # Element is below formula dy = element_box.y - formula_box.y2 else: # Vertical overlap dy = 0.0 # Return Euclidean distance return (dx * dx + dy * dy) ** 0.5 def _collect_element_formula_candidates( self, page: Page ) -> tuple[list, dict, dict]: """Collect all potential assignments of elements to formulas. Uses two-level IoU matching strategy: 1. Exact IoU matching (zero tolerance) - highest priority 2. Tolerant IoU matching (2.0 tolerance, distance-sorted) - second priority Returns: Tuple of (all_formulas, curve_candidates, form_candidates) where: - all_formulas: list of (formula, paragraph_xobj_id) tuples - curve_candidates: dict mapping curve index to (curve, candidates) tuples - form_candidates: dict mapping form index to (form, candidates) tuples where candidates is a list of (formula_index, score, match_type) tuples """ curve_candidates = {} form_candidates = {} # Configuration parameters max_tolerant_distance = 100.0 # Maximum distance for tolerant matching scoring if not page.pdf_paragraph: return [], curve_candidates, form_candidates # Collect all formulas from all paragraphs with their index all_formulas = [] for paragraph in page.pdf_paragraph: for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula: all_formulas.append((composition.pdf_formula, paragraph.xobj_id)) # Check each curve against all formulas for curve_idx, curve in enumerate(page.pdf_curve): if not curve.box: continue candidates = [] for formula_idx, (formula, paragraph_xobj_id) in enumerate(all_formulas): if not formula.box: continue # Check xobj_id compatibility if paragraph_xobj_id is not None and curve.xobj_id != paragraph_xobj_id: continue # Level 1: Exact IoU matching (zero tolerance) - highest priority if self._is_element_contained_exact(curve.box, formula.box): iou = calculate_iou_for_boxes(curve.box, formula.box) candidates.append((formula_idx, iou, "iou_exact")) # Level 2: Tolerant IoU matching (with tolerance) - distance sorted elif is_element_contained_in_formula(curve.box, formula.box): distance = self._calculate_element_formula_distance( curve.box, formula.box ) # Convert distance to score (closer = higher score) # Score range: 0.5-0.9 to ensure lower than exact IoU distance_factor = max(0.0, 1.0 - distance / max_tolerant_distance) score = 0.5 + 0.4 * distance_factor candidates.append((formula_idx, score, "iou_tolerant")) if candidates: curve_candidates[curve_idx] = (curve, candidates) # Check each form against all formulas for form_idx, form in enumerate(page.pdf_form): if not form.box: continue candidates = [] for formula_idx, (formula, paragraph_xobj_id) in enumerate(all_formulas): if not formula.box: continue # Check xobj_id compatibility if paragraph_xobj_id is not None and form.xobj_id != paragraph_xobj_id: continue # Level 1: Exact IoU matching (zero tolerance) - highest priority if self._is_element_contained_exact(form.box, formula.box): iou = calculate_iou_for_boxes(form.box, formula.box) candidates.append((formula_idx, iou, "iou_exact")) # Level 2: Tolerant IoU matching (with tolerance) - distance sorted elif is_element_contained_in_formula(form.box, formula.box): distance = self._calculate_element_formula_distance( form.box, formula.box ) # Convert distance to score (closer = higher score) # Score range: 0.5-0.9 to ensure lower than exact IoU distance_factor = max(0.0, 1.0 - distance / max_tolerant_distance) score = 0.5 + 0.4 * distance_factor candidates.append((formula_idx, score, "iou_tolerant")) if candidates: form_candidates[form_idx] = (form, candidates) return all_formulas, curve_candidates, form_candidates def _resolve_assignment_conflicts( self, curve_candidates: dict, form_candidates: dict ) -> tuple[dict, list, list]: """Resolve assignment conflicts using prioritized matching strategy. Args: curve_candidates: dict mapping curve index to (curve, candidates) tuples form_candidates: dict mapping form index to (form, candidates) tuples where candidates is a list of (formula_index, score, match_type) tuples Returns: Tuple of (formula_assignments, curves_to_remove, forms_to_remove) where: - formula_assignments: dict mapping formula_index to (curves, forms) tuples - curves_to_remove: list of curves to remove from page level - forms_to_remove: list of forms to remove from page level """ formula_assignments = {} curves_to_remove = [] forms_to_remove = [] def _get_best_candidate(candidates): """Get the best candidate using priority: Exact IoU > Tolerant IoU, then by score.""" if not candidates: return None # Sort by match_type priority and then by score (descending) def sort_key(candidate): formula_idx, score, match_type = candidate # Exact IoU matches get priority 1, tolerant IoU matches get priority 2 priority = 1 if match_type == "iou_exact" else 2 # Return tuple for sorting: (priority, -score) for descending score within priority return (priority, -score) sorted_candidates = sorted(candidates, key=sort_key) return sorted_candidates[0] # Resolve curve assignments for _curve_idx, (curve, candidates) in curve_candidates.items(): if not candidates: continue best_candidate = _get_best_candidate(candidates) if best_candidate: best_formula_idx, best_score, match_type = best_candidate # Add to assignments if best_formula_idx not in formula_assignments: formula_assignments[best_formula_idx] = ([], []) formula_assignments[best_formula_idx][0].append(curve) curves_to_remove.append(curve) # Resolve form assignments for _form_idx, (form, candidates) in form_candidates.items(): if not candidates: continue best_candidate = _get_best_candidate(candidates) if best_candidate: best_formula_idx, best_score, match_type = best_candidate # Add to assignments if best_formula_idx not in formula_assignments: formula_assignments[best_formula_idx] = ([], []) formula_assignments[best_formula_idx][1].append(form) forms_to_remove.append(form) return formula_assignments, curves_to_remove, forms_to_remove def collect_contained_elements(self, page: Page): """Collect curves and forms that are contained within formulas. Uses two-phase assignment strategy to ensure each element is assigned to only one formula based on highest IoU value. """ if not page.pdf_paragraph: return # Phase 1: Collect all potential element-formula assignments all_formulas, curve_candidates, form_candidates = ( self._collect_element_formula_candidates(page) ) # Phase 2: Resolve conflicts using IoU maximization formula_assignments, curves_to_remove, forms_to_remove = ( self._resolve_assignment_conflicts(curve_candidates, form_candidates) ) # Apply the resolved assignments using formula indices for formula_idx, ( assigned_curves, assigned_forms, ) in formula_assignments.items(): formula = all_formulas[formula_idx][0] # Extract formula from tuple formula.pdf_curve.extend(assigned_curves) formula.pdf_form.extend(assigned_forms) # Remove assigned elements from page level for curve in curves_to_remove: if curve in page.pdf_curve: page.pdf_curve.remove(curve) for form in forms_to_remove: if form in page.pdf_form: page.pdf_form.remove(form) def process_page(self, page: Page): """处理页面,包括公式识别和偏移量计算""" self.process_page_formulas(page) # self.process_page_offsets(page) self.process_comma_formulas(page) self.merge_overlapping_formulas(page) if not self.translation_config.skip_formula_offset_calculation: self.process_page_offsets(page) self.process_translatable_formulas(page) self.update_all_formula_data(page) if not self.translation_config.ocr_workaround: self.collect_contained_elements(page) # Process remaining non-formula lines after formula assignment is complete if self.translation_config.remove_non_formula_lines: self.remove_non_formula_lines_from_paragraphs(page) if not self.translation_config.skip_formula_offset_calculation: self.process_page_offsets(page) self.update_all_formula_data(page) self.process_page_styles(page) def update_line_data(self, line: PdfLine): min_x = min(char.visual_bbox.box.x for char in line.pdf_character) min_y = min(char.visual_bbox.box.y for char in line.pdf_character) max_x = max(char.visual_bbox.box.x2 for char in line.pdf_character) max_y = max(char.visual_bbox.box.y2 for char in line.pdf_character) line.box = Box(min_x, min_y, max_x, max_y) def _classify_characters_in_composition( self, composition: PdfParagraphComposition, formula_font_ids: set[int], first_is_bullet_so_far: bool, line_index: int, ) -> tuple[list[tuple[PdfCharacter, bool]], bool]: """ Phase 1: Classify every character in a composition as either formula or text. This preserves the original logic, including the sticky `first_is_bullet` flag. """ tagged_chars = [] is_formula_tags = [] line = composition.pdf_line if not line or not line.pdf_character: return [], first_is_bullet_so_far first_is_bullet = first_is_bullet_so_far in_formula_state = False in_corner_mark_state = False corner_mark_info = [] # Determine the `is_formula` tag for each character for i, char in enumerate(line.pdf_character): # The original logic for `first_is_bullet`: it is set if any segment starts with a bullet. # A "segment" started when `current_chars` was empty. # We determine the start of a segment by looking at the previous char's tag. is_start_of_segment = i == 0 or ( len(is_formula_tags) > 0 and is_formula_tags[-1] != in_formula_state ) if not first_is_bullet and is_start_of_segment and is_bullet_point(char): first_is_bullet = True is_formula = ( ( # 区分公式开头的字符&公式中间的字符。主要是逗号不能在公式开头,但是可以在中间。 char.formula_layout_id or ( is_formulas_start_char( char.char_unicode, self.font_mapper, self.translation_config, ) and not in_formula_state ) or ( is_formulas_middle_char( char.char_unicode, self.font_mapper, self.translation_config, ) and in_formula_state ) ) # 公式字符 or char.pdf_style.font_id in formula_font_ids # 公式字体 or char.vertical # 垂直字体 or ( # 如果是程序添加的 dummy 空格 char.char_unicode is None and in_formula_state ) or ( # 如果字符的视觉框和实际框不一致,则认为是公式字符 char.box.x > char.visual_bbox.box.x2 or char.box.x2 < char.visual_bbox.box.x or char.box.y > char.visual_bbox.box.y2 or char.box.y2 < char.visual_bbox.box.y ) ) previous_char = line.pdf_character[i - 1] if i > 0 else None next_char = ( line.pdf_character[i + 1] if i < len(line.pdf_character) - 1 else None ) isspace = char.char_unicode.isspace() if char.char_unicode else False prev_is_space = ( previous_char.char_unicode.isspace() if previous_char and previous_char.char_unicode else False ) is_corner_mark = ( ( previous_char is not None and not isspace and not prev_is_space and not first_is_bullet # 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 and char.pdf_style.font_size < previous_char.pdf_style.font_size * 0.79 and not in_corner_mark_state ) or ( previous_char is not None and not isspace and not prev_is_space and not first_is_bullet # 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 and char.pdf_style.font_size < previous_char.pdf_style.font_size * 1.1 and in_corner_mark_state ) or ( # 检查段落开始的角标:当没有前一个字符时,通过下一个字符判断 previous_char is None and next_char is not None and not isspace and not prev_is_space and not first_is_bullet # 当前字符字体大小明显小于下一个字符,判定为角标 and char.pdf_style.font_size < next_char.pdf_style.font_size * 0.79 and not in_corner_mark_state ) ) is_formula = is_formula or is_corner_mark if char.char_unicode == " ": is_formula = in_formula_state # This simulates the state change for the next iteration if is_formula != in_formula_state: in_formula_state = is_formula in_corner_mark_state = is_corner_mark is_formula_tags.append(is_formula) corner_mark_info.append(is_corner_mark) for char, is_formula, is_corner_mark in zip( line.pdf_character, is_formula_tags, corner_mark_info, strict=False ): tagged_chars.append((char, is_formula, is_corner_mark)) return tagged_chars, first_is_bullet def _group_classified_characters( self, tagged_chars: list[tuple[PdfCharacter, bool, bool]], line_index: int, ) -> list[PdfParagraphComposition]: """ Phase 2: Group consecutive characters with the same tag into new compositions. """ if not tagged_chars: return [] new_compositions = [] current_chars = [] current_tag = tagged_chars[0][1] current_corner_mark_flags = [] for char, is_formula_tag, is_corner_mark in tagged_chars: if is_formula_tag == current_tag: current_chars.append(char) current_corner_mark_flags.append(is_corner_mark) else: # Check if any character in current group is a corner mark has_corner_mark = any(current_corner_mark_flags) new_compositions.append( self.create_composition( current_chars, current_tag, line_index, has_corner_mark ), ) current_chars = [char] current_tag = is_formula_tag current_corner_mark_flags = [is_corner_mark] if current_chars: # Check if any character in final group is a corner mark has_corner_mark = any(current_corner_mark_flags) new_compositions.append( self.create_composition( current_chars, current_tag, line_index, has_corner_mark ), ) return new_compositions def process_page_formulas(self, page: Page): if not page.pdf_paragraph: return page_level_formula_font_ids, xobj_specific_formula_font_ids = ( collect_page_formula_font_ids( page, self.translation_config.formular_font_pattern ) ) for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue current_formula_font_ids: set[int] if ( paragraph.xobj_id and paragraph.xobj_id in xobj_specific_formula_font_ids ): current_formula_font_ids = xobj_specific_formula_font_ids[ paragraph.xobj_id ] else: current_formula_font_ids = page_level_formula_font_ids new_paragraph_compositions = [] # This flag is carried through all compositions in a paragraph, as in the original implementation. first_is_bullet = False for line_index, composition in enumerate( paragraph.pdf_paragraph_composition ): ( tagged_chars, first_is_bullet, ) = self._classify_characters_in_composition( composition, current_formula_font_ids, first_is_bullet, line_index, ) if not tagged_chars: new_paragraph_compositions.append(composition) continue grouped_compositions = self._group_classified_characters( tagged_chars, line_index ) new_paragraph_compositions.extend(grouped_compositions) paragraph.pdf_paragraph_composition = new_paragraph_compositions def process_translatable_formulas(self, page: Page): """将需要正常翻译的公式(如纯数字、数字加逗号等)转换为普通文本行""" if not page.pdf_paragraph: return for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue new_compositions = [] for composition in paragraph.pdf_paragraph_composition: if ( composition.pdf_formula is not None and not composition.pdf_formula.is_corner_mark and self.is_translatable_formula( composition.pdf_formula, ) ): # 将可翻译公式转换为普通文本行 new_line = PdfLine( pdf_character=composition.pdf_formula.pdf_character, ) self.update_line_data(new_line) new_compositions.append(PdfParagraphComposition(pdf_line=new_line)) else: new_compositions.append(composition) paragraph.pdf_paragraph_composition = new_compositions def process_page_styles(self, page: Page): """处理页面中的文本样式,识别相同样式的文本""" if not page.pdf_paragraph: return for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue # 计算基准样式(除公式外所有文字样式的交集) base_style = self._calculate_base_style(paragraph) paragraph.pdf_style = base_style # 重新组织段落中的文本,将相同样式的文本组合在一起 new_compositions = [] current_chars = [] current_style = None for comp in paragraph.pdf_paragraph_composition: if comp.pdf_formula is not None: if current_chars: new_comp = self._create_same_style_composition( current_chars, current_style, ) new_compositions.append(new_comp) current_chars = [] new_compositions.append(comp) continue if not comp.pdf_line: new_compositions.append(comp) continue for char in comp.pdf_line.pdf_character: char_style = char.pdf_style if current_style is None: current_style = char_style current_chars.append(char) elif is_same_style(char_style, current_style): current_chars.append(char) else: if current_chars: new_comp = self._create_same_style_composition( current_chars, current_style, ) new_compositions.append(new_comp) current_chars = [char] current_style = char_style if current_chars: new_comp = self._create_same_style_composition( current_chars, current_style, ) new_compositions.append(new_comp) paragraph.pdf_paragraph_composition = new_compositions def _calculate_base_style(self, paragraph) -> PdfStyle: """计算段落的基准样式(除公式外所有文字样式的交集)""" styles = [] for comp in paragraph.pdf_paragraph_composition: if isinstance(comp, PdfFormula): continue if not comp.pdf_line: continue for char in comp.pdf_line.pdf_character: styles.append(char.pdf_style) if not styles: return None # 返回所有样式的交集 base_style = styles[0] for style in styles[1:]: # 更新基准样式为所有样式的交集 base_style = self._merge_styles(base_style, style) # 如果 font_id 或 font_size 为 None,则使用众数 if base_style.font_id is None: base_style.font_id = self._get_mode_value([s.font_id for s in styles]) if base_style.font_size is None: base_style.font_size = self._get_mode_value([s.font_size for s in styles]) return base_style def _get_mode_value(self, values): """计算列表中的众数""" if not values: return None from collections import Counter counter = Counter(values) return counter.most_common(1)[0][0] def _merge_styles(self, style1, style2): """合并两个样式,返回它们的交集""" if style1 is None or style1.font_size is None: return style2 if style2 is None or style2.font_size is None: return style1 return PdfStyle( font_id=style1.font_id if style1.font_id == style2.font_id else None, font_size=( style1.font_size if math.fabs(style1.font_size - style2.font_size) < 0.02 else None ), graphic_state=self._merge_graphic_states( style1.graphic_state, style2.graphic_state, ), ) def _merge_graphic_states(self, state1, state2): """合并两个 GraphicState,返回它们的交集""" if state1 is None: return state2 if state2 is None: return state1 return GraphicState( passthrough_per_char_instruction=( state1.passthrough_per_char_instruction if state1.passthrough_per_char_instruction == state2.passthrough_per_char_instruction else None ), ) def _create_same_style_composition( self, chars: list[PdfCharacter], style, ) -> PdfParagraphComposition: """创建具有相同样式的文本组合""" if not chars: return None # 计算边界框 min_x = min(char.visual_bbox.box.x for char in chars) min_y = min(char.visual_bbox.box.y for char in chars) max_x = max(char.visual_bbox.box.x2 for char in chars) max_y = max(char.visual_bbox.box.y2 for char in chars) box = Box(min_x, min_y, max_x, max_y) return PdfParagraphComposition( pdf_same_style_characters=PdfSameStyleCharacters( box=box, pdf_style=style, pdf_character=chars, ), ) def process_page_offsets(self, page: Page): """计算公式的 x 和 y 偏移量""" if not page.pdf_paragraph: return for paragraph in page.pdf_paragraph: if paragraph.debug_id is None: continue if not paragraph.pdf_paragraph_composition: continue # 计算该段落的行间距,用其 80% 作为容差 # line_spacing = self.calculate_line_spacing(paragraph) # y_tolerance = line_spacing * 0.8 for i, composition in enumerate(paragraph.pdf_paragraph_composition): if not composition.pdf_formula: continue formula = composition.pdf_formula left_char = None right_char = None left_iou = 0 right_iou = 0 # 查找左边最近的同一行的文本 for j in range(i - 1, -1, -1): comp = paragraph.pdf_paragraph_composition[j] if comp.pdf_line: for char in reversed(comp.pdf_line.pdf_character): if not char.pdf_character_id: continue # 检查 y 坐标是否接近,判断是否在同一行 left_iou = calculate_y_true_iou_for_boxes( formula.box, char.box ) if left_iou > 0.6: left_char = char break break # 查找右边最近的同一行的文本 for j in range(i + 1, len(paragraph.pdf_paragraph_composition)): comp = paragraph.pdf_paragraph_composition[j] if comp.pdf_line: for char in comp.pdf_line.pdf_character: if not char.pdf_character_id: continue # 检查 y 坐标是否接近,判断是否在同一行 right_iou = calculate_y_true_iou_for_boxes( formula.box, char.box ) if right_iou > 0.6: right_char = char break break # If both text segments exist, keep the one with higher IOU if left_char and right_char: if left_iou < right_iou: left_char = None elif right_iou < left_iou: right_char = None # If IOUs are equal, keep both # 计算 x 偏移量(相对于左边文本) if left_char: formula.x_offset = formula.box.x - left_char.box.x2 else: formula.x_offset = 0 # 如果左边没有文字,x_offset 应该为 0 if abs(formula.x_offset) < 0.1: formula.x_offset = 0 if formula.x_offset > 10: formula.x_offset = 0 # if formula.x_offset > 0: # formula.x_offset = 0 if formula.x_offset < -5: formula.x_offset = 0 # 计算 y 偏移量 if left_char: # 使用底部坐标计算偏移量 formula.y_offset = formula.box.y - left_char.box.y elif right_char: formula.y_offset = formula.box.y - right_char.box.y else: formula.y_offset = 0 if abs(formula.y_offset) < 0.1: formula.y_offset = 0 if max(abs(formula.y_offset), abs(formula.x_offset)) > 10: pass # logging.debug( # f"公式 {formula.box} 的偏移量过大:{formula.x_offset}, {formula.y_offset}" # ) def calculate_line_spacing(self, paragraph) -> float: """计算段落中的平均行间距""" if not paragraph.pdf_paragraph_composition: return 0.0 # 收集所有文本行的 y 坐标 line_y_positions = [] for comp in paragraph.pdf_paragraph_composition: if comp.pdf_line: line_y_positions.append(comp.pdf_line.box.y) if len(line_y_positions) < 2: return 10.0 # 如果只有一行或没有行,返回一个默认值 # 计算相邻行之间的 y 差值 line_spacings = [] for i in range(len(line_y_positions) - 1): spacing = abs(line_y_positions[i] - line_y_positions[i + 1]) if spacing > 0: # 忽略重叠的行 line_spacings.append(spacing) if not line_spacings: return 10.0 # 如果没有有效的行间距,返回默认值 # 使用中位数来避免异常值的影响 median_spacing = sorted(line_spacings)[len(line_spacings) // 2] return median_spacing def create_composition( self, chars: list[PdfCharacter], is_formula: bool, line_index: int, is_corner_mark: bool = False, ) -> PdfParagraphComposition: if is_formula: formula = PdfFormula(pdf_character=chars, line_id=line_index) formula.is_corner_mark = is_corner_mark self.update_formula_data(formula) return PdfParagraphComposition(pdf_formula=formula) else: new_line = PdfLine(pdf_character=chars) self.update_line_data(new_line) return PdfParagraphComposition(pdf_line=new_line) def is_translatable_formula(self, formula: PdfFormula) -> bool: """判断公式是否只包含需要正常翻译的字符(数字、空格和英文逗号)""" if all(char.formula_layout_id for char in formula.pdf_character): return False text = "".join(char.char_unicode for char in formula.pdf_character) if formula.y_offset > 0.1: return False return bool(re.match(r"^[0-9, .]+$", text)) def should_split_formula(self, formula: PdfFormula) -> bool: """判断公式是否需要按逗号拆分(包含逗号且有其他特殊符号)""" if all(x.formula_layout_id for x in formula.pdf_character): return False text = "".join(char.char_unicode for char in formula.pdf_character) # 必须包含逗号 if "," not in text: return False # 检查是否包含除了数字和 [] 之外的其他符号 text_without_basic = re.sub(r"[0-9\[\],\s]", "", text) return bool(text_without_basic) def split_formula_by_comma( self, formula: PdfFormula, ) -> list[tuple[list[PdfCharacter], PdfCharacter]]: """按逗号拆分公式字符,返回 (字符组,逗号字符) 的列表,最后一组的逗号字符为 None。 只有不在括号内的逗号才会被用作分隔符。支持的括号对包括: - (cid:8) 和 (cid:9) - ( 和 ) - (cid:16) 和 (cid:17) """ result = [] current_chars = [] bracket_level = 0 # 跟踪括号的层数 for char in formula.pdf_character: # 检查是否是左括号 if char.char_unicode in LEFT_BRACKET: bracket_level += 1 current_chars.append(char) # 检查是否是右括号 elif char.char_unicode in RIGHT_BRACKET: bracket_level = max(0, bracket_level - 1) # 防止括号不匹配的情况 current_chars.append(char) # 检查是否是逗号,且不在括号内 elif char.char_unicode == "," and bracket_level == 0: if current_chars: result.append((current_chars, char)) current_chars = [] else: current_chars.append(char) if current_chars: result.append((current_chars, None)) # 最后一组没有逗号 return result def merge_formulas(self, formula1: PdfFormula, formula2: PdfFormula) -> PdfFormula: """合并两个公式,保持字符的相对位置""" # 合并所有字符 all_chars = formula1.pdf_character + formula2.pdf_character # 按 y 坐标和 x 坐标排序,确保字符顺序正确 # sorted_chars = sorted( # all_chars, key=lambda c: (c.visual_bbox.box.y, c.visual_bbox.box.x)) # 继承第一个公式的行 ID merged_formula = PdfFormula(pdf_character=all_chars, line_id=formula1.line_id) self.update_formula_data(merged_formula) return merged_formula def is_x_axis_contained(self, box1: Box, box2: Box) -> bool: """判断 box1 的 x 轴是否完全包含在 box2 的 x 轴内,或反之""" return (box1.x >= box2.x and box1.x2 <= box2.x2) or ( box2.x >= box1.x and box2.x2 <= box1.x2 ) def has_y_intersection(self, box1: Box, box2: Box) -> bool: """判断两个 box 的 y 轴是否有交集""" tolerance = 1.0 return not (box1.y2 < box2.y - tolerance or box2.y2 < box1.y - tolerance) def is_x_axis_adjacent(self, box1: Box, box2: Box, tolerance: float = 2.0) -> bool: """判断两个 box 在 x 轴上是否相邻或有交集""" # 检查是否有交集 has_intersection = not (box1.x2 < box2.x or box2.x2 < box1.x) # 检查 box1 是否在 box2 左边且相邻 left_adjacent = abs(box1.x2 - box2.x) <= tolerance # 检查 box2 是否在 box1 左边且相邻 right_adjacent = abs(box2.x2 - box1.x) <= tolerance return has_intersection or left_adjacent or right_adjacent def calculate_y_iou(self, box1: Box, box2: Box) -> float: """计算两个 box 在 y 轴上的 IOU (Intersection over Union)""" # 计算交集 intersection_start = max(box1.y, box2.y) intersection_end = min(box1.y2, box2.y2) intersection_length = max(0, intersection_end - intersection_start) # 计算并集 box1_height = box1.y2 - box1.y box2_height = box2.y2 - box2.y union_length = box1_height + box2_height - intersection_length # 避免除零错误 if union_length <= 0: return 0.0 return intersection_length / union_length def merge_overlapping_formulas(self, page: Page): """ 合并符合以下条件的公式: 1. x 轴重叠且 y 轴有交集的相邻公式,或者 2. x 轴相邻且 y 轴 IOU > 0.5 的相邻公式,或者 3. 所有字符的 layout id 都相同的相邻公式,或者 4. 任意两个公式的 IOU > 0.8 角标可能会被识别成单独的公式,需要合并 """ if not page.pdf_paragraph: return for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue # 重复执行合并过程,直到没有更多可以合并的公式 merged = True while merged: merged = False for i in range(len(paragraph.pdf_paragraph_composition)): if merged: break comp1 = paragraph.pdf_paragraph_composition[i] if comp1.pdf_formula is None: continue for j in range(i + 1, len(paragraph.pdf_paragraph_composition)): comp2 = paragraph.pdf_paragraph_composition[j] if comp2.pdf_formula is None: continue formula1 = comp1.pdf_formula formula2 = comp2.pdf_formula # 检查合并条件: # 0. 必须在同一行(line_id 相同),以及 # 1. x 轴重叠且 y 轴有交集,或者 # 2. x 轴相邻且 y 轴 IOU > 0.5,或者 # 3. 所有字符的 layout id 都相同,或者 # 4. 任意两个公式的 IOU > 0.8 # 检查是否在同一行 same_line = formula1.line_id == formula2.line_id should_merge = same_line and ( ( j == i + 1 and ( ( self.is_x_axis_contained( formula1.box, formula2.box ) and self.has_y_intersection( formula1.box, formula2.box ) ) or ( self.is_x_axis_adjacent( formula1.box, formula2.box ) and self.calculate_y_iou( formula1.box, formula2.box ) > 0.5 ) ) ) or (self._have_same_layout_ids(formula1, formula2, page)) or ( calculate_iou_for_boxes(formula1.box, formula2.box) > 0.8 ) or ( calculate_iou_for_boxes(formula2.box, formula1.box) > 0.8 ) ) if should_merge: # 合并公式 merged_formula = self.merge_formulas(formula1, formula2) paragraph.pdf_paragraph_composition[i] = ( PdfParagraphComposition( pdf_formula=merged_formula, ) ) # 删除第二个公式 del paragraph.pdf_paragraph_composition[j] merged = True break def _have_same_layout_ids( self, formula1: PdfFormula, formula2: PdfFormula, page: Page ) -> bool: """检查两个公式的所有字符是否具有相同的 layout id""" # 获取 formula1 中所有字符的 layout id formula1_layout_ids = set() for char in formula1.pdf_character: if char.char_unicode == " ": continue layout = char.formula_layout_id if layout: formula1_layout_ids.add(layout) # 获取 formula2 中所有字符的 layout id formula2_layout_ids = set() for char in formula2.pdf_character: if char.char_unicode == " ": continue layout = char.formula_layout_id if layout: formula2_layout_ids.add(layout) # 如果任一公式没有有效的 layout id,则不合并 if not (len(formula1_layout_ids) == len(formula2_layout_ids) == 1): return False # 检查两个公式的 layout id 集合是否相同 return formula1_layout_ids == formula2_layout_ids def process_comma_formulas(self, page: Page): """处理包含逗号的复杂公式,将其按逗号拆分""" if not page.pdf_paragraph: return for paragraph in page.pdf_paragraph: if not paragraph.pdf_paragraph_composition: continue new_compositions = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula is not None and self.should_split_formula( composition.pdf_formula, ): # 按逗号拆分公式 char_groups = self.split_formula_by_comma(composition.pdf_formula) for chars, comma in char_groups: if chars: # 忽略空组(连续的逗号) # 继承原公式的行 ID formula = PdfFormula( pdf_character=chars, line_id=composition.pdf_formula.line_id, ) self.update_formula_data(formula) new_compositions.append( PdfParagraphComposition(pdf_formula=formula), ) # 如果有逗号,添加为文本行 if comma: comma_line = PdfLine(pdf_character=[comma]) self.update_line_data(comma_line) new_compositions.append( PdfParagraphComposition(pdf_line=comma_line), ) else: new_compositions.append(composition) paragraph.pdf_paragraph_composition = new_compositions def remove_non_formula_lines_from_paragraphs(self, page: Page): """Remove non-formula lines from paragraphs. This method processes curves that remain in page.pdf_curve after collect_contained_elements() has assigned formula-related curves to formulas. All remaining curves are non-formula lines, but we need to be careful not to remove lines from figure/table areas. Args: page: The page to process """ if not page.pdf_curve: return # Build layout index for efficient spatial queries layout_index, layout_map = build_layout_index(page) curves_to_remove = [] # Get configuration thresholds protection_threshold = getattr( self.translation_config, "figure_table_protection_threshold", 0.9 ) overlap_threshold = getattr( self.translation_config, "non_formula_line_iou_threshold", 0.9 ) for curve in page.pdf_curve: # Skip if curve is in figure/table layout areas if is_curve_in_figure_table_layout( curve, layout_index, layout_map, protection_threshold ): continue # Only remove if curve overlaps with text paragraph areas if is_curve_overlapping_with_paragraphs( curve, page.pdf_paragraph, overlap_threshold ): curves_to_remove.append(curve) # Remove identified curves removed_count = 0 for curve in curves_to_remove: if curve in page.pdf_curve: page.pdf_curve.remove(curve) removed_count += 1 if removed_count > 0: import logging logger = logging.getLogger(__name__) logger.debug(f"Removed {removed_count} non-formula lines from paragraphs") ================================================ FILE: babeldoc/format/pdf/document_il/midend/table_parser.py ================================================ import logging from pathlib import Path import cv2 import numpy as np from pymupdf import Document from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img from babeldoc.format.pdf.document_il.utils.style_helper import GREEN from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class TableParser: stage_name = "Parse Table" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config self.model = translation_config.table_model def _save_debug_image(self, image: np.ndarray, layouts, page_number: int): """Save debug image with drawn boxes if debug mode is enabled.""" if not self.translation_config.debug: return if not isinstance(layouts, list): layouts = [layouts] debug_dir = Path( self.translation_config.get_working_file_path("table-ocr-box-image") ) debug_dir.mkdir(parents=True, exist_ok=True) # Draw boxes on the image debug_image = image.copy() for layout in layouts: for box in layout.boxes: x0, y0, x1, y1 = box.xyxy cv2.rectangle( debug_image, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 2, ) # Add text label cv2.putText( debug_image, layout.names[box.cls], (int(x0), int(y0) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, ) # Save the image output_path = debug_dir / f"{page_number}.jpg" cv2.imwrite(str(output_path), debug_image) def _save_debug_box_to_page(self, page: il_version_1.Page): """Save debug boxes and text labels to the PDF page.""" if not self.translation_config.debug: return color = GREEN for layout in page.page_layout: # Create a rectangle box rect = il_version_1.PdfRectangle( box=il_version_1.Box( x=layout.box.x, y=layout.box.y, x2=layout.box.x2, y2=layout.box.y2, ), graphic_state=color, debug_info=True, ) page.pdf_rectangle.append(rect) # Create text label at top-left corner # Note: PDF coordinates are from bottom-left, # so we use y2 for top position style = il_version_1.PdfStyle( font_id="base", font_size=4, graphic_state=color, ) page.pdf_paragraph.append( il_version_1.PdfParagraph( first_line_indent=False, box=il_version_1.Box( x=layout.box.x, y=layout.box.y2, x2=layout.box.x2, y2=layout.box.y2 + 5, ), vertical=False, pdf_style=style, unicode=layout.class_name, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=layout.class_name, pdf_style=style, debug_info=True, ), ), ], xobj_id=-1, ), ) def process(self, docs: il_version_1.Document, mupdf_doc: Document): """Generate layouts for all pages that need to be translated.""" # Get pages that need to be translated have_table_pages = {} for page in docs.page: for layout in page.page_layout: if layout.class_name == "table": have_table_pages[page.page_number] = page with self.translation_config.progress_monitor.stage_start( self.stage_name, len(have_table_pages), ) as progress: # Process predictions for each page for page, layouts in self.model.handle_document( have_table_pages.values(), mupdf_doc, self.translation_config, self._save_debug_image, ): page_layouts = [] for layout in layouts.boxes: # Convert coordinate system from picture to il # system to the il coordinate system x0, y0, x1, y1 = layout.xyxy # pix = mupdf_doc[page.page_number].get_pixmap() pix = get_no_rotation_img(mupdf_doc[page.page_number]) h, w = pix.height, pix.width x0, y0, x1, y1 = ( np.clip(int(x0 - 1), 0, w - 1), np.clip(int(h - y1 - 1), 0, h - 1), np.clip(int(x1 + 1), 0, w - 1), np.clip(int(h - y0 + 1), 0, h - 1), ) page_layout = il_version_1.PageLayout( id=len(page_layouts) + 1, box=il_version_1.Box( x0.item(), y0.item(), x1.item(), y1.item(), ), conf=layout.conf.item(), class_name=layouts.names[layout.cls], ) page_layouts.append(page_layout) page.page_layout.extend(page_layouts) self._save_debug_box_to_page(page) progress.advance(1) return docs ================================================ FILE: babeldoc/format/pdf/document_il/midend/typesetting.py ================================================ from __future__ import annotations import copy import logging import re import statistics import unicodedata from functools import cache import pymupdf import regex from rtree import index from babeldoc.const import WATERMARK_VERSION from babeldoc.format.pdf.document_il import Box from babeldoc.format.pdf.document_il import PdfCharacter from babeldoc.format.pdf.document_il import PdfCurve from babeldoc.format.pdf.document_il import PdfForm from babeldoc.format.pdf.document_il import PdfFormula from babeldoc.format.pdf.document_il import PdfParagraphComposition from babeldoc.format.pdf.document_il import PdfStyle from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.formular_helper import update_formula_data from babeldoc.format.pdf.document_il.utils.layout_helper import box_to_tuple from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.format.pdf.translation_config import WatermarkOutputMode logger = logging.getLogger(__name__) LINE_BREAK_REGEX = regex.compile( r"^[" r"a-z" r"A-Z" r"0-9" r"\u00C0-\u00FF" # Latin-1 Supplement r"\u0100-\u017F" # Latin Extended A r"\u0180-\u024F" # Latin Extended B r"\u1E00-\u1EFF" # Latin Extended Additional r"\u2C60-\u2C7F" # Latin Extended C r"\uA720-\uA7FF" # Latin Extended D r"\uAB30-\uAB6F" # Latin Extended E r"\u0250-\u02A0" # IPA Extensions r"\u0400-\u04FF" # Cyrillic r"\u0300-\u036F" # Combining Diacritical Marks r"\u0500-\u052F" # Cyrillic Supplement r"\u0370-\u03FF" # Greek and Coptic r"\u2DE0-\u2DFF" # Cyrillic Extended-A r"\uA650-\uA69F" # Cyrillic Extended-B r"\u1200-\u137F" # Ethiopic r"\u1380-\u139F" # Ethiopic Supplement r"\u2D80-\u2DDF" # Ethiopic Extended r"\uAB00-\uAB2F" # Ethiopic Extended-A r"\U0001E7E0-\U0001E7FF" # Ethiopic Extended-B r"\u0E80-\u0EFF" # Lao r"\u0D00-\u0D7F" # Malayalam r"\u0A80-\u0AFF" # Gujarati r"\u0E00-\u0E7F" # Thai r"\u1000-\u109F" # Myanmar r"\uAA60-\uAA7F" # Myanmar Extended-A r"\uA9E0-\uA9FF" # Myanmar Extended-B r"\U000116D0-\U000116FF" # Myanmar Extended-C r"\u0B80-\u0BFF" # Tamil r"\u0C00-\u0C7F" # Telugu r"\u0B00-\u0B7F" # Oriya r"\u0530-\u058F" # Armenian r"\u10A0-\u10FF" # Georgian r"\u1C90-\u1CBF" # Georgian Extended r"\u2D00-\u2D2F" # Georgian Supplement r"\u1780-\u17FF" # Khmer r"\u19E0-\u19FF" # Khmer Symbols r"\U00010B00-\U00010B3F" # Avestan r"\u1D00-\u1D7F" # Phonetic Extensions r"\u1400-\u167F" # Unified Canadian Aboriginal Syllabics r"\u0B00-\u0B7F" # Oriya r"\u0780-\u07BF" # Thaana r"\U0001E900-\U0001E95F" # Adlam r"\u1C80-\u1C8F" # Cyrillic Extended-C r"\U0001E030-\U0001E08F" # Cyrillic Extended-D r"\uA000-\uA48F" # Yi Syllables r"\uA490-\uA4CF" # Yi Radicals r"'" r"-" # Hyphen r"·" # Middle Dot (U+00B7) For Català r"ʻ" # Spacing Modifier Letters U+02BB r"]+$" ) class TypesettingUnit: def __str__(self): return self.try_get_unicode() or "" def __init__( self, char: PdfCharacter | None = None, formular: PdfFormula | None = None, unicode: str | None = None, font: pymupdf.Font | None = None, original_font: il_version_1.PdfFont | None = None, font_size: float | None = None, style: PdfStyle | None = None, xobj_id: int | None = None, debug_info: bool = False, ): assert (char is not None) + (formular is not None) + ( unicode is not None ) == 1, "Only one of chars and formular can be not None" self.char = char self.formular = formular self.unicode = unicode self.x = None self.y = None self.scale = None self.debug_info = debug_info # Cache variables self.box_cache: Box | None = None self.can_break_line_cache: bool | None = None self.is_cjk_char_cache: bool | None = None self.mixed_character_blacklist_cache: bool | None = None self.is_space_cache: bool | None = None self.is_hung_punctuation_cache: bool | None = None self.is_cannot_appear_in_line_end_punctuation_cache: bool | None = None self.can_passthrough_cache: bool | None = None self.width_cache: float | None = None self.height_cache: float | None = None self.font_size: float | None = None if unicode: assert font_size, "Font size must be provided when unicode is provided" assert style, "Style must be provided when unicode is provided" assert len(unicode) == 1, "Unicode must be a single character" assert xobj_id is not None, ( "Xobj id must be provided when unicode is provided" ) self.font = font if font is not None and hasattr(font, "font_id"): self.font_id = font.font_id else: self.font_id = "base" if original_font: self.original_font = original_font else: self.original_font = None self.font_size = font_size self.style = style self.xobj_id = xobj_id def try_resue_cache(self, old_tu: TypesettingUnit): if old_tu.is_cjk_char_cache is not None: self.is_cjk_char_cache = old_tu.is_cjk_char_cache if old_tu.can_break_line_cache is not None: self.can_break_line_cache = old_tu.can_break_line_cache if old_tu.is_space_cache is not None: self.is_space_cache = old_tu.is_space_cache if old_tu.is_hung_punctuation_cache is not None: self.is_hung_punctuation_cache = old_tu.is_hung_punctuation_cache if old_tu.is_cannot_appear_in_line_end_punctuation_cache is not None: self.is_cannot_appear_in_line_end_punctuation_cache = ( old_tu.is_cannot_appear_in_line_end_punctuation_cache ) if old_tu.can_passthrough_cache is not None: self.can_passthrough_cache = old_tu.can_passthrough_cache if old_tu.mixed_character_blacklist_cache is not None: self.mixed_character_blacklist_cache = ( old_tu.mixed_character_blacklist_cache ) def try_get_unicode(self) -> str | None: if self.char: return self.char.char_unicode elif self.formular: return None elif self.unicode: return self.unicode @property def mixed_character_blacklist(self): if self.mixed_character_blacklist_cache is None: self.mixed_character_blacklist_cache = self.calc_mixed_character_blacklist() return self.mixed_character_blacklist_cache def calc_mixed_character_blacklist(self): unicode = self.try_get_unicode() if unicode: return unicode in [ "。", ",", ":", "?", "!", ] return False @property def can_break_line(self): if self.can_break_line_cache is None: self.can_break_line_cache = self.calc_can_break_line() return self.can_break_line_cache def calc_can_break_line(self): unicode = self.try_get_unicode() if not unicode: return True if LINE_BREAK_REGEX.match(unicode): return False return True @property def is_cjk_char(self): if self.is_cjk_char_cache is None: self.is_cjk_char_cache = self.calc_is_cjk_char() return self.is_cjk_char_cache def calc_is_cjk_char(self): if self.formular: return False unicode = self.try_get_unicode() if not unicode: return False if "(cid" in unicode: return False if len(unicode) > 1: return False assert len(unicode) == 1, "Unicode must be a single character" if unicode in [ "(", ")", "【", "】", "《", "》", "〔", "〕", "〈", "〉", "〖", "〗", "「", "」", "『", "』", "、", "。", ":", "?", "!", ",", ]: return True if unicode: if re.match( r"^[" r"\u3000-\u303f" # CJK Symbols and Punctuation r"\u3040-\u309f" # Hiragana r"\u30a0-\u30ff" # Katakana r"\u3100-\u312f" # Bopomofo r"\uac00-\ud7af" # Hangul Syllables r"\u1100-\u11ff" # Hangul Jamo r"\u3130-\u318f" # Hangul Compatibility Jamo r"\ua960-\ua97f" # Hangul Jamo Extended-A r"\ud7b0-\ud7ff" # Hangul Jamo Extended-B r"\u3190-\u319f" # Kanbun r"\u3200-\u32ff" # Enclosed CJK Letters and Months r"\u3300-\u33ff" # CJK Compatibility r"\ufe30-\ufe4f" # CJK Compatibility Forms r"\u4e00-\u9fff" # CJK Unified Ideographs r"\u2e80-\u2eff" # CJK Radicals Supplement r"\u31c0-\u31ef" # CJK Strokes r"\u2f00-\u2fdf" # Kangxi Radicals r"\ufe10-\ufe1f" # Vertical Forms r"]+$", unicode, ): return True try: unicodedata_name = unicodedata.name(unicode) return ( "CJK UNIFIED IDEOGRAPH" in unicodedata_name or "FULLWIDTH" in unicodedata_name ) except ValueError: return False return False @property def is_space(self): if self.is_space_cache is None: self.is_space_cache = self.calc_is_space() return self.is_space_cache def calc_is_space(self): if self.formular: return False unicode = self.try_get_unicode() return unicode == " " @property def is_hung_punctuation(self): if self.is_hung_punctuation_cache is None: self.is_hung_punctuation_cache = self.calc_is_hung_punctuation() return self.is_hung_punctuation_cache def calc_is_hung_punctuation(self): if self.formular: return False unicode = self.try_get_unicode() if unicode: return unicode in [ # 英文标点 ",", ".", ":", ";", "?", "!", # 中文点号 ",", # 逗号 "。", # 句号 ".", # 全角句号 "、", # 顿号 ":", # 冒号 ";", # 分号 "!", # 叹号 "‼", # 双叹号 "?", # 问号 "⁇", # 双问号 # 结束引号 "”", # 右双引号 "’", # 右单引号 "」", # 右直角单引号 "』", # 右直角双引号 # 结束括号 ")", # 右圆括号 "]", # 右方括号 "}", # 右花括号 ")", # 右圆括号 "〕", # 右龟甲括号 "〉", # 右单书名号 "】", # 右黑色方头括号 "〗", # 右空白方头括号 "]", # 全角右方括号 "}", # 全角右花括号 # 结束双书名号 "》", # 右双书名号 # 连接号 "~", # 全角波浪号 "-", # 连字符减号 "–", # 短破折号 (EN DASH) "—", # 长破折号 (EM DASH) # 间隔号 "·", # 中间点 "・", # 片假名中间点 "‧", # 连字点 # 分隔号 "/", # 斜杠 "/", # 全角斜杠 "⁄", # 分数斜杠 ] return False @property def is_cannot_appear_in_line_end_punctuation(self): if self.is_cannot_appear_in_line_end_punctuation_cache is None: self.is_cannot_appear_in_line_end_punctuation_cache = ( self.calc_is_cannot_appear_in_line_end_punctuation() ) return self.is_cannot_appear_in_line_end_punctuation_cache def calc_is_cannot_appear_in_line_end_punctuation(self): if self.formular: return False unicode = self.try_get_unicode() if not unicode: return False return unicode in [ # 开始引号 "“", # 左双引号 "‘", # 左单引号 "「", # 左直角单引号 "『", # 左直角双引号 # 开始括号 "(", # 左圆括号 "[", # 左方括号 "{", # 左花括号 "(", # 左圆括号 "〔", # 左龟甲括号 "〈", # 左单书名号 "《", # 左双书名号 # 开始单双书名号 "〖", # 左空白方头括号 "〘", # 左黑色方头括号 "〚", # 左单书名号 ] def passthrough( self, ) -> tuple[list[PdfCharacter], list[PdfCurve], list[PdfForm]]: if self.char: return [self.char], [], [] elif self.formular: return ( self.formular.pdf_character, self.formular.pdf_curve, self.formular.pdf_form, ) elif self.unicode: logger.error(f"Cannot passthrough unicode. TypesettingUnit: {self}. ") logger.error(f"Cannot passthrough unicode. TypesettingUnit: {self}. ") return [], [], [] @property def can_passthrough(self): if self.can_passthrough_cache is None: self.can_passthrough_cache = self.calc_can_passthrough() return self.can_passthrough_cache def calc_can_passthrough(self): return self.unicode is None def calculate_box(self): if self.char: box = copy.deepcopy(self.char.box) if self.char.visual_bbox and self.char.visual_bbox.box: box.y = self.char.visual_bbox.box.y box.y2 = self.char.visual_bbox.box.y2 # return self.char.visual_bbox.box return box elif self.formular: return self.formular.box # if self.formular.x_offset <= 0.5: # return self.formular.box # formular_box = copy.copy(self.formular.box) # formular_box.x2 += self.formular.x_advance # return formular_box elif self.unicode: char_width = self.font.char_lengths(self.unicode, self.font_size)[0] if self.x is None or self.y is None or self.scale is None: return Box(0, 0, char_width, self.font_size) return Box(self.x, self.y, self.x + char_width, self.y + self.font_size) @property def box(self): if not self.box_cache: self.box_cache = self.calculate_box() return self.box_cache @property def width(self): if self.width_cache is None: self.width_cache = self.calc_width() return self.width_cache def calc_width(self): box = self.box return box.x2 - box.x @property def height(self): if self.height_cache is None: self.height_cache = self.calc_height() return self.height_cache def calc_height(self): box = self.box return box.y2 - box.y def relocate( self, x: float, y: float, scale: float, ) -> TypesettingUnit: """重定位并缩放排版单元 Args: x: 新的 x 坐标 y: 新的 y 坐标 scale: 缩放因子 Returns: 新的排版单元 """ if self.char: # 创建新的字符对象 new_char = PdfCharacter( pdf_character_id=self.char.pdf_character_id, char_unicode=self.char.char_unicode, box=Box( x=x, y=y, x2=x + self.width * scale, y2=y + self.height * scale, ), pdf_style=PdfStyle( font_id=self.char.pdf_style.font_id, font_size=self.char.pdf_style.font_size * scale, graphic_state=self.char.pdf_style.graphic_state, ), scale=scale, vertical=self.char.vertical, advance=self.char.advance * scale if self.char.advance else None, debug_info=self.debug_info, xobj_id=self.char.xobj_id, ) new_tu = TypesettingUnit(char=new_char) new_tu.try_resue_cache(self) return new_tu elif self.formular: # 创建新的公式对象,保持内部字符的相对位置 new_chars = [] min_x = self.formular.box.x min_y = self.formular.box.y for char in self.formular.pdf_character: # 计算相对位置 rel_x = char.box.x - min_x rel_y = char.box.y - min_y visual_rel_x = char.visual_bbox.box.x - min_x visual_rel_y = char.visual_bbox.box.y - min_y # 创建新的字符对象 new_char = PdfCharacter( pdf_character_id=char.pdf_character_id, char_unicode=char.char_unicode, box=Box( x=x + (rel_x + self.formular.x_offset) * scale, y=y + (rel_y + self.formular.y_offset) * scale, x2=x + (rel_x + (char.box.x2 - char.box.x) + self.formular.x_offset) * scale, y2=y + (rel_y + (char.box.y2 - char.box.y) + self.formular.y_offset) * scale, ), visual_bbox=il_version_1.VisualBbox( box=Box( x=x + (visual_rel_x + self.formular.x_offset) * scale, y=y + (visual_rel_y + self.formular.y_offset) * scale, x2=x + ( visual_rel_x + (char.visual_bbox.box.x2 - char.visual_bbox.box.x) + self.formular.x_offset ) * scale, y2=y + ( visual_rel_y + (char.visual_bbox.box.y2 - char.visual_bbox.box.y) + self.formular.y_offset ) * scale, ), ), pdf_style=PdfStyle( font_id=char.pdf_style.font_id, font_size=char.pdf_style.font_size * scale, graphic_state=char.pdf_style.graphic_state, ), scale=scale, vertical=char.vertical, advance=char.advance * scale if char.advance else None, xobj_id=char.xobj_id, ) new_chars.append(new_char) # Calculate bounding box from new_chars min_x = min(char.visual_bbox.box.x for char in new_chars) min_y = min(char.visual_bbox.box.y for char in new_chars) max_x = max(char.visual_bbox.box.x2 for char in new_chars) max_y = max(char.visual_bbox.box.y2 for char in new_chars) new_formula = PdfFormula( box=Box( x=min_x, y=min_y, x2=max_x, y2=max_y, ), pdf_character=new_chars, x_offset=self.formular.x_offset * scale, y_offset=self.formular.y_offset * scale, x_advance=self.formular.x_advance * scale, ) # Handle contained curves new_curves = [] for curve in self.formular.pdf_curve: new_curve = self._transform_curve_for_relocation( curve, self.formular.box.x, self.formular.box.y, x, y, scale, ) new_curves.append(new_curve) new_formula.pdf_curve = new_curves # Handle contained forms new_forms = [] for form in self.formular.pdf_form: new_form = self._transform_form_for_relocation( form, self.formular.box.x, self.formular.box.y, x, y, scale ) new_forms.append(new_form) new_formula.pdf_form = new_forms update_formula_data(new_formula) new_tu = TypesettingUnit(formular=new_formula) new_tu.try_resue_cache(self) return new_tu elif self.unicode: # 对于 Unicode 字符,我们存储新的位置信息 new_unit = TypesettingUnit( unicode=self.unicode, font=self.font, original_font=self.original_font, font_size=self.font_size * scale, style=self.style, xobj_id=self.xobj_id, debug_info=self.debug_info, ) new_unit.x = x new_unit.y = y new_unit.scale = scale new_unit.try_resue_cache(self) return new_unit def _transform_curve_for_relocation( self, curve, original_formula_x: float, original_formula_y: float, new_x: float, new_y: float, scale: float, ): """Transform a curve for formula relocation.""" import copy new_curve = copy.deepcopy(curve) if new_curve.box: # Calculate relative position to formula's original position (same as chars) rel_x = new_curve.box.x - original_formula_x rel_y = new_curve.box.y - original_formula_y # Apply same transformation as characters new_curve.box = Box( x=new_x + (rel_x + self.formular.x_offset) * scale, y=new_y + (rel_y + self.formular.y_offset) * scale, x2=new_x + ( rel_x + (new_curve.box.x2 - new_curve.box.x) + self.formular.x_offset ) * scale, y2=new_y + ( rel_y + (new_curve.box.y2 - new_curve.box.y) + self.formular.y_offset ) * scale, ) # Set relocation transform instead of modifying original CTM translation_x = ( new_x + self.formular.x_offset * scale - original_formula_x * scale ) translation_y = ( new_y + self.formular.y_offset * scale - original_formula_y * scale ) # Create relocation transformation matrix from babeldoc.format.pdf.document_il.utils.matrix_helper import ( create_translation_and_scale_matrix, ) relocation_matrix = create_translation_and_scale_matrix( translation_x, translation_y, scale ) new_curve.relocation_transform = list(relocation_matrix) return new_curve def _transform_form_for_relocation( self, form, original_formula_x: float, original_formula_y: float, new_x: float, new_y: float, scale: float, ): """Transform a form for formula relocation.""" import copy new_form = copy.deepcopy(form) if new_form.box: # Calculate relative position to formula's original position (same as chars) rel_x = new_form.box.x - original_formula_x rel_y = new_form.box.y - original_formula_y # Apply same transformation as characters new_form.box = Box( x=new_x + (rel_x + self.formular.x_offset) * scale, y=new_y + (rel_y + self.formular.y_offset) * scale, x2=new_x + (rel_x + (new_form.box.x2 - new_form.box.x) + self.formular.x_offset) * scale, y2=new_y + (rel_y + (new_form.box.y2 - new_form.box.y) + self.formular.y_offset) * scale, ) # Set relocation transform instead of modifying original matrices translation_x = ( new_x + self.formular.x_offset * scale - original_formula_x * scale ) translation_y = ( new_y + self.formular.y_offset * scale - original_formula_y * scale ) # Create relocation transformation matrix from babeldoc.format.pdf.document_il.utils.matrix_helper import ( create_translation_and_scale_matrix, ) relocation_matrix = create_translation_and_scale_matrix( translation_x, translation_y, scale ) new_form.relocation_transform = list(relocation_matrix) return new_form def render( self, ) -> tuple[list[PdfCharacter], list[PdfCurve], list[PdfForm]]: """渲染排版单元为 PdfCharacter 列表 Returns: PdfCharacter 列表 """ if self.can_passthrough: return self.passthrough() elif self.unicode: assert self.x is not None, ( "x position must be set, should be set by `relocate`" ) assert self.y is not None, ( "y position must be set, should be set by `relocate`" ) assert self.scale is not None, ( "scale must be set, should be set by `relocate`" ) x = self.x y = self.y # if self.original_font and self.font and hasattr(self.original_font, "descent") and hasattr(self.font, "descent_fontmap"): # original_descent = self.original_font.descent # new_descent = self.font.descent_fontmap # y -= (original_descent - new_descent) * self.font_size / 1000 # 计算字符宽度 char_width = self.width new_char = PdfCharacter( pdf_character_id=self.font.has_glyph(ord(self.unicode)), char_unicode=self.unicode, box=Box( x=x, # 使用存储的位置 y=y, x2=x + char_width, y2=y + self.font_size, ), pdf_style=PdfStyle( font_id=self.font_id, font_size=self.font_size, graphic_state=self.style.graphic_state, ), scale=self.scale, vertical=False, advance=char_width, xobj_id=self.xobj_id, debug_info=self.debug_info, ) return [new_char], [], [] else: logger.error(f"Unknown typesetting unit. TypesettingUnit: {self}. ") logger.error(f"Unknown typesetting unit. TypesettingUnit: {self}. ") return [], [], [] class Typesetting: stage_name = "Typesetting" def __init__(self, translation_config: TranslationConfig): self.font_mapper = FontMapper(translation_config) self.translation_config = translation_config self.lang_code = self.translation_config.lang_out.upper() self.is_cjk = ( # Why zh-CN/zh-HK/zh-TW here but not zh-Hans and so on? # See https://funstory-ai.github.io/BabelDOC/supported_languages/ ("ZH" in self.lang_code) # C or ("JA" in self.lang_code) or ("JP" in self.lang_code) # J or ("KR" in self.lang_code) # K or ("CN" in self.lang_code) or ("HK" in self.lang_code) or ("TW" in self.lang_code) ) def preprocess_document(self, document: il_version_1.Document, pbar): """预处理文档,获取每个段落的最优缩放因子,不执行实际排版""" all_scales: list[float] = [] all_paragraphs: list[il_version_1.PdfParagraph] = [] for page in document.page: pbar.advance() # 准备字体信息(复制自 render_page 的逻辑) fonts: dict[ str | int, il_version_1.PdfFont | dict[str, il_version_1.PdfFont], ] = {f.font_id: f for f in page.pdf_font if f.font_id} page_fonts = {f.font_id: f for f in page.pdf_font if f.font_id} for k, v in self.font_mapper.fontid2font.items(): fonts[k] = v for xobj in page.pdf_xobject: if xobj.xobj_id is not None: fonts[xobj.xobj_id] = page_fonts.copy() for font in xobj.pdf_font: if ( xobj.xobj_id in fonts and isinstance(fonts[xobj.xobj_id], dict) and font.font_id ): fonts[xobj.xobj_id][font.font_id] = font # 处理每个段落 for paragraph in page.pdf_paragraph: all_paragraphs.append(paragraph) unit_count = 0 try: typesetting_units = self.create_typesetting_units(paragraph, fonts) unit_count = len(typesetting_units) for unit in typesetting_units: if unit.formular: unit_count += len(unit.formular.pdf_character) - 1 # 如果所有单元都可以直接传递,则 scale = 1.0 if all(unit.can_passthrough for unit in typesetting_units): paragraph.optimal_scale = 1.0 else: # 获取最优缩放因子 optimal_scale = self._get_optimal_scale( paragraph, page, typesetting_units ) paragraph.optimal_scale = optimal_scale except Exception as e: # 如果预处理出错,默认使用 1.0 缩放因子 logger.warning(f"预处理段落时出错:{e}") paragraph.optimal_scale = 1.0 if paragraph.optimal_scale is not None: all_scales.extend([paragraph.optimal_scale] * unit_count) # 获取缩放因子的众数 if all_scales: try: modes = statistics.multimode(all_scales) mode_scale = min(modes) except statistics.StatisticsError: logger.warning( "Could not find a mode for paragraph scales. Falling back to median." ) mode_scale = statistics.median(all_scales) # 将所有大于众数的值修改为众数 for paragraph in all_paragraphs: if ( paragraph.optimal_scale is not None and paragraph.optimal_scale > mode_scale ): paragraph.optimal_scale = mode_scale else: logger.error( "document_scales is empty, there seems no paragraph in this PDF" ) def _find_optimal_scale_and_layout( self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page, typesetting_units: list[TypesettingUnit], initial_scale: float = 1.0, use_english_line_break: bool = True, apply_layout: bool = False, ) -> tuple[float, list[TypesettingUnit] | None]: """查找最优缩放因子并可选择性地执行布局 Args: paragraph: 段落对象 page: 页面对象 typesetting_units: 排版单元列表 initial_scale: 初始缩放因子 use_english_line_break: 是否使用英文换行规则 apply_layout: 是否应用布局到 paragraph(True 时执行实际排版) Returns: tuple[float, list[TypesettingUnit] | None]: (最终缩放因子,排版后的单元列表或 None) """ if not paragraph.box: return initial_scale, None box = paragraph.box scale = initial_scale line_skip = 1.50 if self.is_cjk else 1.3 min_scale = 0.1 expand_space_flag = 0 final_typeset_units = None while scale >= min_scale: try: # 尝试布局排版单元 typeset_units, all_units_fit = self._layout_typesetting_units( typesetting_units, box, scale, line_skip, paragraph, use_english_line_break, ) # 如果所有单元都放得下 if all_units_fit: if apply_layout: # 实际应用排版结果 paragraph.scale = scale paragraph.pdf_paragraph_composition = [] for unit in typeset_units: chars, curves, forms = unit.render() for char in chars: paragraph.pdf_paragraph_composition.append( PdfParagraphComposition(pdf_character=char), ) for curve in curves: page.pdf_curve.append(curve) for form in forms: page.pdf_form.append(form) final_typeset_units = typeset_units return scale, final_typeset_units except Exception: # 如果布局检查出错,继续尝试下一个缩放因子 pass # 添加与原 retypeset 一致的逻辑检查 if not hasattr(paragraph, "debug_id") or not paragraph.debug_id: return scale, final_typeset_units # 减小缩放因子 if scale > 0.6: scale -= 0.05 else: scale -= 0.1 if scale < 0.7: space_expanded = False # 标记是否成功扩展了空间 if expand_space_flag == 0: # 尝试向下扩展 try: min_y = self.get_max_bottom_space(box, page) + 2 if min_y < box.y: expanded_box = Box(x=box.x, y=min_y, x2=box.x2, y2=box.y2) box = expanded_box if apply_layout: # 更新段落的边界框 paragraph.box = expanded_box space_expanded = True except Exception: pass expand_space_flag = 1 # 只有成功扩展空间时才 continue,否则继续减小 scale if space_expanded: continue elif expand_space_flag == 1: # 尝试向右扩展 try: max_x = self.get_max_right_space(box, page) - 5 if max_x > box.x2: expanded_box = Box(x=box.x, y=box.y, x2=max_x, y2=box.y2) box = expanded_box if apply_layout: # 更新段落的边界框 paragraph.box = expanded_box space_expanded = True except Exception: pass expand_space_flag = 2 # 只有成功扩展空间时才 continue,否则继续减小 scale if space_expanded: continue # 只有在扩展尝试阶段 (expand_space_flag < 2) 且扩展失败时才重置 scale # 当 expand_space_flag >= 2 时,说明已经尝试过所有扩展,应该继续正常的 scale 减小 if expand_space_flag < 2: # 如果无法扩展空间,重置 scale 并继续循环 scale = 1.0 # 如果仍然放不下,尝试去除英文换行限制 if use_english_line_break: return self._find_optimal_scale_and_layout( paragraph, page, typesetting_units, initial_scale, use_english_line_break=False, apply_layout=apply_layout, ) # 最后返回最小缩放因子 return min_scale, final_typeset_units def _get_optimal_scale( self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page, typesetting_units: list[TypesettingUnit], use_english_line_break: bool = True, ) -> float: """获取段落的最优缩放因子,不执行实际排版""" scale, _ = self._find_optimal_scale_and_layout( paragraph, page, typesetting_units, 1.0, use_english_line_break, apply_layout=False, ) return scale def retypeset_with_precomputed_scale( self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page, typesetting_units: list[TypesettingUnit], precomputed_scale: float, use_english_line_break: bool = True, ): """使用预计算的缩放因子进行排版""" if not paragraph.box: return # 使用通用方法进行排版,传入预计算的缩放因子作为初始值 self._find_optimal_scale_and_layout( paragraph, page, typesetting_units, precomputed_scale, use_english_line_break, apply_layout=True, ) def typesetting_document(self, document: il_version_1.Document): # 原有的排版逻辑 if self.translation_config.progress_monitor: with self.translation_config.progress_monitor.stage_start( self.stage_name, len(document.page) * 2, ) as pbar: # 预处理:获取所有段落的最优缩放因子 self.preprocess_document(document, pbar) for page in document.page: self.translation_config.raise_if_cancelled() self.render_page(page) pbar.advance() else: for page in document.page: self.translation_config.raise_if_cancelled() self.render_page(page) def render_page(self, page: il_version_1.Page): fonts: dict[ str | int, il_version_1.PdfFont | dict[str, il_version_1.PdfFont], ] = {f.font_id: f for f in page.pdf_font if f.font_id} page_fonts = {f.font_id: f for f in page.pdf_font if f.font_id} for k, v in self.font_mapper.fontid2font.items(): fonts[k] = v for xobj in page.pdf_xobject: if xobj.xobj_id is not None: fonts[xobj.xobj_id] = page_fonts.copy() for font in xobj.pdf_font: if font.font_id: fonts[xobj.xobj_id][font.font_id] = font if ( page.page_number == 0 and self.translation_config.watermark_output_mode == WatermarkOutputMode.Watermarked ): self.add_watermark(page) try: para_index = index.Index() para_map = {} # valid_paras = [ p for p in page.pdf_paragraph if p.box and all(c is not None for c in [p.box.x, p.box.y, p.box.x2, p.box.y2]) ] for i, para in enumerate(valid_paras): para_map[i] = para para_index.insert(i, box_to_tuple(para.box)) for i, p_upper in para_map.items(): if not (p_upper.box and p_upper.box.y is not None): continue # Calculate paragraph height and set required gap accordingly para_height = p_upper.box.y2 - p_upper.box.y required_gap = 0.5 if para_height < 36 else 3 check_area = il_version_1.Box( x=p_upper.box.x, y=p_upper.box.y - required_gap, x2=p_upper.box.x2, y2=p_upper.box.y, ) candidate_ids = list(para_index.intersection(box_to_tuple(check_area))) conflicting_paras = [] for para_id in candidate_ids: if para_id == i: continue p_lower = para_map[para_id] if not ( p_lower.box and p_upper.box and p_lower.box.x2 < p_upper.box.x or p_lower.box.x > p_upper.box.x2 ): conflicting_paras.append(p_lower) if conflicting_paras: max_y2 = max( p.box.y2 for p in conflicting_paras if p.box and p.box.y2 is not None ) new_y = max_y2 + required_gap if p_upper.box and new_y < p_upper.box.y2: p_upper.box.y = new_y except Exception as e: logger.warning( f"Failed to adjust paragraph positions on page {page.page_number}: {e}" ) # 开始实际的渲染过程 for paragraph in page.pdf_paragraph: self.render_paragraph(paragraph, page, fonts) def add_watermark(self, page: il_version_1.Page): page_width = page.cropbox.box.x2 - page.cropbox.box.x page_height = page.cropbox.box.y2 - page.cropbox.box.y style = il_version_1.PdfStyle( font_id="base", font_size=6, graphic_state=il_version_1.GraphicState(), ) text = f"本文档由 funstory.ai 的开源 PDF 翻译库 BabelDOC {WATERMARK_VERSION} (http://yadt.io) 翻译,本仓库正在积极的建设当中,欢迎 star 和关注。" if self.translation_config.debug: text += "\n 当前为 DEBUG 模式,将显示更多辅助信息。请注意,部分框的位置对应原文,但在译文中可能不正确。" page.pdf_paragraph.append( il_version_1.PdfParagraph( first_line_indent=False, box=il_version_1.Box( x=page.cropbox.box.x + page_width * 0.05, y=page.cropbox.box.y, x2=page.cropbox.box.x2, y2=page.cropbox.box.y2 - page_height * 0.05, ), vertical=False, pdf_style=style, pdf_paragraph_composition=[ il_version_1.PdfParagraphComposition( pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( unicode=text, pdf_style=style, ), ), ], xobj_id=-1, ), ) def render_paragraph( self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page, fonts: dict[ str | int, il_version_1.PdfFont | dict[str, il_version_1.PdfFont], ], ): typesetting_units = self.create_typesetting_units(paragraph, fonts) # 如果所有单元都可以直接传递,则直接传递 if all(unit.can_passthrough for unit in typesetting_units): paragraph.scale = 1.0 paragraph.pdf_paragraph_composition = self.create_passthrough_composition( typesetting_units, ) else: # 使用预计算的缩放因子进行重排版 precomputed_scale = ( paragraph.optimal_scale if paragraph.optimal_scale is not None else 1.0 ) # 如果有单元无法直接传递,则进行重排版 paragraph.pdf_paragraph_composition = [] self.retypeset_with_precomputed_scale( paragraph, page, typesetting_units, precomputed_scale ) # 重排版后,重新设置段落各字符的 render order self._update_paragraph_render_order(paragraph) def _get_width_before_next_break_point( self, typesetting_units: list[TypesettingUnit], scale: float ) -> float: if not typesetting_units: return 0 if typesetting_units[0].can_break_line: return 0 total_width = 0 for unit in typesetting_units: if unit.can_break_line: return total_width * scale total_width += unit.width return total_width * scale def _layout_typesetting_units( self, typesetting_units: list[TypesettingUnit], box: Box, scale: float, line_skip: float, paragraph: il_version_1.PdfParagraph, use_english_line_break: bool = True, ) -> tuple[list[TypesettingUnit], bool]: """布局排版单元。 Args: typesetting_units: 要布局的排版单元列表 box: 布局边界框 scale: 缩放因子 Returns: tuple[list[TypesettingUnit], bool]: (已布局的排版单元列表,是否所有单元都放得下) """ # 计算字号众数 font_sizes = [] for unit in typesetting_units: if unit.font_size: font_sizes.append(unit.font_size) if unit.char and unit.char.pdf_style and unit.char.pdf_style.font_size: font_sizes.append(unit.char.pdf_style.font_size) font_sizes.sort() font_size = statistics.mode(font_sizes) space_width = ( self.font_mapper.base_font.char_lengths("你", font_size * scale)[0] * 0.5 ) # 计算行高(使用众数) unit_heights = ( [unit.height for unit in typesetting_units] if typesetting_units else [] ) if not unit_heights: avg_height = 0 elif len(unit_heights) == 1: avg_height = unit_heights[0] * scale else: try: avg_height = statistics.mode(unit_heights) * scale except statistics.StatisticsError: # 如果没有众数(所有值都出现相同次数),则使用平均值 avg_height = sum(unit_heights) / len(unit_heights) * scale # 初始化位置为右上角,并减去一个平均行高 current_x = box.x current_y = box.y2 - avg_height box = copy.deepcopy(box) # box.y -= avg_height * (line_spacing - 1.01) # line_spacing 已被替换为 line_skip line_height = 0 current_line_heights = [] # 存储当前行所有元素的高度 # 存储已排版的单元 typeset_units = [] all_units_fit = True last_unit: TypesettingUnit | None = None line_ys = [current_y] if paragraph.first_line_indent: current_x += space_width * 4 # 遍历所有排版单元 for i, unit in enumerate(typesetting_units): # 计算当前单元在当前缩放下的尺寸 unit_width = unit.width * scale unit_height = unit.height * scale # 跳过行首的空格 if current_x == box.x and unit.is_space: continue if ( last_unit # 有上一个单元 and last_unit.is_cjk_char ^ unit.is_cjk_char # 中英文交界处 and ( last_unit.box and last_unit.box.y and current_y - 0.1 <= last_unit.box.y2 <= current_y + line_height + 0.1 ) # 在同一行,且有垂直重叠 and not last_unit.mixed_character_blacklist # 不是混排空格黑名单字符 and not unit.mixed_character_blacklist # 同上 and current_x > box.x # 不是行首 and unit.try_get_unicode() != " " # 不是空格 and last_unit.try_get_unicode() != " " # 不是空格 and last_unit.try_get_unicode() not in [ "。", "!", "?", ";", ":", ",", ] ): current_x += space_width * 0.5 if use_english_line_break: width_before_next_break_point = self._get_width_before_next_break_point( typesetting_units[i:], scale ) else: width_before_next_break_point = 0 # 如果当前行放不下这个元素,换行 if not unit.is_hung_punctuation and ( (current_x + unit_width > box.x2) or ( use_english_line_break and current_x + unit_width + width_before_next_break_point > box.x2 ) or ( unit.is_cannot_appear_in_line_end_punctuation and current_x + unit_width * 2 > box.x2 ) ): # 换行 current_x = box.x if not current_line_heights: return [], False max_height = max(current_line_heights) mode_height = statistics.mode(current_line_heights) current_y -= max(mode_height * line_skip, max_height * 1.05) line_ys.append(current_y) line_height = 0.0 current_line_heights = [] # 清空当前行高度列表 # 检查是否超出底部边界 # if current_y - unit_height < box.y: if current_y < box.y: all_units_fit = False # 这里不要 break,继续排版剩余内容 if unit.is_space: line_height = max(line_height, unit_height) continue # 放置当前单元 relocated_unit = unit.relocate(current_x, current_y, scale) typeset_units.append(relocated_unit) # 添加当前单元的高度到当前行高度列表 if not unit.is_space: current_line_heights.append(unit_height) prev_x = current_x # 更新 x 坐标 current_x = relocated_unit.box.x2 if prev_x > current_x: logger.warning(f"坐标回绕!!!TypesettingUnit: {unit.box}, ") last_unit = relocated_unit return typeset_units, all_units_fit def create_typesetting_units( self, paragraph: il_version_1.PdfParagraph, fonts: dict[str, il_version_1.PdfFont], ) -> list[TypesettingUnit]: if not paragraph.pdf_paragraph_composition: return [] result = [] @cache def get_font(font_id: str, xobj_id: int | None): if xobj_id in fonts: font = fonts[xobj_id][font_id] else: font = fonts[font_id] return font for composition in paragraph.pdf_paragraph_composition: if composition is None: continue if composition.pdf_line: result.extend( [ TypesettingUnit(char=char) for char in composition.pdf_line.pdf_character ], ) elif composition.pdf_character: result.append( TypesettingUnit( char=composition.pdf_character, debug_info=paragraph.debug_info, ), ) elif composition.pdf_same_style_characters: result.extend( [ TypesettingUnit(char=char) for char in composition.pdf_same_style_characters.pdf_character ], ) elif composition.pdf_same_style_unicode_characters: style = composition.pdf_same_style_unicode_characters.pdf_style if style is None: logger.warning( f"Style is None. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue font_id = style.font_id if font_id is None: logger.warning( f"Font ID is None. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue font = get_font(font_id, paragraph.xobj_id) if composition.pdf_same_style_unicode_characters.unicode: result.extend( [ TypesettingUnit( unicode=char_unicode, font=self.font_mapper.map( font, char_unicode, ), original_font=font, font_size=style.font_size, style=style, xobj_id=paragraph.xobj_id, debug_info=composition.pdf_same_style_unicode_characters.debug_info or False, ) for char_unicode in composition.pdf_same_style_unicode_characters.unicode if char_unicode not in ("\n",) ], ) elif composition.pdf_formula: result.extend([TypesettingUnit(formular=composition.pdf_formula)]) else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue result = list( filter( lambda x: x.unicode is None or x.font is not None, result, ), ) if any(x.width < 0 for x in result): logger.warning("有排版单元宽度小于 0,请检查字体映射是否正确。") return result def create_passthrough_composition( self, typesetting_units: list[TypesettingUnit], ) -> list[PdfParagraphComposition]: """从排版单元创建直接传递的段落组合。 Args: typesetting_units: 排版单元列表 Returns: 段落组合列表 """ composition = [] for unit in typesetting_units: if unit.formular: # 对于公式单元,直接创建包含完整公式的组合 composition.append(PdfParagraphComposition(pdf_formula=unit.formular)) else: # 对于字符单元,使用原有逻辑 chars, curves, forms = unit.passthrough() composition.extend( [PdfParagraphComposition(pdf_character=char) for char in chars], ) return composition def get_max_right_space(self, current_box: Box, page) -> float: """获取段落右侧最大可用空间 Args: current_box: 当前段落的边界框 page: 当前页面 Returns: 可以扩展到的最大 x 坐标 """ # 获取页面的裁剪框作为初始最大限制 max_x = page.cropbox.box.x2 * 0.9 # 检查所有可能的阻挡元素 for para in page.pdf_paragraph: if para.box == current_box or para.box is None: # 跳过当前段落 continue # 只考虑在当前段落右侧且有垂直重叠的元素 if para.box.x > current_box.x and not ( para.box.y >= current_box.y2 or para.box.y2 <= current_box.y ): max_x = min(max_x, para.box.x) for char in page.pdf_character: if char.box.x > current_box.x and not ( char.box.y >= current_box.y2 or char.box.y2 <= current_box.y ): max_x = min(max_x, char.box.x) # 检查图形 for figure in page.pdf_figure: if figure.box.x > current_box.x and not ( figure.box.y >= current_box.y2 or figure.box.y2 <= current_box.y ): max_x = min(max_x, figure.box.x) return max_x def get_max_bottom_space(self, current_box: Box, page: il_version_1.Page) -> float: """获取段落下方最大可用空间 Args: current_box: 当前段落的边界框 page: 当前页面 Returns: 可以扩展到的最小 y 坐标 """ # 获取页面的裁剪框作为初始最小限制 min_y = page.cropbox.box.y * 1.1 # 检查所有可能的阻挡元素 for para in page.pdf_paragraph: if para.box == current_box or para.box is None: # 跳过当前段落 continue # 只考虑在当前段落下方且有水平重叠的元素 if para.box.y2 < current_box.y and not ( para.box.x >= current_box.x2 or para.box.x2 <= current_box.x ): min_y = max(min_y, para.box.y2) for char in page.pdf_character: if char.box.y2 < current_box.y and not ( char.box.x >= current_box.x2 or char.box.x2 <= current_box.x ): min_y = max(min_y, char.box.y2) # 检查图形 for figure in page.pdf_figure: if figure.box.y2 < current_box.y and not ( figure.box.x >= current_box.x2 or figure.box.x2 <= current_box.x ): min_y = max(min_y, figure.box.y2) return min_y def _update_paragraph_render_order(self, paragraph: il_version_1.PdfParagraph): """ 重新设置段落各字符的 render order 主 render order 等于 paragraph 的 renderorder,sub render order 从 1 开始自增 """ if not hasattr(paragraph, "render_order") or paragraph.render_order is None: return main_render_order = paragraph.render_order sub_render_order = 1 # 遍历段落的所有组成部分 for composition in paragraph.pdf_paragraph_composition: # 检查单个字符 if composition.pdf_character: char = composition.pdf_character char.render_order = main_render_order char.sub_render_order = sub_render_order sub_render_order += 1 ================================================ FILE: babeldoc/format/pdf/document_il/utils/__init__.py ================================================ ================================================ FILE: babeldoc/format/pdf/document_il/utils/extract_char.py ================================================ import logging import shutil from collections import defaultdict from pathlib import Path import cv2 import numpy as np import pymupdf from rich.logging import RichHandler from sklearn.cluster import DBSCAN import babeldoc.format.pdf.high_level import babeldoc.format.pdf.translation_config from babeldoc.const import get_process_pool from babeldoc.format.pdf.document_il import il_version_1 logger = logging.getLogger(__name__) # --- Algorithm Tuning Parameters --- # --- Band Creation --- # Minimum vertical overlap ratio for a character to be added to an existing band. BAND_CREATION_OVERLAP_THRESHOLD = 0.5 # --- Line Clustering (within a band) --- # Epsilon for DBSCAN, as a multiplier of the average character width/height. LINE_CLUSTERING_EPS_MULTIPLIER = 3.5 # --- Line Splitting (for tall/wide lines) --- # A line is considered for splitting if its height/width is > X times the max char size. LINE_SPLIT_SIZE_RATIO_THRESHOLD = 1.5 # Epsilon for DBSCAN when splitting lines, as a multiplier of the max char size. LINE_SPLIT_DBSCAN_EPS_MULTIPLIER = 0.5 # --- Space Insertion (in a finalized line) --- # A space is inserted if the gap between chars is > X times the average char width. SPACE_INSERTION_GAP_MULTIPLIER = 0.45 # --- Line Merging (across the page) --- # --- Optimization --- # Maximum vertical gap to search for potential merges, as a multiplier of avg char height. MERGE_VERTICAL_GAP_MULTIPLIER = 1.5 # --- Containment Merge --- # Intersection-over-area threshold to consider one line as contained within another. MERGE_CONTAINMENT_IOU_THRESHOLD = 0.6 # --- Adjacency Merge --- # Minimum vertical/horizontal overlap for adjacent lines to be considered for merging. MERGE_ADJACENCY_OVERLAP_THRESHOLD = 0.7 # Maximum gap between adjacent lines to merge, as a multiplier of avg char size. MERGE_ADJACENCY_GAP_MULTIPLIER = 1.5 # --- End of Parameters --- def parse_pdf(pdf_path, page_ranges=None) -> il_version_1.Document: translation_config = babeldoc.format.pdf.translation_config.TranslationConfig( *[None for _ in range(4)], doc_layout_model=None ) if page_ranges: translation_config.page_ranges = [page_ranges] translation_config.progress_monitor = ( babeldoc.format.pdf.high_level.ProgressMonitor( babeldoc.format.pdf.high_level.TRANSLATE_STAGES ) ) try: shutil.copy(pdf_path, translation_config.get_working_file_path("input.pdf")) doc = pymupdf.open(pdf_path) il_creater = babeldoc.format.pdf.high_level.ILCreater(translation_config) il_creater.mupdf = doc with Path(translation_config.get_working_file_path("input.pdf")).open( "rb" ) as f: babeldoc.format.pdf.high_level.start_parse_il( f, doc_zh=doc, resfont="test_font", il_creater=il_creater, translation_config=translation_config, ) il = il_creater.create_il() doc.close() return il finally: translation_config.cleanup_temp_files() return None class Line: def __init__(self, chars: list[tuple[il_version_1.Box, str, bool]]): self.chars = chars self.text = "".join([c[1] for c in chars]) def _recalculate_line_text_with_spacing(line, orientation): if not line.chars: line.text = "" return if orientation == "horizontal": def get_main_start(c): return c[0].x def get_main_end(c): return c[0].x2 def get_main_size(c): return c[0].x2 - c[0].x else: # vertical def get_main_start(c): return c[0].y def get_main_end(c): return c[0].y2 def get_main_size(c): return c[0].y2 - c[0].y line_text = "" avg_width = np.mean( [get_main_size(c) for c in line.chars if get_main_size(c) > 0] or [0] ) if len(line.chars) > 1 and avg_width > 0: for i in range(len(line.chars) - 1): c1, c2 = line.chars[i], line.chars[i + 1] gap = get_main_start(c2) - get_main_end(c1) if gap > avg_width * SPACE_INSERTION_GAP_MULTIPLIER: line_text += c1[1] + " " else: line_text += c1[1] if line.chars: line_text += line.chars[-1][1] line.text = line_text # [box, char_unicode, vertical] # vertical: True if the char is vertical, False if the char is horizontal def extract_paragraph_line( pdf_path, ) -> dict[int, list[tuple[il_version_1.Box, str, bool]]]: il = parse_pdf(pdf_path) if il is None: return None line_boxes = {} for page in il.page: line_boxes[page.page_number] = convert_page_to_char_boxes(page) return line_boxes def convert_page_to_char_boxes( page: il_version_1.Page, ) -> list[tuple[il_version_1.Box, str, bool]]: return [ (char.visual_bbox.box, char.char_unicode, char.vertical) for char in page.pdf_character ] def _cluster_by_axis(chars: list[tuple[il_version_1.Box, str, bool]], orientation: str): """ A generalized function to cluster characters into lines based on main and secondary axes. """ if not chars: return [] # Define main and secondary axes based on orientation if orientation == "horizontal": def get_secondary_start(c): return c[0].y def get_secondary_end(c): return c[0].y2 def get_main_start(c): return c[0].x def get_main_end(c): return c[0].x2 def get_main_size(c): return c[0].x2 - c[0].x else: # vertical def get_secondary_start(c): return c[0].x def get_secondary_end(c): return c[0].x2 def get_main_start(c): return c[0].y def get_main_end(c): return c[0].y2 def get_main_size(c): return c[0].y2 - c[0].y # Step 1: Group chars into bands along the secondary axis based on overlap. # This is an optimized version of the band clustering algorithm. # It avoids the O(N^2) complexity of the naive approach by making # assumptions based on the sorted order of characters. chars.sort(key=get_secondary_start) # Each band is a tuple: (list_of_chars, min_secondary_coord, max_secondary_coord) bands_data: list[tuple[list, float, float]] = [] for char in chars: char_secondary_start = get_secondary_start(char) char_secondary_end = get_secondary_end(char) char_secondary_size = char_secondary_end - char_secondary_start best_band_index = -1 max_overlap_ratio = ( BAND_CREATION_OVERLAP_THRESHOLD # Minimum overlap ratio to be considered ) # Iterate backwards over bands, as recent bands are more likely to overlap. for i in range(len(bands_data) - 1, -1, -1): band_chars, band_secondary_start, band_secondary_end = bands_data[i] # Optimization: If the band is already far above the current char, # and since chars are sorted by start, no further bands will match. if band_secondary_end < char_secondary_start: break overlap = max( 0, min(char_secondary_end, band_secondary_end) - max(char_secondary_start, band_secondary_start), ) if char_secondary_size > 0: overlap_ratio = overlap / char_secondary_size if overlap_ratio > max_overlap_ratio: max_overlap_ratio = overlap_ratio best_band_index = i if best_band_index != -1: # Add char to the best matching band and update its boundaries band_chars, band_start, band_end = bands_data[best_band_index] band_chars.append(char) updated_band = ( band_chars, min(band_start, char_secondary_start), max(band_end, char_secondary_end), ) bands_data[best_band_index] = updated_band # Move the updated band to the end to maintain rough locality bands_data.append(bands_data.pop(best_band_index)) else: # No suitable band found, create a new one bands_data.append(([char], char_secondary_start, char_secondary_end)) # Extract final bands from the data structure bands = [b[0] for b in bands_data] # Step 2: For each band, cluster along the main axis using DBSCAN final_lines = [] for band in bands: if len(band) < 1: continue main_axis_sizes = [get_main_size(c) for c in band if get_main_size(c) > 0] avg_main_size = np.mean(main_axis_sizes) if main_axis_sizes else 10 # Epsilon for main-axis clustering is twice the average character size in that dimension eps = avg_main_size * LINE_CLUSTERING_EPS_MULTIPLIER centroids = np.array( [((c[0].x + c[0].x2) / 2, (c[0].y + c[0].y2) / 2) for c in band] ) if centroids.size > 0: db = DBSCAN(eps=eps, min_samples=1, metric="manhattan").fit(centroids) line_groups = defaultdict(list) for i, label in enumerate(db.labels_): if label != -1: line_groups[label].append(band[i]) for _, line in line_groups.items(): line.sort(key=get_main_start) final_lines.append(Line(line)) # Step 3: Split lines that are too tall/wide, which likely contain multiple distinct lines from different columns processed_lines = [] for line in final_lines: if not line.chars: continue line_secondary_start = min(get_secondary_start(c) for c in line.chars) line_secondary_end = max(get_secondary_end(c) for c in line.chars) line_secondary_size = line_secondary_end - line_secondary_start char_secondary_sizes = [ get_secondary_end(c) - get_secondary_start(c) for c in line.chars if get_secondary_end(c) - get_secondary_start(c) > 0 ] if not char_secondary_sizes: processed_lines.append(line) continue max_char_secondary_size = np.max(char_secondary_sizes) if ( line_secondary_size > max_char_secondary_size * LINE_SPLIT_SIZE_RATIO_THRESHOLD and len(line.chars) > 1 ): # logger.debug( # f"Splitting line '{line.text}' which seems to contain multiple lines." # ) # Use DBSCAN on the secondary axis centers to split the line centers = np.array( [ [(get_secondary_start(c) + get_secondary_end(c)) / 2] for c in line.chars ] ) db = DBSCAN( eps=max_char_secondary_size * LINE_SPLIT_DBSCAN_EPS_MULTIPLIER, min_samples=1, ).fit(centers) sub_lines = defaultdict(list) for i, label in enumerate(db.labels_): sub_lines[label].append(line.chars[i]) for _, sub_line_chars in sub_lines.items(): sub_line_chars.sort(key=get_main_start) processed_lines.append(Line(sub_line_chars)) else: processed_lines.append(line) final_lines = processed_lines for line in final_lines: _recalculate_line_text_with_spacing(line, orientation) return final_lines def _merge_lines_on_page(page_lines: list[Line]) -> list[Line]: """ Merge lines on a page that are either contained within or adjacent to each other. This function contains both containment and adjacency merge logic. """ if not page_lines: return [] merged_lines = [] lines_to_skip = set() for i in range(len(page_lines)): if i in lines_to_skip: continue line1 = page_lines[i] if not line1.chars: merged_lines.append(line1) continue bbox1 = ( min(c[0].x for c in line1.chars), min(c[0].y for c in line1.chars), max(c[0].x2 for c in line1.chars), max(c[0].y2 for c in line1.chars), ) # Optimization: Calculate a vertical gap threshold to prune the search space. # Based on the vertical adjacency merge condition. line1_avg_char_height = np.mean( [c[0].y2 - c[0].y for c in line1.chars if c[0].y2 > c[0].y] or [0] ) max_v_gap = line1_avg_char_height * MERGE_VERTICAL_GAP_MULTIPLIER merged = False for j in range(i + 1, len(page_lines)): if j in lines_to_skip: continue line2 = page_lines[j] if not line2.chars: continue bbox2 = ( min(c[0].x for c in line2.chars), min(c[0].y for c in line2.chars), max(c[0].x2 for c in line2.chars), max(c[0].y2 for c in line2.chars), ) # Optimization: if line2 is too far below line1, no more merges with line1 are possible. # The list is sorted top-to-bottom, so we can break early. v_gap = bbox1[1] - bbox2[3] # y_min_1 - y_max_2 if v_gap > max_v_gap: break # Check for "mostly contained" by checking intersection over area inter_x0 = max(bbox1[0], bbox2[0]) inter_y0 = max(bbox1[1], bbox2[1]) inter_x1 = min(bbox1[2], bbox2[2]) inter_y1 = min(bbox1[3], bbox2[3]) inter_area = max(0, inter_x1 - inter_x0) * max(0, inter_y1 - inter_y0) area1 = ( (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) if (bbox1[2] > bbox1[0] and bbox1[3] > bbox1[1]) else 0 ) area2 = ( (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) if (bbox2[2] > bbox2[0] and bbox2[3] > bbox2[1]) else 0 ) # Heuristic for merging: # 1. By containment: if one line is mostly inside another. # 2. By adjacency: if two lines are close and aligned. if ( area2 > 0 and area1 >= area2 and (inter_area / area2) > MERGE_CONTAINMENT_IOU_THRESHOLD ): # Case 1: Merge line2 (smaller) into line1 (larger) by containment # logger.debug( # f"Merging line '{line2.text}' into '{line1.text}' (mostly contained)" # ) line1.chars.extend(line2.chars) lines_to_skip.add(j) merged = True bbox1 = ( min(bbox1[0], bbox2[0]), min(bbox1[1], bbox2[1]), max(bbox1[2], bbox2[2]), max(bbox1[3], bbox2[3]), ) elif ( area1 > 0 and area2 > area1 and (inter_area / area1) > MERGE_CONTAINMENT_IOU_THRESHOLD ): # Case 2: Merge line1 (smaller) into line2 (larger) by containment # logger.debug( # f"Merging line '{line1.text}' into '{line2.text}' (mostly contained)" # ) line2.chars.extend(line1.chars) page_lines[i], page_lines[j] = page_lines[j], page_lines[i] line1 = page_lines[i] lines_to_skip.add(j) merged = True bbox1 = ( min(bbox1[0], bbox2[0]), min(bbox1[1], bbox2[1]), max(bbox1[2], bbox2[2]), max(bbox1[3], bbox2[3]), ) else: # Case 3: Merge by adjacency for lines that are close to each other orientation = "horizontal" if not line1.chars[0][2] else "vertical" if orientation == "horizontal": height1 = bbox1[3] - bbox1[1] height2 = bbox2[3] - bbox2[1] if height1 > 0 and height2 > 0: v_overlap = max( 0, min(bbox1[3], bbox2[3]) - max(bbox1[1], bbox2[1]), ) if ( v_overlap / height1 ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD and ( v_overlap / height2 ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD: h_gap = max(bbox1[0], bbox2[0]) - min(bbox1[2], bbox2[2]) if h_gap >= 0: avg_char_width = np.mean( [ c[0].x2 - c[0].x for c in (line1.chars + line2.chars) if c[0].x2 > c[0].x ] or [0] ) if ( avg_char_width > 0 and h_gap < avg_char_width * MERGE_ADJACENCY_GAP_MULTIPLIER ): # logger.debug( # f"Merging adjacent lines '{line1.text}' and '{line2.text}'" # ) line1.chars.extend(line2.chars) lines_to_skip.add(j) merged = True bbox1 = ( min(bbox1[0], bbox2[0]), min(bbox1[1], bbox2[1]), max(bbox1[2], bbox2[2]), max(bbox1[3], bbox2[3]), ) else: # Vertical width1 = bbox1[2] - bbox1[0] width2 = bbox2[2] - bbox2[0] if width1 > 0 and width2 > 0: h_overlap = max( 0, min(bbox1[2], bbox2[2]) - max(bbox1[0], bbox2[0]), ) if ( h_overlap / width1 ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD and ( h_overlap / width2 ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD: v_gap = max(bbox1[1], bbox2[1]) - min(bbox1[3], bbox2[3]) if v_gap >= 0: avg_char_height = np.mean( [ c[0].y2 - c[0].y for c in (line1.chars + line2.chars) if c[0].y2 > c[0].y ] or [0] ) if ( avg_char_height > 0 and v_gap < avg_char_height * MERGE_ADJACENCY_GAP_MULTIPLIER ): # logger.debug( # f"Merging adjacent vertical lines '{line1.text}' and '{line2.text}'" # ) line1.chars.extend(line2.chars) lines_to_skip.add(j) merged = True bbox1 = ( min(bbox1[0], bbox2[0]), min(bbox1[1], bbox2[1]), max(bbox1[2], bbox2[2]), max(bbox1[3], bbox2[3]), ) if merged: # Re-sort and recalculate text for the merged line orientation = ( "horizontal" if not line1.chars[0][2] else "vertical" ) # Guess orientation from first char if orientation == "horizontal": line1.chars.sort(key=lambda c: c[0].x) else: # vertical line1.chars.sort(key=lambda c: c[0].y) _recalculate_line_text_with_spacing(line1, orientation) merged_lines.append(line1) return merged_lines def process_page_chars_to_lines( chars: list[tuple[il_version_1.Box, str, bool]], ) -> list[Line]: pool = get_process_pool() if pool is None: return process_page_chars_to_lines_internal(chars) return pool.apply(process_page_chars_to_lines_internal, (chars,)) def process_page_chars_to_lines_internal( chars: list[tuple[il_version_1.Box, str, bool]], ) -> list[Line]: """ Process characters on a single page to cluster them into lines. Args: chars: List of character tuples (box, char_unicode, is_vertical) Returns: List of Line objects representing clustered and merged lines """ if not chars: return [] horizontal_chars = [c for c in chars if not c[2]] vertical_chars = [c for c in chars if c[2]] horizontal_lines = _cluster_by_axis(horizontal_chars, "horizontal") vertical_lines = _cluster_by_axis(vertical_chars, "vertical") page_lines = horizontal_lines + vertical_lines # Sort all found lines by their position on the page (top-to-bottom, left-to-right) def get_line_position(line): if not line: return (0, 0) # PDF coordinate system: Y increases upwards. We negate it for top-to-bottom sort. avg_y = np.mean([(c[0].y + c[0].y2) / 2 for c in line]) avg_x = np.mean([(c[0].x + c[0].x2) / 2 for c in line]) return (-avg_y, avg_x) page_lines.sort(key=lambda line: get_line_position(line.chars)) # Merge lines on the page merged_page_lines = _merge_lines_on_page(page_lines) return merged_page_lines def cluster_chars_to_lines( char_boxes: dict[int, list[tuple[il_version_1.Box, str, bool]]], ) -> dict[int, list[Line]]: clustered_lines = {} if not char_boxes: return clustered_lines for page_num, chars in char_boxes.items(): merged_page_lines = process_page_chars_to_lines(chars) clustered_lines[page_num] = merged_page_lines return clustered_lines def draw_clustered_lines_to_image(pdf_path, clustered_lines: dict[int, list[Line]]): doc = pymupdf.open(pdf_path) debug_dir = Path("ocr-box-image-clustered") / Path(pdf_path).stem debug_dir.mkdir(parents=True, exist_ok=True) for page_number, lines in clustered_lines.items(): if not lines: continue page = doc[page_number] pixmap = page.get_pixmap(dpi=300) image_height = pixmap.height image_width = pixmap.width samples = bytearray(pixmap.samples) image_array = np.frombuffer(samples, dtype=np.uint8).reshape( image_height, image_width, pixmap.n ) if pixmap.n in [3, 4]: image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR) # cv2.imwrite(str(debug_dir / f"{page_number}.png"), image_array) annotated_image = image_array.copy() page_rect = page.rect x_scale = image_width / page_rect.width y_scale = image_height / page_rect.height for i, line in enumerate(lines): if not line: continue # Draw the encompassing line box first (red) char_boxes_in_line = [item[0] for item in line.chars] min_x = min(b.x for b in char_boxes_in_line) min_y = min(b.y for b in char_boxes_in_line) max_x2 = max(b.x2 for b in char_boxes_in_line) max_y2 = max(b.y2 for b in char_boxes_in_line) img_x0_line = int(min_x * x_scale) img_y1_line = int(image_height - (max_y2 * y_scale)) img_x1_line = int(max_x2 * x_scale) img_y0_line = int(image_height - (min_y * y_scale)) cv2.rectangle( annotated_image, (img_x0_line, img_y1_line), (img_x1_line, img_y0_line), (0, 0, 255), # Red for lines 2, ) cv2.putText( annotated_image, f"line {i}: {line.text}", (img_x0_line, img_y1_line - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, ) # Then, draw the individual character boxes on top (green) for char_box, _, _ in line.chars: pdf_x0, pdf_y0, pdf_x1, pdf_y1 = ( char_box.x, char_box.y, char_box.x2, char_box.y2, ) img_x0_char = int(pdf_x0 * x_scale) img_y0_char_pdf = int(pdf_y0 * y_scale) img_x1_char = int(pdf_x1 * x_scale) img_y1_char_pdf = int(pdf_y1 * y_scale) img_y0_char = image_height - img_y0_char_pdf img_y1_char = image_height - img_y1_char_pdf cv2.rectangle( annotated_image, (img_x0_char, img_y1_char), (img_x1_char, img_y0_char), (0, 255, 0), # Green for characters 1, # Thinner line ) cv2.imwrite(str(debug_dir / f"{page_number}_annotated.png"), annotated_image) doc.close() def main(): logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) for pdf_path in ( "2404.16109v1.pdf", "2022 - Bortoli_Valentin De, Mathieu_Emile - Riemannian Score-Based Generative Modelling.pdf", "2024 - Regev_Oded - On Lattices, Learning with Errors, Random Linear Codes, and Cryptography.pdf", "2024 - Yang_Tian-Le, Lee_Kuang-Yao - Functional Linear Non-Gaussian Acyclic Model for Causal Discovery.pdf", ): logger.info(f"Processing {pdf_path}") char_boxes = extract_paragraph_line(pdf_path) if not char_boxes: logger.warning(f"No character boxes extracted from {pdf_path}") continue logger.info( f"Extracted {sum(len(c) for c in char_boxes.values())} characters. Clustering them into lines..." ) lines = cluster_chars_to_lines(char_boxes) total_lines = sum(len(l) for l in lines.values()) logger.info(f"Clustered into {total_lines} lines. Drawing boxes...") # logger.info("--- Clustered Lines Text ---") # for page_num, page_lines in lines.items(): # logger.info(f"Page {page_num}:") # for i, line in enumerate(page_lines): # logger.info(f" Line {i}: {line.text}") # logger.info("----------------------------") draw_clustered_lines_to_image(pdf_path, lines) logger.info("Annotated images saved in 'ocr-box-image-clustered' directory.") if __name__ == "__main__": main() ================================================ FILE: babeldoc/format/pdf/document_il/utils/fontmap.py ================================================ import enum import functools import logging import re from pathlib import Path import pymupdf from babeldoc.assets import assets from babeldoc.format.pdf.document_il import PdfFont from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class PrimaryFontFamily(enum.IntEnum): SERIF = 1 SANS_SERIF = 2 SCRIPT = 3 NONE = 4 @classmethod def from_str(cls, value: str): if value == "serif": return cls.SERIF elif value == "sans-serif": return cls.SANS_SERIF elif value == "script": return cls.SCRIPT else: return cls.NONE class FontMapper: stage_name = "Add Fonts" def __init__(self, translation_config: TranslationConfig): self.translation_config = translation_config assert translation_config.primary_font_family in [ None, "serif", "sans-serif", "script", ] self.primary_font_family = PrimaryFontFamily.from_str( translation_config.primary_font_family, ) font_family = assets.get_font_family(translation_config.lang_out) self.font_file_names = [] for k in ( "normal", "script", "fallback", "base", ): self.font_file_names.extend(font_family[k]) self.fonts: dict[str, pymupdf.Font] = {} self.fontid2fontpath: dict[str, Path] = {} for font_file_name in self.font_file_names: if font_file_name in self.fontid2fontpath: continue font_path, font_metadata = assets.get_font_and_metadata(font_file_name) pymupdf_font = pymupdf.Font(fontfile=str(font_path)) pymupdf_font.has_glyph = functools.lru_cache(maxsize=10240, typed=True)( pymupdf_font.has_glyph, ) pymupdf_font.char_lengths = functools.lru_cache(maxsize=10240, typed=True)( pymupdf_font.char_lengths, ) self.fonts[font_file_name] = pymupdf_font self.fontid2fontpath[font_file_name] = font_path self.fonts[font_file_name].font_id = font_file_name self.fonts[font_file_name].font_path = font_path self.fonts[font_file_name].ascent_fontmap = font_metadata["ascent"] self.fonts[font_file_name].descent_fontmap = font_metadata["descent"] self.fonts[font_file_name].encoding_length = font_metadata[ "encoding_length" ] self.normal_font_ids: list[str] = font_family["normal"] self.script_font_ids: list[str] = font_family["script"] self.fallback_font_ids: list[str] = font_family["fallback"] self.base_font_ids: list[str] = font_family["base"] self.fontid2fontpath["base"] = self.fontid2fontpath[font_family["base"][0]] self.fontid2font: dict[str, pymupdf.Font] = { f.font_id: f for f in self.fonts.values() } self.fontid2font["base"] = self.fontid2font[self.base_font_ids[0]] self.normal_fonts: list[pymupdf.Font] = [ self.fontid2font[font_id] for font_id in self.normal_font_ids ] self.script_fonts: list[pymupdf.Font] = [ self.fontid2font[font_id] for font_id in self.script_font_ids ] self.fallback_fonts: list[pymupdf.Font] = [ self.fontid2font[font_id] for font_id in self.fallback_font_ids ] self.base_font = self.fontid2font["base"] self.type2font: dict[str, list[pymupdf.Font]] = { "normal": self.normal_fonts, "script": self.script_fonts, "fallback": self.fallback_fonts, "base": [self.base_font], } self.has_char = functools.lru_cache(maxsize=10240, typed=True)(self.has_char) self.map_in_type = functools.lru_cache(maxsize=10240, typed=True)( self.map_in_type ) def has_char(self, char_unicode: str): if len(char_unicode) != 1: return False current_char = ord(char_unicode) for font in self.fonts.values(): if font.has_glyph(current_char): return True return False def map_in_type( self, bold: bool, italic: bool, monospaced: bool, serif: bool, char_unicode: str, font_type: str, ): if font_type == "script" and not italic: return None current_char = ord(char_unicode) for font in self.type2font[font_type]: if not font.has_glyph(current_char): continue if bool(bold) != bool(font.is_bold): continue # 不知道什么原因,思源黑体的 serif 属性为 1,先 workaround if bool(serif) and "serif" not in font.font_id.lower(): continue if not bool(serif) and "serif" in font.font_id.lower(): continue return font return None def map(self, original_font: PdfFont, char_unicode: str): current_char = ord(char_unicode) if isinstance(original_font, pymupdf.Font): bold = original_font.is_bold italic = original_font.is_italic monospaced = original_font.is_monospaced serif = original_font.is_serif elif isinstance(original_font, PdfFont): bold = original_font.bold italic = original_font.italic monospaced = original_font.monospace serif = original_font.serif else: logger.error( f"Unknown font type: {type(original_font)}. " f"Original font: {original_font}. " f"Char unicode: {char_unicode}. ", ) return None if self.primary_font_family == PrimaryFontFamily.SERIF: serif = True elif self.primary_font_family == PrimaryFontFamily.SANS_SERIF: serif = False elif self.primary_font_family == PrimaryFontFamily.SCRIPT: serif = False italic = True script_font_map_result = self.map_in_type( bold, italic, monospaced, serif, char_unicode, "script" ) if script_font_map_result: return script_font_map_result for script_font in self.script_fonts: if italic and script_font.has_glyph(current_char): return script_font normal_font_map_result = self.map_in_type( bold, italic, monospaced, serif, char_unicode, "normal" ) if normal_font_map_result is not None: return normal_font_map_result fallback_font_map_result = self.map_in_type( bold, italic, monospaced, serif, char_unicode, "fallback" ) if fallback_font_map_result is not None: return fallback_font_map_result for font in self.fallback_fonts: if font.has_glyph(current_char): return font logger.warning( f"Can't find font for {char_unicode}({current_char}). " f"Original font: {original_font.name}[{original_font.font_id}]. " f"Char unicode: {char_unicode}. ", ) return None def get_used_font_ids(self, il: il_version_1.Document) -> set[str]: result = set() for page in il.page: for char in page.pdf_character: if char.pdf_style and char.pdf_style.font_id: result.add(char.pdf_style.font_id) for para in page.pdf_paragraph: for comp in para.pdf_paragraph_composition: if char := comp.pdf_character: if char.pdf_style and char.pdf_style.font_id: result.add(char.pdf_style.font_id) return result def add_font(self, doc_zh: pymupdf.Document, il: il_version_1.Document): used_font_ids = self.get_used_font_ids(il) font_list = [ (k, v) for k, v in self.fontid2fontpath.items() if k in used_font_ids ] font_id = {} xreflen = doc_zh.xref_length() total = xreflen - 1 + len(font_list) + len(il.page) + len(font_list) with self.translation_config.progress_monitor.stage_start( self.stage_name, total, ) as pbar: if not il.page: pbar.advance(total) return for font in font_list: if font[0] in font_id: continue font_id[font[0]] = doc_zh[0].insert_font(font[0], font[1]) pbar.advance(1) for xref in range(1, xreflen): pbar.advance(1) # xref_type = doc_zh.xref_get_key(xref, "Type") # if xref_type[1] == "/Page": # resources_xref = doc_zh.xref_get_key(xref, "Resources") # if resources_xref[0] == 'null': # doc_zh.xref_set_key(xref, "Resources", f"<>>>") for label in ["Resources/", ""]: # 可能是基于 xobj 的 res try: # xref 读写可能出错 font_res = doc_zh.xref_get_key(xref, f"{label}Font") if font_res is None: continue target_key_prefix = f"{label}Font/" if font_res[0] == "xref": resource_xref_id = re.search( "(\\d+) 0 R", font_res[1], ).group(1) xref = int(resource_xref_id) font_res = ("dict", doc_zh.xref_object(xref)) target_key_prefix = "" if font_res[0] == "dict": for font in font_list: target_key = f"{target_key_prefix}{font[0]}" font_exist = doc_zh.xref_get_key(xref, target_key) if font_exist[0] == "null": doc_zh.xref_set_key( xref, target_key, f"{font_id[font[0]]} 0 R", ) except Exception: pass # Create PdfFont for each font # 预先创建所有字体对象 pdf_fonts = [] for font_name, _ in font_list: # Get descent_fontmap from fontid2font assert font_name in self.fontid2font, f"Font {font_name} not found" mupdf_font = self.fontid2font[font_name] descent_fontmap = mupdf_font.descent_fontmap ascent_fontmap = mupdf_font.ascent_fontmap encoding_length = mupdf_font.encoding_length pdf_fonts.append( il_version_1.PdfFont( name=font_name, xref_id=font_id[font_name], font_id=font_name, encoding_length=encoding_length, bold=mupdf_font.is_bold, italic=mupdf_font.is_italic, monospace=mupdf_font.is_monospaced, serif=mupdf_font.is_serif, descent=descent_fontmap, ascent=ascent_fontmap, ), ) pbar.advance(1) # 批量添加字体到页面和 XObject for page in il.page: page.pdf_font.extend(pdf_fonts) for xobj in page.pdf_xobject: xobj.pdf_font.extend(pdf_fonts) pbar.advance(1) ================================================ FILE: babeldoc/format/pdf/document_il/utils/formular_helper.py ================================================ import base64 import functools import re import unicodedata from babeldoc.format.pdf.document_il.il_version_1 import Box from babeldoc.format.pdf.document_il.il_version_1 import Page from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.utils.layout_helper import ( formular_height_ignore_char, ) from babeldoc.format.pdf.translation_config import TranslationConfig def is_formulas_start_char( char: str, font_mapper: FontMapper, translation_config: TranslationConfig, ) -> bool: if not char: return False if "(cid:" in char: return True if not font_mapper.has_char(char): if len(char) > 1 and all(font_mapper.has_char(x) for x in char): return False return True if translation_config.formular_char_pattern: pattern = translation_config.formular_char_pattern if re.match(pattern, char): return True if char != " " and ( unicodedata.category(char[0]) in [ # "Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs", "Co", # private use character # "So", # symbol ] # 文字修饰符、数学符号、分隔符号 or ord(char[0]) in range(0x370, 0x400) # 希腊字母 ): return True if re.match("[0-9\\[\\]•]", char): return True return False def is_formulas_middle_char( char: str, font_mapper: FontMapper, translation_config: TranslationConfig, ) -> bool: if is_formulas_start_char(char, font_mapper, translation_config): return True if re.match(",", char): return True return False def collect_page_formula_font_ids( page: Page, formular_font_pattern: str | None ) -> tuple[set[int], dict[str, set[int]]]: """ Collects formula font IDs from page fonts and XObject fonts. Args: page: The Page object to process. formular_font_pattern: The regex pattern to identify formula fonts by name. Returns: A tuple containing: - A set of font_ids considered formula fonts at the page level. - A dictionary mapping xobj_id to a set of font_ids considered formula fonts for that specific XObject. """ # Page-level formula font IDs page_formula_font_ids = set() if page.pdf_font: for font in page.pdf_font: if is_formulas_font(font.name, formular_font_pattern): page_formula_font_ids.add(font.font_id) # XObject-level formula font IDs xobj_formula_font_ids_map = {} if page.pdf_xobject: for xobj in page.pdf_xobject: # Start with a copy of page-level formula fonts for this XObject current_xobj_fonts = page_formula_font_ids.copy() if xobj.pdf_font: for font in xobj.pdf_font: if is_formulas_font(font.name, formular_font_pattern): current_xobj_fonts.add(font.font_id) else: # If a font within an XObject is explicitly not a formula font, # remove it from this XObject's set. current_xobj_fonts.discard(font.font_id) xobj_formula_font_ids_map[xobj.xobj_id] = current_xobj_fonts return page_formula_font_ids, xobj_formula_font_ids_map @functools.cache def is_formulas_font(font_name: str, formular_font_pattern: str | None) -> bool: pattern_text = ( r"^(" r"|BLKFort.*" r"|Cambria.*" r"|EUAlbertina.*" r"|NimbusRomNo9L.*" r"|GlosaMath.*" r"|URWPalladioL.*" r"|CMSS.+" r"|Arial.*" r"|TimesNewRoman.*" r"|SegoeUI.*" r"|CMTT9.*" r"|CMSL10.*" r"|CMTI10.*" r"|CMTT10.*" r"|CMTI12.*" r"|CMR12.*" r"|MeridienLTStd.*" r"|Calibri.*" r"|STIXMathJax_Main.*" r"|.*NewBaskerville.*" r"|.*FranklinGothic.*" r"|.*AGaramondPro.*" r"|.*PalatinoItalCOR.*" r"|.*ITCSymbolStd.*" r"|.*PlantinStd.*" r"|.*DJ5EscrowCond.*" r"|.*ExchangeBook.*" r"|.*DJ5Exchange.*" r"|.*Times.*" r"|.*PalatinoLTStd.*" r"|.*Times New Roman,Italic.*" r"|.*EhrhardtMT.*" r"|.*GillSansMTStd.*" r"|.*MedicineSymbols3.*" r"|.*HardingText.*" r"|.*GraphikNaturel.*" r"|.*HelveticaNeue.*" r"|.*GoudyOldStyleT.*" r"|.*Symbol.*" r"|.*ScalaSansLF.*" r"|.*ScalaLF.*" r"|.*ScalaSansPro.*" r"|.*PetersburgC.*" r"|.*ColiseumC.*" r"|.*Gantari.*" r"|.*OptimaLTStd.*" r"|.*CronosPro.*" r"|.*ACaslon.*" r"|.*Frutiger.*" r"|.*BrandonGrotesque.*" r"|.*FairfieldLH.*" r"|.*CaeciliaLTStd.*" r"|.*Whitney.*" r"|.*Mercury.*" r"|.*SabonLTStd.*" r"|.*AnonymousPro.*" r"|.*SabonLTPro.*" r"|.*ArnoPro.*" r"|.*CharisSIL.*" r"|.*MSReference.*" r"|.*CMUSerif-Roman.*" r"|.*CourierNewPS.*" r"|.*XCharter.*" r"|.*GillSans.*" r"|.*Perpetua.*" r"|.*GEInspira.*" r"|.*AGaramond.*" r"|.*BMath.*" r"|.*MSTT.*" r"|.*Bookinsanity.*" r"|.*ScalySans.*" r"|.*Code2000.*" r"|.*Minion.*" r"|.*JansonTextLT.*" r"|.*MathPack.*" r"|.*Macmillan.*" r"|.*NimbusSan.*" r"|.*Mincho.*" r"|.*Amerigo.*" r"|.*MSGloriolaIIStd.*" r"|.*CMU.+" r"|.*LinLibertine.*" r"|.*txsys.*" r")$" ) precise_formula_font_pattern = ( r"^(" # r"|.*CambriaMath.*" # r"|.*Cambria Math.*" r"|.*Asana.*" r"|.*MiriamMonoCLM-BookOblique.*" r"|.*Miriam Mono CLM.*" r"|.*Logix.*" r"|.*AeBonum.*" r"|.*AeMRoman.*" r"|.*AePagella.*" r"|.*AeSchola.*" r"|.*Concrete.*" r"|.*LatinModernMathCompanion.*" r"|.*Latin Modern Math Companion.*" r"|.*RalphSmithsFormalScriptCompanion.*" r"|.*Ralph Smiths Formal Script Companion.*" r"|.*TeXGyreBonumMathCompanion.*" r"|.*TeX Gyre Bonum Companion.*" r"|.*TeXGyrePagellaMathCompanion.*" r"|.*TeX Gyre Pagella Math Companion.*" r"|.*TeXGyreTermesMathCompanion.*" r"|.*TeX Gyre Termes Math Companion.*" r"|.*XITSMathCompanion.*" r"|.*XITS Math Companion.*" r"|.*Erewhon.*" r"|.*Euler-Math.*" r"|.*Euler Math.*" r"|.*FiraMath-Regular.*" r"|.*Fira Math.*" r"|.*Garamond-Math.*" r"|.*GFSNeohellenicMath.*" r"|.*KpMath.*" r"|.*Lete Sans Math.*" r"|.*LeteSansMath.*" # r"|.*LinLibertineO.*" r"|.*Linux Libertine O.*" r"|.*LibertinusMath-Regular.*" r"|.*Libertinus Math.*" r"|.*LatinModernMath-Regular.*" r"|.*Latin Modern Math.*" r"|.*Luciole.*" r"|.*NewCM.*" r"|.*NewComputerModern.*" r"|.*OldStandard-Math.*" r"|.*STIXMath-Regular.*" r"|.*STIX Math.*" r"|.*STIXTwoMath-Regular.*" r"|.*STIX Two Math.*" r"|.*TeXGyreBonumMath.*" r"|.*TeX Gyre Bonum Math.*" r"|.*TeXGyreDejaVuMath.*" r"|.*TeX Gyre DejaVu Math.*" r"|.*TeXGyrePagellaMath.*" r"|.*TeX Gyre Pagella Math.*" r"|.*TeXGyreScholaMath.*" r"|.*TeX Gyre Schola Math.*" r"|.*TeXGyreTermesMath.*" r"|.*TeX Gyre Termes Math.*" r"|.*XCharter-Math.*" r"|.*XCharter Math.*" r"|.*XITSMath-Bold.*" r"|.*XITS Math.*" r"|.*XITSMath.*" r"|.*IBMPlexMath.*" r"|.*IBM Plex Math.*" r")$" ) if formular_font_pattern: broad_formula_font_pattern = formular_font_pattern else: broad_formula_font_pattern = ( r"(CM[^RB]" r"|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]" r"|LINE" r"|LCIRCLE" r"|TeX-" r"|rsfs" r"|txsy" r"|wasy" r"|stmary" r"|.*Mono" r"|.*Code" # r"|.*Ital" r"|.*Sym" r"|.*Math" r"|AdvP4C4E74" r"|AdvPSSym" r"|AdvP4C4E59" r")" ) if font_name.startswith("BASE64:"): font_name_bytes = base64.b64decode(font_name[7:]) font = font_name_bytes.split(b"+")[-1] pattern_text = pattern_text.encode() broad_formula_font_pattern = broad_formula_font_pattern.encode() else: font = font_name.split("+")[-1] if not font: return False if re.match(precise_formula_font_pattern, font): return True elif re.match(pattern_text, font): return False elif re.match(broad_formula_font_pattern, font): return True return False def update_formula_data(formula: PdfFormula): min_x = min(char.visual_bbox.box.x for char in formula.pdf_character) max_x = max(char.visual_bbox.box.x2 for char in formula.pdf_character) if not all(map(formular_height_ignore_char, formula.pdf_character)): min_y = min( char.visual_bbox.box.y for char in formula.pdf_character if not formular_height_ignore_char(char) ) max_y = max( char.visual_bbox.box.y2 for char in formula.pdf_character if not formular_height_ignore_char(char) ) else: min_y = min(char.visual_bbox.box.y for char in formula.pdf_character) max_y = max(char.visual_bbox.box.y2 for char in formula.pdf_character) formula.box = Box(min_x, min_y, max_x, max_y) if not formula.y_offset: formula.y_offset = 0 if not formula.x_offset: formula.x_offset = 0 if not formula.x_advance: formula.x_advance = 0 ================================================ FILE: babeldoc/format/pdf/document_il/utils/layout_helper.py ================================================ import logging import math import re import unicodedata from typing import Literal import regex from pymupdf import Font from babeldoc.format.pdf.document_il import GraphicState from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.il_version_1 import Box from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraph from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition logger = logging.getLogger(__name__) # HEIGHT_NOT_USFUL_CHAR_IN_CHAR = ( # "∑︁", # # 暂时假设 cid:17 和 cid 16 是特殊情况 # # 来源于 arXiv:2310.18608v2 第九页公式大括号 # "(cid:17)", # "(cid:16)", # # arXiv:2411.19509v2 第四页 [] # "(cid:104)", # "(cid:105)", # # arXiv:2411.19509v2 第四页 公式的 | 竖线 # "(cid:13)", # "∑︁", # # arXiv:2412.05265 27 页 累加号 # "(cid:88)", # # arXiv:2412.05265 16 页 累乘号 # "(cid:89)", # # arXiv:2412.05265 27 页 积分 # "(cid:90)", # # arXiv:2412.05265 32 页 公式左右的中括号 # "(cid:2)", # "(cid:3)", # "·", # "√", # ) # 由于我们有一套 bbox 解析机制了,所以现在不需要这个东西了。 HEIGHT_NOT_USFUL_CHAR_IN_CHAR = (None,) LEFT_BRACKET = ("(cid:8)", "(", "(cid:16)", "{", "[", "(cid:104)", "(cid:2)") RIGHT_BRACKET = ("(cid:9)", ")", "(cid:17)", "}", "]", "(cid:105)", "(cid:3)") BULLET_POINT_PATTERN = re.compile( r"[■•⚫⬤◆◇○●◦‣⁃▪▫∗†‡¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ¶※⁑⁂⁕⁎⁜❧☙⁋‖‽·]" ) def is_bullet_point(char: PdfCharacter) -> bool: """Check if the character is a bullet point. Args: char: The character to check Returns: bool: True if the character is a bullet point """ is_bullet = bool(BULLET_POINT_PATTERN.match(char.char_unicode)) return is_bullet def calculate_box_iou(box1: Box, box2: Box) -> float: """Calculate the Intersection over Union (IOU) between two boxes. Args: box1: First box box2: Second box Returns: float: IOU value between 0 and 1 """ if box1 is None or box2 is None: return 0.0 # Calculate intersection x_left = max(box1.x, box2.x) y_top = max(box1.y, box2.y) x_right = min(box1.x2, box2.x2) y_bottom = min(box1.y2, box2.y2) # Check if there's no intersection if x_left >= x_right or y_top >= y_bottom: return 0.0 # Calculate intersection area intersection_area = (x_right - x_left) * (y_bottom - y_top) # Calculate areas of both boxes box1_area = (box1.x2 - box1.x) * (box1.y2 - box1.y) box2_area = (box2.x2 - box2.x) * (box2.y2 - box2.y) # Calculate union area union_area = box1_area + box2_area - intersection_area # Avoid division by zero if union_area <= 0: return 0.0 return intersection_area / union_area def formular_height_ignore_char(char: PdfCharacter): return ( char.pdf_character_id is None or char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR ) def box_to_tuple(box: Box) -> tuple[float, float, float, float]: """Converts a Box object to a tuple of its coordinates.""" if box is None: return (0, 0, 0, 0) return (box.x, box.y, box.x2, box.y2) class Layout: def __init__(self, layout_id, name): self.id = layout_id self.name = name @staticmethod def is_newline(prev_char: PdfCharacter, curr_char: PdfCharacter) -> bool: # 如果没有前一个字符,不是换行 if prev_char is None: return False # 获取两个字符的中心 y 坐标 # prev_y = (prev_char.box.y + prev_char.box.y2) / 2 # curr_y = (curr_char.box.y + curr_char.box.y2) / 2 # 如果当前字符的 y 坐标明显低于前一个字符,说明换行了 # 这里使用字符高度的一半作为阈值 char_height = max( curr_char.box.y2 - curr_char.box.y, prev_char.box.y2 - prev_char.box.y, ) char_width = max( curr_char.box.x2 - curr_char.box.x, prev_char.box.x2 - prev_char.box.x, ) should_new_line = ( curr_char.box.y2 < prev_char.box.y or curr_char.box.x2 < prev_char.box.x - char_width * 10 ) if should_new_line and ( formular_height_ignore_char(curr_char) or formular_height_ignore_char(prev_char) ): return False return should_new_line def get_paragraph_length_except( paragraph: PdfParagraph, except_chars: str, font: Font, ) -> int: length = 0 for composition in paragraph.pdf_paragraph_composition: if composition.pdf_character: length += ( composition.pdf_character[0].box.x2 - composition.pdf_character[0].box.x ) elif composition.pdf_same_style_characters: for pdf_char in composition.pdf_same_style_characters.pdf_character: if pdf_char.char_unicode in except_chars: continue length += pdf_char.box.x2 - pdf_char.box.x elif composition.pdf_same_style_unicode_characters: for char_unicode in composition.pdf_same_style_unicode_characters.unicode: if char_unicode in except_chars: continue length += font.char_lengths( char_unicode, composition.pdf_same_style_unicode_characters.pdf_style.font_size, )[0] elif composition.pdf_line: for pdf_char in composition.pdf_line.pdf_character: if pdf_char.char_unicode in except_chars: continue length += pdf_char.box.x2 - pdf_char.box.x elif composition.pdf_formula: length += composition.pdf_formula.box.x2 - composition.pdf_formula.box.x else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue return length def get_paragraph_unicode(paragraph: PdfParagraph) -> str: chars = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: chars.extend(composition.pdf_line.pdf_character) elif composition.pdf_same_style_characters: chars.extend(composition.pdf_same_style_characters.pdf_character) elif composition.pdf_same_style_unicode_characters: chars.extend(composition.pdf_same_style_unicode_characters.unicode) elif composition.pdf_formula: chars.extend(composition.pdf_formula.pdf_character) elif composition.pdf_character: chars.append(composition.pdf_character) else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue return get_char_unicode_string(chars) SPACE_REGEX = regex.compile(r"\s+", regex.UNICODE) def get_char_unicode_string(chars: list[PdfCharacter | str]) -> str: """ 将字符列表转换为 Unicode 字符串,根据字符间距自动插入空格。 有些 PDF 不会显式编码空格,这时需要根据间距自动插入空格。 Args: chars: 字符列表,可以是 PdfCharacter 对象或字符串 Returns: str: 处理后的 Unicode 字符串 """ # 计算字符间距的中位数 distances = [] for i in range(len(chars) - 1): if not ( isinstance(chars[i], PdfCharacter) and isinstance(chars[i + 1], PdfCharacter) ): continue distance = chars[i + 1].box.x - chars[i].box.x2 if distance > 1: # 只考虑正向距离 distances.append(distance) # 去重后的距离 distinct_distances = sorted(set(distances)) if not distinct_distances: median_distance = 1 elif len(distinct_distances) == 1: median_distance = distinct_distances[0] else: median_distance = distinct_distances[1] # 构建 unicode 字符串,根据间距插入空格 unicode_chars = [] for i in range(len(chars)): # 如果不是字符对象,直接添加,一般来说这个时候 chars[i] 是字符串 if not isinstance(chars[i], PdfCharacter): unicode_chars.append(chars[i]) continue # use unicode regex to replace all space with " " unicode_chars.append( regex.sub( r"\s+", " ", unicodedata.normalize("NFKC", chars[i].char_unicode), ) ) # 如果是空格,跳过 if chars[i].char_unicode == " ": continue # 如果两个字符都是 PdfCharacter,检查间距 if i < len(chars) - 1 and isinstance(chars[i + 1], PdfCharacter): distance = chars[i + 1].box.x - chars[i].box.x2 if distance >= median_distance or Layout.is_newline( # 间距大于中位数 chars[i], chars[i + 1], ): # 换行 unicode_chars.append(" ") # 添加空格 result = "".join(unicode_chars) # use unicode regex to replace all space with " " normalize = unicodedata.normalize("NFKC", result) result = SPACE_REGEX.sub(" ", normalize).strip() return result def get_paragraph_max_height(paragraph: PdfParagraph) -> float: """ 获取段落中最高的排版单元高度。 Args: paragraph: PDF 段落对象 Returns: float: 最大高度值 """ max_height = 0.0 for composition in paragraph.pdf_paragraph_composition: if composition is None: continue if composition.pdf_character: char_height = ( composition.pdf_character[0].box.y2 - composition.pdf_character[0].box.y ) max_height = max(max_height, char_height) elif composition.pdf_same_style_characters: for pdf_char in composition.pdf_same_style_characters.pdf_character: char_height = pdf_char.box.y2 - pdf_char.box.y max_height = max(max_height, char_height) elif composition.pdf_same_style_unicode_characters: # 对于纯 Unicode 字符,我们使用其样式中的字体大小作为高度估计 font_size = ( composition.pdf_same_style_unicode_characters.pdf_style.font_size ) max_height = max(max_height, font_size) elif composition.pdf_line: for pdf_char in composition.pdf_line.pdf_character: char_height = pdf_char.box.y2 - pdf_char.box.y max_height = max(max_height, char_height) elif composition.pdf_formula: formula_height = ( composition.pdf_formula.box.y2 - composition.pdf_formula.box.y ) max_height = max(max_height, formula_height) else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue return max_height def is_same_style(style1, style2) -> bool: """判断两个样式是否相同""" if style1 is None or style2 is None: return style1 is style2 return ( style1.font_id == style2.font_id and math.fabs(style1.font_size - style2.font_size) < 0.02 and is_same_graphic_state(style1.graphic_state, style2.graphic_state) ) def is_same_style_except_size(style1, style2) -> bool: """判断两个样式是否相同""" if style1 is None or style2 is None: return style1 is style2 return ( style1.font_id == style2.font_id and 0.7 < math.fabs(style1.font_size / style2.font_size) < 1.3 and is_same_graphic_state(style1.graphic_state, style2.graphic_state) ) def is_same_style_except_font(style1, style2) -> bool: """判断两个样式是否相同""" if style1 is None or style2 is None: return style1 is style2 return math.fabs( style1.font_size - style2.font_size, ) < 0.02 and is_same_graphic_state(style1.graphic_state, style2.graphic_state) def is_same_graphic_state(state1: GraphicState, state2: GraphicState) -> bool: """判断两个 GraphicState 是否相同""" if state1 is None or state2 is None: return state1 is state2 return ( state1.passthrough_per_char_instruction == state2.passthrough_per_char_instruction ) def add_space_dummy_chars(paragraph: PdfParagraph) -> None: """ 在 PDF 段落中添加表示空格的 dummy 字符。 这个函数会直接修改传入的 paragraph 对象,在需要空格的地方添加 dummy 字符。 同时也会处理不同组成部分之间的空格。 Args: paragraph: 需要处理的 PDF 段落对象 """ # 首先处理每个组成部分内部的空格 for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: chars = composition.pdf_line.pdf_character _add_space_dummy_chars_to_list(chars) elif composition.pdf_same_style_characters: chars = composition.pdf_same_style_characters.pdf_character _add_space_dummy_chars_to_list(chars) elif composition.pdf_same_style_unicode_characters: # 对于 unicode 字符,不需要处理。 # 这种类型只会出现在翻译好的结果中 continue elif composition.pdf_formula: chars = composition.pdf_formula.pdf_character _add_space_dummy_chars_to_list(chars) # 然后处理组成部分之间的空格 for i in range(len(paragraph.pdf_paragraph_composition) - 1): curr_comp = paragraph.pdf_paragraph_composition[i] next_comp = paragraph.pdf_paragraph_composition[i + 1] # 获取当前组成部分的最后一个字符 curr_last_char = _get_last_char_from_composition(curr_comp) if not curr_last_char: continue # 获取下一个组成部分的第一个字符 next_first_char = _get_first_char_from_composition(next_comp) if not next_first_char: continue # 检查两个组成部分之间是否需要添加空格 distance = next_first_char.box.x - curr_last_char.box.x2 if distance > 1: # 只考虑正向距离 # 创建一个 dummy 字符作为空格 space_box = Box( x=curr_last_char.box.x2, y=curr_last_char.box.y, x2=curr_last_char.box.x2 + distance, y2=curr_last_char.box.y2, ) space_char = PdfCharacter( pdf_style=curr_last_char.pdf_style, box=space_box, char_unicode=" ", scale=curr_last_char.scale, advance=space_box.x2 - space_box.x, visual_bbox=il_version_1.VisualBbox(box=space_box), ) # 将空格添加到当前组成部分的末尾 if curr_comp.pdf_line: curr_comp.pdf_line.pdf_character.append(space_char) elif curr_comp.pdf_same_style_characters: curr_comp.pdf_same_style_characters.pdf_character.append(space_char) elif curr_comp.pdf_formula: curr_comp.pdf_formula.pdf_character.append(space_char) def _get_first_char_from_composition( comp: PdfParagraphComposition, ) -> PdfCharacter | None: """获取组成部分的第一个字符""" if comp.pdf_line and comp.pdf_line.pdf_character: return comp.pdf_line.pdf_character[0] elif ( comp.pdf_same_style_characters and comp.pdf_same_style_characters.pdf_character ): return comp.pdf_same_style_characters.pdf_character[0] elif comp.pdf_formula and comp.pdf_formula.pdf_character: return comp.pdf_formula.pdf_character[0] elif comp.pdf_character: return comp.pdf_character return None def _get_last_char_from_composition( comp: PdfParagraphComposition, ) -> PdfCharacter | None: """获取组成部分的最后一个字符""" if comp.pdf_line and comp.pdf_line.pdf_character: return comp.pdf_line.pdf_character[-1] elif ( comp.pdf_same_style_characters and comp.pdf_same_style_characters.pdf_character ): return comp.pdf_same_style_characters.pdf_character[-1] elif comp.pdf_formula and comp.pdf_formula.pdf_character: return comp.pdf_formula.pdf_character[-1] elif comp.pdf_character: return comp.pdf_character return None def _add_space_dummy_chars_to_list(chars: list[PdfCharacter]) -> None: """ 在字符列表中的适当位置添加表示空格的 dummy 字符。 Args: chars: PdfCharacter 对象列表 """ if not chars: return # 计算字符间距的中位数 distances = [] for i in range(len(chars) - 1): distance = chars[i + 1].box.x - chars[i].box.x2 if distance > 1: # 只考虑正向距离 distances.append(distance) # 去重后的距离 distinct_distances = sorted(set(distances)) if not distinct_distances: median_distance = 1 elif len(distinct_distances) == 1: median_distance = distinct_distances[0] else: median_distance = distinct_distances[1] # 在需要的地方插入空格字符 i = 0 while i < len(chars) - 1: curr_char = chars[i] next_char = chars[i + 1] distance = next_char.box.x - curr_char.box.x2 if distance >= median_distance or Layout.is_newline(curr_char, next_char): if distance < 0: distance = -distance # 创建一个 dummy 字符作为空格 space_box = Box( x=curr_char.box.x2, y=curr_char.box.y, x2=curr_char.box.x2 + min(distance, median_distance), y2=curr_char.box.y2, ) space_char = PdfCharacter( pdf_style=curr_char.pdf_style, box=space_box, char_unicode=" ", scale=curr_char.scale, advance=space_box.x2 - space_box.x, visual_bbox=il_version_1.VisualBbox(box=space_box), ) # 在当前位置后插入空格字符 chars.insert(i + 1, space_char) i += 2 # 跳过刚插入的空格 else: i += 1 def build_layout_index(page): """Builds an R-tree index for all layouts on the page.""" from rtree import index layout_index = index.Index() layout_map = {} for i, layout in enumerate(page.page_layout): layout_map[i] = layout if layout.box: layout_index.insert(i, box_to_tuple(layout.box)) return layout_index, layout_map def calculate_iou_for_boxes(box1: Box, box2: Box) -> float: """Calculate the intersection area divided by the first box area.""" x_left = max(box1.x, box2.x) y_bottom = max(box1.y, box2.y) x_right = min(box1.x2, box2.x2) y_top = min(box1.y2, box2.y2) if x_right <= x_left or y_top <= y_bottom: return 0.0 # Calculate intersection area intersection_area = (x_right - x_left) * (y_top - y_bottom) # Calculate area of first box first_box_area = (box1.x2 - box1.x) * (box1.y2 - box1.y) # Return intersection divided by first box area, handle division by zero if first_box_area <= 0: return 0.0 return intersection_area / first_box_area def calculate_y_iou_for_boxes(box1: Box, box2: Box) -> float: """Calculate the intersection ratio in y-axis direction divided by the first box height. Args: box1: First box box2: Second box Returns: float: Intersection ratio in y-axis direction between 0 and 1 """ y_bottom = max(box1.y, box2.y) y_top = min(box1.y2, box2.y2) if y_top <= y_bottom: return 0.0 # Calculate intersection height intersection_height = y_top - y_bottom # Calculate height of first box first_box_height = box1.y2 - box1.y # Return intersection divided by first box height, handle division by zero if first_box_height <= 0: return 0.0 return intersection_height / first_box_height def calculate_y_true_iou_for_boxes(box1: Box, box2: Box) -> float: """Calculate the intersection ratio in y-axis direction divided by the first box height. Args: box1: First box box2: Second box Returns: float: Intersection ratio in y-axis direction between 0 and 1 """ y_bottom = max(box1.y, box2.y) y_top = min(box1.y2, box2.y2) if y_top <= y_bottom: return 0.0 # Calculate intersection height intersection_height = y_top - y_bottom # Calculate height of first box first_box_height = box1.y2 - box1.y second_box_height = box2.y2 - box2.y min_height = min(first_box_height, second_box_height) # Return intersection divided by first box height, handle division by zero if first_box_height <= 0: return 0.0 return intersection_height / min_height def get_character_layout( char, layout_index, layout_map, layout_priority=None, _bbox_mode: Literal["auto", "visual", "box"] = "auto", ): """Get the layout for a character based on priority and IoU.""" if layout_priority is None: layout_priority = [ "number", "reference", "reference_content", "algorithm", "formula_caption", "isolate_formula", "table_footnote", "table_caption", "figure_caption", "figure_title", "chart_title", "table_title", "table_cell_hybrid", "table_text", "wireless_table_cell", "wired_table_cell", "abandon", "title", "abstract", "paragraph_title", "content", "doc_title", "footnote", "header", "footer", "seal", "plain text", "tiny text", "author_info_hybrid", "list_item_hybrid", "text", "paragraph_hybrid", "paragraph", "table_cell", "figure_text", "list_item", "title", "caption", "footnote_hybrid", "footnote", "formula", "formula_hybrid", "page_header", "page_footer", # --- hybrid labels --- "reference_hybrid", "document_hybrid", "academic_paper_hybrid", "form_or_table_hybrid", "presentation_slide_hybrid", "webpage_screenshot_hybrid", "manga_or_comic_hybrid", "advertisement_hybrid", "magazine_or_newspaper_hybrid", "other_hybrid", "table_cell_hybrid", "figure_text_hybrid", "title_hybrid", "caption_hybrid", "code_algo_hybrid", "line_number_hybrid", "page_header_hybrid", "page_footer_hybrid", "page_number_hybrid", "unknown_hybrid", "fallback_line", "table", "figure", "image", ] char_box = char.visual_bbox.box # char_box2 = char.box # if bbox_mode == "auto": # # Calculate IOU to decide which box to use # intersection_area = max( # 0, min(char_box.x2, char_box2.x2) - max(char_box.x, char_box2.x) # ) * max(0, min(char_box.y2, char_box2.y2) - max(char_box.y, char_box2.y)) # char_box_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y) # # if char_box_area > 0: # iou = intersection_area / char_box_area # if iou < 0.2: # char_box = char_box2 # elif bbox_mode == "box": # char_box = char_box2 # Collect all intersecting layouts and their IoU values matching_layouts = [] candidate_ids = list(layout_index.intersection(box_to_tuple(char_box))) candidate_layouts = [layout_map[i] for i in candidate_ids] for layout in candidate_layouts: # Calculate IoU intersection_area = max( 0, min(char_box.x2, layout.box.x2) - max(char_box.x, layout.box.x) ) * max(0, min(char_box.y2, layout.box.y2) - max(char_box.y, layout.box.y)) char_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y) if char_area > 0: iou = intersection_area / char_area if iou > 0: matching_layouts.append( { "layout": Layout(layout.id, layout.class_name), "priority": ( layout_priority.index(layout.class_name) if layout.class_name in layout_priority else len(layout_priority) ), "iou": iou, } ) if not matching_layouts: return None # Sort by priority (ascending) and IoU value (descending) matching_layouts.sort(key=lambda x: (x["priority"], -x["iou"])) # non_hybrid_table_label = None # for layout in matching_layouts: # layout = layout["layout"] # label = layout.name # if is_text_layout(layout) and label not in ( # "table_cell_hybrid", # "table_text", # "wireless_table_cell", # "wired_table_cell", # "fallback_line", # "unknown_hybrid", # ): # non_hybrid_table_label = layout # break # # if non_hybrid_table_label: # return non_hybrid_table_label return matching_layouts[0]["layout"] def is_text_layout(layout: Layout): """Check if a layout is a text layout.""" return layout is not None and layout.name in [ "plain text", "tiny text", "title", "abandon", "figure_caption", "table_caption", "table_text", "table_footnote", # "reference", "title", "paragraph_title", "abstract", "content", "figure_title", "table_title", "doc_title", "footnote", "header", "footer", "seal", "text", "chart_title", "paragraph", "table_cell", "figure_text", "list_item", "title", "caption", "footnote", "page_header", "page_footer", "wired_table_cell", "wireless_table_cell", "paragraph_hybrid", "table_cell_hybrid", "caption_hybrid", "unknown_hybrid", "figure_text_hybrid", "list_item_hybrid", "title_hybrid", "fallback_line", "author_info_hybrid", "page_header_hybrid", "page_footer_hybrid", "footnote_hybrid", ] def is_character_in_formula_layout( char: il_version_1.PdfCharacter, _page: il_version_1.Page, layout_index, layout_map, ) -> int | None: """Check if character is contained within any formula-related layout.""" formula_layout_types = {"formula"} char_box = char.visual_bbox.box char_box2 = char.box if calculate_iou_for_boxes(char_box, char_box2) < 0.2: char_box = char_box2 # Get all candidate layouts that intersect with the character candidate_ids = list(layout_index.intersection(box_to_tuple(char_box))) candidate_layouts: list[il_version_1.PageLayout] = [ layout_map[i] for i in candidate_ids ] # Check if any intersecting layout is a formula type for layout in candidate_layouts: if layout.class_name in formula_layout_types: iou = calculate_iou_for_boxes(char_box, layout.box) if iou > 0.4: # Character has overlap with formula layout return layout.id return None def is_curve_in_figure_table_layout( curve, layout_index, layout_map, protection_threshold: float = 0.3 ) -> bool: """Check if curve is within figure/table layout areas. Args: curve: The curve object to check layout_index: Spatial index for layouts layout_map: Mapping from layout IDs to layout objects protection_threshold: IoU threshold for figure/table protection Returns: True if curve is within figure/table layout areas """ if not curve.box: return False # Figure/table related layout types figure_table_layouts = { "figure", "table", "figure_text", "table_text", "figure_caption", "table_caption", "figure_title", "table_title", "chart_title", "table_cell", "table_cell_hybrid", "wired_table_cell", "wireless_table_cell", "table_footnote", } # Get candidate layouts that intersect with curve candidate_ids = list(layout_index.intersection(box_to_tuple(curve.box))) candidate_layouts = [layout_map[i] for i in candidate_ids] for layout in candidate_layouts: if layout.class_name in figure_table_layouts: # Check if curve has significant overlap with figure/table layout iou = calculate_iou_for_boxes(curve.box, layout.box) if iou > protection_threshold: return True return False def is_curve_overlapping_with_paragraphs( curve, paragraphs: list, overlap_threshold: float = 0.2 ) -> bool: """Check if curve overlaps with text paragraph areas. Args: curve: The curve object to check paragraphs: List of paragraph objects overlap_threshold: IoU threshold for paragraph overlap detection Returns: True if curve overlaps with any paragraph area """ if not curve.box: return False for paragraph in paragraphs: para_box = get_paragraph_bounding_box(paragraph) if para_box: iou = calculate_iou_for_boxes(curve.box, para_box) if iou > overlap_threshold: return True return False def get_paragraph_bounding_box(paragraph) -> Box | None: """Calculate the bounding box of a paragraph from its compositions. Args: paragraph: The paragraph object Returns: Box object representing the paragraph bounds, or None if no valid bounds """ if not paragraph.pdf_paragraph_composition: return None min_x = float("inf") min_y = float("inf") max_x = float("-inf") max_y = float("-inf") has_valid_box = False for composition in paragraph.pdf_paragraph_composition: comp_box = None if composition.pdf_line and composition.pdf_line.box: comp_box = composition.pdf_line.box elif composition.pdf_formula and composition.pdf_formula.box: comp_box = composition.pdf_formula.box elif ( composition.pdf_same_style_characters and composition.pdf_same_style_characters.box ): comp_box = composition.pdf_same_style_characters.box elif composition.pdf_character and len(composition.pdf_character) > 0: # Calculate box from character list char_boxes = [ char.visual_bbox.box for char in composition.pdf_character if char.visual_bbox and char.visual_bbox.box ] if char_boxes: comp_min_x = min(box.x for box in char_boxes) comp_min_y = min(box.y for box in char_boxes) comp_max_x = max(box.x2 for box in char_boxes) comp_max_y = max(box.y2 for box in char_boxes) comp_box = Box(comp_min_x, comp_min_y, comp_max_x, comp_max_y) if comp_box: min_x = min(min_x, comp_box.x) min_y = min(min_y, comp_box.y) max_x = max(max_x, comp_box.x2) max_y = max(max_y, comp_box.y2) has_valid_box = True if not has_valid_box: return None return Box(min_x, min_y, max_x, max_y) ================================================ FILE: babeldoc/format/pdf/document_il/utils/matrix_helper.py ================================================ """Matrix helper utilities for CTM decomposition and composition. This module provides functions to: - Decompose a PDF CTM into translation, rotation, scale, and shear - Compose a CTM back from translation, rotation, scale, and shear All comments and docstrings are in English per project guidelines. """ from __future__ import annotations import math from babeldoc.format.pdf.document_il.il_version_1 import PdfAffineTransform from babeldoc.format.pdf.document_il.il_version_1 import PdfMatrix # Local type aliases to avoid importing from pdfminer Point = tuple[float, float] Matrix = tuple[float, float, float, float, float, float] def decompose_ctm(m: Matrix | PdfMatrix) -> PdfAffineTransform: """Decompose a PDF CTM into a PdfAffineTransform. The PDF current transformation matrix (CTM) is represented as ``(a, b, c, d, e, f)`` corresponding to the affine matrix: ``[[a, c, e], [b, d, f], [0, 0, 1]]``. This function decomposes it into: - translation: (tx, ty) - rotation: angle in radians (counter-clockwise) - scale: (sx, sy) - shear: x-shear factor (dimensionless, equals tan(shear_angle)) The decomposition is based on a QR-like approach commonly used for 2D affine matrices. If the linear part is degenerate, sensible fallbacks are applied. Args: m: CTM as ``(a, b, c, d, e, f)``. Returns: A ``PdfAffineTransform`` instance with fields populated. """ if isinstance(m, PdfMatrix): a = m.a b = m.b c = m.c d = m.d e = m.e f = m.f assert a is not None assert b is not None assert c is not None assert d is not None assert e is not None assert f is not None else: (a, b, c, d, e, f) = m tx, ty = e, f # Linear part m00, m01 = a, c m10, m11 = b, d # Scale X is the length of the first column sx = math.hypot(m00, m10) eps = 1e-12 if sx < eps: # Degenerate first column. Choose rotation = 0, shear = 0, sx = 0. rotation = 0.0 shear = 0.0 # Then sy is the length of the second column sy = math.hypot(m01, m11) # Handle reflection det = m00 * m11 - m01 * m10 if det < 0: sy = -sy if sy != 0 else -0.0 return PdfAffineTransform( translation_x=tx, translation_y=ty, rotation=rotation, scale_x=sx, scale_y=sy, shear=shear, ) # Normalize first column to get rotation axis r0x = m00 / sx r0y = m10 / sx # Shear is the projection of the second column onto the first column shear = r0x * m01 + r0y * m11 # Remove the shear component from the second column m01_ortho = m01 - shear * r0x m11_ortho = m11 - shear * r0y # Scale Y is the length of the orthogonalized second column sy = math.hypot(m01_ortho, m11_ortho) # Determine reflection by determinant sign det = m00 * m11 - m01 * m10 if det < 0: sy = -sy if sy != 0 else -0.0 shear = -shear m01_ortho = -m01_ortho m11_ortho = -m11_ortho # Rotation is the angle of the first column rotation = math.atan2(m10, m00) return PdfAffineTransform( translation_x=tx, translation_y=ty, rotation=rotation, scale_x=sx, scale_y=sy, shear=shear, ) def compose_ctm(transform: PdfAffineTransform) -> Matrix: """Compose a PDF CTM from a PdfAffineTransform. This composes the 2x2 linear part using the following model: - First column: ``sx * r0`` where ``r0 = (cos(theta), sin(theta))`` - Second column: ``shear * r0 + sy * r1`` where ``r1`` is the unit vector orthogonal to ``r0``: ``r1 = (-sin(theta), cos(theta))`` - Translation is appended as (e, f) = (tx, ty) Args: transform: A ``PdfAffineTransform`` with translation, rotation, scale, and shear populated. Returns: The CTM matrix ``(a, b, c, d, e, f)``. """ # Extract and validate required values from the dataclass tx = float(transform.translation_x if transform.translation_x is not None else 0.0) ty = float(transform.translation_y if transform.translation_y is not None else 0.0) theta = float(transform.rotation if transform.rotation is not None else 0.0) sx = float(transform.scale_x if transform.scale_x is not None else 1.0) sy = float(transform.scale_y if transform.scale_y is not None else 1.0) shear = float(transform.shear if transform.shear is not None else 0.0) cos_t = math.cos(theta) sin_t = math.sin(theta) # Unit basis aligned with rotation r0x, r0y = cos_t, sin_t r1x, r1y = -sin_t, cos_t # Columns of the linear matrix col0x = sx * r0x col0y = sx * r0y col1x = shear * r0x + sy * r1x col1y = shear * r0y + sy * r1y a = col0x b = col0y c = col1x d = col1y e = tx f = ty return a, b, c, d, e, f def scale_and_set_translation( m: Matrix | PdfMatrix, scale_factor: float, tx: float, ty: float ) -> Matrix | PdfMatrix: """Uniformly scale CTM by percentage and set translation to a position. This function performs an isotropic scale in X and Y by ``percent`` and then sets the translation components to ``(tx, ty)``. It preserves the input type: if a ``PdfMatrix`` is provided, a ``PdfMatrix`` is returned; if a tuple is provided, a tuple is returned. Args: m: Input CTM as ``(a, b, c, d, e, f)`` or ``PdfMatrix``. scale_factor: Scale factor. ``1.0`` keeps size unchanged, ``0.5`` halves it, ``2.0`` doubles it. tx: New translation X. ty: New translation Y. Returns: A CTM of the same type as the input, scaled and with translation set. """ if isinstance(m, PdfMatrix): a = m.a b = m.b c = m.c d = m.d # e, f will be overridden by tx, ty assert a is not None assert b is not None assert c is not None assert d is not None return PdfMatrix( a=a * scale_factor, b=b * scale_factor, c=c * scale_factor, d=d * scale_factor, e=float(tx), f=float(ty), ) a, b, c, d, _, _ = m return ( a * scale_factor, b * scale_factor, c * scale_factor, d * scale_factor, float(tx), float(ty), ) def create_translation_and_scale_matrix( translation_x: float, translation_y: float, scale_factor: float ) -> Matrix: """Create a transformation matrix for translation and uniform scaling. This creates a CTM that first scales uniformly by scale_factor, then translates by (translation_x, translation_y). Args: translation_x: Translation in X direction translation_y: Translation in Y direction scale_factor: Uniform scale factor for both X and Y Returns: The CTM matrix (a, b, c, d, e, f) """ # Matrix for uniform scaling and translation: # [scale 0 tx] # [0 scale ty] # [0 0 1 ] # Which maps to CTM (scale, 0, 0, scale, tx, ty) return (scale_factor, 0.0, 0.0, scale_factor, translation_x, translation_y) def multiply_matrices(m1: Matrix | PdfMatrix, m2: Matrix | PdfMatrix) -> Matrix: """Multiply two transformation matrices (m1 * m2). Args: m1: Left matrix in multiplication m2: Right matrix in multiplication Returns: Result matrix as tuple (a, b, c, d, e, f) """ # Extract components from first matrix if isinstance(m1, PdfMatrix): a1, b1, c1, d1, e1, f1 = m1.a, m1.b, m1.c, m1.d, m1.e, m1.f assert all(x is not None for x in [a1, b1, c1, d1, e1, f1]) else: a1, b1, c1, d1, e1, f1 = m1 # Extract components from second matrix if isinstance(m2, PdfMatrix): a2, b2, c2, d2, e2, f2 = m2.a, m2.b, m2.c, m2.d, m2.e, m2.f assert all(x is not None for x in [a2, b2, c2, d2, e2, f2]) else: a2, b2, c2, d2, e2, f2 = m2 # Matrix multiplication for 2D affine transformations: # [a1 c1 e1] [a2 c2 e2] [a1*a2+c1*b2 a1*c2+c1*d2 a1*e2+c1*f2+e1] # [b1 d1 f1] * [b2 d2 f2] = [b1*a2+d1*b2 b1*c2+d1*d2 b1*e2+d1*f2+f1] # [0 0 1 ] [0 0 1 ] [0 0 1 ] a = a1 * a2 + c1 * b2 b = b1 * a2 + d1 * b2 c = a1 * c2 + c1 * d2 d = b1 * c2 + d1 * d2 e = a1 * e2 + c1 * f2 + e1 f = b1 * e2 + d1 * f2 + f1 return (a, b, c, d, e, f) def apply_transform_to_ctm( existing_ctm: list[object], translation_x: float, translation_y: float, scale_factor: float, ) -> list[object]: """Apply translation and scale transformation to an existing CTM. Args: existing_ctm: Existing CTM as list of 6 floats translation_x: Translation in X direction translation_y: Translation in Y direction scale_factor: Uniform scale factor Returns: New CTM as list of objects """ if len(existing_ctm) != 6: # If CTM is invalid, create a new identity matrix with the transform transform_matrix = create_translation_and_scale_matrix( translation_x, translation_y, scale_factor ) return list(transform_matrix) # Convert existing CTM to Matrix format try: existing_matrix = tuple(float(x) for x in existing_ctm) except (ValueError, TypeError): # If conversion fails, use identity matrix existing_matrix = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0) # Create the transform matrix transform_matrix = create_translation_and_scale_matrix( translation_x, translation_y, scale_factor ) # Left-multiply: new_ctm = transform_matrix * existing_matrix result_matrix = multiply_matrices(transform_matrix, existing_matrix) return list(result_matrix) def matrix_to_bytes(m: Matrix | PdfMatrix) -> bytes: if isinstance(m, PdfMatrix): return ( f" {m.a:.6f} {m.b:.6f} {m.c:.6f} {m.d:.6f} {m.e:.6f} {m.f:.6f} cm ".encode() ) else: return f" {m[0]:.6f} {m[1]:.6f} {m[2]:.6f} {m[3]:.6f} {m[4]:.6f} {m[5]:.6f} cm ".encode() ================================================ FILE: babeldoc/format/pdf/document_il/utils/mupdf_helper.py ================================================ import numpy as np import pymupdf from babeldoc.const import get_process_pool def get_no_rotation_img(page: pymupdf.Page, dpi: int = 72) -> pymupdf.Pixmap: # return page.get_pixmap(dpi=72) original_rotation = page.rotation page.set_rotation(0) pix = page.get_pixmap(dpi=dpi) page.set_rotation(original_rotation) return pix def get_no_rotation_img_multiprocess_internal( pdf_bytes: str, pagenum: int, dpi: int = 72 ) -> np.ndarray: # return page.get_pixmap(dpi=72) doc = pymupdf.open(pdf_bytes) try: page = doc[pagenum] original_rotation = page.rotation page.set_rotation(0) pix = page.get_pixmap(dpi=dpi) page.set_rotation(original_rotation) return np.frombuffer(pix.samples, np.uint8).reshape( pix.height, pix.width, 3, )[:, :, ::-1] finally: doc.close() def get_no_rotation_img_multiprocess(pdf_bytes: str, pagenum: int, dpi: int = 72): pool = get_process_pool() if pool is None: return get_no_rotation_img_multiprocess_internal(pdf_bytes, pagenum, dpi) return pool.apply( get_no_rotation_img_multiprocess_internal, (pdf_bytes, pagenum, dpi) ) ================================================ FILE: babeldoc/format/pdf/document_il/utils/paragraph_helper.py ================================================ import logging import re from babeldoc.format.pdf.document_il import il_version_1 logger = logging.getLogger(__name__) def is_cid_paragraph(paragraph: il_version_1.PdfParagraph): chars: list[il_version_1.PdfCharacter] = [] for composition in paragraph.pdf_paragraph_composition: if composition.pdf_line: chars.extend(composition.pdf_line.pdf_character) elif composition.pdf_same_style_characters: chars.extend(composition.pdf_same_style_characters.pdf_character) elif composition.pdf_same_style_unicode_characters: continue # chars.extend(composition.pdf_same_style_unicode_characters.unicode) elif composition.pdf_formula: chars.extend(composition.pdf_formula.pdf_character) elif composition.pdf_character: chars.append(composition.pdf_character) else: logger.error( f"Unknown composition type. " f"Composition: {composition}. " f"Paragraph: {paragraph}. ", ) continue cid_count = 0 for char in chars: if re.match(r"^\(cid:\d+\)$", char.char_unicode): cid_count += 1 return cid_count > len(chars) * 0.8 NUMERIC_PATTERN = re.compile(r"^-?\d+(\.\d+)?$") def is_pure_numeric_paragraph(paragraph) -> bool: """只检查段落是否为纯数字(支持整数、小数、负数)""" if not paragraph or not getattr(paragraph, "unicode", None): return False text = paragraph.unicode.strip() if not text: return False return bool(NUMERIC_PATTERN.match(text)) def is_placeholder_only_paragraph(paragraph: il_version_1.PdfParagraph) -> bool: """Check if a paragraph contains only placeholders and whitespace. Args: paragraph: PDF paragraph to check Returns: True if the paragraph contains only placeholders (formula or style tags) and whitespace, False otherwise """ if not paragraph or not paragraph.unicode: return False for composition in paragraph.pdf_paragraph_composition: if composition.pdf_formula: # Formula composition is allowed continue elif composition.pdf_character: # Check if single character is whitespace if not composition.pdf_character.char_unicode.isspace(): return False elif composition.pdf_line: # Check if all characters in the line are whitespace for char in composition.pdf_line.pdf_character: if not char.char_unicode.isspace(): return False elif composition.pdf_same_style_characters: # Check if all characters in the group are whitespace for char in composition.pdf_same_style_characters.pdf_character: if not char.char_unicode.isspace(): return False elif composition.pdf_same_style_unicode_characters: # Check if the unicode content is only whitespace if not composition.pdf_same_style_unicode_characters.unicode.isspace(): return False else: # Unknown composition type, conservatively return False return False return True ================================================ FILE: babeldoc/format/pdf/document_il/utils/spatial_analyzer.py ================================================ """Spatial relationship analyzer for PDF elements. This module provides functions to analyze spatial relationships between PDF elements, particularly for detecting containment relationships between formulas and other elements like curves and forms. All comments and docstrings are in English per project guidelines. """ from __future__ import annotations from babeldoc.format.pdf.document_il.il_version_1 import Box from babeldoc.format.pdf.document_il.il_version_1 import Page from babeldoc.format.pdf.document_il.il_version_1 import PdfCurve from babeldoc.format.pdf.document_il.il_version_1 import PdfForm from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes def is_element_contained_in_formula( element_box: Box, formula_box: Box, containment_threshold: float = 0.95, tolerance: float = 2.0, ) -> bool: """Check if an element is completely contained within a formula with tolerance. Args: element_box: The bounding box of the element to check formula_box: The bounding box of the formula containment_threshold: Minimum IoU ratio to consider as contained (default: 0.95) tolerance: Tolerance in units to expand formula box for containment check (default: 2.0) Returns: True if the element is considered contained within the formula """ if element_box is None or formula_box is None: return False # Expand formula box by tolerance for more lenient containment check expanded_formula_box = Box( x=formula_box.x - tolerance, y=formula_box.y - tolerance, x2=formula_box.x2 + tolerance, y2=formula_box.y2 + tolerance, ) # Calculate IoU of element box with respect to expanded formula box iou = calculate_iou_for_boxes(element_box, expanded_formula_box) return iou >= containment_threshold def find_contained_curves( formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None ) -> list[PdfCurve]: """Find all curves that are contained within the given formula. Args: formula: The formula to check for contained curves page: The page containing the curves paragraph_xobj_id: The xobj_id of the paragraph containing the formula. If provided, only curves with matching xobj_id will be returned. Returns: List of curves that are contained within the formula """ if not formula.box or not page.pdf_curve: return [] contained_curves = [] for curve in page.pdf_curve: if curve.box and is_element_contained_in_formula(curve.box, formula.box): # If paragraph_xobj_id is specified, only include curves with matching xobj_id if paragraph_xobj_id is not None and curve.xobj_id != paragraph_xobj_id: continue contained_curves.append(curve) return contained_curves def find_contained_forms( formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None ) -> list[PdfForm]: """Find all forms that are contained within the given formula. Args: formula: The formula to check for contained forms page: The page containing the forms paragraph_xobj_id: The xobj_id of the paragraph containing the formula. If provided, only forms with matching xobj_id will be returned. Returns: List of forms that are contained within the formula """ if not formula.box or not page.pdf_form: return [] contained_forms = [] for form in page.pdf_form: if form.box and is_element_contained_in_formula(form.box, formula.box): # If paragraph_xobj_id is specified, only include forms with matching xobj_id if paragraph_xobj_id is not None and form.xobj_id != paragraph_xobj_id: continue contained_forms.append(form) return contained_forms def find_all_contained_elements( formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None ) -> tuple[list[PdfCurve], list[PdfForm]]: """Find all curves and forms that are contained within the given formula. Args: formula: The formula to check for contained elements page: The page containing the elements paragraph_xobj_id: The xobj_id of the paragraph containing the formula. If provided, only elements with matching xobj_id will be returned. Returns: Tuple of (contained_curves, contained_forms) """ contained_curves = find_contained_curves(formula, page, paragraph_xobj_id) contained_forms = find_contained_forms(formula, page, paragraph_xobj_id) return contained_curves, contained_forms def calculate_translation_and_scale( old_box: Box, new_box: Box ) -> tuple[float, float, float]: """Calculate translation and scale factors between two boxes. Args: old_box: The original bounding box new_box: The new bounding box Returns: Tuple of (translation_x, translation_y, scale_factor) """ if old_box is None or new_box is None: return 0.0, 0.0, 1.0 # Calculate translation (difference in top-left corners) translation_x = new_box.x - old_box.x translation_y = new_box.y - old_box.y # Calculate scale factor (using width ratio, fallback to height if needed) old_width = old_box.x2 - old_box.x new_width = new_box.x2 - new_box.x if old_width > 0: scale_factor = new_width / old_width else: old_height = old_box.y2 - old_box.y new_height = new_box.y2 - new_box.y scale_factor = new_height / old_height if old_height > 0 else 1.0 return translation_x, translation_y, scale_factor ================================================ FILE: babeldoc/format/pdf/document_il/utils/style_helper.py ================================================ from babeldoc.format.pdf.document_il import il_version_1 def create_pdf_style(r, g, b, font_id="base", font_size=6): """ Create a PdfStyle object from RGB values. Args: r: Red component in range 0-255 g: Green component in range 0-255 b: Blue component in range 0-255 font_id: Font identifier font_size: Font size Returns: PdfStyle object with the specified color """ r, g, b = [x / 255.0 for x in (r, g, b)] return il_version_1.PdfStyle( font_id=font_id, font_size=font_size, graphic_state=il_version_1.GraphicState( passthrough_per_char_instruction=f"{r:.10f} {g:.10f} {b:.10f} rg", ), ) BLACK = il_version_1.GraphicState(passthrough_per_char_instruction="0 g 0 G") WHITE = il_version_1.GraphicState(passthrough_per_char_instruction="1 g 1 G") GRAY80 = il_version_1.GraphicState(passthrough_per_char_instruction="0.80 g 0.80 G") GRAY67 = il_version_1.GraphicState(passthrough_per_char_instruction="0.67 g 0.67 G") GRAY33 = il_version_1.GraphicState(passthrough_per_char_instruction="0.33 g 0.33 G") # Generate all color styles RED = il_version_1.GraphicState( passthrough_per_char_instruction="1.0000000000 0.2313725490 0.1882352941 rg " "1.0000000000 0.2313725490 0.1882352941 RG", ) ORANGE = il_version_1.GraphicState( passthrough_per_char_instruction="1.0000000000 0.5843137255 0.0000000000 rg " "1.0000000000 0.5843137255 0.0000000000 RG", ) YELLOW = il_version_1.GraphicState( passthrough_per_char_instruction="1.0000000000 0.8000000000 0.0000000000 rg " "1.0000000000 0.8000000000 0.0000000000 RG", ) GREEN = il_version_1.GraphicState( passthrough_per_char_instruction="0.2039215686 0.7803921569 0.3490196078 rg " "0.2039215686 0.7803921569 0.3490196078 RG", ) MINT = il_version_1.GraphicState( passthrough_per_char_instruction="0.0000000000 0.7803921569 0.7450980392 rg " "0.0000000000 0.7803921569 0.7450980392 RG", ) TEAL = il_version_1.GraphicState( passthrough_per_char_instruction="0.1882352941 0.6901960784 0.7803921569 rg " "0.1882352941 0.6901960784 0.7803921569 RG", ) CYAN = il_version_1.GraphicState( passthrough_per_char_instruction="0.1960784314 0.6784313725 0.9019607843 rg " "0.1960784314 0.6784313725 0.9019607843 RG", ) BLUE = il_version_1.GraphicState( passthrough_per_char_instruction="0.0000000000 0.4784313725 1.0000000000 rg " "0.0000000000 0.4784313725 1.0000000000 RG", ) INDIGO = il_version_1.GraphicState( passthrough_per_char_instruction="0.3450980392 0.3372549020 0.8392156863 rg " "0.3450980392 0.3372549020 0.8392156863 RG", ) PURPLE = il_version_1.GraphicState( passthrough_per_char_instruction="0.6862745098 0.3215686275 0.8705882353 rg " "0.6862745098 0.3215686275 0.8705882353 RG", ) PINK = il_version_1.GraphicState( passthrough_per_char_instruction="1.0000000000 0.1764705882 0.3333333333 rg " "1.0000000000 0.1764705882 0.3333333333 RG", ) BROWN = il_version_1.GraphicState( passthrough_per_char_instruction="0.6352941176 0.5176470588 0.3686274510 rg " "0.6352941176 0.5176470588 0.3686274510 RG", ) ================================================ FILE: babeldoc/format/pdf/document_il/utils/zstd_helper.py ================================================ import base64 import pyzstd def zstd_compress(data) -> str: if isinstance(data, str): data = data.encode() if not isinstance(data, bytes): raise TypeError(f"data must be str or bytes, not {type(data)}") return base64.b85encode(pyzstd.compress(data)).decode() def zstd_decompress(data) -> str: if isinstance(data, str): data = data.encode() if not isinstance(data, bytes): raise TypeError(f"data must be str or bytes, not {type(data)}") return pyzstd.decompress(base64.b85decode(data)).decode() ================================================ FILE: babeldoc/format/pdf/document_il/xml_converter.py ================================================ import copy from pathlib import Path import orjson from xsdata.formats.dataclass.context import XmlContext from xsdata.formats.dataclass.parsers import XmlParser from xsdata.formats.dataclass.serializers import XmlSerializer from xsdata.formats.dataclass.serializers.config import SerializerConfig from babeldoc.format.pdf.document_il import il_version_1 class XMLConverter: def __init__(self): self.parser = XmlParser() config = SerializerConfig(indent=" ") context = XmlContext() self.serializer = XmlSerializer(context=context, config=config) def write_xml(self, document: il_version_1.Document, path: str): with Path(path).open("w", encoding="utf-8") as f: f.write(self.to_xml(document)) def read_xml(self, path: str) -> il_version_1.Document: with Path(path).open(encoding="utf-8") as f: return self.from_xml(f.read()) def to_xml(self, document: il_version_1.Document) -> str: return self.serializer.render(document) def from_xml(self, xml: str) -> il_version_1.Document: return self.parser.from_string( xml, il_version_1.Document, ) def deepcopy(self, document: il_version_1.Document) -> il_version_1.Document: return copy.deepcopy(document) # return self.from_xml(self.to_xml(document)) def to_json(self, document: il_version_1.Document) -> str: return orjson.dumps( document, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ).decode() def write_json(self, document: il_version_1.Document, path: str): with Path(path).open("w", encoding="utf-8") as f: f.write(self.to_json(document)) ================================================ FILE: babeldoc/format/pdf/high_level.py ================================================ import asyncio import copy import hashlib import io import logging import pathlib import re import shutil import threading import time from asyncio import CancelledError from pathlib import Path from typing import Any from typing import BinaryIO import pymupdf from pymupdf import Document from pymupdf import Font from babeldoc import asynchronize from babeldoc.assets.assets import warmup from babeldoc.babeldoc_exception.BabelDOCException import ExtractTextError from babeldoc.babeldoc_exception.BabelDOCException import ( InputFileGeneratedByBabelDOCError, ) from babeldoc.const import CACHE_FOLDER from babeldoc.const import WATERMARK_VERSION from babeldoc.const import close_process_pool from babeldoc.format.pdf.converter import TranslateConverter from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.format.pdf.document_il.backend.pdf_creater import SAVE_PDF_STAGE_NAME from babeldoc.format.pdf.document_il.backend.pdf_creater import SUBSET_FONT_STAGE_NAME from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater from babeldoc.format.pdf.document_il.backend.pdf_creater import reproduce_cmap from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater from babeldoc.format.pdf.document_il.midend.add_debug_information import ( AddDebugInformation, ) from babeldoc.format.pdf.document_il.midend.automatic_term_extractor import ( AutomaticTermExtractor, ) from babeldoc.format.pdf.document_il.midend.detect_scanned_file import DetectScannedFile from babeldoc.format.pdf.document_il.midend.il_translator import ILTranslator from babeldoc.format.pdf.document_il.midend.il_translator_llm_only import ( ILTranslatorLLMOnly, ) from babeldoc.format.pdf.document_il.midend.layout_parser import LayoutParser from babeldoc.format.pdf.document_il.midend.paragraph_finder import ParagraphFinder from babeldoc.format.pdf.document_il.midend.styles_and_formulas import StylesAndFormulas from babeldoc.format.pdf.document_il.midend.table_parser import TableParser from babeldoc.format.pdf.document_il.midend.typesetting import Typesetting from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper from babeldoc.format.pdf.document_il.xml_converter import XMLConverter from babeldoc.format.pdf.pdfinterp import PDFPageInterpreterEx from babeldoc.format.pdf.result_merger import ResultMerger from babeldoc.format.pdf.split_manager import SplitManager from babeldoc.format.pdf.translation_config import TranslateResult from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.format.pdf.translation_config import WatermarkOutputMode from babeldoc.pdfminer.pdfdocument import PDFDocument from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.pdfparser import PDFParser from babeldoc.progress_monitor import ProgressMonitor from babeldoc.utils import memory logger = logging.getLogger(__name__) TRANSLATE_STAGES = [ (ILCreater.stage_name, 14.12), # Parse PDF and Create IR (DetectScannedFile.stage_name, 2.45), # DetectScannedFile (LayoutParser.stage_name, 14.03), # Parse Page Layout (TableParser.stage_name, 1.0), # Parse Table (ParagraphFinder.stage_name, 6.26), # Parse Paragraphs (StylesAndFormulas.stage_name, 1.66), # Parse Formulas and Styles # (RemoveDescent.stage_name, 0.15), # Remove Char Descent (AutomaticTermExtractor.stage_name, 30.0), # Extract Terms (ILTranslator.stage_name, 46.96), # Translate Paragraphs (Typesetting.stage_name, 4.71), # Typesetting (FontMapper.stage_name, 0.61), # Add Fonts (PDFCreater.stage_name, 1.96), # Generate drawing instructions (SUBSET_FONT_STAGE_NAME, 0.92), # Subset font (SAVE_PDF_STAGE_NAME, 6.34), # Save PDF ] resfont_map = { "zh-cn": "china-ss", "zh-tw": "china-ts", "zh-hans": "china-ss", "zh-hant": "china-ts", "zh": "china-ss", "ja": "japan-s", "ko": "korea-s", } def safe_save(doc, *args, **kwargs): try: # first try, saving without options doc.save(*args, **kwargs) except Exception: # second try, saving with 'garbage=3' for object missing doc.ez_save(*args, **kwargs) def check_metadata(pdf: Document): meta = pdf.metadata if not meta: return producer = meta.get("producer", None) if ( producer and "BabelDOC" in producer and "Translation_generated_by_AI,please_carefully_discern" in producer ): raise InputFileGeneratedByBabelDOCError( "Input file is generated by BabelDOC, Cannot translate files that have already been translated." ) def add_metadata( translate_result: TranslateResult, translate_config: TranslationConfig ): processed = [] for attr in ( "mono_pdf_path", "dual_pdf_path", "no_watermark_mono_pdf_path", "no_watermark_dual_pdf_path", ): path = getattr(translate_result, attr) if not path or path in processed: continue processed.append(path) temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf") pdf = pymupdf.open(path) meta = pdf.metadata if not meta: meta = {} creator = meta.get("creator", None) producer = meta.get("producer", None) if producer: if not creator: creator = producer else: creator += f", {producer}" translated_by = f"BabelDOC{WATERMARK_VERSION}_{time.time()}_Translation_generated_by_AI,please_carefully_discern" if translate_config.metadata_extra_data: translated_by += f"_{translate_config.metadata_extra_data}" meta["producer"] = translated_by meta["creator"] = creator for k, v in meta.items(): if v: # 使用正则替换掉 surrogate 范围内的字符 meta[k] = re.sub(r"[\uD800-\uDFFF]", "", v) pdf.set_metadata(meta) safe_save(pdf, temp_path) shutil.move(temp_path, path) def fix_cmap(translate_result: TranslateResult, translate_config: TranslationConfig): processed = [] for attr in ( "mono_pdf_path", "dual_pdf_path", "no_watermark_mono_pdf_path", "no_watermark_dual_pdf_path", ): path = getattr(translate_result, attr) if not path or path in processed: continue processed.append(path) temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf") pdf = pymupdf.open(path) reproduce_cmap(pdf) safe_save(pdf, temp_path) shutil.move(temp_path, path) def verify_file_hash(file_path: str, expected_hash: str) -> bool: """Verify the SHA256 hash of a file.""" sha256_hash = hashlib.sha256() with Path(file_path).open("rb") as f: # Read the file in chunks to handle large files efficiently for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() == expected_hash def translator_supports_llm(translator) -> bool: if not translator or not hasattr(translator, "do_llm_translate"): return False try: translator.do_llm_translate(None) return True except NotImplementedError: return False except Exception as exc: # pragma: no cover - defensive logging logger.debug("translator %s failed llm detection: %s", translator, exc) return False def start_parse_il( inf: BinaryIO, pages: list[int] | None = None, vfont: str = "", vchar: str = "", thread: int = 0, doc_zh: Document = None, lang_in: str = "", lang_out: str = "", service: str = "", resfont: str = "", noto: Font = None, cancellation_event: asyncio.Event = None, il_creater: ILCreater = None, translation_config: TranslationConfig = None, **kwarg: Any, ) -> None: rsrcmgr = PDFResourceManager() layout = {} device = TranslateConverter( rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto, kwarg.get("envs", {}), kwarg.get("prompt", []), il_creater=il_creater, ) # model = DocLayoutModel.load_available() assert device is not None assert il_creater is not None assert translation_config is not None obj_patch = {} interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch, il_creater) if pages: total_pages = len(pages) else: total_pages = doc_zh.page_count il_creater.on_total_pages(total_pages) parser = PDFParser(inf) doc = PDFDocument(parser) for pageno, page in enumerate(PDFPage.create_pages(doc)): if cancellation_event and cancellation_event.is_set(): raise CancelledError("task cancelled") if pages and (pageno not in pages): continue page.pageno = pageno if not translation_config.should_translate_page(pageno + 1): continue height, width = ( page.cropbox[3] - page.cropbox[1], page.cropbox[2] - page.cropbox[0], ) if height > 1200 or width > 2000: logger.warning(f"page {pageno + 1} is too large, maybe unable to translate") # continue translation_config.raise_if_cancelled() # The current program no longer relies on # the following layout recognition results, # but in order to facilitate the migration of pdf2zh, # the relevant code is temporarily retained. # pix = doc_zh[page.pageno].get_pixmap() # image = np.frombuffer(pix.samples, np.uint8).reshape( # pix.height, pix.width, 3 # )[:, :, ::-1] # page_layout = model.predict( # image, imgsz=int(pix.height / 32) * 32)[0] # # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间 # box = np.ones((pix.height, pix.width)) # h, w = box.shape # vcls = ["abandon", "figure", "table", # "isolate_formula", "formula_caption"] # for i, d in enumerate(page_layout.boxes): # if page_layout.names[int(d.cls)] not in vcls: # x0, y0, x1, y1 = d.xyxy.squeeze() # x0, y0, x1, y1 = ( # np.clip(int(x0 - 1), 0, w - 1), # np.clip(int(h - y1 - 1), 0, h - 1), # np.clip(int(x1 + 1), 0, w - 1), # np.clip(int(h - y0 + 1), 0, h - 1), # ) # box[y0:y1, x0:x1] = i + 2 # for i, d in enumerate(page_layout.boxes): # if page_layout.names[int(d.cls)] in vcls: # x0, y0, x1, y1 = d.xyxy.squeeze() # x0, y0, x1, y1 = ( # np.clip(int(x0 - 1), 0, w - 1), # np.clip(int(h - y1 - 1), 0, h - 1), # np.clip(int(x1 + 1), 0, w - 1), # np.clip(int(h - y0 + 1), 0, h - 1), # ) # box[y0:y1, x0:x1] = 0 # layout[page.pageno] = box # 新建一个 xref 存放新指令流 # page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref # doc_zh.update_object(page.page_xref, "<<>>") # doc_zh.update_stream(page.page_xref, b"") # doc_zh[page.pageno].set_contents(page.page_xref) ops_base = interpreter.process_page(page) il_creater.on_page_base_operation(ops_base) il_creater.on_page_end() il_creater.on_finish() device.close() def translate(translation_config: TranslationConfig) -> TranslateResult: with ProgressMonitor(get_translation_stage(translation_config)) as pm: return do_translate(pm, translation_config) def get_translation_stage( translation_config: TranslationConfig, ) -> list[tuple[str, float]]: result = copy.deepcopy(TRANSLATE_STAGES) should_remove = [] # If only parsing and generating PDF, skip all translation-related stages if translation_config.only_parse_generate_pdf: should_remove.extend( [ DetectScannedFile.stage_name, LayoutParser.stage_name, TableParser.stage_name, ParagraphFinder.stage_name, StylesAndFormulas.stage_name, AutomaticTermExtractor.stage_name, ILTranslator.stage_name, Typesetting.stage_name, ] ) else: # Original logic for selective removal if not translation_config.table_model: should_remove.append(TableParser.stage_name) if translation_config.skip_scanned_detection: should_remove.append(DetectScannedFile.stage_name) if not translation_config.auto_extract_glossary: should_remove.append(AutomaticTermExtractor.stage_name) if translation_config.skip_translation: should_remove.append(ILTranslator.stage_name) result = [x for x in result if x[0] not in should_remove] return result async def async_translate(translation_config: TranslationConfig): """Asynchronously translate a PDF file with real-time progress reporting. This function yields progress events that can be used to update progress bars or other UI elements. The events are dictionaries with the following structure: - progress_start: { "type": "progress_start", "stage": str, # Stage name "stage_progress": float, # Always 0.0 "stage_current": int, # Current count (0) "stage_total": int # Total items in stage } - progress_update: { "type": "progress_update", "stage": str, # Stage name "stage_progress": float, # Stage progress (0-100) "stage_current": int, # Current items processed "stage_total": int, # Total items in stage "overall_progress": float # Overall progress (0-100) } - progress_end: { "type": "progress_end", "stage": str, # Stage name "stage_progress": float, # Always 100.0 "stage_current": int, # Equal to stage_total "stage_total": int, # Total items processed "overall_progress": float # Overall progress (0-100) } - finish: { "type": "finish", "translate_result": TranslateResult } - error: { "type": "error", "error": str } Args: translation_config: Configuration for the translation process Yields: dict: Progress events during translation Raises: CancelledError: If the translation is cancelled Exception: Any other errors during translation """ loop = asyncio.get_running_loop() callback = asynchronize.AsyncCallback() finish_event = asyncio.Event() cancel_event = threading.Event() with ProgressMonitor( get_translation_stage(translation_config), progress_change_callback=callback.step_callback, finish_callback=callback.finished_callback, finish_event=finish_event, cancel_event=cancel_event, loop=loop, report_interval=translation_config.report_interval, ) as pm: future = loop.run_in_executor(None, do_translate, pm, translation_config) try: async for event in callback: event = event.kwargs yield event if event["type"] == "error": break except CancelledError: cancel_event.set() except KeyboardInterrupt: logger.info("Translation cancelled by user through keyboard interrupt") cancel_event.set() if cancel_event.is_set(): future.cancel() logger.info("Waiting for translation to finish...") await finish_event.wait() class MemoryMonitor: """Monitor memory usage of current process and all child processes.""" def __init__(self, interval=0.1): """Initialize memory monitor. Args: interval: Monitoring interval in seconds, defaults to 0.1s (100ms) """ self.interval = interval self.peak_memory_usage = 0 self.monitor_thread = None self.stop_event = None self.last_pss_check_time = None def __enter__(self): """Start memory monitoring.""" self.stop_event = threading.Event() self.monitor_thread = threading.Thread( target=self._monitor_memory_usage, daemon=True ) self.monitor_thread.start() logger.debug("Memory monitoring started") return self def __exit__(self, exc_type, exc_val, exc_tb): """Stop monitoring and log peak memory usage.""" if not self.monitor_thread: return self.stop_event.set() self.monitor_thread.join(timeout=2.0) logger.info(f"Peak memory usage: {self.peak_memory_usage:.2f} MB") def _monitor_memory_usage(self): """Background thread that periodically checks memory usage.""" while not self.stop_event.is_set(): try: # Use throttled memory check with 2-second PSS throttle total_memory, self.last_pss_check_time = ( memory.get_memory_usage_with_throttle( include_children=True, prefer_pss=True, last_pss_check_time=self.last_pss_check_time, pss_throttle_seconds=2.0, ) ) # Convert to MB for better readability total_memory_mb = total_memory / (1024 * 1024) if total_memory_mb > self.peak_memory_usage: self.peak_memory_usage = total_memory_mb except Exception as e: logger.warning(f"Error monitoring memory: {e}") time.sleep(self.interval) def get_peek_memory_psutil(self): """Get peak memory usage using psutil (for backwards compatibility).""" return memory.get_memory_usage_bytes(include_children=True, prefer_pss=True) def fix_null_page_content(doc: Document) -> list[int]: invalid_page = [] for x in range(len(doc)): xref = doc[x].xref if doc.xref_object(xref) == "null": invalid_page.append(x) for x in invalid_page: doc.delete_page(x) doc.insert_page(x) return invalid_page def fix_null_xref(doc: Document) -> None: """Fix null xref in PDF file by replacing them with empty arrays. Args: doc: PyMuPDF Document object to fix """ for i in range(1, doc.xref_length()): try: obj = doc.xref_object(i) if obj == "null": doc.update_object(i, "[]") elif obj and "/ASCII85Decode" in obj: # make pdfminer happy data = doc.xref_stream(i) doc.update_stream(i, data) elif obj and "/LZWDecode" in obj: data = doc.xref_stream(i) doc.update_stream(i, data) elif obj and "/Annots" in obj: doc.xref_set_key(i, "Annots", "null") except Exception: doc.update_object(i, "[]") def fix_filter(doc): page_contents = [] for page in doc: page_contents.extend(page.get_contents()) for page_piece in page_contents: f = doc.xref_get_key(page_piece, "Filter") if f[0] == "xref": data = doc.xref_stream(page_piece) doc.update_stream(page_piece, data) for page in doc: contents = page.get_contents() if len(contents) > 1: page_streams = [doc.xref_stream(i) for i in contents] r = doc.get_new_xref() doc.update_object(r, "<<>>") doc.update_stream(r, b" ".join(page_streams)) doc.xref_set_key(page.xref, "Contents", f"{r} 0 R") return # skip rotate for now for page in doc: contents = page.get_contents() t, v = doc.xref_get_key(page.xref, "Rotate") rotate = -int(v) if t == "int" else 0 if len(contents) > 1 or rotate: page_streams = [doc.xref_stream(i) for i in contents] r = doc.get_new_xref() page_prefix = b"" page_suffix = b"" if rotate: m0 = pymupdf.Matrix(rotate) b0 = page.mediabox * m0 m1 = m0 * pymupdf.Matrix(1, 0, 0, 1, b0.x0, -b0.y0) page_prefix = ( f" {m1.a} {m1.b} {m1.c} {m1.d} {m1.e} {m1.f} cm q ".encode() ) page_suffix = b" Q " update_page_bbox(doc, page, page.cropbox * m1, "CropBox") update_page_bbox(doc, page, page.artbox * m1, "ArtBox") update_page_bbox(doc, page, page.bleedbox * m1, "BleedBox") update_page_bbox(doc, page, page.mediabox * m1, "MediaBox") doc.xref_set_key(page.xref, "Rotate", "0") doc.update_object(r, "<<>>") doc.update_stream(r, page_prefix + b" ".join(page_streams) + page_suffix) doc.xref_set_key(page.xref, "Contents", f"{r} 0 R") def update_page_bbox(doc, page, box, key): if doc.xref_get_key(page.xref, key)[0] == "array": doc.xref_set_key(page.xref, key, f"[{box.x0} {box.y0} {box.x1} {box.y1}]") def do_translate( pm: ProgressMonitor, translation_config: TranslationConfig ) -> TranslateResult: try: translation_config.progress_monitor = pm original_pdf_path = translation_config.input_file logger.info(f"start to translate: {original_pdf_path}") try: check_metadata(Document(original_pdf_path)) except InputFileGeneratedByBabelDOCError as e: logger.error( f"input file {original_pdf_path} is generated by BabelDOC, Cannot translate files that have already been translated." ) raise e except Exception as e: logger.warning(f"Error in check metadata, continue: {e}") start_time = time.time() peak_memory_usage = 0 with MemoryMonitor() as memory_monitor: # Check if split translation is enabled if not translation_config.split_strategy: result = _do_translate_single(pm, translation_config) else: # Initialize split manager and determine split points split_manager = SplitManager(translation_config) split_points = split_manager.determine_split_points(translation_config) if not split_points: logger.warning( "No split points determined, falling back to single translation" ) result = _do_translate_single(pm, translation_config) else: logger.info(f"Split points determined: {len(split_points)} parts") if len(split_points) == 1: logger.info("Only one part, use single translation") result = _do_translate_single(pm, translation_config) else: pm.total_parts = len(split_points) # Process parts serially results: dict[int, TranslateResult | None] = {} original_watermark_mode = ( translation_config.watermark_output_mode ) original_doc = Document(original_pdf_path) for i, split_point in enumerate(split_points): try: # Create a copy of config for this part part_config = copy.copy(translation_config) part_config.skip_clean = True should_translate_pages = [] for page in range( split_point.start_page, split_point.end_page + 1 ): if translation_config.should_translate_page( page + 1 ): should_translate_pages.append( page - split_point.start_page + 1 ) part_config.pages = None part_config.page_ranges = [ (x, x) for x in should_translate_pages ] if ( translation_config.only_include_translated_page and not should_translate_pages ): results[i] = None continue # Only first part should do scanned detection if enabled if i > 0: part_config.skip_scanned_detection = True part_config.working_dir = ( translation_config.get_part_working_dir(i) ) part_config.output_dir = ( translation_config.get_part_output_dir(i) ) assert id( part_config.shared_context_cross_split_part ) == id( translation_config.shared_context_cross_split_part ), "shared_context_cross_split_part must be the same" part_temp_input_path = ( part_config.get_working_file_path( f"input.part{i}.pdf" ) ) part_config.input_file = part_temp_input_path temp_doc = Document() for x in range( split_point.start_page, split_point.end_page + 1 ): xref = original_doc[x].xref if ( original_doc.xref_get_key(xref, "Annots")[0] != "null" ): original_doc.xref_set_key( xref, "Annots", "null" ) temp_doc.insert_pdf( original_doc, from_page=split_point.start_page, to_page=split_point.end_page, ) safe_save(temp_doc, part_temp_input_path) assert ( temp_doc.page_count == split_point.end_page - split_point.start_page + 1 ) # Only first part should have watermark if i > 0: part_config.watermark_output_mode = ( WatermarkOutputMode.NoWatermark ) # Create progress monitor for this part part_monitor = pm.create_part_monitor( i, len(split_points) ) # Process this part result = _do_translate_single( part_monitor, part_config, ) results[i] = result except Exception as e: logger.error(f"Error in part {i}: {e}") pm.translate_error(e) raise finally: # Clean up part working directory translation_config.cleanup_part_working_dir(i) # Restore original watermark mode translation_config.watermark_output_mode = ( original_watermark_mode ) # Merge results merger = ResultMerger(translation_config) logger.info("start merge results") result = merger.merge_results(results) logger.info("finish merge results") peak_memory_usage = memory_monitor.peak_memory_usage finish_time = time.time() result.total_seconds = finish_time - start_time logger.info( f"finish translate: {original_pdf_path}, cost: {finish_time - start_time} s", ) # Populate aggregate valid text statistics into result try: sc = translation_config.shared_context_cross_split_part result.total_valid_character_count = getattr( sc, "valid_char_count_total", 0 ) token_total = getattr(sc, "total_valid_text_token_count", None) result.total_valid_text_token_count = ( token_total if isinstance(token_total, int) else 0 ) except Exception as e: logger.warning("Failed to populate valid text statistics: %s", e) try: result.total_valid_character_count = 0 result.total_valid_text_token_count = 0 except Exception: pass result.original_pdf_path = translation_config.input_file result.peak_memory_usage = peak_memory_usage fix_cmap(result, translation_config) add_metadata(result, translation_config) try: migrate_toc(translation_config, result) except Exception as e: logger.error( f"Failed to migrate TOC from {translation_config.input_file}: {e}" ) pm.translate_done(result) return result except Exception as e: if translation_config.debug: logger.exception("translate error:") else: logger.error(f"translate error: {e}") pm.disable = False pm.translate_error(e) raise finally: logger.debug("do_translate finally") pm.on_finish() translation_config.cleanup_temp_files() def migrate_toc( translation_config: TranslationConfig, translate_result: TranslateResult ): if translation_config.use_alternating_pages_dual: logger.info('skipping TOC migration for "use_alternating_pages_dual" mode') return old_doc = Document(translation_config.input_file) if not old_doc: return try: fix_filter(old_doc) fix_null_xref(old_doc) except Exception: logger.exception("auto fix failed, please check the pdf file") toc_data = old_doc.get_toc() if not toc_data: logger.info("No TOC found in the original PDF, skipping migration.") return if translation_config.only_include_translated_page: total_page = set(range(0, len(old_doc))) pages_to_translate = { i for i in len(old_doc) if translation_config.should_translate_page(i + 1) } should_removed_page = list(total_page - pages_to_translate) files = { translate_result.dual_pdf_path, # translate_result.mono_pdf_path, translate_result.no_watermark_dual_pdf_path, # translate_result.no_watermark_mono_pdf_path } for f in files: if not f: continue mig_toc_temp_input = translation_config.get_working_file_path( "mig_toc_temp.pdf" ) shutil.copy(f, mig_toc_temp_input) new_doc = Document(mig_toc_temp_input.as_posix()) if not new_doc: continue new_doc.set_toc(toc_data) PDFCreater.save_pdf_with_timeout( new_doc, f.as_posix(), translation_config=translation_config, clean=not translation_config.skip_clean, tag="mig_toc", ) # mediabox -> '[0 nul 792]' def fix_media_box(doc: Document) -> None: mediabox_data = {} for x in range(1, doc.xref_length()): t = doc.xref_get_key(x, "Type") box_set = {} if t[1] in ["/Pages", "/Page"]: mediabox = doc.xref_get_key(x, "MediaBox") if mediabox[0] == "array": try: _, _, x1, y1 = ( mediabox[1].replace("[", "").replace("]", "").split(" ") ) doc.xref_set_key(x, "MediaBox", f"[0 0 {x1} {y1}]") box_set["MediaBox"] = mediabox[1] except Exception: logger.warning( "Attempt to fix media box failed; some pages may not have been processed correctly." ) for k in ["CropBox", "BleedBox", "TrimBox", "ArtBox"]: box = doc.xref_get_key(x, k) if box[0] != "null": box_set[k] = box[1] doc.xref_set_key(x, k, "null") if box_set: mediabox_data[x] = box_set return mediabox_data def check_cid_char(il: il_version_1.Document): chars = [] for page in il.page: chars.extend(page.pdf_character) cid_count = 0 for char in chars: if re.match(r"^\(cid:\d+\)$", char.char_unicode): cid_count += 1 return cid_count > len(chars) * 0.8 def _do_translate_single( pm: ProgressMonitor, translation_config: TranslationConfig, ) -> TranslateResult: """Original translation logic for a single document or part""" translation_config.progress_monitor = pm if translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround: translation_config.ocr_workaround = True translation_config.skip_scanned_detection = True original_pdf_path = translation_config.input_file if translation_config.debug: doc_input = Document(original_pdf_path) logger.debug("debug mode, save decompressed input pdf") output_path = translation_config.get_working_file_path( "input.decompressed.pdf", ) # Fix null xref in PDF file try: _ = fix_null_page_content(doc_input) fix_filter(doc_input) fix_null_xref(doc_input) except Exception: logger.exception("auto fix failed, please check the pdf file") safe_save(doc_input, output_path, expand=True, pretty=True) del doc_input # Continue with original processing temp_pdf_path = translation_config.get_working_file_path("input.pdf") doc_pdf2zh = Document(original_pdf_path) safe_save(doc_pdf2zh, temp_pdf_path) # Fix null xref in PDF file invalid_pages = [] try: invalid_pages = fix_null_page_content(doc_pdf2zh) fix_filter(doc_pdf2zh) fix_null_xref(doc_pdf2zh) except Exception: logger.exception("auto fix failed, please check the pdf file") mediabox_data = fix_media_box(doc_pdf2zh) # for page in doc_pdf2zh: # page.insert_font(resfont, None) resfont = None safe_save(doc_pdf2zh, temp_pdf_path) # if not translation_config.skip_scanned_detection and DetectScannedFile( # translation_config # ).fast_check(doc_pdf2zh): # if translation_config.auto_enable_ocr_workaround: # logger.warning( # "Fast scanned check hit, Turning on OCR workaround.", # ) # translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround = True # translation_config.ocr_workaround = True # translation_config.skip_scanned_detection = True # else: # logger.warning( # "Fast scanned check hit, Please check the input PDF file.", # ) # raise ScannedPDFError("Scanned PDF detected.") il_creater = ILCreater(translation_config) il_creater.mupdf = doc_pdf2zh xml_converter = XMLConverter() logger.debug(f"start parse il from {temp_pdf_path}") with Path(temp_pdf_path).open("rb") as f: start_parse_il( f, doc_zh=doc_pdf2zh, resfont=resfont, il_creater=il_creater, translation_config=translation_config, ) logger.debug(f"finish parse il from {temp_pdf_path}") docs = il_creater.create_il() logger.debug(f"finish create il from {temp_pdf_path}") del il_creater if translation_config.only_include_translated_page and not docs.page: return None if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("create_il.debug.json"), ) if check_cid_char(docs): raise ExtractTextError("The document contains too many CID chars.") # Skip all translation processing if only_parse_generate_pdf is enabled if translation_config.only_parse_generate_pdf: logger.debug("only_parse_generate_pdf enabled, skipping translation processing") # Skip directly to PDF generation pdf_creater = PDFCreater(temp_pdf_path, docs, translation_config, mediabox_data) result = pdf_creater.write(translation_config) result.original_pdf_path = translation_config.input_file return result # Rest of the original translation logic... # [Previous implementation of do_translate continues here] # 检测是否为扫描文件 if translation_config.skip_scanned_detection: logger.debug("skipping scanned file detection") else: logger.debug("start detect scanned file") DetectScannedFile(translation_config).process( docs, temp_pdf_path, mediabox_data ) logger.debug("finish detect scanned file") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("detect_scanned_file.json"), ) # Generate layouts for all pages logger.debug("start generating layouts") docs = LayoutParser(translation_config).process(docs, doc_pdf2zh) logger.debug("finish generating layouts") close_process_pool() if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("layout_generator.json"), ) if translation_config.table_model: docs = TableParser(translation_config).process(docs, doc_pdf2zh) logger.debug("finish table parser") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("table_parser.json"), ) ParagraphFinder(translation_config).process(docs) logger.debug(f"finish paragraph finder from {temp_pdf_path}") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("paragraph_finder.json"), ) StylesAndFormulas(translation_config).process(docs) logger.debug(f"finish styles and formulas from {temp_pdf_path}") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("styles_and_formulas.json"), ) translate_engine = translation_config.translator term_extraction_engine = translation_config.get_term_extraction_translator() support_llm_translate = translator_supports_llm(translate_engine) support_llm_term_extraction = translator_supports_llm(term_extraction_engine) if support_llm_term_extraction and translation_config.auto_extract_glossary: AutomaticTermExtractor(term_extraction_engine, translation_config).procress( docs ) if not translation_config.skip_translation: if support_llm_translate: il_translator = ILTranslatorLLMOnly(translate_engine, translation_config) else: il_translator = ILTranslator(translate_engine, translation_config) il_translator.translate(docs) del il_translator logger.debug(f"finish ILTranslator from {temp_pdf_path}") else: logger.info("skip ILTranslator") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("il_translated.json"), ) if translation_config.debug: AddDebugInformation(translation_config).process(docs) xml_converter.write_json( docs, translation_config.get_working_file_path("add_debug_information.json"), ) mono_watermark_first_page_doc_bytes = None dual_watermark_first_page_doc_bytes = None try: if translation_config.watermark_output_mode == WatermarkOutputMode.Both: mono_watermark_first_page_doc_bytes, dual_watermark_first_page_doc_bytes = ( generate_first_page_with_watermark( doc_pdf2zh, translation_config, docs, mediabox_data ) ) except Exception: logger.warning( "Failed to generate watermark for first page, using no watermark" ) translation_config.watermark_output_mode = WatermarkOutputMode.NoWatermark mono_watermark_first_page_doc_bytes = None dual_watermark_first_page_doc_bytes = None Typesetting(translation_config).typesetting_document(docs) logger.debug(f"finish typsetting from {temp_pdf_path}") if translation_config.debug: xml_converter.write_json( docs, translation_config.get_working_file_path("typsetting.json"), ) pdf_creater = PDFCreater(temp_pdf_path, docs, translation_config, mediabox_data) result = pdf_creater.write(translation_config) try: if mono_watermark_first_page_doc_bytes: mono_watermark_pdf = merge_watermark_doc( result.mono_pdf_path, mono_watermark_first_page_doc_bytes, translation_config, ) result.mono_pdf_path = mono_watermark_pdf except Exception: result.mono_pdf_path = result.no_watermark_mono_pdf_path try: if dual_watermark_first_page_doc_bytes: dual_watermark_pdf = merge_watermark_doc( result.dual_pdf_path, dual_watermark_first_page_doc_bytes, translation_config, ) result.dual_pdf_path = dual_watermark_pdf except Exception: result.dual_pdf_path = result.no_watermark_dual_pdf_path result.original_pdf_path = translation_config.input_file return result def generate_first_page_with_watermark( mupdf: Document, translation_config: TranslationConfig, doc_il: il_version_1.Document, mediabox_data: dict[int, Any] | None = None, ) -> (io.BytesIO, io.BytesIO): first_page_doc = Document() first_page_doc.insert_pdf(mupdf, from_page=0, to_page=0) il_only_first_page_doc = il_version_1.Document() il_only_first_page_doc.total_pages = 1 il_only_first_page_doc.page = [copy.deepcopy(doc_il.page[0])] watermarked_config = copy.copy(translation_config) watermarked_config.watermark_output_mode = WatermarkOutputMode.Watermarked try: watermarked_config.progress_monitor.disable = True watermarked_temp_pdf_path = watermarked_config.get_working_file_path( "watermarked_temp_input.pdf" ) safe_save(first_page_doc, watermarked_temp_pdf_path) Typesetting(watermarked_config).typsetting_document(il_only_first_page_doc) pdf_creater = PDFCreater( watermarked_temp_pdf_path.as_posix(), il_only_first_page_doc, watermarked_config, mediabox_data, ) result = pdf_creater.write(watermarked_config) mono_pdf_bytes = None dual_pdf_bytes = None if result.mono_pdf_path: mono_pdf_bytes = io.BytesIO() with Path(result.mono_pdf_path).open("rb") as f: mono_pdf_bytes.write(f.read()) result.mono_pdf_path.unlink() mono_pdf_bytes.seek(0) if result.dual_pdf_path: dual_pdf_bytes = io.BytesIO() with Path(result.dual_pdf_path).open("rb") as f: dual_pdf_bytes.write(f.read()) result.dual_pdf_path.unlink() dual_pdf_bytes.seek(0) return mono_pdf_bytes, dual_pdf_bytes finally: watermarked_config.progress_monitor.disable = False def merge_watermark_doc( no_watermark_pdf_path: pathlib.PosixPath, watermark_first_page_pdf_bytes: io.BytesIO, translation_config: TranslationConfig, ) -> pathlib.PosixPath: if not no_watermark_pdf_path.exists(): raise FileNotFoundError( f"no_watermark_pdf_path not found: {no_watermark_pdf_path}" ) if not watermark_first_page_pdf_bytes: raise FileNotFoundError( f"watermark_first_page_pdf_bytes not found: {watermark_first_page_pdf_bytes}" ) no_watermark_pdf = Document(no_watermark_pdf_path.as_posix()) no_watermark_pdf.delete_page(0) watermark_first_page_pdf = Document("pdf", watermark_first_page_pdf_bytes) no_watermark_pdf.insert_pdf( watermark_first_page_pdf, from_page=0, to_page=0, start_at=0 ) new_save_path = no_watermark_pdf_path.with_name( no_watermark_pdf_path.name.replace(".no_watermark", "") ) PDFCreater.save_pdf_with_timeout( no_watermark_pdf, new_save_path.as_posix(), translation_config=translation_config, clean=not translation_config.skip_clean, ) return new_save_path def download_font_assets(): warmup() def create_cache_folder(): try: logger.debug(f"create cache folder at {CACHE_FOLDER}") Path(CACHE_FOLDER).mkdir(parents=True, exist_ok=True) except OSError: logger.critical( f"Failed to create cache folder at {CACHE_FOLDER}", exc_info=True, ) exit(1) def init(): create_cache_folder() ================================================ FILE: babeldoc/format/pdf/pdfinterp.py ================================================ import logging from collections.abc import Sequence from typing import Any from typing import cast import numpy as np from babeldoc.format.pdf.babelpdf.utils import guarded_bbox from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater from babeldoc.pdfminer import settings from babeldoc.pdfminer.pdfcolor import PREDEFINED_COLORSPACE from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdfdevice import PDFDevice from babeldoc.pdfminer.pdfdevice import PDFTextSeq from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdfinterp import LITERAL_FORM from babeldoc.pdfminer.pdfinterp import LITERAL_IMAGE from babeldoc.pdfminer.pdfinterp import Color from babeldoc.pdfminer.pdfinterp import PDFContentParser from babeldoc.pdfminer.pdfinterp import PDFInterpreterError from babeldoc.pdfminer.pdfinterp import PDFPageInterpreter from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.pdfinterp import PDFStackT from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.pdftypes import LITERALS_ASCII85_DECODE from babeldoc.pdfminer.pdftypes import PDFObjRef from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.pdftypes import resolve1 from babeldoc.pdfminer.pdftypes import stream_value from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psexceptions import PSTypeError from babeldoc.pdfminer.psparser import PSKeyword from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.psparser import keyword_name from babeldoc.pdfminer.psparser import literal_name from babeldoc.pdfminer.utils import MATRIX_IDENTITY from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer.utils import mult_matrix log = logging.getLogger(__name__) def safe_float(o: Any) -> float | None: try: return float(o) except (TypeError, ValueError): return None class PDFContentParserEx(PDFContentParser): def __init__(self, streams: Sequence[object]) -> None: super().__init__(streams) def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, "inline") elif token is self.KEYWORD_ID: try: (_, objs) = self.end_type("inline") if len(objs) % 2 != 0: error_msg = f"Invalid dictionary construct: {objs!r}" raise PSTypeError(error_msg) d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} eos = b"EI" filter_ = d.get("F", None) if filter_: if isinstance(filter_, PSLiteral): filter_ = [filter_] if filter_[0] in LITERALS_ASCII85_DECODE: eos = b"~>" (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) if eos != b"EI": # it may be necessary for decoding data += eos obj = PDFStream(d, data) self.push((pos, obj)) if eos == b"EI": # otherwise it is still in the stream self.push((pos, self.KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise else: self.push((pos, token)) class PDFPageInterpreterEx(PDFPageInterpreter): """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary """ def __init__( self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch, il_creater: ILCreater, ) -> None: self.rsrcmgr = rsrcmgr self.device = device self.obj_patch = obj_patch self.il_creater = il_creater def dup(self) -> "PDFPageInterpreterEx": return self.__class__( self.rsrcmgr, self.device, self.obj_patch, self.il_creater, ) def init_resources(self, resources: dict[object, object]) -> None: # 重载设置 fontid 和 descent """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap: dict[object, PDFFont] = {} self.fontid: dict[PDFFont, object] = {} self.xobjmap = {} self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec: object) -> PDFColorSpace | None: if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: val = stream_value(spec[1]) if "N" in val: return PDFColorSpace(name, val["N"]) elif "Alternate" in val: return PREDEFINED_COLORSPACE[val["Alternate"].name] elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) for k, v in dict_value(resources).items(): # log.debug("Resource: %r: %r", k, v) if k == "Font": for fontid, spec in dict_value(v).items(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) font = self.rsrcmgr.get_font(objid, spec) font.xobj_id = objid self.il_creater.on_page_resource_font(font, objid, fontid) self.fontmap[fontid] = font self.fontmap[fontid].descent = 0 # hack fix descent self.fontid[self.fontmap[fontid]] = fontid elif k == "ColorSpace": for csid, spec in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec)) if colorspace is not None: self.csmap[csid] = colorspace elif k == "ProcSet": self.rsrcmgr.get_procset(list_value(v)) elif k == "XObject": for xobjid, xobjstrm in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm pass def do_CS(self, name: PDFStackT) -> None: """Set color space for stroking operations Introduced in PDF 1.1 """ try: self.il_creater.on_stroking_color_space(literal_name(name)) self.scs = self.csmap[literal_name(name)] except KeyError: if settings.STRICT: raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from None return def do_cs(self, name: PDFStackT) -> None: """Set color space for nonstroking operations""" try: self.il_creater.on_non_stroking_color_space(literal_name(name)) self.ncs = self.csmap[literal_name(name)] except KeyError: if settings.STRICT: raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from None return ############################################################ # 重载返回调用参数(SCN) def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: n = self.scs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 n = len(self.argstack) args = self.pop(n) self.il_creater.on_passthrough_per_char("SCN", args) self.graphicstate.scolor = cast(Color, args) return args def do_scn(self) -> None: """Set color for nonstroking operations""" if self.ncs: n = self.ncs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 n = len(self.argstack) args = self.pop(n) self.il_creater.on_passthrough_per_char("scn", args) self.graphicstate.ncolor = cast(Color, args) return args def do_SC(self) -> None: """Set color for stroking operations""" args = self.do_SCN() self.il_creater.remove_latest_passthrough_per_char_instruction() self.il_creater.on_passthrough_per_char("SC", args) return args def do_sc(self) -> None: """Set color for nonstroking operations""" args = self.do_scn() self.il_creater.remove_latest_passthrough_per_char_instruction() self.il_creater.on_passthrough_per_char("sc", args) return args # Ensure bbox has four numbers, otherwise determine it as an illegal image # For example, some Form's bbox is '[ null -.00487 1.00412 .99393 ]' def do_Do(self, xobjid_arg: PDFStackT) -> None: # 重载设置 xobj 的 obj_patch """Invoke named XObject""" xobjid = literal_name(xobjid_arg) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if settings.STRICT: raise PDFInterpreterError(f"Undefined xobject id: {xobjid!r}") from None return # log.debug("Processing xobj: %r", xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() # In extremely rare cases, a none might be mixed in the bbox, for example # /BBox [ 0 3.052 null 274.9 157.3 ] bbox = list( filter(lambda x: x is not None, cast(Rect, list_value(xobj["BBox"]))) ) if len(bbox) < 4: return matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. xobjres = xobj.get("Resources") if xobjres: resources = dict_value(xobjres) else: resources = self.resources.copy() self.il_creater.on_xobj_form( self.ctm, self.il_creater.xobj_id, xobj.objid, "form", xobjid, bbox, matrix, ) self.device.begin_figure(xobjid, bbox, matrix) ctm = mult_matrix(matrix, self.ctm) (x, y, x2, y2) = guarded_bbox(bbox) (x, y) = apply_matrix_pt(ctm, (x, y)) (x2, y2) = apply_matrix_pt(ctm, (x2, y2)) x_id = self.il_creater.on_xobj_begin((x, y, x2, y2), xobj.objid) try: ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) except Exception: self.il_creater.on_xobj_end(x_id, " ") return np_version = np.__version__ if np_version.split(".")[0] >= "2": pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv else: pos_inv = -np.mat(ctm[4:]) * ctm_inv a, b, c, d = ctm_inv.reshape(4).tolist() e, f = pos_inv.tolist()[0] ops_base = interpreter.render_contents( resources, [xobj], ctm=ctm, ) self.ncs = interpreter.ncs self.scs = interpreter.scs self.il_creater.on_xobj_end( x_id, # f"q {ops_base} Q {a} {b} {c} {d} {e} {f} cm ", f"{a:.6f} {b:.6f} {c:.6f} {d:.6f} {e:.6f} {f:.6f} cm ", ) try: # 有的时候 form 字体加不上这里会烂掉 self.device.fontid = interpreter.fontid self.device.fontmap = interpreter.fontmap ops_new = self.device.end_figure(xobjid) ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) np_version = np.__version__ if np_version.split(".")[0] >= "2": pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv else: pos_inv = -np.mat(ctm[4:]) * ctm_inv a, b, c, d = ctm_inv.reshape(4).tolist() e, f = pos_inv.tolist()[0] self.obj_patch[self.xobjmap[xobjid].objid] = ( f"q {ops_base}Q {a:.6f} {b:.6f} {c:.6f} {d:.6f} {e:.6f} {f:.6f} cm {ops_new}" ) except Exception: pass elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: self.il_creater.on_xobj_form( self.ctm, self.il_creater.xobj_id, xobj.objid, "image", xobjid, (0, 0, 1, 1), MATRIX_IDENTITY, ) self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass def do_W(self) -> None: """Set clipping path using nonzero winding number rule""" self.handle_w(False) def do_W_a(self) -> None: """Set clipping path using even-odd rule""" self.handle_w(True) def handle_w(self, evenodd: bool): path = self.curpath self.il_creater.on_pdf_clip_path(path, evenodd, self.ctm) def process_page(self, page: PDFPage) -> None: # 重载设置 page 的 obj_patch # log.debug("Processing page: %r", page) # print(page.mediabox,page.cropbox) # (x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.cropbox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) elif page.rotate == 180: ctm = (-1, 0, 0, -1, x1, y1) elif page.rotate == 270: ctm = (0, 1, -1, 0, y1, -x0) else: ctm = (1, 0, 0, 1, -x0, -y0) # ctm_for_ops = copy.copy(ctm) ctm_for_ops = (1, 0, 0, 1, -x0, -y0) ctm = (1, 0, 0, 1, -x0, -y0) if page.rotate == 90 or page.rotate == 270: (x0, y0, x1, y1) = (y0, x1, y1, x0) self.il_creater.on_page_start() self.il_creater.on_page_crop_box(x0, y0, x1, y1) self.device.begin_page(page, ctm) ops_base = self.render_contents(page.resources, page.contents, ctm=ctm) self.device.fontid = self.fontid self.device.fontmap = self.fontmap _ops_new = self.device.end_page(page) # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来 # self.obj_patch[page.page_xref] = ( # # f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 # "" # ) # for obj in page.contents: # self.obj_patch[obj.objid] = "" return f"q {ops_base} Q {' '.join(f'{x:f}' for x in ctm_for_ops)} cm" # return f"q {ops_base} Q 1 0 0 1 {x0} {y0} cm" def render_contents( self, resources: dict[object, object], streams: Sequence[object], ctm: Matrix = MATRIX_IDENTITY, ) -> None: # 重载返回指令流 """Render the content streams. This method may be called recursively. """ # log.debug( # "render_contents: resources=%r, streams=%r, ctm=%r", # resources, # streams, # ctm, # ) self.init_resources(resources) self.init_state(ctm) return self.execute(list_value(streams)) def do_q(self) -> None: """Save graphics state""" self.gstack.append(self.get_current_state()) self.il_creater.push_passthrough_per_char_instruction() return def do_Q(self) -> None: """Restore graphics state""" if self.gstack: self.set_current_state(self.gstack.pop()) self.il_creater.pop_passthrough_per_char_instruction() return def do_TJ(self, seq: PDFStackT) -> None: """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: raise PDFInterpreterError("No font specified!") return if isinstance(seq, PSLiteral): return assert self.ncs is not None gs = self.graphicstate.copy() gs.passthrough_instruction = ( self.il_creater.passthrough_per_char_instruction.copy() ) if isinstance(seq, int) or isinstance(seq, float): seq = [seq] self.device.render_string(self.textstate, cast(PDFTextSeq, seq), self.ncs, gs) return def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: """Set line dash pattern""" self.graphicstate.dash = (dash, phase) self.il_creater.on_line_dash(dash, phase) def do_BI(self) -> None: """Begin inline image object""" self.il_creater.on_inline_image_begin() def do_ID(self) -> None: """Begin inline image data""" pass # Handled by PDFContentParserEx def do_EI(self, obj: PDFStackT) -> None: """End inline image object""" if isinstance(obj, PDFStream): self.il_creater.on_inline_image_end(obj, self.ctm) # Run PostScript commands # The Do_xxx method is the method for executing corresponding postscript instructions def execute(self, streams: Sequence[object]) -> None: ops = "" for stream in streams: self.il_creater.on_new_stream() # 重载返回指令流 try: parser = PDFContentParserEx([stream]) except PSEOF: # empty page return while True: try: (_, obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): name = keyword_name(obj) act_name = ( name.replace("*", "_a").replace('"', "_w").replace("'", "_q") ) method = f"do_{act_name}" if hasattr(self, method): func = getattr(self, method) nargs = func.__code__.co_argcount - 1 if nargs: args = self.pop(nargs) # log.debug("exec: %s %r", name, args) if len(args) == nargs: func(*args) if self.il_creater.is_passthrough_per_char_operation( name, ): self.il_creater.on_passthrough_per_char(name, args) if self.il_creater.is_graphic_operation(name): continue elif name == "d": arg0 = f"[{' '.join(f'{arg}' for arg in args[0])}]" arg1 = args[1] ops += f"{arg0} {arg1} {name} " elif not ( name[0] == "T" or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"] ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 p = " ".join( [ ( f"{x:f}" if isinstance(x, float) else str(x).replace("'", "") ) for x in args ], ) ops += f"{p} {name} " else: # log.debug("exec: %s", name) targs = func() if targs is None: targs = [] if self.il_creater.is_graphic_operation(name): continue elif not (name[0] == "T" or name in ["BI", "ID", "EMC"]): p = " ".join( [ ( f"{x:f}" if isinstance(x, float) else str(x).replace("'", "") ) for x in targs ], ) ops += f"{p} {name} " elif settings.STRICT: error_msg = f"Unknown operator: {name!r}" raise PDFInterpreterError(error_msg) else: self.push(obj) # print('REV DATA',ops) return ops ================================================ FILE: babeldoc/format/pdf/result_merger.py ================================================ import logging from pathlib import Path from pymupdf import Document from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater from babeldoc.format.pdf.translation_config import TranslateResult from babeldoc.format.pdf.translation_config import TranslationConfig logger = logging.getLogger(__name__) class ResultMerger: """Handles merging of split translation results""" def __init__(self, translation_config: TranslationConfig): self.config = translation_config def merge_results( self, results: dict[int, TranslateResult | None] ) -> TranslateResult: """Merge multiple translation results into one""" if not results: raise ValueError("No results to merge") basename = Path(self.config.input_file).stem debug_suffix = ".debug" if self.config.debug else "" mono_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf" dual_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf" debug_suffix += ".no_watermark" mono_file_name_no_watermark = ( f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf" ) dual_file_name_no_watermark = ( f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf" ) results = {k: v for k, v in results.items() if v is not None} # Sort results by part index sorted_results = dict(sorted(results.items())) first_result = next(iter(sorted_results.values())) # Initialize paths for merged files merged_mono_path = None merged_dual_path = None merged_no_watermark_mono_path = None merged_no_watermark_dual_path = None try: # Merge monolingual PDFs if they exist if ( any(r.mono_pdf_path for r in results.values()) and not self.config.no_mono ): merged_mono_path = self._merge_pdfs( [ r.mono_pdf_path for r in sorted_results.values() if r.mono_pdf_path ], mono_file_name, tag="merged_mono", ) except Exception as e: logger.error(f"Error merging monolingual PDFs: {e}") merged_mono_path = None try: # Merge dual-language PDFs if they exist if ( any(r.dual_pdf_path for r in results.values()) and not self.config.no_dual ): merged_dual_path = self._merge_pdfs( [ r.dual_pdf_path for r in sorted_results.values() if r.dual_pdf_path ], dual_file_name, tag="merged_dual", ) except Exception as e: logger.error(f"Error merging dual-language PDFs: {e}") merged_dual_path = None if any( r.dual_pdf_path != r.no_watermark_dual_pdf_path or r.mono_pdf_path != r.no_watermark_mono_pdf_path for r in results.values() ): try: # Merge no-watermark PDFs if they exist if ( any(r.no_watermark_mono_pdf_path for r in results.values()) and not self.config.no_mono ): merged_no_watermark_mono_path = self._merge_pdfs( [ r.no_watermark_mono_pdf_path for r in sorted_results.values() if r.no_watermark_mono_pdf_path ], mono_file_name_no_watermark, tag="merged_no_watermark_mono", ) except Exception as e: logger.error(f"Error merging no-watermark PDFs: {e}") merged_no_watermark_mono_path = None try: if ( any(r.no_watermark_dual_pdf_path for r in results.values()) and not self.config.no_dual ): merged_no_watermark_dual_path = self._merge_pdfs( [ r.no_watermark_dual_pdf_path for r in sorted_results.values() if r.no_watermark_dual_pdf_path ], "merged_no_watermark_dual.pdf", tag="merged_no_watermark_dual", ) except Exception as e: logger.error(f"Error merging no-watermark PDFs: {e}") merged_no_watermark_dual_path = None auto_extracted_glossary_path = None if ( self.config.save_auto_extracted_glossary and self.config.shared_context_cross_split_part.auto_extracted_glossary ): auto_extracted_glossary_path = self.config.get_output_file_path( f"{basename}{debug_suffix}.{self.config.lang_out}.glossary.csv" ) with auto_extracted_glossary_path.open("w", encoding="utf-8") as f: logger.info( f"save auto extracted glossary to {auto_extracted_glossary_path}" ) f.write( self.config.shared_context_cross_split_part.auto_extracted_glossary.to_csv() ) # Create merged result merged_result = TranslateResult( mono_pdf_path=merged_mono_path, dual_pdf_path=merged_dual_path, auto_extracted_glossary_path=auto_extracted_glossary_path, ) merged_result.no_watermark_mono_pdf_path = merged_no_watermark_mono_path merged_result.no_watermark_dual_pdf_path = merged_no_watermark_dual_path if merged_result.no_watermark_mono_pdf_path is None: merged_result.no_watermark_mono_pdf_path = merged_mono_path elif merged_result.mono_pdf_path is None: merged_result.mono_pdf_path = merged_no_watermark_mono_path if merged_result.no_watermark_dual_pdf_path is None: merged_result.no_watermark_dual_pdf_path = merged_dual_path elif merged_result.dual_pdf_path is None: merged_result.dual_pdf_path = merged_no_watermark_dual_path # Calculate total time total_time = sum( r.total_seconds for r in results.values() if hasattr(r, "total_seconds") ) merged_result.total_seconds = total_time return merged_result def _merge_pdfs( self, pdf_paths: list[str | Path], output_name: str, tag: str ) -> Path: """Merge multiple PDFs into one""" if not pdf_paths: return None output_path = self.config.get_output_file_path(output_name) merged_doc = Document() for pdf_path in pdf_paths: doc = Document(str(pdf_path)) merged_doc.insert_pdf(doc) merged_doc = PDFCreater.subset_fonts_in_subprocess( merged_doc, self.config, tag=tag ) PDFCreater.save_pdf_with_timeout( merged_doc, str(output_path), translation_config=self.config ) return output_path ================================================ FILE: babeldoc/format/pdf/split_manager.py ================================================ import logging from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class SplitPoint: """Represents a point where the document should be split""" start_page: int end_page: int estimated_complexity: float = 1.0 chapter_title: str | None = None class BaseSplitStrategy: """Base class for split strategies""" def determine_split_points(self, config) -> list[SplitPoint]: raise NotImplementedError class PageCountStrategy(BaseSplitStrategy): """Split document based on page count""" def __init__(self, max_pages_per_part: int = 20): self.max_pages_per_part = max_pages_per_part def determine_split_points(self, config) -> list[SplitPoint]: from pymupdf import Document doc = Document(str(config.input_file)) total_pages = doc.page_count split_points = [] current_page = 0 while current_page < total_pages: end_page = min(current_page + self.max_pages_per_part, total_pages) split_points.append( SplitPoint( start_page=current_page, end_page=end_page - 1, # end_page is inclusive ) ) current_page = end_page return split_points class SplitManager: """Manages document splitting process""" def __init__(self, config=None): self.strategy = config.split_strategy def determine_split_points(self, config) -> list[SplitPoint]: """Determine where to split the document""" return self.strategy.determine_split_points(config) def estimate_part_complexity(self, split_point: SplitPoint) -> float: """Estimate the complexity of a document part""" # Simple estimation based on page count for now return ( split_point.end_page - split_point.start_page + 1 ) * split_point.estimated_complexity ================================================ FILE: babeldoc/format/pdf/translation_config.py ================================================ import enum import logging import shutil import tempfile import threading from collections import Counter from pathlib import Path from babeldoc.const import CACHE_FOLDER from babeldoc.format.pdf.split_manager import BaseSplitStrategy from babeldoc.format.pdf.split_manager import PageCountStrategy from babeldoc.glossary import Glossary from babeldoc.glossary import GlossaryEntry from babeldoc.progress_monitor import ProgressMonitor from babeldoc.translator.translator import BaseTranslator logger = logging.getLogger(__name__) class WatermarkOutputMode(enum.Enum): Watermarked = "watermarked" NoWatermark = "no_watermark" Both = "both" class SharedContextCrossSplitPart: def __init__(self): self.first_paragraph = None self.recent_title_paragraph = None self._lock = threading.Lock() self.user_glossaries: list[Glossary] = [] self.auto_extracted_glossary: Glossary | None = None self.raw_extracted_terms: list[tuple[str, str]] = [] self.auto_enabled_ocr_workaround = False # Statistics for valid characters/text across the whole file self.valid_char_count_total: int = 0 self.total_valid_text_token_count: int = 0 def initialize_glossaries(self, initial_glossaries: list[Glossary] | None): with self._lock: self.user_glossaries = ( list(initial_glossaries) if initial_glossaries else [] ) self.auto_extracted_glossary = None self.raw_extracted_terms = [] self.unique_name = self._generate_unique_auto_glossary_name() self.norm_terms = set() for g in self.user_glossaries: for entity in g.normalized_lookup: self.norm_terms.add(entity) # reset statistics buffer when initializing self.valid_char_count_total = 0 self.total_valid_text_token_count = 0 def add_raw_extracted_term_pair(self, src: str, tgt: str): with self._lock: self.raw_extracted_terms.append((src, tgt)) def _generate_unique_auto_glossary_name(self) -> str: base_name = "auto_extracted_glossary" current_name = base_name suffix = 0 existing_names = {g.name for g in self.user_glossaries} if ( self.auto_extracted_glossary and self.auto_extracted_glossary.name == current_name ): pass while current_name in existing_names: suffix += 1 current_name = f"{base_name}#{suffix}" return current_name def contains_term(self, term: str) -> bool: with self._lock: try: return term in self.norm_terms except Exception: return False def finalize_auto_extracted_glossary(self): with self._lock: self.auto_extracted_glossary = None if not self.raw_extracted_terms: self.raw_extracted_terms = [] return term_translations: dict[str, list[str]] = {} for src, tgt in self.raw_extracted_terms: term_translations.setdefault(src, []).append(tgt) final_entries: list[GlossaryEntry] = [] for src, tgts in term_translations.items(): if not tgts: continue most_common_tgt = Counter(tgts).most_common(1)[0][0] final_entries.append(GlossaryEntry(src, most_common_tgt)) if final_entries: self.auto_extracted_glossary = Glossary( name=self.unique_name, entries=final_entries ) def get_glossaries(self) -> list[Glossary]: with self._lock: all_glossaries = list(self.user_glossaries) if self.auto_extracted_glossary: all_glossaries.append(self.auto_extracted_glossary) return all_glossaries def get_glossaries_for_translation( self, auto_extract_enabled: bool ) -> list[Glossary]: with self._lock: if auto_extract_enabled and self.auto_extracted_glossary: return [self.auto_extracted_glossary] else: all_glossaries = list(self.user_glossaries) if self.auto_extracted_glossary: all_glossaries.append(self.auto_extracted_glossary) return all_glossaries def add_valid_counts(self, char_count: int, token_count: int): """Accumulate valid character and token counts in a threadsafe way.""" if char_count <= 0 and token_count <= 0: return with self._lock: if char_count > 0: self.valid_char_count_total += char_count if token_count > 0: self.total_valid_text_token_count += token_count class TranslationConfig: @staticmethod def create_max_pages_per_part_split_strategy(max_pages_per_part: int): return PageCountStrategy(max_pages_per_part) # for backward compatibility, # new parameters should be added at the end of the function. def __init__( self, translator: BaseTranslator, input_file: str | Path, lang_in: str, lang_out: str, doc_layout_model, # DocLayoutModel # for backward compatibility font: str | Path | None = None, pages: str | None = None, output_dir: str | Path | None = None, debug: bool = False, working_dir: str | Path | None = None, no_dual: bool = False, no_mono: bool = False, formular_font_pattern: str | None = None, formular_char_pattern: str | None = None, qps: int = 1, split_short_lines: bool = False, short_line_split_factor: float = 0.8, use_rich_pbar: bool = True, progress_monitor: ProgressMonitor | None = None, skip_clean: bool = False, dual_translate_first: bool = False, disable_rich_text_translate: bool = False, enhance_compatibility: bool = False, report_interval: float = 0.1, min_text_length: int = 5, use_side_by_side_dual: bool = True, # Deprecated: 是否使用拼版式双语 PDF(并排显示原文和译文)向下兼容选项,已停用。 use_alternating_pages_dual: bool = False, watermark_output_mode: WatermarkOutputMode = WatermarkOutputMode.Watermarked, # Add split-related parameters split_strategy: BaseSplitStrategy | None = None, table_model=None, show_char_box: bool = False, skip_scanned_detection: bool = False, ocr_workaround: bool = False, custom_system_prompt: str | None = None, add_formula_placehold_hint: bool = False, glossaries: list[Glossary] | None = None, pool_max_workers: int | None = None, auto_extract_glossary: bool = True, auto_enable_ocr_workaround: bool = False, primary_font_family: str | None = None, only_include_translated_page: bool | None = False, save_auto_extracted_glossary: bool = True, enable_graphic_element_process: bool = True, merge_alternating_line_numbers: bool = True, skip_translation: bool = False, skip_form_render: bool = False, skip_curve_render: bool = False, only_parse_generate_pdf: bool = False, remove_non_formula_lines: bool = False, non_formula_line_iou_threshold: float = 0.9, figure_table_protection_threshold: float = 0.9, skip_formula_offset_calculation: bool = False, term_extraction_translator: BaseTranslator | None = None, metadata_extra_data: str | None = None, term_pool_max_workers: int | None = None, disable_same_text_fallback: bool = False, ): self.translator = translator self.term_extraction_translator = term_extraction_translator or translator initial_user_glossaries = list(glossaries) if glossaries else [] self.input_file = input_file self.lang_in = lang_in self.lang_out = lang_out # just ignore font self.font = None self.pages = pages self.page_ranges = self.parse_pages(pages) if pages else None self.debug = debug self.watermark_output_mode = watermark_output_mode self.output_dir = output_dir self.working_dir = working_dir self.no_dual = no_dual self.no_mono = no_mono self.formular_font_pattern = formular_font_pattern self.formular_char_pattern = formular_char_pattern self.qps = qps # Set pool_max_workers with default value from qps self.pool_max_workers = ( pool_max_workers if pool_max_workers is not None else qps ) # Set term_pool_max_workers for automatic term extraction. # If not provided, default to pool_max_workers. self.term_pool_max_workers = ( term_pool_max_workers if term_pool_max_workers is not None else self.pool_max_workers ) self.split_short_lines = split_short_lines self.short_line_split_factor = short_line_split_factor self.use_rich_pbar = use_rich_pbar self.progress_monitor = progress_monitor self.doc_layout_model = doc_layout_model self.skip_clean = skip_clean or enhance_compatibility self.skip_scanned_detection = skip_scanned_detection self.dual_translate_first = dual_translate_first or enhance_compatibility self.disable_rich_text_translate = ( disable_rich_text_translate or enhance_compatibility ) self.report_interval = report_interval self.min_text_length = min_text_length self.use_alternating_pages_dual = use_alternating_pages_dual self.ocr_workaround = ocr_workaround self.merge_alternating_line_numbers = merge_alternating_line_numbers if self.ocr_workaround: self.skip_scanned_detection = True self.disable_rich_text_translate = True # for backward compatibility if use_side_by_side_dual is False and use_alternating_pages_dual is False: self.use_alternating_pages_dual = True if progress_monitor and progress_monitor.cancel_event is None: progress_monitor.cancel_event = threading.Event() if working_dir is None: if debug: working_dir = Path(CACHE_FOLDER) / "working" / Path(input_file).stem self._is_temp_dir = False else: working_dir = tempfile.mkdtemp() self._is_temp_dir = True else: working_dir = Path(working_dir) / Path(input_file).stem self._is_temp_dir = False self.working_dir = working_dir Path(working_dir).mkdir(parents=True, exist_ok=True) if output_dir is None: output_dir = Path.cwd() self.output_dir = output_dir Path(output_dir).mkdir(parents=True, exist_ok=True) if not doc_layout_model: from babeldoc.docvision.doclayout import DocLayoutModel doc_layout_model = DocLayoutModel.load_available() self.doc_layout_model = doc_layout_model self.shared_context_cross_split_part = SharedContextCrossSplitPart() self.shared_context_cross_split_part.initialize_glossaries( initial_user_glossaries ) # Initialize split-related attributes self.split_strategy = split_strategy # Create a unique working directory for each part self._part_working_dirs: dict[int, Path] = {} self._part_output_dirs: dict[int, Path] = {} self.table_model = table_model self.show_char_box = show_char_box self.custom_system_prompt = custom_system_prompt self.add_formula_placehold_hint = add_formula_placehold_hint self.auto_extract_glossary = auto_extract_glossary self.auto_enable_ocr_workaround = auto_enable_ocr_workaround self.skip_translation = skip_translation self.only_parse_generate_pdf = only_parse_generate_pdf if self.skip_translation or self.only_parse_generate_pdf: self.auto_extract_glossary = False if auto_enable_ocr_workaround: self.ocr_workaround = False self.skip_scanned_detection = False assert primary_font_family in [ None, "serif", "sans-serif", "script", ] self.primary_font_family = primary_font_family if only_include_translated_page is None: only_include_translated_page = False self.only_include_translated_page = only_include_translated_page self.save_auto_extracted_glossary = save_auto_extracted_glossary # force disable table translate until the new model is ready self.table_model = None self.enable_graphic_element_process = enable_graphic_element_process self.skip_form_render = skip_form_render self.skip_curve_render = skip_curve_render self.remove_non_formula_lines = remove_non_formula_lines self.non_formula_line_iou_threshold = non_formula_line_iou_threshold self.figure_table_protection_threshold = figure_table_protection_threshold self.skip_formula_offset_calculation = skip_formula_offset_calculation self.metadata_extra_data = metadata_extra_data self.term_extraction_token_usage: dict[str, int] = { "total_tokens": 0, "prompt_tokens": 0, "completion_tokens": 0, "cache_hit_prompt_tokens": 0, } self.disable_same_text_fallback = disable_same_text_fallback if self.ocr_workaround: self.remove_non_formula_lines = False def parse_pages(self, pages_str: str | None) -> list[tuple[int, int]] | None: """解析页码字符串,返回页码范围列表 Args: pages_str: 形如 "1-,2,-3,4" 的页码字符串 Returns: 包含 (start, end) 元组的列表,其中 -1 表示无限制 """ if not pages_str: return None ranges: list[tuple[int, int]] = [] for part in pages_str.split(","): part = part.strip() if "-" in part: start, end = part.split("-") start_as_int = int(start) if start else 1 end_as_int = int(end) if end else -1 ranges.append((start_as_int, end_as_int)) else: page = int(part) ranges.append((page, page)) return ranges def should_translate_page(self, page_number: int) -> bool: """判断指定页码是否需要翻译 Args: page_number: 页码 Returns: 是否需要翻译该页 """ if isinstance(self.page_ranges, list) and len(self.page_ranges) == 0: return False if not self.page_ranges: return True for start, end in self.page_ranges: if start <= page_number and (end == -1 or page_number <= end): return True return False def get_output_file_path(self, filename: str) -> Path: return Path(self.output_dir) / filename def get_working_file_path(self, filename: str) -> Path: return Path(self.working_dir) / filename def get_part_working_dir(self, part_index: int) -> Path: """Get working directory for a specific part""" if part_index not in self._part_working_dirs: if self.working_dir: part_dir = Path(self.working_dir) / f"part_{part_index}" else: part_dir = Path(tempfile.mkdtemp()) / f"part_{part_index}" part_dir.mkdir(parents=True, exist_ok=True) self._part_working_dirs[part_index] = part_dir return self._part_working_dirs[part_index] def get_part_output_dir(self, part_index: int) -> Path: """Get output directory for a specific part""" if part_index not in self._part_output_dirs: part_dir = Path(self.working_dir) / f"part_{part_index}_output" part_dir.mkdir(parents=True, exist_ok=True) self._part_output_dirs[part_index] = part_dir return self._part_output_dirs[part_index] def cleanup_part_output_dir(self, part_index: int): """Clean up output directory for a specific part""" if part_index in self._part_output_dirs: part_dir = self._part_output_dirs[part_index] if part_dir.exists(): shutil.rmtree(part_dir) del self._part_output_dirs[part_index] def cleanup_part_working_dir(self, part_index: int): """Clean up working directory for a specific part""" if part_index in self._part_working_dirs: part_dir = self._part_working_dirs[part_index] if part_dir.exists(): shutil.rmtree(part_dir, ignore_errors=True) del self._part_working_dirs[part_index] def cleanup_temp_files(self): """Clean up all temporary files including part working directories""" try: for part_index in list(self._part_working_dirs.keys()): self.cleanup_part_working_dir(part_index) if self._is_temp_dir: logger.info(f"cleanup temp files: {self.working_dir}") shutil.rmtree(self.working_dir, ignore_errors=True) except Exception: logger.exception("Error cleaning up temporary files") def raise_if_cancelled(self): if self.progress_monitor is not None: self.progress_monitor.raise_if_cancelled() def cancel_translation(self): if self.progress_monitor is not None: self.progress_monitor.cancel() def get_term_extraction_translator(self) -> BaseTranslator: """Return the translator to use for automatic term extraction.""" return self.term_extraction_translator def record_term_extraction_usage( self, total_tokens: int, prompt_tokens: int, completion_tokens: int, cache_hit_prompt_tokens: int, ) -> None: """Accumulate token usage for automatic term extraction.""" if total_tokens > 0: self.term_extraction_token_usage["total_tokens"] += total_tokens if prompt_tokens > 0: self.term_extraction_token_usage["prompt_tokens"] += prompt_tokens if completion_tokens > 0: self.term_extraction_token_usage["completion_tokens"] += completion_tokens if cache_hit_prompt_tokens > 0: self.term_extraction_token_usage["cache_hit_prompt_tokens"] += ( cache_hit_prompt_tokens ) class TranslateResult: original_pdf_path: str total_seconds: float mono_pdf_path: Path | None dual_pdf_path: Path | None no_watermark_mono_pdf_path: Path | None no_watermark_dual_pdf_path: Path | None peak_memory_usage: int | None auto_extracted_glossary_path: Path | None total_valid_character_count: int | None total_valid_text_token_count: int | None def __init__( self, mono_pdf_path: Path | None, dual_pdf_path: Path | None, auto_extracted_glossary_path: Path | None = None, ): self.mono_pdf_path = mono_pdf_path self.dual_pdf_path = dual_pdf_path # For compatibility considerations, if only a non-watermarked PDF is generated, # the values of mono_pdf_path and no_watermark_mono_pdf_path are the same. self.no_watermark_mono_pdf_path = mono_pdf_path self.no_watermark_dual_pdf_path = dual_pdf_path self.auto_extracted_glossary_path = auto_extracted_glossary_path self.total_valid_character_count = None self.total_valid_text_token_count = None def __str__(self): """Return a human-readable string representation of the translation result.""" result = [] if hasattr(self, "original_pdf_path") and self.original_pdf_path: result.append(f"\tOriginal PDF: {self.original_pdf_path}") if hasattr(self, "total_seconds") and self.total_seconds: result.append(f"\tTotal time: {self.total_seconds:.2f} seconds") if self.mono_pdf_path: result.append(f"\tMonolingual PDF: {self.mono_pdf_path}") if self.dual_pdf_path: result.append(f"\tDual-language PDF: {self.dual_pdf_path}") if ( hasattr(self, "no_watermark_mono_pdf_path") and self.no_watermark_mono_pdf_path and self.no_watermark_mono_pdf_path != self.mono_pdf_path ): result.append( f"\tNo-watermark Monolingual PDF: {self.no_watermark_mono_pdf_path}" ) if ( hasattr(self, "no_watermark_dual_pdf_path") and self.no_watermark_dual_pdf_path and self.no_watermark_dual_pdf_path != self.dual_pdf_path ): result.append( f"\tNo-watermark Dual-language PDF: {self.no_watermark_dual_pdf_path}" ) if ( hasattr(self, "auto_extracted_glossary_path") and self.auto_extracted_glossary_path ): result.append( f"\tAuto-extracted glossary: {self.auto_extracted_glossary_path}" ) if hasattr(self, "peak_memory_usage") and self.peak_memory_usage: result.append(f"\tPeak memory usage: {self.peak_memory_usage} MB") if hasattr(self, "total_valid_character_count") and isinstance( self.total_valid_character_count, int ): result.append( f"\tTotal valid character count: {self.total_valid_character_count}" ) if hasattr(self, "total_valid_text_token_count") and isinstance( self.total_valid_text_token_count, int ): result.append( f"\tTotal valid text token count (gpt-4o): {self.total_valid_text_token_count}" ) if result: result.insert(0, "Translation results:") return "\n".join(result) if result else "No translation results available" ================================================ FILE: babeldoc/glossary.py ================================================ import csv import io import itertools import logging import re import time from pathlib import Path import chardet import hyperscan import regex logger = logging.getLogger(__name__) class GlossaryEntry: def __init__(self, source: str, target: str, target_language: str | None = None): self.source = source self.target = target self.target_language = target_language def __repr__(self): return f"GlossaryEntry(source='{self.source}', target='{self.target}', target_language='{self.target_language}')" def batched(iterable, n, *, strict=False): # batched('ABCDEFG', 3) → ABC DEF G if n < 1: raise ValueError("n must be at least one") iterator = iter(iterable) while batch := tuple(itertools.islice(iterator, n)): if strict and len(batch) != n: raise ValueError("batched(): incomplete batch") yield batch TERM_NORM_PATTERN = re.compile(r"\s+", regex.UNICODE) class Glossary: def __init__(self, name: str, entries: list[GlossaryEntry]): self.name = name # Deduplicate entries based on normalized source unique_entries = [] seen_normalized_sources = set() for entry in entries: normalized_source = self.normalize_source(entry.source) if normalized_source not in seen_normalized_sources: unique_entries.append(entry) seen_normalized_sources.add(normalized_source) self.entries = unique_entries self.normalized_lookup: dict[str, tuple[str, str]] = {} self.id_lookup: list[tuple[str, str]] = [] self.hs_dbs: list[hyperscan.Database] | None = None self._build_regex_and_lookup() @staticmethod def normalize_source(source_term: str) -> str: """Normalizes a source term by lowercasing and standardizing whitespace.""" term = source_term.lower() term = TERM_NORM_PATTERN.sub( " ", term ) # Replace multiple whitespace with single space return term.strip() def _build_regex_and_lookup(self): logger.debug( f"start build regex for glossary {self.name} with {len(self.entries)} entries" ) """ Builds a combined regex for all source terms and a lookup dictionary from normalized source terms to (original_source, original_target). Regex patterns are sorted by length in descending order to prioritize longer matches. """ self.normalized_lookup = {} if not self.entries: self.source_terms_regex = None return self.hs_dbs = [] hs_pattern = [] start = time.time() for idx, entry in enumerate(self.entries): normalized_key = self.normalize_source(entry.source) self.normalized_lookup[normalized_key] = (entry.source, entry.target) self.id_lookup.append((entry.source, entry.target)) hs_pattern.append((re.escape(entry.source).encode("utf-8"), idx)) chunk_size = 20000 for i, pattern_chunk in enumerate( batched(hs_pattern, chunk_size, strict=False) ): logger.debug( f"building hs_db chunk {i + 1} / {len(self.entries) // chunk_size + 1}" ) expressions, ids = zip(*pattern_chunk, strict=False) hs_db = hyperscan.Database() hs_db.compile( expressions=expressions, ids=ids, elements=len(pattern_chunk), flags=hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SINGLEMATCH, # | hyperscan.HS_FLAG_UTF8 # | hyperscan.HS_FLAG_UCP, ) self.hs_dbs.append(hs_db) end = time.time() logger.debug( f"finished building regex for glossary {self.name} in {end - start:.2f} seconds" ) logger.debug( f"build hs database for glossary {self.name} with {len(self.entries)} entries, hs_info: {self.hs_dbs[0].info()}" ) if not self.hs_dbs: self.hs_dbs = None @classmethod def from_csv(cls, file_path: Path, target_lang_out: str) -> "Glossary": """ Loads glossary entries from a CSV file. CSV format: source,target,tgt_lng (tgt_lng is optional) Filters entries based on tgt_lng matching target_lang_out. The glossary name is derived from the CSV filename. """ glossary_name = file_path.stem loaded_entries: list[GlossaryEntry] = [] # Normalize target_lang_out once for comparison normalized_target_lang_out = target_lang_out.lower().replace("-", "_") try: with file_path.open("rb") as f: content = f.read() encoding = chardet.detect(content)["encoding"] buffer = io.StringIO(content.decode(encoding)) reader = csv.DictReader(buffer, doublequote=True) if not all(col in reader.fieldnames for col in ["source", "target"]): raise ValueError( f"CSV file {file_path} must contain 'source' and 'target' columns." ) for row in reader: source = row["source"] target = row["target"] tgt_lng = row.get("tgt_lng", None) # Handle optional tgt_lng if tgt_lng and tgt_lng.strip(): normalized_entry_tgt_lng = ( tgt_lng.strip().lower().replace("-", "_") ) if normalized_entry_tgt_lng != normalized_target_lang_out: continue # Skip if language doesn't match loaded_entries.append(GlossaryEntry(source, target, tgt_lng)) except FileNotFoundError: # Or handle as per your project's error strategy, e.g., log and return empty Glossary raise except Exception as e: # Or handle as per your project's error strategy raise ValueError( f"Error reading or parsing CSV file {file_path}: {e}" ) from e return cls(name=glossary_name, entries=loaded_entries) def to_csv(self) -> str: """Exports the glossary entries to a CSV formatted string.""" dict_data = [ { "source": x.source, "target": x.target, "tgt_lng": x.target_language if x.target_language else "", } for x in self.entries ] buffer = io.StringIO() dict_writer = csv.DictWriter( buffer, fieldnames=["source", "target", "tgt_lng"], doublequote=True ) dict_writer.writeheader() dict_writer.writerows(dict_data) return buffer.getvalue() def __repr__(self): return f"Glossary(name='{self.name}', num_entries={len(self.entries)})" def get_active_entries_for_text(self, text: str) -> list[tuple[str, str]]: """Returns a list of (original_source, target_text) tuples for terms found in the given text.""" if not self.hs_dbs or not text: return [] text = TERM_NORM_PATTERN.sub(" ", text) # Normalize whitespace in the text if not text: return [] active_entries = [] def on_match( idx: int, _from: int, _to: int, _flags: int, _context=None ) -> bool | None: active_entries.append(self.id_lookup[idx]) return False for hs_db in self.hs_dbs: # Scan the text with the hyperscan database scratch = hyperscan.Scratch(hs_db) hs_db.scan(text.encode("utf-8"), on_match, scratch=scratch) return active_entries ================================================ FILE: babeldoc/main.py ================================================ import asyncio import logging import multiprocessing as mp import queue import random import sys from pathlib import Path from typing import Any import configargparse import tqdm from rich.progress import BarColumn from rich.progress import MofNCompleteColumn from rich.progress import Progress from rich.progress import TextColumn from rich.progress import TimeElapsedColumn from rich.progress import TimeRemainingColumn import babeldoc.assets.assets import babeldoc.format.pdf.high_level from babeldoc.const import enable_process_pool from babeldoc.format.pdf.translation_config import TranslationConfig from babeldoc.format.pdf.translation_config import WatermarkOutputMode from babeldoc.glossary import Glossary from babeldoc.translator.translator import OpenAITranslator from babeldoc.translator.translator import set_translate_rate_limiter logger = logging.getLogger(__name__) __version__ = "0.5.23" def create_parser(): parser = configargparse.ArgParser( config_file_parser_class=configargparse.TomlConfigParser(["babeldoc"]), ) parser.add_argument( "-c", "--config", is_config_file=True, help="config file path", ) parser.add_argument( "--version", action="version", version=f"%(prog)s {__version__}", ) parser.add_argument( "--files", action="append", help="One or more paths to PDF files.", ) parser.add_argument( "--debug", action="store_true", help="Use debug logging level.", ) parser.add_argument( "--warmup", action="store_true", help="Only download and verify required assets then exit.", ) parser.add_argument( "--rpc-doclayout", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout2", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout3", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout4", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout5", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout6", help="RPC service host address for document layout analysis", ) parser.add_argument( "--rpc-doclayout7", help="RPC service host address for document layout analysis", ) parser.add_argument( "--generate-offline-assets", default=None, help="Generate offline assets package in the specified directory", ) parser.add_argument( "--restore-offline-assets", default=None, help="Restore offline assets package from the specified file", ) parser.add_argument( "--working-dir", default=None, help="Working directory for translation. If not set, use temp directory.", ) parser.add_argument( "--metadata-extra-data", default=None, help="Extra data for metadata", ) parser.add_argument( "--enable-process-pool", action="store_true", help="DEBUG ONLY", ) # translation option argument group translation_group = parser.add_argument_group( "Translation", description="Used during translation", ) translation_group.add_argument( "--pages", "-p", help="Pages to translate. If not set, translate all pages. like: 1,2,1-,-3,3-5", ) translation_group.add_argument( "--min-text-length", type=int, default=5, help="Minimum text length to translate (default: 5)", ) translation_group.add_argument( "--lang-in", "-li", default="en", help="The code of source language.", ) translation_group.add_argument( "--lang-out", "-lo", default="zh", help="The code of target language.", ) translation_group.add_argument( "--output", "-o", help="Output directory for files. if not set, use same as input.", ) translation_group.add_argument( "--qps", "-q", type=int, default=4, help="QPS limit of translation service", ) translation_group.add_argument( "--ignore-cache", action="store_true", help="Ignore translation cache.", ) translation_group.add_argument( "--no-dual", action="store_true", help="Do not output bilingual PDF files", ) translation_group.add_argument( "--no-mono", action="store_true", help="Do not output monolingual PDF files", ) translation_group.add_argument( "--formular-font-pattern", help="Font pattern to identify formula text", ) translation_group.add_argument( "--formular-char-pattern", help="Character pattern to identify formula text", ) translation_group.add_argument( "--split-short-lines", action="store_true", help="Force split short lines into different paragraphs (may cause poor typesetting & bugs)", ) translation_group.add_argument( "--short-line-split-factor", type=float, default=0.8, help="Split threshold factor. The actual threshold is the median length of all lines on the current page * this factor", ) translation_group.add_argument( "--skip-clean", action="store_true", help="Skip PDF cleaning step", ) translation_group.add_argument( "--dual-translate-first", action="store_true", help="Put translated pages first in dual PDF mode", ) translation_group.add_argument( "--disable-rich-text-translate", action="store_true", help="Disable rich text translation (may help improve compatibility with some PDFs)", ) translation_group.add_argument( "--enhance-compatibility", action="store_true", help="Enable all compatibility enhancement options (equivalent to --skip-clean --dual-translate-first --disable-rich-text-translate)", ) translation_group.add_argument( "--use-alternating-pages-dual", action="store_true", help="Use alternating pages mode for dual PDF. When enabled, original and translated pages are arranged in alternate order.", ) translation_group.add_argument( "--watermark-output-mode", type=str, choices=["watermarked", "no_watermark", "both"], default="watermarked", help="Control watermark output mode: 'watermarked' (default) adds watermark to translated PDF, 'no_watermark' doesn't add watermark, 'both' outputs both versions.", ) translation_group.add_argument( "--max-pages-per-part", type=int, help="Maximum number of pages per part for split translation. If not set, no splitting will be performed.", ) translation_group.add_argument( "--no-watermark", action="store_true", help="[DEPRECATED] Use --watermark-output-mode=no_watermark instead. Do not add watermark to the translated PDF.", ) translation_group.add_argument( "--report-interval", type=float, default=0.1, help="Progress report interval in seconds (default: 0.1)", ) translation_group.add_argument( "--translate-table-text", action="store_true", default=False, help="Translate table text (experimental)", ) translation_group.add_argument( "--show-char-box", action="store_true", default=False, help="Show character box (debug only)", ) translation_group.add_argument( "--skip-scanned-detection", action="store_true", default=False, help="Skip scanned document detection (speeds up processing for non-scanned documents)", ) translation_group.add_argument( "--ocr-workaround", action="store_true", default=False, help="Add text fill background (experimental)", ) translation_group.add_argument( "--custom-system-prompt", help="Custom system prompt for translation.", default=None, ) translation_group.add_argument( "--add-formula-placehold-hint", action="store_true", default=False, help="Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)", ) translation_group.add_argument( "--disable-same-text-fallback", action="store_true", default=False, help="Disable fallback translation when LLM output matches input text. (default: False)", ) translation_group.add_argument( "--glossary-files", type=str, default=None, help="Comma-separated paths to glossary CSV files.", ) translation_group.add_argument( "--pool-max-workers", type=int, help="Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations.", ) translation_group.add_argument( "--term-pool-max-workers", type=int, help="Maximum number of worker threads dedicated to automatic term extraction. If not specified, defaults to --pool-max-workers (or QPS value when unset).", ) translation_group.add_argument( "--no-auto-extract-glossary", action="store_false", dest="auto_extract_glossary", default=True, help="Disable automatic term extraction. (Config file: set auto_extract_glossary = false)", ) translation_group.add_argument( "--auto-enable-ocr-workaround", action="store_true", default=False, help="Enable automatic OCR workaround. If a document is detected as heavily scanned, this will attempt to enable OCR processing and skip further scan detection. Note: This option interacts with `--ocr-workaround` and `--skip-scanned-detection`. See documentation for details. (default: False)", ) translation_group.add_argument( "--primary-font-family", type=str, choices=["serif", "sans-serif", "script"], default=None, help="Override primary font family for translated text. Choices: 'serif' for serif fonts, 'sans-serif' for sans-serif fonts, 'script' for script/italic fonts. If not specified, uses automatic font selection based on original text properties.", ) translation_group.add_argument( "--only-include-translated-page", action="store_true", default=False, help="Only include translated pages in the output PDF. Effective only when --pages is used.", ) translation_group.add_argument( "--save-auto-extracted-glossary", action="store_true", default=False, help="Save automatically extracted glossary terms to a CSV file in the output directory.", ) translation_group.add_argument( "--disable-graphic-element-process", action="store_true", default=False, help="Disable graphic element process. (default: False)", ) translation_group.add_argument( "--no-merge-alternating-line-numbers", action="store_false", dest="merge_alternating_line_numbers", default=True, help="Disable post-processing that merges alternating line-number layouts (by default this feature is enabled).", ) translation_group.add_argument( "--skip-translation", action="store_true", default=False, help="Skip translation step. (default: False)", ) translation_group.add_argument( "--skip-form-render", action="store_true", default=False, help="Skip form rendering. (default: False)", ) translation_group.add_argument( "--skip-curve-render", action="store_true", default=False, help="Skip curve rendering. (default: False)", ) translation_group.add_argument( "--only-parse-generate-pdf", action="store_true", default=False, help="Only parse PDF and generate output PDF without translation (default: False). This skips all translation-related processing including layout analysis, paragraph finding, style processing, and translation itself.", ) translation_group.add_argument( "--remove-non-formula-lines", action="store_true", default=False, help="Remove non-formula lines from paragraph areas. This removes decorative lines that are not part of formulas, while protecting lines in figure/table areas. (default: False)", ) translation_group.add_argument( "--non-formula-line-iou-threshold", type=float, default=0.9, help="IoU threshold for detecting paragraph overlap when removing non-formula lines. Higher values are more conservative. (default: 0.9)", ) translation_group.add_argument( "--figure-table-protection-threshold", type=float, default=0.9, help="IoU threshold for protecting lines in figure/table areas when removing non-formula lines. Higher values provide more protection. (default: 0.9)", ) translation_group.add_argument( "--skip-formula-offset-calculation", action="store_true", default=False, help="Skip formula offset calculation (default: False)", ) # service option argument group service_group = translation_group.add_mutually_exclusive_group() service_group.add_argument( "--openai", action="store_true", help="Use OpenAI translator.", ) service_group = parser.add_argument_group( "Translation - OpenAI Options", description="OpenAI specific options", ) service_group.add_argument( "--openai-model", default="gpt-4o-mini", help="The OpenAI model to use for translation.", ) service_group.add_argument( "--openai-base-url", help="The base URL for the OpenAI API.", ) service_group.add_argument( "--openai-api-key", "-k", help="The API key for the OpenAI API.", ) service_group.add_argument( "--openai-term-extraction-model", default=None, help="OpenAI model to use for automatic term extraction. Defaults to --openai-model when unset.", ) service_group.add_argument( "--openai-term-extraction-base-url", default=None, help="Base URL for the OpenAI API used during automatic term extraction. Falls back to --openai-base-url when unset.", ) service_group.add_argument( "--openai-term-extraction-api-key", default=None, help="API key for the OpenAI API used during automatic term extraction. Falls back to --openai-api-key when unset.", ) service_group.add_argument( "--enable-json-mode-if-requested", action="store_true", default=False, help="Enable JSON mode for OpenAI requests.", ) service_group.add_argument( "--send-dashscope-header", action="store_true", default=False, help="Send DashScope data inspection header to disable input/output inspection.", ) service_group.add_argument( "--no-send-temperature", action="store_true", default=False, help="Do not send temperature parameter to OpenAI API (default: send temperature).", ) service_group.add_argument( "--openai-reasoning", type=str, default=None, help="Reasoning string to send in the OpenAI request body 'reasoning' field. If not set, the field is not sent.", ) service_group.add_argument( "--openai-term-extraction-reasoning", type=str, default=None, help="Reasoning string for the OpenAI term extraction translator. If not set, no reasoning field is sent for term extraction requests.", ) return parser async def main(): parser = create_parser() args: Any = parser.parse_args() if args.debug: logging.getLogger().setLevel(logging.DEBUG) if args.generate_offline_assets: babeldoc.assets.assets.generate_offline_assets_package( Path(args.generate_offline_assets) ) logger.info("Offline assets package generated, exiting...") return if args.restore_offline_assets: babeldoc.assets.assets.restore_offline_assets_package( Path(args.restore_offline_assets) ) logger.info("Offline assets package restored, exiting...") return if args.warmup: babeldoc.assets.assets.warmup() logger.info("Warmup completed, exiting...") return # 验证翻译服务选择 if not args.openai: parser.error("必须选择一个翻译服务:--openai") # 验证 OpenAI 参数 if args.openai and not args.openai_api_key: parser.error("使用 OpenAI 服务时必须提供 API key") if args.enable_process_pool: enable_process_pool() # 实例化翻译器 if args.openai: translator_kwargs: dict[str, Any] = {} if args.openai_reasoning is not None: translator_kwargs["reasoning"] = args.openai_reasoning translator = OpenAITranslator( lang_in=args.lang_in, lang_out=args.lang_out, model=args.openai_model, base_url=args.openai_base_url, api_key=args.openai_api_key, ignore_cache=args.ignore_cache, enable_json_mode_if_requested=args.enable_json_mode_if_requested, send_dashscope_header=args.send_dashscope_header, send_temperature=not args.no_send_temperature, **translator_kwargs, ) term_extraction_translator = translator if ( args.openai_term_extraction_model or args.openai_term_extraction_base_url or args.openai_term_extraction_api_key ): term_translator_kwargs: dict[str, Any] = {} if args.openai_term_extraction_reasoning is not None: term_translator_kwargs["reasoning"] = ( args.openai_term_extraction_reasoning ) term_extraction_translator = OpenAITranslator( lang_in=args.lang_in, lang_out=args.lang_out, model=args.openai_term_extraction_model or args.openai_model, base_url=(args.openai_term_extraction_base_url or args.openai_base_url), api_key=args.openai_term_extraction_api_key or args.openai_api_key, ignore_cache=args.ignore_cache, enable_json_mode_if_requested=args.enable_json_mode_if_requested, send_dashscope_header=args.send_dashscope_header, send_temperature=not args.no_send_temperature, **term_translator_kwargs, ) else: raise ValueError("Invalid translator type") # 设置翻译速率限制 set_translate_rate_limiter(args.qps) # 初始化文档布局模型 if args.rpc_doclayout: from babeldoc.docvision.rpc_doclayout import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout) elif args.rpc_doclayout2: from babeldoc.docvision.rpc_doclayout2 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout2) elif args.rpc_doclayout3: from babeldoc.docvision.rpc_doclayout3 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout3) elif args.rpc_doclayout4: from babeldoc.docvision.rpc_doclayout4 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout4) elif args.rpc_doclayout5: from babeldoc.docvision.rpc_doclayout5 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout5) elif args.rpc_doclayout6: from babeldoc.docvision.rpc_doclayout6 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout6) elif args.rpc_doclayout7: from babeldoc.docvision.rpc_doclayout7 import RpcDocLayoutModel doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout7) else: from babeldoc.docvision.doclayout import DocLayoutModel doc_layout_model = DocLayoutModel.load_onnx() if args.translate_table_text: from babeldoc.docvision.table_detection.rapidocr import RapidOCRModel table_model = RapidOCRModel() else: table_model = None # Load glossaries loaded_glossaries: list[Glossary] = [] if args.glossary_files: paths_str = args.glossary_files.split(",") for p_str in paths_str: file_path = Path(p_str.strip()) if not file_path.exists(): logger.error(f"Glossary file not found: {file_path}") continue if not file_path.is_file(): logger.error(f"Glossary path is not a file: {file_path}") continue try: glossary_obj = Glossary.from_csv(file_path, args.lang_out) if glossary_obj.entries: loaded_glossaries.append(glossary_obj) logger.info( f"Loaded glossary '{glossary_obj.name}' with {len(glossary_obj.entries)} entries." ) else: logger.info( f"Glossary '{file_path.stem}' loaded with no applicable entries for lang_out '{args.lang_out}'." ) except Exception as e: logger.error(f"Failed to load glossary from {file_path}: {e}") pending_files = [] for file in args.files: # 清理文件路径,去除两端的引号 if file.startswith("--files="): file = file[len("--files=") :] file = file.lstrip("-").strip("\"'") if not Path(file).exists(): logger.error(f"文件不存在:{file}") exit(1) if not file.lower().endswith(".pdf"): logger.error(f"文件不是 PDF 文件:{file}") exit(1) pending_files.append(file) if args.output: if not Path(args.output).exists(): logger.info(f"输出目录不存在,创建:{args.output}") try: Path(args.output).mkdir(parents=True, exist_ok=True) except OSError: logger.critical( f"Failed to create output folder at {args.output}", exc_info=True, ) exit(1) else: args.output = None if args.working_dir: working_dir = Path(args.working_dir) if not working_dir.exists(): logger.info(f"工作目录不存在,创建:{working_dir}") try: working_dir.mkdir(parents=True, exist_ok=True) except OSError: logger.critical( f"Failed to create working directory at {working_dir}", exc_info=True, ) exit(1) else: working_dir = None watermark_output_mode = WatermarkOutputMode.Watermarked if args.no_watermark: watermark_output_mode = WatermarkOutputMode.NoWatermark elif args.watermark_output_mode == "both": watermark_output_mode = WatermarkOutputMode.Both elif args.watermark_output_mode == "watermarked": watermark_output_mode = WatermarkOutputMode.Watermarked elif args.watermark_output_mode == "no_watermark": watermark_output_mode = WatermarkOutputMode.NoWatermark split_strategy = None if args.max_pages_per_part: split_strategy = TranslationConfig.create_max_pages_per_part_split_strategy( args.max_pages_per_part ) total_term_extraction_total_tokens = 0 total_term_extraction_prompt_tokens = 0 total_term_extraction_completion_tokens = 0 total_term_extraction_cache_hit_prompt_tokens = 0 for file in pending_files: # 清理文件路径,去除两端的引号 file = file.strip("\"'") # 创建配置对象 config = TranslationConfig( input_file=file, font=None, pages=args.pages, output_dir=args.output, translator=translator, term_extraction_translator=term_extraction_translator, debug=args.debug, lang_in=args.lang_in, lang_out=args.lang_out, no_dual=args.no_dual, no_mono=args.no_mono, qps=args.qps, formular_font_pattern=args.formular_font_pattern, formular_char_pattern=args.formular_char_pattern, split_short_lines=args.split_short_lines, short_line_split_factor=args.short_line_split_factor, doc_layout_model=doc_layout_model, skip_clean=args.skip_clean, dual_translate_first=args.dual_translate_first, disable_rich_text_translate=args.disable_rich_text_translate, enhance_compatibility=args.enhance_compatibility, use_alternating_pages_dual=args.use_alternating_pages_dual, report_interval=args.report_interval, min_text_length=args.min_text_length, watermark_output_mode=watermark_output_mode, split_strategy=split_strategy, table_model=table_model, show_char_box=args.show_char_box, skip_scanned_detection=args.skip_scanned_detection, ocr_workaround=args.ocr_workaround, custom_system_prompt=args.custom_system_prompt, working_dir=working_dir, add_formula_placehold_hint=args.add_formula_placehold_hint, disable_same_text_fallback=args.disable_same_text_fallback, glossaries=loaded_glossaries, pool_max_workers=args.pool_max_workers, auto_extract_glossary=args.auto_extract_glossary, auto_enable_ocr_workaround=args.auto_enable_ocr_workaround, primary_font_family=args.primary_font_family, only_include_translated_page=args.only_include_translated_page, save_auto_extracted_glossary=args.save_auto_extracted_glossary, enable_graphic_element_process=not args.disable_graphic_element_process, merge_alternating_line_numbers=args.merge_alternating_line_numbers, skip_translation=args.skip_translation, skip_form_render=args.skip_form_render, skip_curve_render=args.skip_curve_render, only_parse_generate_pdf=args.only_parse_generate_pdf, remove_non_formula_lines=args.remove_non_formula_lines, non_formula_line_iou_threshold=args.non_formula_line_iou_threshold, figure_table_protection_threshold=args.figure_table_protection_threshold, skip_formula_offset_calculation=args.skip_formula_offset_calculation, metadata_extra_data=args.metadata_extra_data, term_pool_max_workers=args.term_pool_max_workers, ) def nop(_x): pass getattr(doc_layout_model, "init_font_mapper", nop)(config) # Create progress handler progress_context, progress_handler = create_progress_handler( config, show_log=False ) # 开始翻译 with progress_context: async for event in babeldoc.format.pdf.high_level.async_translate(config): progress_handler(event) if config.debug: logger.debug(event) if event["type"] == "error": logger.error(f"Error: {event['error']}") break if event["type"] == "finish": result = event["translate_result"] logger.info(str(result)) break usage = config.term_extraction_token_usage total_term_extraction_total_tokens += usage["total_tokens"] total_term_extraction_prompt_tokens += usage["prompt_tokens"] total_term_extraction_completion_tokens += usage["completion_tokens"] total_term_extraction_cache_hit_prompt_tokens += usage[ "cache_hit_prompt_tokens" ] logger.info(f"Total tokens: {translator.token_count.value}") logger.info(f"Prompt tokens: {translator.prompt_token_count.value}") logger.info(f"Completion tokens: {translator.completion_token_count.value}") logger.info( f"Cache hit prompt tokens: {translator.cache_hit_prompt_token_count.value}" ) logger.info( "Term extraction tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s", total_term_extraction_total_tokens, total_term_extraction_prompt_tokens, total_term_extraction_completion_tokens, total_term_extraction_cache_hit_prompt_tokens, ) if term_extraction_translator is not translator: logger.info( "Term extraction translator raw tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s", term_extraction_translator.token_count.value, term_extraction_translator.prompt_token_count.value, term_extraction_translator.completion_token_count.value, term_extraction_translator.cache_hit_prompt_token_count.value, ) def create_progress_handler( translation_config: TranslationConfig, show_log: bool = False ): """Create a progress handler function based on the configuration. Args: translation_config: The translation configuration. Returns: A tuple of (progress_context, progress_handler), where progress_context is a context manager that should be used to wrap the translation process, and progress_handler is a function that will be called with progress events. """ if translation_config.use_rich_pbar: progress = Progress( TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), TimeElapsedColumn(), TimeRemainingColumn(), ) translate_task_id = progress.add_task("translate", total=100) stage_tasks = {} def progress_handler(event): if show_log and random.random() <= 0.1: # noqa: S311 logger.info(event) if event["type"] == "progress_start": if event["stage"] not in stage_tasks: stage_tasks[event["stage"]] = progress.add_task( f"{event['stage']} ({event['part_index']}/{event['total_parts']})", total=event.get("stage_total", 100), ) elif event["type"] == "progress_update": stage = event["stage"] if stage in stage_tasks: progress.update( stage_tasks[stage], completed=event["stage_current"], total=event["stage_total"], description=f"{event['stage']} ({event['part_index']}/{event['total_parts']})", refresh=True, ) progress.update( translate_task_id, completed=event["overall_progress"], refresh=True, ) elif event["type"] == "progress_end": stage = event["stage"] if stage in stage_tasks: progress.update( stage_tasks[stage], completed=event["stage_total"], total=event["stage_total"], description=f"{event['stage']} ({event['part_index']}/{event['total_parts']})", refresh=True, ) progress.update( translate_task_id, completed=event["overall_progress"], refresh=True, ) progress.refresh() return progress, progress_handler else: pbar = tqdm.tqdm(total=100, desc="translate") def progress_handler(event): if event["type"] == "progress_update": pbar.update(event["overall_progress"] - pbar.n) pbar.set_description( f"{event['stage']} ({event['stage_current']}/{event['stage_total']})", ) elif event["type"] == "progress_end": pbar.set_description(f"{event['stage']} (Complete)") pbar.refresh() return pbar, progress_handler # for backward compatibility def create_cache_folder(): return babeldoc.format.pdf.high_level.create_cache_folder() # for backward compatibility def download_font_assets(): return babeldoc.format.pdf.high_level.download_font_assets() class EvictQueue(queue.Queue): def __init__(self, maxsize): self.discarded = 0 super().__init__(maxsize) def put(self, item, block=False, timeout=None): while True: try: super().put(item, block=False) break except queue.Full: try: self.get_nowait() self.discarded += 1 except queue.Empty: pass def speed_up_logs(): import logging.handlers root_logger = logging.getLogger() log_que = EvictQueue(1000) queue_handler = logging.handlers.QueueHandler(log_que) queue_listener = logging.handlers.QueueListener(log_que, *root_logger.handlers) queue_listener.start() root_logger.handlers = [queue_handler] def cli(): """Command line interface entry point.""" from rich.logging import RichHandler logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) logging.getLogger("httpx").setLevel("CRITICAL") logging.getLogger("httpx").propagate = False logging.getLogger("openai").setLevel("CRITICAL") logging.getLogger("openai").propagate = False logging.getLogger("httpcore").setLevel("CRITICAL") logging.getLogger("httpcore").propagate = False logging.getLogger("http11").setLevel("CRITICAL") logging.getLogger("http11").propagate = False for v in logging.Logger.manager.loggerDict.values(): if getattr(v, "name", None) is None: continue if ( v.name.startswith("pdfminer") or v.name.startswith("peewee") or v.name.startswith("httpx") or "http11" in v.name or "openai" in v.name or "pdfminer" in v.name ): v.disabled = True v.propagate = False speed_up_logs() babeldoc.format.pdf.high_level.init() asyncio.run(main()) if __name__ == "__main__": if sys.platform == "darwin" or sys.platform == "win32": mp.set_start_method("spawn") else: mp.set_start_method("forkserver") cli() ================================================ FILE: babeldoc/pdfminer/LICENSE ================================================ Copyright (c) 2004-2016 Yusuke Shinyama Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: babeldoc/pdfminer/__init__.py ================================================ from importlib.metadata import PackageNotFoundError from importlib.metadata import version try: __version__ = version("pdfminer.six") except PackageNotFoundError: # package is not installed, return default __version__ = "0.0" if __name__ == "__main__": print(__version__) ================================================ FILE: babeldoc/pdfminer/_saslprep.py ================================================ # Copyright 2016-present MongoDB, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Some changes copyright 2021-present Matthias Valvekens, # licensed under the license of the pyHanko project (see LICENSE file). """An implementation of RFC4013 SASLprep.""" __all__ = ["saslprep"] import stringprep import unicodedata from collections.abc import Callable from babeldoc.pdfminer.pdfexceptions import PDFValueError # RFC4013 section 2.3 prohibited output. _PROHIBITED: tuple[Callable[[str], bool], ...] = ( # A strict reading of RFC 4013 requires table c12 here, but # characters from it are mapped to SPACE in the Map step. Can # normalization reintroduce them somehow? stringprep.in_table_c12, stringprep.in_table_c21_c22, stringprep.in_table_c3, stringprep.in_table_c4, stringprep.in_table_c5, stringprep.in_table_c6, stringprep.in_table_c7, stringprep.in_table_c8, stringprep.in_table_c9, ) def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str: """An implementation of RFC4013 SASLprep. :param data: The string to SASLprep. :param prohibit_unassigned_code_points: RFC 3454 and RFCs for various SASL mechanisms distinguish between `queries` (unassigned code points allowed) and `stored strings` (unassigned code points prohibited). Defaults to ``True`` (unassigned code points are prohibited). :return: The SASLprep'ed version of `data`. """ if prohibit_unassigned_code_points: prohibited = _PROHIBITED + (stringprep.in_table_a1,) else: prohibited = _PROHIBITED # RFC3454 section 2, step 1 - Map # RFC4013 section 2.1 mappings # Map Non-ASCII space characters to SPACE (U+0020). Map # commonly mapped to nothing characters to, well, nothing. in_table_c12 = stringprep.in_table_c12 in_table_b1 = stringprep.in_table_b1 data = "".join( [ "\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt) ], ) # RFC3454 section 2, step 2 - Normalize # RFC4013 section 2.2 normalization data = unicodedata.ucd_3_2_0.normalize("NFKC", data) in_table_d1 = stringprep.in_table_d1 if in_table_d1(data[0]): if not in_table_d1(data[-1]): # RFC3454, Section 6, #3. If a string contains any # RandALCat character, the first and last characters # MUST be RandALCat characters. raise PDFValueError("SASLprep: failed bidirectional check") # RFC3454, Section 6, #2. If a string contains any RandALCat # character, it MUST NOT contain any LCat character. prohibited = prohibited + (stringprep.in_table_d2,) else: # RFC3454, Section 6, #3. Following the logic of #3, if # the first character is not a RandALCat, no other character # can be either. prohibited = prohibited + (in_table_d1,) # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi for char in data: if any(in_table(char) for in_table in prohibited): raise PDFValueError("SASLprep: failed prohibited character check") return data ================================================ FILE: babeldoc/pdfminer/arcfour.py ================================================ """Python implementation of Arcfour encryption algorithm. See https://en.wikipedia.org/wiki/RC4 This code is in the public domain. """ from collections.abc import Sequence class Arcfour: def __init__(self, key: Sequence[int]) -> None: # because Py3 range is not indexable s = [i for i in range(256)] j = 0 klen = len(key) for i in range(256): j = (j + s[i] + key[i % klen]) % 256 (s[i], s[j]) = (s[j], s[i]) self.s = s (self.i, self.j) = (0, 0) def process(self, data: bytes) -> bytes: (i, j) = (self.i, self.j) s = self.s r = b"" for c in iter(data): i = (i + 1) % 256 j = (j + s[i]) % 256 (s[i], s[j]) = (s[j], s[i]) k = s[(s[i] + s[j]) % 256] r += bytes((c ^ k,)) (self.i, self.j) = (i, j) return r encrypt = decrypt = process ================================================ FILE: babeldoc/pdfminer/ascii85.py ================================================ """Python implementation of ASCII85/ASCIIHex decoder (Adobe version).""" import re from base64 import a85decode from binascii import unhexlify start_re = re.compile(rb"^\s*?\s*$") def ascii85decode(data: bytes) -> bytes: """In ASCII85 encoding, every four bytes are encoded with five ASCII letters, using 85 different types of characters (as 256**4 < 85**5). When the length of the original bytes is not a multiple of 4, a special rule is used for round up. Adobe's ASCII85 implementation expects the input to be terminated by `b"~>"`, and (though this is absent from the PDF spec) it can also begin with `b"<~"`. We can't reliably expect this to be the case, and there can be off-by-one errors in stream lengths which mean we only see `~` at the end. Worse yet, `<` and `>` are ASCII85 digits, so we can't strip them. We settle on a compromise where we strip leading `<~` or `~` and trailing `~` or `~>`. """ data = start_re.sub(b"", data) data = end_re.sub(b"", data) return a85decode(data) bws_re = re.compile(rb"\s") def asciihexdecode(data: bytes) -> bytes: """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the ASCIIHexDecode filter produces one byte of binary data. All white-space characters are ignored. A right angle bracket character (>) indicates EOD. Any other characters will cause an error. If the filter encounters the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. """ data = bws_re.sub(b"", data) idx = data.find(b">") if idx != -1: data = data[:idx] if idx % 2 == 1: data += b"0" return unhexlify(data) ================================================ FILE: babeldoc/pdfminer/casting.py ================================================ import itertools from typing import Any from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import Rect _FloatTriple = tuple[float, float, float] _FloatQuadruple = tuple[float, float, float, float] def safe_int(o: Any) -> int | None: try: return int(o) except (TypeError, ValueError): return None def safe_float(o: Any) -> float | None: try: return float(o) except (TypeError, ValueError): return None def safe_matrix(a: Any, b: Any, c: Any, d: Any, e: Any, f: Any) -> Matrix | None: a_f = safe_float(a) b_f = safe_float(b) c_f = safe_float(c) d_f = safe_float(d) e_f = safe_float(e) f_f = safe_float(f) if ( a_f is None or b_f is None or c_f is None or d_f is None or e_f is None or f_f is None ): return None return a_f, b_f, c_f, d_f, e_f, f_f def safe_rgb(r: Any, g: Any, b: Any) -> tuple[float, float, float] | None: return _safe_float_triple(r, g, b) def safe_cmyk( c: Any, m: Any, y: Any, k: Any ) -> tuple[float, float, float, float] | None: return _safe_float_quadruple(c, m, y, k) def safe_rect_list(value: Any) -> Rect | None: try: values = list(itertools.islice(value, 4)) except TypeError: return None if len(values) != 4: return None return safe_rect(*values) def safe_rect(a: Any, b: Any, c: Any, d: Any) -> Rect | None: return _safe_float_quadruple(a, b, c, d) def _safe_float_triple(a: Any, b: Any, c: Any) -> _FloatTriple | None: a_f = safe_float(a) b_f = safe_float(b) c_f = safe_float(c) if a_f is None or b_f is None or c_f is None: return None return a_f, b_f, c_f def _safe_float_quadruple(a: Any, b: Any, c: Any, d: Any) -> _FloatQuadruple | None: a_f = safe_float(a) b_f = safe_float(b) c_f = safe_float(c) d_f = safe_float(d) if a_f is None or b_f is None or c_f is None or d_f is None: return None return a_f, b_f, c_f, d_f ================================================ FILE: babeldoc/pdfminer/ccitt.py ================================================ # CCITT Fax decoder # # Bugs: uncompressed mode untested. # # cf. # ITU-T Recommendation T.4 # "Standardization of Group 3 facsimile terminals # for document transmission" # ITU-T Recommendation T.6 # "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS # FOR GROUP 4 FACSIMILE APPARATUS" import array from collections.abc import Callable from collections.abc import Iterator from collections.abc import MutableSequence from collections.abc import Sequence from typing import Any from typing import cast from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdfexceptions import PDFValueError def get_bytes(data: bytes) -> Iterator[int]: yield from data # Workaround https://github.com/python/mypy/issues/731 BitParserState = MutableSequence[Any] # A better definition (not supported by mypy) would be: # BitParserState = MutableSequence[Union["BitParserState", int, str, None]] class BitParser: _state: BitParserState # _accept is declared Optional solely as a workaround for # https://github.com/python/mypy/issues/708 _accept: Callable[[Any], BitParserState] | None def __init__(self) -> None: self._pos = 0 @classmethod def add(cls, root: BitParserState, v: int | str, bits: str) -> None: p: BitParserState = root b = None for i in range(len(bits)): if i > 0: assert b is not None if p[b] is None: p[b] = [None, None] p = p[b] if bits[i] == "1": b = 1 else: b = 0 assert b is not None p[b] = v def feedbytes(self, data: bytes) -> None: for byte in get_bytes(data): for m in (128, 64, 32, 16, 8, 4, 2, 1): self._parse_bit(byte & m) def _parse_bit(self, x: object) -> None: if x: v = self._state[1] else: v = self._state[0] self._pos += 1 if isinstance(v, list): self._state = v else: assert self._accept is not None self._state = self._accept(v) class CCITTG4Parser(BitParser): MODE = [None, None] BitParser.add(MODE, 0, "1") BitParser.add(MODE, +1, "011") BitParser.add(MODE, -1, "010") BitParser.add(MODE, "h", "001") BitParser.add(MODE, "p", "0001") BitParser.add(MODE, +2, "000011") BitParser.add(MODE, -2, "000010") BitParser.add(MODE, +3, "0000011") BitParser.add(MODE, -3, "0000010") BitParser.add(MODE, "u", "0000001111") BitParser.add(MODE, "x1", "0000001000") BitParser.add(MODE, "x2", "0000001001") BitParser.add(MODE, "x3", "0000001010") BitParser.add(MODE, "x4", "0000001011") BitParser.add(MODE, "x5", "0000001100") BitParser.add(MODE, "x6", "0000001101") BitParser.add(MODE, "x7", "0000001110") BitParser.add(MODE, "e", "000000000001000000000001") WHITE = [None, None] BitParser.add(WHITE, 0, "00110101") BitParser.add(WHITE, 1, "000111") BitParser.add(WHITE, 2, "0111") BitParser.add(WHITE, 3, "1000") BitParser.add(WHITE, 4, "1011") BitParser.add(WHITE, 5, "1100") BitParser.add(WHITE, 6, "1110") BitParser.add(WHITE, 7, "1111") BitParser.add(WHITE, 8, "10011") BitParser.add(WHITE, 9, "10100") BitParser.add(WHITE, 10, "00111") BitParser.add(WHITE, 11, "01000") BitParser.add(WHITE, 12, "001000") BitParser.add(WHITE, 13, "000011") BitParser.add(WHITE, 14, "110100") BitParser.add(WHITE, 15, "110101") BitParser.add(WHITE, 16, "101010") BitParser.add(WHITE, 17, "101011") BitParser.add(WHITE, 18, "0100111") BitParser.add(WHITE, 19, "0001100") BitParser.add(WHITE, 20, "0001000") BitParser.add(WHITE, 21, "0010111") BitParser.add(WHITE, 22, "0000011") BitParser.add(WHITE, 23, "0000100") BitParser.add(WHITE, 24, "0101000") BitParser.add(WHITE, 25, "0101011") BitParser.add(WHITE, 26, "0010011") BitParser.add(WHITE, 27, "0100100") BitParser.add(WHITE, 28, "0011000") BitParser.add(WHITE, 29, "00000010") BitParser.add(WHITE, 30, "00000011") BitParser.add(WHITE, 31, "00011010") BitParser.add(WHITE, 32, "00011011") BitParser.add(WHITE, 33, "00010010") BitParser.add(WHITE, 34, "00010011") BitParser.add(WHITE, 35, "00010100") BitParser.add(WHITE, 36, "00010101") BitParser.add(WHITE, 37, "00010110") BitParser.add(WHITE, 38, "00010111") BitParser.add(WHITE, 39, "00101000") BitParser.add(WHITE, 40, "00101001") BitParser.add(WHITE, 41, "00101010") BitParser.add(WHITE, 42, "00101011") BitParser.add(WHITE, 43, "00101100") BitParser.add(WHITE, 44, "00101101") BitParser.add(WHITE, 45, "00000100") BitParser.add(WHITE, 46, "00000101") BitParser.add(WHITE, 47, "00001010") BitParser.add(WHITE, 48, "00001011") BitParser.add(WHITE, 49, "01010010") BitParser.add(WHITE, 50, "01010011") BitParser.add(WHITE, 51, "01010100") BitParser.add(WHITE, 52, "01010101") BitParser.add(WHITE, 53, "00100100") BitParser.add(WHITE, 54, "00100101") BitParser.add(WHITE, 55, "01011000") BitParser.add(WHITE, 56, "01011001") BitParser.add(WHITE, 57, "01011010") BitParser.add(WHITE, 58, "01011011") BitParser.add(WHITE, 59, "01001010") BitParser.add(WHITE, 60, "01001011") BitParser.add(WHITE, 61, "00110010") BitParser.add(WHITE, 62, "00110011") BitParser.add(WHITE, 63, "00110100") BitParser.add(WHITE, 64, "11011") BitParser.add(WHITE, 128, "10010") BitParser.add(WHITE, 192, "010111") BitParser.add(WHITE, 256, "0110111") BitParser.add(WHITE, 320, "00110110") BitParser.add(WHITE, 384, "00110111") BitParser.add(WHITE, 448, "01100100") BitParser.add(WHITE, 512, "01100101") BitParser.add(WHITE, 576, "01101000") BitParser.add(WHITE, 640, "01100111") BitParser.add(WHITE, 704, "011001100") BitParser.add(WHITE, 768, "011001101") BitParser.add(WHITE, 832, "011010010") BitParser.add(WHITE, 896, "011010011") BitParser.add(WHITE, 960, "011010100") BitParser.add(WHITE, 1024, "011010101") BitParser.add(WHITE, 1088, "011010110") BitParser.add(WHITE, 1152, "011010111") BitParser.add(WHITE, 1216, "011011000") BitParser.add(WHITE, 1280, "011011001") BitParser.add(WHITE, 1344, "011011010") BitParser.add(WHITE, 1408, "011011011") BitParser.add(WHITE, 1472, "010011000") BitParser.add(WHITE, 1536, "010011001") BitParser.add(WHITE, 1600, "010011010") BitParser.add(WHITE, 1664, "011000") BitParser.add(WHITE, 1728, "010011011") BitParser.add(WHITE, 1792, "00000001000") BitParser.add(WHITE, 1856, "00000001100") BitParser.add(WHITE, 1920, "00000001101") BitParser.add(WHITE, 1984, "000000010010") BitParser.add(WHITE, 2048, "000000010011") BitParser.add(WHITE, 2112, "000000010100") BitParser.add(WHITE, 2176, "000000010101") BitParser.add(WHITE, 2240, "000000010110") BitParser.add(WHITE, 2304, "000000010111") BitParser.add(WHITE, 2368, "000000011100") BitParser.add(WHITE, 2432, "000000011101") BitParser.add(WHITE, 2496, "000000011110") BitParser.add(WHITE, 2560, "000000011111") BLACK = [None, None] BitParser.add(BLACK, 0, "0000110111") BitParser.add(BLACK, 1, "010") BitParser.add(BLACK, 2, "11") BitParser.add(BLACK, 3, "10") BitParser.add(BLACK, 4, "011") BitParser.add(BLACK, 5, "0011") BitParser.add(BLACK, 6, "0010") BitParser.add(BLACK, 7, "00011") BitParser.add(BLACK, 8, "000101") BitParser.add(BLACK, 9, "000100") BitParser.add(BLACK, 10, "0000100") BitParser.add(BLACK, 11, "0000101") BitParser.add(BLACK, 12, "0000111") BitParser.add(BLACK, 13, "00000100") BitParser.add(BLACK, 14, "00000111") BitParser.add(BLACK, 15, "000011000") BitParser.add(BLACK, 16, "0000010111") BitParser.add(BLACK, 17, "0000011000") BitParser.add(BLACK, 18, "0000001000") BitParser.add(BLACK, 19, "00001100111") BitParser.add(BLACK, 20, "00001101000") BitParser.add(BLACK, 21, "00001101100") BitParser.add(BLACK, 22, "00000110111") BitParser.add(BLACK, 23, "00000101000") BitParser.add(BLACK, 24, "00000010111") BitParser.add(BLACK, 25, "00000011000") BitParser.add(BLACK, 26, "000011001010") BitParser.add(BLACK, 27, "000011001011") BitParser.add(BLACK, 28, "000011001100") BitParser.add(BLACK, 29, "000011001101") BitParser.add(BLACK, 30, "000001101000") BitParser.add(BLACK, 31, "000001101001") BitParser.add(BLACK, 32, "000001101010") BitParser.add(BLACK, 33, "000001101011") BitParser.add(BLACK, 34, "000011010010") BitParser.add(BLACK, 35, "000011010011") BitParser.add(BLACK, 36, "000011010100") BitParser.add(BLACK, 37, "000011010101") BitParser.add(BLACK, 38, "000011010110") BitParser.add(BLACK, 39, "000011010111") BitParser.add(BLACK, 40, "000001101100") BitParser.add(BLACK, 41, "000001101101") BitParser.add(BLACK, 42, "000011011010") BitParser.add(BLACK, 43, "000011011011") BitParser.add(BLACK, 44, "000001010100") BitParser.add(BLACK, 45, "000001010101") BitParser.add(BLACK, 46, "000001010110") BitParser.add(BLACK, 47, "000001010111") BitParser.add(BLACK, 48, "000001100100") BitParser.add(BLACK, 49, "000001100101") BitParser.add(BLACK, 50, "000001010010") BitParser.add(BLACK, 51, "000001010011") BitParser.add(BLACK, 52, "000000100100") BitParser.add(BLACK, 53, "000000110111") BitParser.add(BLACK, 54, "000000111000") BitParser.add(BLACK, 55, "000000100111") BitParser.add(BLACK, 56, "000000101000") BitParser.add(BLACK, 57, "000001011000") BitParser.add(BLACK, 58, "000001011001") BitParser.add(BLACK, 59, "000000101011") BitParser.add(BLACK, 60, "000000101100") BitParser.add(BLACK, 61, "000001011010") BitParser.add(BLACK, 62, "000001100110") BitParser.add(BLACK, 63, "000001100111") BitParser.add(BLACK, 64, "0000001111") BitParser.add(BLACK, 128, "000011001000") BitParser.add(BLACK, 192, "000011001001") BitParser.add(BLACK, 256, "000001011011") BitParser.add(BLACK, 320, "000000110011") BitParser.add(BLACK, 384, "000000110100") BitParser.add(BLACK, 448, "000000110101") BitParser.add(BLACK, 512, "0000001101100") BitParser.add(BLACK, 576, "0000001101101") BitParser.add(BLACK, 640, "0000001001010") BitParser.add(BLACK, 704, "0000001001011") BitParser.add(BLACK, 768, "0000001001100") BitParser.add(BLACK, 832, "0000001001101") BitParser.add(BLACK, 896, "0000001110010") BitParser.add(BLACK, 960, "0000001110011") BitParser.add(BLACK, 1024, "0000001110100") BitParser.add(BLACK, 1088, "0000001110101") BitParser.add(BLACK, 1152, "0000001110110") BitParser.add(BLACK, 1216, "0000001110111") BitParser.add(BLACK, 1280, "0000001010010") BitParser.add(BLACK, 1344, "0000001010011") BitParser.add(BLACK, 1408, "0000001010100") BitParser.add(BLACK, 1472, "0000001010101") BitParser.add(BLACK, 1536, "0000001011010") BitParser.add(BLACK, 1600, "0000001011011") BitParser.add(BLACK, 1664, "0000001100100") BitParser.add(BLACK, 1728, "0000001100101") BitParser.add(BLACK, 1792, "00000001000") BitParser.add(BLACK, 1856, "00000001100") BitParser.add(BLACK, 1920, "00000001101") BitParser.add(BLACK, 1984, "000000010010") BitParser.add(BLACK, 2048, "000000010011") BitParser.add(BLACK, 2112, "000000010100") BitParser.add(BLACK, 2176, "000000010101") BitParser.add(BLACK, 2240, "000000010110") BitParser.add(BLACK, 2304, "000000010111") BitParser.add(BLACK, 2368, "000000011100") BitParser.add(BLACK, 2432, "000000011101") BitParser.add(BLACK, 2496, "000000011110") BitParser.add(BLACK, 2560, "000000011111") UNCOMPRESSED = [None, None] BitParser.add(UNCOMPRESSED, "1", "1") BitParser.add(UNCOMPRESSED, "01", "01") BitParser.add(UNCOMPRESSED, "001", "001") BitParser.add(UNCOMPRESSED, "0001", "0001") BitParser.add(UNCOMPRESSED, "00001", "00001") BitParser.add(UNCOMPRESSED, "00000", "000001") BitParser.add(UNCOMPRESSED, "T00", "00000011") BitParser.add(UNCOMPRESSED, "T10", "00000010") BitParser.add(UNCOMPRESSED, "T000", "000000011") BitParser.add(UNCOMPRESSED, "T100", "000000010") BitParser.add(UNCOMPRESSED, "T0000", "0000000011") BitParser.add(UNCOMPRESSED, "T1000", "0000000010") BitParser.add(UNCOMPRESSED, "T00000", "00000000011") BitParser.add(UNCOMPRESSED, "T10000", "00000000010") class CCITTException(PDFException): pass class EOFB(CCITTException): pass class InvalidData(CCITTException): pass class ByteSkip(CCITTException): pass _color: int def __init__(self, width: int, bytealign: bool = False) -> None: BitParser.__init__(self) self.width = width self.bytealign = bytealign self.reset() def feedbytes(self, data: bytes) -> None: for byte in get_bytes(data): try: for m in (128, 64, 32, 16, 8, 4, 2, 1): self._parse_bit(byte & m) except self.ByteSkip: self._accept = self._parse_mode self._state = self.MODE except self.EOFB: break def _parse_mode(self, mode: object) -> BitParserState: if mode == "p": self._do_pass() self._flush_line() return self.MODE elif mode == "h": self._n1 = 0 self._accept = self._parse_horiz1 if self._color: return self.WHITE else: return self.BLACK elif mode == "u": self._accept = self._parse_uncompressed return self.UNCOMPRESSED elif mode == "e": raise self.EOFB elif isinstance(mode, int): self._do_vertical(mode) self._flush_line() return self.MODE else: raise self.InvalidData(mode) def _parse_horiz1(self, n: Any) -> BitParserState: if n is None: raise self.InvalidData self._n1 += n if n < 64: self._n2 = 0 self._color = 1 - self._color self._accept = self._parse_horiz2 if self._color: return self.WHITE else: return self.BLACK def _parse_horiz2(self, n: Any) -> BitParserState: if n is None: raise self.InvalidData self._n2 += n if n < 64: self._color = 1 - self._color self._accept = self._parse_mode self._do_horizontal(self._n1, self._n2) self._flush_line() return self.MODE elif self._color: return self.WHITE else: return self.BLACK def _parse_uncompressed(self, bits: str | None) -> BitParserState: if not bits: raise self.InvalidData if bits.startswith("T"): self._accept = self._parse_mode self._color = int(bits[1]) self._do_uncompressed(bits[2:]) return self.MODE else: self._do_uncompressed(bits) return self.UNCOMPRESSED def _get_bits(self) -> str: return "".join(str(b) for b in self._curline[: self._curpos]) def _get_refline(self, i: int) -> str: if i < 0: return "[]" + "".join(str(b) for b in self._refline) elif len(self._refline) <= i: return "".join(str(b) for b in self._refline) + "[]" else: return ( "".join(str(b) for b in self._refline[:i]) + "[" + str(self._refline[i]) + "]" + "".join(str(b) for b in self._refline[i + 1 :]) ) def reset(self) -> None: self._y = 0 self._curline = array.array("b", [1] * self.width) self._reset_line() self._accept = self._parse_mode self._state = self.MODE def output_line(self, y: int, bits: Sequence[int]) -> None: print(y, "".join(str(b) for b in bits)) def _reset_line(self) -> None: self._refline = self._curline self._curline = array.array("b", [1] * self.width) self._curpos = -1 self._color = 1 def _flush_line(self) -> None: if self.width <= self._curpos: self.output_line(self._y, self._curline) self._y += 1 self._reset_line() if self.bytealign: raise self.ByteSkip def _do_vertical(self, dx: int) -> None: x1 = self._curpos + 1 while 1: if x1 == 0: if self._color == 1 and self._refline[x1] != self._color: break elif x1 == len(self._refline) or ( self._refline[x1 - 1] == self._color and self._refline[x1] != self._color ): break x1 += 1 x1 += dx x0 = max(0, self._curpos) x1 = max(0, min(self.width, x1)) if x1 < x0: for x in range(x1, x0): self._curline[x] = self._color elif x0 < x1: for x in range(x0, x1): self._curline[x] = self._color self._curpos = x1 self._color = 1 - self._color def _do_pass(self) -> None: x1 = self._curpos + 1 while 1: if x1 == 0: if self._color == 1 and self._refline[x1] != self._color: break elif x1 == len(self._refline) or ( self._refline[x1 - 1] == self._color and self._refline[x1] != self._color ): break x1 += 1 while 1: if x1 == 0: if self._color == 0 and self._refline[x1] == self._color: break elif x1 == len(self._refline) or ( self._refline[x1 - 1] != self._color and self._refline[x1] == self._color ): break x1 += 1 for x in range(self._curpos, x1): self._curline[x] = self._color self._curpos = x1 def _do_horizontal(self, n1: int, n2: int) -> None: if self._curpos < 0: self._curpos = 0 x = self._curpos for _ in range(n1): if len(self._curline) <= x: break self._curline[x] = self._color x += 1 for _ in range(n2): if len(self._curline) <= x: break self._curline[x] = 1 - self._color x += 1 self._curpos = x def _do_uncompressed(self, bits: str) -> None: for c in bits: self._curline[self._curpos] = int(c) self._curpos += 1 self._flush_line() class CCITTFaxDecoder(CCITTG4Parser): def __init__( self, width: int, bytealign: bool = False, reversed: bool = False, ) -> None: CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.reversed = reversed self._buf = b"" def close(self) -> bytes: return self._buf def output_line(self, y: int, bits: Sequence[int]) -> None: arr = array.array("B", [0] * ((len(bits) + 7) // 8)) if self.reversed: bits = [1 - b for b in bits] for i, b in enumerate(bits): if b: arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] self._buf += arr.tobytes() def ccittfaxdecode(data: bytes, params: dict[str, object]) -> bytes: K = params.get("K") if K == -1: cols = cast(int, params.get("Columns")) bytealign = cast(bool, params.get("EncodedByteAlign")) reversed = cast(bool, params.get("BlackIs1")) parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed) else: raise PDFValueError(K) parser.feedbytes(data) return parser.close() # test def main(argv: list[str]) -> None: if not argv[1:]: import unittest unittest.main() return class Parser(CCITTG4Parser): def __init__(self, width: int, bytealign: bool = False) -> None: import pygame # type: ignore[import] CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.img = pygame.Surface((self.width, 1000)) def output_line(self, y: int, bits: Sequence[int]) -> None: for x, b in enumerate(bits): if b: self.img.set_at((x, y), (255, 255, 255)) else: self.img.set_at((x, y), (0, 0, 0)) def close(self) -> None: import pygame pygame.image.save(self.img, "out.bmp") for path in argv[1:]: fp = open(path, "rb") (_, _, k, w, h, _) = path.split(".") parser = Parser(int(w)) parser.feedbytes(fp.read()) parser.close() fp.close() ================================================ FILE: babeldoc/pdfminer/cmap/README.txt ================================================ README.txt for cmap This directory contains *.pickle.gz files converted from Adobe CMap resources. CMaps are required to decode text data written in CJK (Chinese, Japanese, Korean) language. CMap resources are now available freely from Adobe web site: http://opensource.adobe.com/wiki/display/cmap/CMap+Resources The follwing files were extracted from the downloadable tarballs: cid2code_Adobe_CNS1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z cid2code_Adobe_GB1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z cid2code_Adobe_Japan1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z cid2code_Adobe_Korea1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z These *.pickle.gz files can be generated by running following commands in the top directory: $ make cmap python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'... writing 'CNS1_H.py'... ... On Windows machines which don't have `make` command, paste the following commands on a command line prompt: mkdir pdfminer\cmap python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt python tools\conv_cmap.py -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt python tools\conv_cmap.py -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt python tools\conv_cmap.py -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt Here is the license information in the original files: %%Copyright: ----------------------------------------------------------- %%Copyright: Copyright 1990-20xx Adobe Systems Incorporated. %%Copyright: All rights reserved. %%Copyright: %%Copyright: Redistribution and use in source and binary forms, with or %%Copyright: without modification, are permitted provided that the %%Copyright: following conditions are met: %%Copyright: %%Copyright: Redistributions of source code must retain the above %%Copyright: copyright notice, this list of conditions and the following %%Copyright: disclaimer. %%Copyright: %%Copyright: Redistributions in binary form must reproduce the above %%Copyright: copyright notice, this list of conditions and the following %%Copyright: disclaimer in the documentation and/or other materials %%Copyright: provided with the distribution. %%Copyright: %%Copyright: Neither the name of Adobe Systems Incorporated nor the names %%Copyright: of its contributors may be used to endorse or promote %%Copyright: products derived from this software without specific prior %%Copyright: written permission. %%Copyright: %%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND %%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, %%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF %%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE %%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR %%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, %%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT %%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; %%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) %%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN %%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR %%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS %%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %%Copyright: ----------------------------------------------------------- ================================================ FILE: babeldoc/pdfminer/cmapdb.py ================================================ """Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode code-points to character ids (CIDs). More information is available on: https://github.com/adobe-type-tools/cmap-resources """ import gzip import logging import os import os.path import pickle as pickle import struct import sys from collections.abc import Iterable from collections.abc import Iterator from collections.abc import MutableMapping from typing import Any from typing import BinaryIO from typing import TextIO from typing import cast from babeldoc.pdfminer.encodingdb import name2unicode from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdfexceptions import PDFTypeError from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psexceptions import PSSyntaxError from babeldoc.pdfminer.psparser import KWD from babeldoc.pdfminer.psparser import PSKeyword from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.psparser import PSStackParser from babeldoc.pdfminer.psparser import literal_name from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer.utils import nunpack log = logging.getLogger(__name__) class CMapError(PDFException): pass class CMapBase: debug = 0 def __init__(self, **kwargs: object) -> None: self.attrs: MutableMapping[str, object] = kwargs.copy() def is_vertical(self) -> bool: return self.attrs.get("WMode", 0) != 0 def set_attr(self, k: str, v: object) -> None: self.attrs[k] = v def add_code2cid(self, code: str, cid: int) -> None: pass def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: pass def use_cmap(self, cmap: "CMapBase") -> None: pass def decode(self, code: bytes) -> Iterable[int]: raise NotImplementedError class CMap(CMapBase): def __init__(self, **kwargs: str | int) -> None: CMapBase.__init__(self, **kwargs) self.code2cid: dict[int, object] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst: dict[int, object], src: dict[int, object]) -> None: for k, v in src.items(): if isinstance(v, dict): d: dict[int, object] = {} dst[k] = d copy(d, v) else: dst[k] = v copy(self.code2cid, cmap.code2cid) def decode(self, code: bytes) -> Iterator[int]: log.debug("decode: %r, %r", self, code) d = self.code2cid for i in iter(code): if i in d: x = d[i] if isinstance(x, int): yield x d = self.code2cid else: d = cast(dict[int, object], x) else: d = self.code2cid def dump( self, out: TextIO = sys.stdout, code2cid: dict[int, object] | None = None, code: tuple[int, ...] = (), ) -> None: if code2cid is None: code2cid = self.code2cid code = () for k, v in sorted(code2cid.items()): c = code + (k,) if isinstance(v, int): out.write("code %r = cid %d\n" % (c, v)) else: self.dump(out=out, code2cid=cast(dict[int, object], v), code=c) class IdentityCMap(CMapBase): def decode(self, code: bytes) -> tuple[int, ...]: n = len(code) // 2 if n: return struct.unpack_from(f">{n}H", code) else: return () class IdentityCMapByte(IdentityCMap): def decode(self, code: bytes) -> tuple[int, ...]: n = len(code) if n: return struct.unpack(">%dB" % n, code) else: return () class UnicodeMap(CMapBase): def __init__(self, **kwargs: str | int) -> None: CMapBase.__init__(self, **kwargs) self.cid2unichr: dict[int, str] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def get_unichr(self, cid: int) -> str: log.debug("get_unichr: %r, %r", self, cid) return self.cid2unichr[cid] def dump(self, out: TextIO = sys.stdout) -> None: for k, v in sorted(self.cid2unichr.items()): out.write("cid %d = unicode %r\n" % (k, v)) class IdentityUnicodeMap(UnicodeMap): def get_unichr(self, cid: int) -> str: """Interpret character id as unicode codepoint""" log.debug("get_unichr: %r, %r", self, cid) return chr(cid) class FileCMap(CMap): def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int), str( (type(code), type(cid)), ) d = self.code2cid for c in code[:-1]: ci = ord(c) if ci in d: d = cast(dict[int, object], d[ci]) else: t: dict[int, object] = {} d[ci] = t d = t ci = ord(code[-1]) d[ci] = cid class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: assert isinstance(cid, int), str(type(cid)) if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) unichr = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. unichr = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): unichr = chr(code) else: raise PDFTypeError(code) # A0 = non-breaking space, some weird fonts can have a collision on a cid here. if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": return self.cid2unichr[cid] = unichr class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: super().__init__(CMapName=name) self.code2cid = module.CODE2CID if module.IS_VERTICAL: self.attrs["WMode"] = 1 class PyUnicodeMap(UnicodeMap): def __init__(self, name: str, module: Any, vertical: bool) -> None: super().__init__(CMapName=name) if vertical: self.cid2unichr = module.CID2UNICHR_V self.attrs["WMode"] = 1 else: self.cid2unichr = module.CID2UNICHR_H class CMapDB: _cmap_cache: dict[str, PyCMap] = {} _umap_cache: dict[str, list[PyUnicodeMap]] = {} class CMapNotFound(CMapError): pass @classmethod def _load_data(cls, name: str) -> Any: name = name.replace("\0", "") filename = "%s.pickle.gz" % name log.debug("loading: %r", name) cmap_paths = ( os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"), os.path.join(os.path.dirname(__file__), "cmap"), ) for directory in cmap_paths: path = os.path.join(directory, filename) if os.path.exists(path): gzfile = gzip.open(path) try: return type(str(name), (), pickle.loads(gzfile.read())) finally: gzfile.close() raise CMapDB.CMapNotFound(name) @classmethod def get_cmap(cls, name: str) -> CMapBase: if name == "Identity-H": return IdentityCMap(WMode=0) elif name == "Identity-V": return IdentityCMap(WMode=1) elif name == "OneByteIdentityH": return IdentityCMapByte(WMode=0) elif name == "OneByteIdentityV": return IdentityCMapByte(WMode=1) try: return cls._cmap_cache[name] except KeyError: pass data = cls._load_data(name) cls._cmap_cache[name] = cmap = PyCMap(name, data) return cmap @classmethod def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: try: return cls._umap_cache[name][vertical] except KeyError: pass data = cls._load_data("to-unicode-%s" % name) cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] return cls._umap_cache[name][vertical] class CMapParser(PSStackParser[PSKeyword]): def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True self._warnings: set[str] = set() def run(self) -> None: try: self.nextobject() except PSEOF: pass KEYWORD_BEGINCMAP = KWD(b"begincmap") KEYWORD_ENDCMAP = KWD(b"endcmap") KEYWORD_USECMAP = KWD(b"usecmap") KEYWORD_DEF = KWD(b"def") KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") KEYWORD_ENDBFRANGE = KWD(b"endbfrange") KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") KEYWORD_ENDBFCHAR = KWD(b"endbfchar") KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") def do_keyword(self, pos: int, token: PSKeyword) -> None: """ToUnicode CMaps See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. """ if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() return elif token is self.KEYWORD_ENDCMAP: self._in_cmap = False return if not self._in_cmap: return if token is self.KEYWORD_DEF: try: ((_, k), (_, v)) = self.pop(2) self.cmap.set_attr(literal_name(k), v) except PSSyntaxError: pass return if token is self.KEYWORD_USECMAP: try: ((_, cmapname),) = self.pop(1) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) except PSSyntaxError: pass except CMapDB.CMapNotFound: pass return if token is self.KEYWORD_BEGINCODESPACERANGE: self.popall() return if token is self.KEYWORD_ENDCODESPACERANGE: self.popall() return if token is self.KEYWORD_BEGINCIDRANGE: self.popall() return if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] for start_byte, end_byte, cid in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object of begincidrange is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object of begincidrange is not a byte.") continue if not isinstance(cid, int): self._warn_once("The cid object of begincidrange is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once( "The start and end byte of begincidrange have " "different lengths.", ) continue start_prefix = start_byte[:-4] end_prefix = end_byte[:-4] if start_prefix != end_prefix: self._warn_once( "The prefix of the start and end byte of " "begincidrange are not the same.", ) continue svar = start_byte[-4:] evar = end_byte[-4:] start = nunpack(svar) end = nunpack(evar) vlen = len(svar) for i in range(end - start + 1): x = start_prefix + struct.pack(">L", start + i)[-vlen:] self.cmap.add_cid2unichr(cid + i, x) return if token is self.KEYWORD_BEGINCIDCHAR: self.popall() return if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] for cid, code in choplist(2, objs): if isinstance(code, bytes) and isinstance(cid, int): self.cmap.add_cid2unichr(cid, code) return if token is self.KEYWORD_BEGINBFRANGE: self.popall() return if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] for start_byte, end_byte, code in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once("The start and end byte have different lengths.") continue start = nunpack(start_byte) end = nunpack(end_byte) if isinstance(code, list): if len(code) != end - start + 1: self._warn_once( "The difference between the start and end " "offsets does not match the code length.", ) for cid, unicode_value in zip( range(start, end + 1), code, strict=False ): self.cmap.add_cid2unichr(cid, unicode_value) else: assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) for i in range(end - start + 1): x = prefix + struct.pack(">L", base + i)[-vlen:] self.cmap.add_cid2unichr(start + i, x) return if token is self.KEYWORD_BEGINBFCHAR: self.popall() return if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] for cid, code in choplist(2, objs): if isinstance(cid, bytes) and isinstance(code, bytes): self.cmap.add_cid2unichr(nunpack(cid), code) return if token is self.KEYWORD_BEGINNOTDEFRANGE: self.popall() return if token is self.KEYWORD_ENDNOTDEFRANGE: self.popall() return self.push((pos, token)) def _warn_once(self, msg: str) -> None: """Warn once for each unique message""" if msg not in self._warnings: self._warnings.add(msg) base_msg = ( "Ignoring (part of) ToUnicode map because the PDF data " "does not conform to the format. This could result in " "(cid) values in the output. " ) log.warning(base_msg + msg) ================================================ FILE: babeldoc/pdfminer/converter.py ================================================ import io import logging import re from collections.abc import Sequence from typing import BinaryIO from typing import Generic from typing import TextIO from typing import TypeVar from typing import cast from babeldoc.format.pdf.document_il import il_version_1 from babeldoc.pdfminer.image import ImageWriter from babeldoc.pdfminer.layout import LAParams from babeldoc.pdfminer.layout import LTAnno from babeldoc.pdfminer.layout import LTChar from babeldoc.pdfminer.layout import LTComponent from babeldoc.pdfminer.layout import LTContainer from babeldoc.pdfminer.layout import LTCurve from babeldoc.pdfminer.layout import LTFigure from babeldoc.pdfminer.layout import LTImage from babeldoc.pdfminer.layout import LTItem from babeldoc.pdfminer.layout import LTLayoutContainer from babeldoc.pdfminer.layout import LTLine from babeldoc.pdfminer.layout import LTPage from babeldoc.pdfminer.layout import LTRect from babeldoc.pdfminer.layout import LTText from babeldoc.pdfminer.layout import LTTextBox from babeldoc.pdfminer.layout import LTTextBoxVertical from babeldoc.pdfminer.layout import LTTextGroup from babeldoc.pdfminer.layout import LTTextLine from babeldoc.pdfminer.layout import TextGroupElement from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdfdevice import PDFTextDevice from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined from babeldoc.pdfminer.pdfinterp import PDFGraphicState from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.utils import AnyIO from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import PathSegment from babeldoc.pdfminer.utils import Point from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import bbox2str from babeldoc.pdfminer.utils import enc from babeldoc.pdfminer.utils import make_compat_str from babeldoc.pdfminer.utils import mult_matrix from babeldoc.pdfminer import utils log = logging.getLogger(__name__) class PDFLayoutAnalyzer(PDFTextDevice): cur_item: LTLayoutContainer ctm: Matrix def __init__( self, rsrcmgr: PDFResourceManager, pageno: int = 1, laparams: LAParams | None = None, ) -> None: PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno self.laparams = laparams self._stack: list[LTLayoutContainer] = [] def begin_page(self, page: PDFPage, ctm: Matrix) -> None: (x0, y0, x1, y1) = page.mediabox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) self.cur_item = LTPage(self.pageno, mediabox) def end_page(self, page: PDFPage) -> None: assert not self._stack, str(len(self._stack)) assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) if self.laparams is not None: self.cur_item.analyze(self.laparams) self.pageno += 1 self.receive_layout(self.cur_item) def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: self._stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) def end_figure(self, _: str) -> None: fig = self.cur_item assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) self.cur_item = self._stack.pop() self.cur_item.add(fig) def render_image(self, name: str, stream: PDFStream) -> None: assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) item = LTImage( name, stream, (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), ) self.cur_item.add(item) def paint_path( self, gstate: PDFGraphicState, stroke: bool, fill: bool, evenodd: bool, path: Sequence[PathSegment], ) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = "".join(x[0] for x in path) current_clip_paths = self.il_creater.current_clip_paths.copy() if shape[:1] != "m": # Per PDF Reference Section 4.4.1, "path construction operators may # be invoked in any sequence, but the first one invoked must be m # or re to begin a new subpath." Since pdfminer.six already # converts all `re` (rectangle) operators to their equivelent # `mlllh` representation, paths ingested by `.paint_path(...)` that # do not begin with the `m` operator are invalid. pass # elif shape.count("m") > 1: # # recurse if there are multiple m's in this shape # for m in re.finditer(r"m[^m]+", shape): # subpath = path[m.start(0) : m.end(0)] # self.paint_path(gstate, stroke, fill, evenodd, subpath) else: # Although the 'h' command does not not literally provide a # point-position, its position is (by definition) equal to the # subpath's starting point. # # And, per Section 4.4's Table 4.9, all other path commands place # their point-position in their final two arguments. (Any preceding # arguments represent control points on Bézier curves.) raw_pts = [ cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path ] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] operators = [str(operation[0]) for operation in path] transformed_points = [ [ apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) for operand1, operand2 in zip( operation[1::2], operation[2::2], strict=False ) ] for operation in path ] transformed_path = [ cast(PathSegment, (o, *p)) for o, p in zip(operators, transformed_points, strict=False) ] # Drop a redundant "l" on a path closed with "h" if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]: shape = shape[:-2] + "h" pts.pop() passthrough_instruction = ( self.il_creater.passthrough_per_char_instruction.copy() ) xobj_id = self.il_creater.xobj_id if shape in {"mlh", "ml"}: # single line segment # # Note: 'ml', in conditional above, is a frequent anomaly # that we want to support. line = LTLine( gstate.linewidth, pts[0], pts[1], stroke, fill, evenodd, gstate.scolor, gstate.ncolor, original_path=transformed_path, dashing_style=gstate.dash, ) line.passthrough_instruction = passthrough_instruction line.xobj_id = xobj_id line.render_order = self.il_creater.get_render_order_and_increase() line.ctm = self.ctm line.raw_path = path.copy() line.clip_paths = current_clip_paths self.cur_item.add(line) elif shape in {"mlllh", "mllll"}: (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts is_closed_loop = pts[0] == pts[4] has_square_coordinates = ( x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) if is_closed_loop and has_square_coordinates: rect = LTRect( gstate.linewidth, (*pts[0], *pts[2]), stroke, fill, evenodd, gstate.scolor, gstate.ncolor, transformed_path, gstate.dash, ) rect.passthrough_instruction = passthrough_instruction rect.xobj_id = xobj_id rect.render_order = self.il_creater.get_render_order_and_increase() rect.ctm = self.ctm rect.raw_path = path.copy() rect.clip_paths = current_clip_paths self.cur_item.add(rect) else: curve = LTCurve( gstate.linewidth, pts, stroke, fill, evenodd, gstate.scolor, gstate.ncolor, transformed_path, gstate.dash, ) curve.passthrough_instruction = passthrough_instruction curve.xobj_id = xobj_id curve.render_order = self.il_creater.get_render_order_and_increase() curve.ctm = self.ctm curve.raw_path = path.copy() curve.clip_paths = current_clip_paths self.cur_item.add(curve) else: curve = LTCurve( gstate.linewidth, pts, stroke, fill, evenodd, gstate.scolor, gstate.ncolor, transformed_path, gstate.dash, ) curve.passthrough_instruction = passthrough_instruction curve.xobj_id = xobj_id curve.render_order = self.il_creater.get_render_order_and_increase() curve.ctm = self.ctm curve.raw_path = path.copy() curve.clip_paths = current_clip_paths self.cur_item.add(curve) def render_char( self, matrix: Matrix, font: PDFFont, fontsize: float, scaling: float, rise: float, cid: int, ncs: PDFColorSpace, graphicstate: PDFGraphicState, ) -> float: try: text = font.to_unichr(cid) assert isinstance(text, str), str(type(text)) except PDFUnicodeNotDefined: text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) item = LTChar( matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate, ) self.cur_item.add(item) return item.adv def handle_undefined_char(self, font: PDFFont, cid: int) -> str: log.debug("undefined: %r, %r", font, cid) return "(cid:%d)" % cid def receive_layout(self, ltpage: LTPage) -> None: pass class PDFPageAggregator(PDFLayoutAnalyzer): def __init__( self, rsrcmgr: PDFResourceManager, pageno: int = 1, laparams: LAParams | None = None, ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.result: LTPage | None = None def receive_layout(self, ltpage: LTPage) -> None: self.result = ltpage def get_result(self) -> LTPage: assert self.result is not None return self.result # Some PDFConverter children support only binary I/O IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO) class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): def __init__( self, rsrcmgr: PDFResourceManager, outfp: IOType, codec: str = "utf-8", pageno: int = 1, laparams: LAParams | None = None, ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.outfp: IOType = outfp self.codec = codec self.outfp_binary = self._is_binary_stream(self.outfp) @staticmethod def _is_binary_stream(outfp: AnyIO) -> bool: """Test if an stream is binary or not""" if "b" in getattr(outfp, "mode", ""): return True elif hasattr(outfp, "mode"): # output stream has a mode, but it does not contain 'b' return False elif isinstance(outfp, io.BytesIO): return True elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase): return False return True class TextConverter(PDFConverter[AnyIO]): def __init__( self, rsrcmgr: PDFResourceManager, outfp: AnyIO, codec: str = "utf-8", pageno: int = 1, laparams: LAParams | None = None, showpageno: bool = False, imagewriter: ImageWriter | None = None, ) -> None: super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter def write_text(self, text: str) -> None: text = utils.compatible_encode_method(text, self.codec, "ignore") if self.outfp_binary: cast(BinaryIO, self.outfp).write(text.encode()) else: cast(TextIO, self.outfp).write(text) def receive_layout(self, ltpage: LTPage) -> None: def render(item: LTItem) -> None: if isinstance(item, LTContainer): for child in item: render(child) elif isinstance(item, LTText): self.write_text(item.get_text()) if isinstance(item, LTTextBox): self.write_text("\n") elif isinstance(item, LTImage): if self.imagewriter is not None: self.imagewriter.export_image(item) if self.showpageno: self.write_text("Page %s\n" % ltpage.pageid) render(ltpage) self.write_text("\f") # Some dummy functions to save memory/CPU when all that is wanted # is text. This stops all the image and drawing output from being # recorded and taking up RAM. def render_image(self, name: str, stream: PDFStream) -> None: if self.imagewriter is not None: PDFConverter.render_image(self, name, stream) def paint_path( self, gstate: PDFGraphicState, stroke: bool, fill: bool, evenodd: bool, path: Sequence[PathSegment], ) -> None: pass class HTMLConverter(PDFConverter[AnyIO]): RECT_COLORS = { "figure": "yellow", "textline": "magenta", "textbox": "cyan", "textgroup": "red", "curve": "black", "page": "gray", } TEXT_COLORS = { "textbox": "blue", "char": "black", } def __init__( self, rsrcmgr: PDFResourceManager, outfp: AnyIO, codec: str = "utf-8", pageno: int = 1, laparams: LAParams | None = None, scale: float = 1, fontscale: float = 1.0, layoutmode: str = "normal", showpageno: bool = True, pagemargin: int = 50, imagewriter: ImageWriter | None = None, debug: int = 0, rect_colors: dict[str, str] | None = None, text_colors: dict[str, str] | None = None, ) -> None: PDFConverter.__init__( self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams, ) # write() assumes a codec for binary I/O, or no codec for text I/O. if self.outfp_binary and not self.codec: raise PDFValueError("Codec is required for a binary I/O output") if not self.outfp_binary and self.codec: raise PDFValueError("Codec must not be specified for a text I/O output") if text_colors is None: text_colors = {"char": "black"} if rect_colors is None: rect_colors = {"curve": "black", "page": "gray"} self.scale = scale self.fontscale = fontscale self.layoutmode = layoutmode self.showpageno = showpageno self.pagemargin = pagemargin self.imagewriter = imagewriter self.rect_colors = rect_colors self.text_colors = text_colors if debug: self.rect_colors.update(self.RECT_COLORS) self.text_colors.update(self.TEXT_COLORS) self._yoffset: float = self.pagemargin self._font: tuple[str, float] | None = None self._fontstack: list[tuple[str, float] | None] = [] self.write_header() def write(self, text: str) -> None: if self.codec: cast(BinaryIO, self.outfp).write(text.encode(self.codec)) else: cast(TextIO, self.outfp).write(text) def write_header(self) -> None: self.write("\n") if self.codec: s = ( '\n' % self.codec ) else: s = '\n' self.write(s) self.write("\n") def write_footer(self) -> None: page_links = [f'{i}' for i in range(1, self.pageno)] s = '

Page: %s
\n' % ", ".join( page_links, ) self.write(s) self.write("\n") def write_text(self, text: str) -> None: self.write(enc(text)) def place_rect( self, color: str, borderwidth: int, x: float, y: float, w: float, h: float, ) -> None: color2 = self.rect_colors.get(color) if color2 is not None: s = ( '\n' % ( color2, borderwidth, x * self.scale, (self._yoffset - y) * self.scale, w * self.scale, h * self.scale, ) ) self.write(s) def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None: self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) def place_image( self, item: LTImage, borderwidth: int, x: float, y: float, w: float, h: float, ) -> None: if self.imagewriter is not None: name = self.imagewriter.export_image(item) s = ( '\n' % ( enc(name), borderwidth, x * self.scale, (self._yoffset - y) * self.scale, w * self.scale, h * self.scale, ) ) self.write(s) def place_text( self, color: str, text: str, x: float, y: float, size: float, ) -> None: color2 = self.text_colors.get(color) if color2 is not None: s = ( '' % ( color2, x * self.scale, (self._yoffset - y) * self.scale, size * self.scale * self.fontscale, ) ) self.write(s) self.write_text(text) self.write("\n") def begin_div( self, color: str, borderwidth: int, x: float, y: float, w: float, h: float, writing_mode: str = "False", ) -> None: self._fontstack.append(self._font) self._font = None s = ( '
' % ( color, borderwidth, writing_mode, x * self.scale, (self._yoffset - y) * self.scale, w * self.scale, h * self.scale, ) ) self.write(s) def end_div(self, color: str) -> None: if self._font is not None: self.write("") self._font = self._fontstack.pop() self.write("
") def put_text(self, text: str, fontname: str, fontsize: float) -> None: font = (fontname, fontsize) if font != self._font: if self._font is not None: self.write("") # Remove subset tag from fontname, see PDF Reference 5.5.3 fontname_without_subset_tag = fontname.split("+")[-1] self.write( '' % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale), ) self._font = font self.write_text(text) def put_newline(self) -> None: self.write("
") def receive_layout(self, ltpage: LTPage) -> None: def show_group(item: LTTextGroup | TextGroupElement) -> None: if isinstance(item, LTTextGroup): self.place_border("textgroup", 1, item) for child in item: show_group(child) def render(item: LTItem) -> None: child: LTItem if isinstance(item, LTPage): self._yoffset += item.y1 self.place_border("page", 1, item) if self.showpageno: self.write( '
' % ((self._yoffset - item.y1) * self.scale), ) self.write( f'Page {item.pageid}
\n', ) for child in item: render(child) if item.groups is not None: for group in item.groups: show_group(group) elif isinstance(item, LTCurve): self.place_border("curve", 1, item) elif isinstance(item, LTFigure): self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) self.end_div("figure") elif isinstance(item, LTImage): self.place_image(item, 1, item.x0, item.y1, item.width, item.height) elif self.layoutmode == "exact": if isinstance(item, LTTextLine): self.place_border("textline", 1, item) for child in item: render(child) elif isinstance(item, LTTextBox): self.place_border("textbox", 1, item) self.place_text( "textbox", str(item.index + 1), item.x0, item.y1, 20, ) for child in item: render(child) elif isinstance(item, LTChar): self.place_border("char", 1, item) self.place_text( "char", item.get_text(), item.x0, item.y1, item.size, ) elif isinstance(item, LTTextLine): for child in item: render(child) if self.layoutmode != "loose": self.put_newline() elif isinstance(item, LTTextBox): self.begin_div( "textbox", 1, item.x0, item.y1, item.width, item.height, item.get_writing_mode(), ) for child in item: render(child) self.end_div("textbox") elif isinstance(item, LTChar): fontname = make_compat_str(item.fontname) self.put_text(item.get_text(), fontname, item.size) elif isinstance(item, LTText): self.write_text(item.get_text()) render(ltpage) self._yoffset += self.pagemargin def close(self) -> None: self.write_footer() class XMLConverter(PDFConverter[AnyIO]): CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]") def __init__( self, rsrcmgr: PDFResourceManager, outfp: AnyIO, codec: str = "utf-8", pageno: int = 1, laparams: LAParams | None = None, imagewriter: ImageWriter | None = None, stripcontrol: bool = False, ) -> None: PDFConverter.__init__( self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams, ) # write() assumes a codec for binary I/O, or no codec for text I/O. if self.outfp_binary == (not self.codec): raise PDFValueError("Codec is required for a binary I/O output") self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.write_header() def write(self, text: str) -> None: if self.codec: cast(BinaryIO, self.outfp).write(text.encode(self.codec)) else: cast(TextIO, self.outfp).write(text) def write_header(self) -> None: if self.codec: self.write('\n' % self.codec) else: self.write('\n') self.write("\n") def write_footer(self) -> None: self.write("\n") def write_text(self, text: str) -> None: if self.stripcontrol: text = self.CONTROL.sub("", text) self.write(enc(text)) def receive_layout(self, ltpage: LTPage) -> None: def show_group(item: LTItem) -> None: if isinstance(item, LTTextBox): self.write( '\n' % (item.index, bbox2str(item.bbox)), ) elif isinstance(item, LTTextGroup): self.write('\n' % bbox2str(item.bbox)) for child in item: show_group(child) self.write("\n") def render(item: LTItem) -> None: child: LTItem if isinstance(item, LTPage): s = '\n' % ( item.pageid, bbox2str(item.bbox), item.rotate, ) self.write(s) for child in item: render(child) if item.groups is not None: self.write("\n") for group in item.groups: show_group(group) self.write("\n") self.write("\n") elif isinstance(item, LTLine): s = '\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTRect): s = '\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTCurve): s = '\n' % ( item.linewidth, bbox2str(item.bbox), item.get_pts(), ) self.write(s) elif isinstance(item, LTFigure): s = f'
\n' self.write(s) for child in item: render(child) self.write("
\n") elif isinstance(item, LTTextLine): self.write('\n' % bbox2str(item.bbox)) for child in item: render(child) self.write("\n") elif isinstance(item, LTTextBox): wmode = "" if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' s = '\n' % ( item.index, bbox2str(item.bbox), wmode, ) self.write(s) for child in item: render(child) self.write("\n") elif isinstance(item, LTChar): s = ( '' % ( enc(item.fontname), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size, ) ) self.write(s) self.write_text(item.get_text()) self.write("\n") elif isinstance(item, LTText): self.write("%s\n" % item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write( '\n' % (enc(name), item.width, item.height), ) else: self.write( '\n' % (item.width, item.height), ) else: assert False, str(("Unhandled", item)) render(ltpage) def close(self) -> None: self.write_footer() class HOCRConverter(PDFConverter[AnyIO]): """Extract an hOCR representation from explicit text information within a PDF.""" # Where text is being extracted from a variety of types of PDF within a # business process, those PDFs where the text is only present in image # form will need to be analysed using an OCR tool which will typically # output hOCR. This converter extracts the explicit text information from # those PDFs that do have it and uses it to genxerate a basic hOCR # representation that is designed to be used in conjunction with the image # of the PDF in the same way as genuine OCR output would be, but without the # inevitable OCR errors. # The converter does not handle images, diagrams or text colors. # In the examples processed by the contributor it was necessary to set # LAParams.all_texts to True. CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") def __init__( self, rsrcmgr: PDFResourceManager, outfp: AnyIO, codec: str = "utf8", pageno: int = 1, laparams: LAParams | None = None, stripcontrol: bool = False, ): PDFConverter.__init__( self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams, ) self.stripcontrol = stripcontrol self.within_chars = False self.write_header() def bbox_repr(self, bbox: Rect) -> str: (in_x0, in_y0, in_x1, in_y1) = bbox # PDF y-coordinates are the other way round from hOCR coordinates out_x0 = int(in_x0) out_y0 = int(self.page_bbox[3] - in_y1) out_x1 = int(in_x1) out_y1 = int(self.page_bbox[3] - in_y0) return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" def write(self, text: str) -> None: if self.codec: encoded_text = text.encode(self.codec) cast(BinaryIO, self.outfp).write(encoded_text) else: cast(TextIO, self.outfp).write(text) def write_header(self) -> None: if self.codec: self.write( "\n" % self.codec, ) else: self.write( "\n", ) self.write("\n") self.write("\n") self.write( "\n", ) self.write( "\n", ) self.write( " \n", ) self.write("\n") self.write("\n") def write_footer(self) -> None: self.write("\n") self.write( "\n", ) def write_text(self, text: str) -> None: if self.stripcontrol: text = self.CONTROL.sub("", text) self.write(text) def write_word(self) -> None: if len(self.working_text) > 0: bold_and_italic_styles = "" if "Italic" in self.working_font: bold_and_italic_styles = "font-style: italic; " if "Bold" in self.working_font: bold_and_italic_styles += "font-weight: bold; " self.write( "%s" % ( ( self.working_font, self.working_size, bold_and_italic_styles, self.bbox_repr(self.working_bbox), self.working_font, self.working_size, self.working_text.strip(), ) ), ) self.within_chars = False def receive_layout(self, ltpage: LTPage) -> None: def render(item: LTItem) -> None: if self.within_chars and isinstance(item, LTAnno): self.write_word() if isinstance(item, LTPage): self.page_bbox = item.bbox self.write( "
\n" % (item.pageid, self.bbox_repr(item.bbox)), ) for child in item: render(child) self.write("
\n") elif isinstance(item, LTTextLine): self.write( "" % (self.bbox_repr(item.bbox)), ) for child_line in item: render(child_line) self.write("\n") elif isinstance(item, LTTextBox): self.write( "
\n" % (item.index, self.bbox_repr(item.bbox)), ) for child in item: render(child) self.write("
\n") elif isinstance(item, LTChar): if not self.within_chars: self.within_chars = True self.working_text = item.get_text() self.working_bbox = item.bbox self.working_font = item.fontname self.working_size = item.size elif len(item.get_text().strip()) == 0: self.write_word() self.write(item.get_text()) else: if ( self.working_bbox[1] != item.bbox[1] or self.working_font != item.fontname or self.working_size != item.size ): self.write_word() self.working_bbox = item.bbox self.working_font = item.fontname self.working_size = item.size self.working_text += item.get_text() self.working_bbox = ( self.working_bbox[0], self.working_bbox[1], item.bbox[2], self.working_bbox[3], ) render(ltpage) def close(self) -> None: self.write_footer() ================================================ FILE: babeldoc/pdfminer/data_structures.py ================================================ from collections.abc import Iterable from typing import Any from babeldoc.pdfminer.pdfparser import PDFSyntaxError from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import int_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer import settings class NumberTree: """A PDF number tree. See Section 3.8.6 of the PDF Reference. """ def __init__(self, obj: Any): self._obj = dict_value(obj) self.nums: Iterable[Any] | None = None self.kids: Iterable[Any] | None = None self.limits: Iterable[Any] | None = None if "Nums" in self._obj: self.nums = list_value(self._obj["Nums"]) if "Kids" in self._obj: self.kids = list_value(self._obj["Kids"]) if "Limits" in self._obj: self.limits = list_value(self._obj["Limits"]) def _parse(self) -> list[tuple[int, Any]]: items = [] if self.nums: # Leaf node for k, v in choplist(2, self.nums): items.append((int_value(k), v)) if self.kids: # Root or intermediate node for child_ref in self.kids: items += NumberTree(child_ref)._parse() return items values: list[tuple[int, Any]] # workaround decorators unsupported by mypy @property # type: ignore[no-redef,misc] def values(self) -> list[tuple[int, Any]]: values = self._parse() if settings.STRICT: if not all(a[0] <= b[0] for a, b in zip(values, values[1:], strict=False)): raise PDFSyntaxError("Number tree elements are out of order") else: values.sort(key=lambda t: t[0]) return values ================================================ FILE: babeldoc/pdfminer/encodingdb.py ================================================ import logging import re from collections.abc import Iterable from typing import cast from babeldoc.pdfminer.glyphlist import glyphname2unicode from babeldoc.pdfminer.latin_enc import ENCODING from babeldoc.pdfminer.pdfexceptions import PDFKeyError from babeldoc.pdfminer.psparser import PSLiteral HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") log = logging.getLogger(__name__) def name2unicode(name: str) -> str: """Converts Adobe glyph names to Unicode numbers. In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. This way the caller must explicitly define what to do when there is not a match. Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping :returns unicode character if name resembles something, otherwise a KeyError """ if not isinstance(name, str): raise PDFKeyError( 'Could not convert unicode name "%s" to character because ' "it should be of type str but is of type %s" % (name, type(name)), ) name = name.split(".")[0] components = name.split("_") if len(components) > 1: return "".join(map(name2unicode, components)) elif name in glyphname2unicode: return glyphname2unicode[name] elif name.startswith("uni"): name_without_uni = name.strip("uni") if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [ int(name_without_uni[i : i + 4], base=16) for i in range(0, len(name_without_uni), 4) ] for digit in unicode_digits: raise_key_error_for_invalid_unicode(digit) characters = map(chr, unicode_digits) return "".join(characters) elif name.startswith("u"): name_without_u = name.strip("u") if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) raise_key_error_for_invalid_unicode(unicode_digit) return chr(unicode_digit) raise PDFKeyError( 'Could not convert unicode name "%s" to character because ' "it does not match specification" % name, ) def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 :raises KeyError if unicode digit is invalid """ if 55295 < unicode_digit < 57344: raise PDFKeyError( "Unicode digit %d is invalid because " "it is in the range D800 through DFFF" % unicode_digit, ) class EncodingDB: std2unicode: dict[int, str] = {} mac2unicode: dict[int, str] = {} win2unicode: dict[int, str] = {} pdf2unicode: dict[int, str] = {} for name, std, mac, win, pdf in ENCODING: c = name2unicode(name) if std: std2unicode[std] = c if mac: mac2unicode[mac] = c if win: win2unicode[win] = c if pdf: pdf2unicode[pdf] = c encodings = { "StandardEncoding": std2unicode, "MacRomanEncoding": mac2unicode, "WinAnsiEncoding": win2unicode, "PDFDocEncoding": pdf2unicode, } @classmethod def get_encoding( cls, name: str, diff: Iterable[object] | None = None, ) -> dict[int, str]: cid2unicode = cls.encodings.get(name, cls.std2unicode) if diff: cid2unicode = cid2unicode.copy() cid = 0 for x in diff: if isinstance(x, int): cid = x elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(cast(str, x.name)) except (KeyError, ValueError) as e: log.debug(str(e)) cid += 1 return cid2unicode ================================================ FILE: babeldoc/pdfminer/fontmetrics.py ================================================ """Font metrics for the Adobe core 14 fonts. Font metrics are used to compute the boundary of each character written with a proportional font. The following data were extracted from the AFM files: http://www.ctan.org/tex-archive/fonts/adobe/afm/ """ ### BEGIN Verbatim copy of the license part # # Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe # # This file and the 35 PostScript(R) AFM files it accompanies may be # used, copied, and distributed for any purpose and without charge, # with or without modification, provided that all copyright notices # are retained; that the AFM files are not distributed without this # file; that all modifications to this file or any of the AFM files # are prominently noted in the modified file(s); and that this # paragraph is not modified. Adobe Systems has no responsibility or # obligation to support the use of the AFM files. # ### END Verbatim copy of the license part # flake8: noqa from typing import Dict def convert_font_metrics(path: str) -> None: """Convert an AFM file to a mapping of font metrics. See below for the output. """ fonts = {} with open(path) as fileinput: for line in fileinput.readlines(): f = line.strip().split(" ") if not f: continue k = f[0] if k == "FontName": fontname = f[1] props = {"FontName": fontname, "Flags": 0} chars: Dict[int, int] = {} fonts[fontname] = (props, chars) elif k == "C": cid = int(f[1]) if 0 <= cid and cid <= 255: width = int(f[4]) chars[cid] = width elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"): k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k) props[k] = float(f[1]) elif k in ("FontName", "FamilyName", "Weight"): k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k) props[k] = f[1] elif k == "IsFixedPitch": if f[1].lower() == "true": props["Flags"] = 64 elif k == "FontBBox": props[k] = tuple(map(float, f[1:5])) print("# -*- python -*-") print("FONT_METRICS = {") for fontname, (props, chars) in fonts.items(): print(f" {fontname!r}: {(props, chars)!r},") print("}") FONT_METRICS = { "Courier": ( { "FontName": "Courier", "Descent": -194.0, "FontBBox": (-6.0, -249.0, 639.0, 803.0), "FontWeight": "Medium", "CapHeight": 572.0, "FontFamily": "Courier", "Flags": 64, "XHeight": 434.0, "ItalicAngle": 0.0, "Ascent": 627.0, }, { " ": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "'": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "`": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\xa4": 600, "\xa5": 600, "\xa6": 600, "\xa7": 600, "\xa8": 600, "\xa9": 600, "\xaa": 600, "\xab": 600, "\xac": 600, "\xae": 600, "\xaf": 600, "\xb0": 600, "\xb1": 600, "\xb2": 600, "\xb3": 600, "\xb4": 600, "\xb5": 600, "\xb6": 600, "\xb7": 600, "\xb8": 600, "\xb9": 600, "\xba": 600, "\xbb": 600, "\xbc": 600, "\xbd": 600, "\xbe": 600, "\xbf": 600, "\xc0": 600, "\xc1": 600, "\xc2": 600, "\xc3": 600, "\xc4": 600, "\xc5": 600, "\xc6": 600, "\xc7": 600, "\xc8": 600, "\xc9": 600, "\xca": 600, "\xcb": 600, "\xcc": 600, "\xcd": 600, "\xce": 600, "\xcf": 600, "\xd0": 600, "\xd1": 600, "\xd2": 600, "\xd3": 600, "\xd4": 600, "\xd5": 600, "\xd6": 600, "\xd7": 600, "\xd8": 600, "\xd9": 600, "\xda": 600, "\xdb": 600, "\xdc": 600, "\xdd": 600, "\xde": 600, "\xdf": 600, "\xe0": 600, "\xe1": 600, "\xe2": 600, "\xe3": 600, "\xe4": 600, "\xe5": 600, "\xe6": 600, "\xe7": 600, "\xe8": 600, "\xe9": 600, "\xea": 600, "\xeb": 600, "\xec": 600, "\xed": 600, "\xee": 600, "\xef": 600, "\xf0": 600, "\xf1": 600, "\xf2": 600, "\xf3": 600, "\xf4": 600, "\xf5": 600, "\xf6": 600, "\xf7": 600, "\xf8": 600, "\xf9": 600, "\xfa": 600, "\xfb": 600, "\xfc": 600, "\xfd": 600, "\xfe": 600, "\xff": 600, "\u0100": 600, "\u0101": 600, "\u0102": 600, "\u0103": 600, "\u0104": 600, "\u0105": 600, "\u0106": 600, "\u0107": 600, "\u010c": 600, "\u010d": 600, "\u010e": 600, "\u010f": 600, "\u0110": 600, "\u0111": 600, "\u0112": 600, "\u0113": 600, "\u0116": 600, "\u0117": 600, "\u0118": 600, "\u0119": 600, "\u011a": 600, "\u011b": 600, "\u011e": 600, "\u011f": 600, "\u0122": 600, "\u0123": 600, "\u012a": 600, "\u012b": 600, "\u012e": 600, "\u012f": 600, "\u0130": 600, "\u0131": 600, "\u0136": 600, "\u0137": 600, "\u0139": 600, "\u013a": 600, "\u013b": 600, "\u013c": 600, "\u013d": 600, "\u013e": 600, "\u0141": 600, "\u0142": 600, "\u0143": 600, "\u0144": 600, "\u0145": 600, "\u0146": 600, "\u0147": 600, "\u0148": 600, "\u014c": 600, "\u014d": 600, "\u0150": 600, "\u0151": 600, "\u0152": 600, "\u0153": 600, "\u0154": 600, "\u0155": 600, "\u0156": 600, "\u0157": 600, "\u0158": 600, "\u0159": 600, "\u015a": 600, "\u015b": 600, "\u015e": 600, "\u015f": 600, "\u0160": 600, "\u0161": 600, "\u0162": 600, "\u0163": 600, "\u0164": 600, "\u0165": 600, "\u016a": 600, "\u016b": 600, "\u016e": 600, "\u016f": 600, "\u0170": 600, "\u0171": 600, "\u0172": 600, "\u0173": 600, "\u0178": 600, "\u0179": 600, "\u017a": 600, "\u017b": 600, "\u017c": 600, "\u017d": 600, "\u017e": 600, "\u0192": 600, "\u0218": 600, "\u0219": 600, "\u02c6": 600, "\u02c7": 600, "\u02d8": 600, "\u02d9": 600, "\u02da": 600, "\u02db": 600, "\u02dc": 600, "\u02dd": 600, "\u2013": 600, "\u2014": 600, "\u2018": 600, "\u2019": 600, "\u201a": 600, "\u201c": 600, "\u201d": 600, "\u201e": 600, "\u2020": 600, "\u2021": 600, "\u2022": 600, "\u2026": 600, "\u2030": 600, "\u2039": 600, "\u203a": 600, "\u2044": 600, "\u2122": 600, "\u2202": 600, "\u2206": 600, "\u2211": 600, "\u2212": 600, "\u221a": 600, "\u2260": 600, "\u2264": 600, "\u2265": 600, "\u25ca": 600, "\uf6c3": 600, "\ufb01": 600, "\ufb02": 600, }, ), "Courier-Bold": ( { "FontName": "Courier-Bold", "Descent": -194.0, "FontBBox": (-88.0, -249.0, 697.0, 811.0), "FontWeight": "Bold", "CapHeight": 572.0, "FontFamily": "Courier", "Flags": 64, "XHeight": 434.0, "ItalicAngle": 0.0, "Ascent": 627.0, }, { " ": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "'": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "`": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\xa4": 600, "\xa5": 600, "\xa6": 600, "\xa7": 600, "\xa8": 600, "\xa9": 600, "\xaa": 600, "\xab": 600, "\xac": 600, "\xae": 600, "\xaf": 600, "\xb0": 600, "\xb1": 600, "\xb2": 600, "\xb3": 600, "\xb4": 600, "\xb5": 600, "\xb6": 600, "\xb7": 600, "\xb8": 600, "\xb9": 600, "\xba": 600, "\xbb": 600, "\xbc": 600, "\xbd": 600, "\xbe": 600, "\xbf": 600, "\xc0": 600, "\xc1": 600, "\xc2": 600, "\xc3": 600, "\xc4": 600, "\xc5": 600, "\xc6": 600, "\xc7": 600, "\xc8": 600, "\xc9": 600, "\xca": 600, "\xcb": 600, "\xcc": 600, "\xcd": 600, "\xce": 600, "\xcf": 600, "\xd0": 600, "\xd1": 600, "\xd2": 600, "\xd3": 600, "\xd4": 600, "\xd5": 600, "\xd6": 600, "\xd7": 600, "\xd8": 600, "\xd9": 600, "\xda": 600, "\xdb": 600, "\xdc": 600, "\xdd": 600, "\xde": 600, "\xdf": 600, "\xe0": 600, "\xe1": 600, "\xe2": 600, "\xe3": 600, "\xe4": 600, "\xe5": 600, "\xe6": 600, "\xe7": 600, "\xe8": 600, "\xe9": 600, "\xea": 600, "\xeb": 600, "\xec": 600, "\xed": 600, "\xee": 600, "\xef": 600, "\xf0": 600, "\xf1": 600, "\xf2": 600, "\xf3": 600, "\xf4": 600, "\xf5": 600, "\xf6": 600, "\xf7": 600, "\xf8": 600, "\xf9": 600, "\xfa": 600, "\xfb": 600, "\xfc": 600, "\xfd": 600, "\xfe": 600, "\xff": 600, "\u0100": 600, "\u0101": 600, "\u0102": 600, "\u0103": 600, "\u0104": 600, "\u0105": 600, "\u0106": 600, "\u0107": 600, "\u010c": 600, "\u010d": 600, "\u010e": 600, "\u010f": 600, "\u0110": 600, "\u0111": 600, "\u0112": 600, "\u0113": 600, "\u0116": 600, "\u0117": 600, "\u0118": 600, "\u0119": 600, "\u011a": 600, "\u011b": 600, "\u011e": 600, "\u011f": 600, "\u0122": 600, "\u0123": 600, "\u012a": 600, "\u012b": 600, "\u012e": 600, "\u012f": 600, "\u0130": 600, "\u0131": 600, "\u0136": 600, "\u0137": 600, "\u0139": 600, "\u013a": 600, "\u013b": 600, "\u013c": 600, "\u013d": 600, "\u013e": 600, "\u0141": 600, "\u0142": 600, "\u0143": 600, "\u0144": 600, "\u0145": 600, "\u0146": 600, "\u0147": 600, "\u0148": 600, "\u014c": 600, "\u014d": 600, "\u0150": 600, "\u0151": 600, "\u0152": 600, "\u0153": 600, "\u0154": 600, "\u0155": 600, "\u0156": 600, "\u0157": 600, "\u0158": 600, "\u0159": 600, "\u015a": 600, "\u015b": 600, "\u015e": 600, "\u015f": 600, "\u0160": 600, "\u0161": 600, "\u0162": 600, "\u0163": 600, "\u0164": 600, "\u0165": 600, "\u016a": 600, "\u016b": 600, "\u016e": 600, "\u016f": 600, "\u0170": 600, "\u0171": 600, "\u0172": 600, "\u0173": 600, "\u0178": 600, "\u0179": 600, "\u017a": 600, "\u017b": 600, "\u017c": 600, "\u017d": 600, "\u017e": 600, "\u0192": 600, "\u0218": 600, "\u0219": 600, "\u02c6": 600, "\u02c7": 600, "\u02d8": 600, "\u02d9": 600, "\u02da": 600, "\u02db": 600, "\u02dc": 600, "\u02dd": 600, "\u2013": 600, "\u2014": 600, "\u2018": 600, "\u2019": 600, "\u201a": 600, "\u201c": 600, "\u201d": 600, "\u201e": 600, "\u2020": 600, "\u2021": 600, "\u2022": 600, "\u2026": 600, "\u2030": 600, "\u2039": 600, "\u203a": 600, "\u2044": 600, "\u2122": 600, "\u2202": 600, "\u2206": 600, "\u2211": 600, "\u2212": 600, "\u221a": 600, "\u2260": 600, "\u2264": 600, "\u2265": 600, "\u25ca": 600, "\uf6c3": 600, "\ufb01": 600, "\ufb02": 600, }, ), "Courier-BoldOblique": ( { "FontName": "Courier-BoldOblique", "Descent": -194.0, "FontBBox": (-49.0, -249.0, 758.0, 811.0), "FontWeight": "Bold", "CapHeight": 572.0, "FontFamily": "Courier", "Flags": 64, "XHeight": 434.0, "ItalicAngle": -11.0, "Ascent": 627.0, }, { " ": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "'": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "`": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\xa4": 600, "\xa5": 600, "\xa6": 600, "\xa7": 600, "\xa8": 600, "\xa9": 600, "\xaa": 600, "\xab": 600, "\xac": 600, "\xae": 600, "\xaf": 600, "\xb0": 600, "\xb1": 600, "\xb2": 600, "\xb3": 600, "\xb4": 600, "\xb5": 600, "\xb6": 600, "\xb7": 600, "\xb8": 600, "\xb9": 600, "\xba": 600, "\xbb": 600, "\xbc": 600, "\xbd": 600, "\xbe": 600, "\xbf": 600, "\xc0": 600, "\xc1": 600, "\xc2": 600, "\xc3": 600, "\xc4": 600, "\xc5": 600, "\xc6": 600, "\xc7": 600, "\xc8": 600, "\xc9": 600, "\xca": 600, "\xcb": 600, "\xcc": 600, "\xcd": 600, "\xce": 600, "\xcf": 600, "\xd0": 600, "\xd1": 600, "\xd2": 600, "\xd3": 600, "\xd4": 600, "\xd5": 600, "\xd6": 600, "\xd7": 600, "\xd8": 600, "\xd9": 600, "\xda": 600, "\xdb": 600, "\xdc": 600, "\xdd": 600, "\xde": 600, "\xdf": 600, "\xe0": 600, "\xe1": 600, "\xe2": 600, "\xe3": 600, "\xe4": 600, "\xe5": 600, "\xe6": 600, "\xe7": 600, "\xe8": 600, "\xe9": 600, "\xea": 600, "\xeb": 600, "\xec": 600, "\xed": 600, "\xee": 600, "\xef": 600, "\xf0": 600, "\xf1": 600, "\xf2": 600, "\xf3": 600, "\xf4": 600, "\xf5": 600, "\xf6": 600, "\xf7": 600, "\xf8": 600, "\xf9": 600, "\xfa": 600, "\xfb": 600, "\xfc": 600, "\xfd": 600, "\xfe": 600, "\xff": 600, "\u0100": 600, "\u0101": 600, "\u0102": 600, "\u0103": 600, "\u0104": 600, "\u0105": 600, "\u0106": 600, "\u0107": 600, "\u010c": 600, "\u010d": 600, "\u010e": 600, "\u010f": 600, "\u0110": 600, "\u0111": 600, "\u0112": 600, "\u0113": 600, "\u0116": 600, "\u0117": 600, "\u0118": 600, "\u0119": 600, "\u011a": 600, "\u011b": 600, "\u011e": 600, "\u011f": 600, "\u0122": 600, "\u0123": 600, "\u012a": 600, "\u012b": 600, "\u012e": 600, "\u012f": 600, "\u0130": 600, "\u0131": 600, "\u0136": 600, "\u0137": 600, "\u0139": 600, "\u013a": 600, "\u013b": 600, "\u013c": 600, "\u013d": 600, "\u013e": 600, "\u0141": 600, "\u0142": 600, "\u0143": 600, "\u0144": 600, "\u0145": 600, "\u0146": 600, "\u0147": 600, "\u0148": 600, "\u014c": 600, "\u014d": 600, "\u0150": 600, "\u0151": 600, "\u0152": 600, "\u0153": 600, "\u0154": 600, "\u0155": 600, "\u0156": 600, "\u0157": 600, "\u0158": 600, "\u0159": 600, "\u015a": 600, "\u015b": 600, "\u015e": 600, "\u015f": 600, "\u0160": 600, "\u0161": 600, "\u0162": 600, "\u0163": 600, "\u0164": 600, "\u0165": 600, "\u016a": 600, "\u016b": 600, "\u016e": 600, "\u016f": 600, "\u0170": 600, "\u0171": 600, "\u0172": 600, "\u0173": 600, "\u0178": 600, "\u0179": 600, "\u017a": 600, "\u017b": 600, "\u017c": 600, "\u017d": 600, "\u017e": 600, "\u0192": 600, "\u0218": 600, "\u0219": 600, "\u02c6": 600, "\u02c7": 600, "\u02d8": 600, "\u02d9": 600, "\u02da": 600, "\u02db": 600, "\u02dc": 600, "\u02dd": 600, "\u2013": 600, "\u2014": 600, "\u2018": 600, "\u2019": 600, "\u201a": 600, "\u201c": 600, "\u201d": 600, "\u201e": 600, "\u2020": 600, "\u2021": 600, "\u2022": 600, "\u2026": 600, "\u2030": 600, "\u2039": 600, "\u203a": 600, "\u2044": 600, "\u2122": 600, "\u2202": 600, "\u2206": 600, "\u2211": 600, "\u2212": 600, "\u221a": 600, "\u2260": 600, "\u2264": 600, "\u2265": 600, "\u25ca": 600, "\uf6c3": 600, "\ufb01": 600, "\ufb02": 600, }, ), "Courier-Oblique": ( { "FontName": "Courier-Oblique", "Descent": -194.0, "FontBBox": (-49.0, -249.0, 749.0, 803.0), "FontWeight": "Medium", "CapHeight": 572.0, "FontFamily": "Courier", "Flags": 64, "XHeight": 434.0, "ItalicAngle": -11.0, "Ascent": 627.0, }, { " ": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "'": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "`": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\xa4": 600, "\xa5": 600, "\xa6": 600, "\xa7": 600, "\xa8": 600, "\xa9": 600, "\xaa": 600, "\xab": 600, "\xac": 600, "\xae": 600, "\xaf": 600, "\xb0": 600, "\xb1": 600, "\xb2": 600, "\xb3": 600, "\xb4": 600, "\xb5": 600, "\xb6": 600, "\xb7": 600, "\xb8": 600, "\xb9": 600, "\xba": 600, "\xbb": 600, "\xbc": 600, "\xbd": 600, "\xbe": 600, "\xbf": 600, "\xc0": 600, "\xc1": 600, "\xc2": 600, "\xc3": 600, "\xc4": 600, "\xc5": 600, "\xc6": 600, "\xc7": 600, "\xc8": 600, "\xc9": 600, "\xca": 600, "\xcb": 600, "\xcc": 600, "\xcd": 600, "\xce": 600, "\xcf": 600, "\xd0": 600, "\xd1": 600, "\xd2": 600, "\xd3": 600, "\xd4": 600, "\xd5": 600, "\xd6": 600, "\xd7": 600, "\xd8": 600, "\xd9": 600, "\xda": 600, "\xdb": 600, "\xdc": 600, "\xdd": 600, "\xde": 600, "\xdf": 600, "\xe0": 600, "\xe1": 600, "\xe2": 600, "\xe3": 600, "\xe4": 600, "\xe5": 600, "\xe6": 600, "\xe7": 600, "\xe8": 600, "\xe9": 600, "\xea": 600, "\xeb": 600, "\xec": 600, "\xed": 600, "\xee": 600, "\xef": 600, "\xf0": 600, "\xf1": 600, "\xf2": 600, "\xf3": 600, "\xf4": 600, "\xf5": 600, "\xf6": 600, "\xf7": 600, "\xf8": 600, "\xf9": 600, "\xfa": 600, "\xfb": 600, "\xfc": 600, "\xfd": 600, "\xfe": 600, "\xff": 600, "\u0100": 600, "\u0101": 600, "\u0102": 600, "\u0103": 600, "\u0104": 600, "\u0105": 600, "\u0106": 600, "\u0107": 600, "\u010c": 600, "\u010d": 600, "\u010e": 600, "\u010f": 600, "\u0110": 600, "\u0111": 600, "\u0112": 600, "\u0113": 600, "\u0116": 600, "\u0117": 600, "\u0118": 600, "\u0119": 600, "\u011a": 600, "\u011b": 600, "\u011e": 600, "\u011f": 600, "\u0122": 600, "\u0123": 600, "\u012a": 600, "\u012b": 600, "\u012e": 600, "\u012f": 600, "\u0130": 600, "\u0131": 600, "\u0136": 600, "\u0137": 600, "\u0139": 600, "\u013a": 600, "\u013b": 600, "\u013c": 600, "\u013d": 600, "\u013e": 600, "\u0141": 600, "\u0142": 600, "\u0143": 600, "\u0144": 600, "\u0145": 600, "\u0146": 600, "\u0147": 600, "\u0148": 600, "\u014c": 600, "\u014d": 600, "\u0150": 600, "\u0151": 600, "\u0152": 600, "\u0153": 600, "\u0154": 600, "\u0155": 600, "\u0156": 600, "\u0157": 600, "\u0158": 600, "\u0159": 600, "\u015a": 600, "\u015b": 600, "\u015e": 600, "\u015f": 600, "\u0160": 600, "\u0161": 600, "\u0162": 600, "\u0163": 600, "\u0164": 600, "\u0165": 600, "\u016a": 600, "\u016b": 600, "\u016e": 600, "\u016f": 600, "\u0170": 600, "\u0171": 600, "\u0172": 600, "\u0173": 600, "\u0178": 600, "\u0179": 600, "\u017a": 600, "\u017b": 600, "\u017c": 600, "\u017d": 600, "\u017e": 600, "\u0192": 600, "\u0218": 600, "\u0219": 600, "\u02c6": 600, "\u02c7": 600, "\u02d8": 600, "\u02d9": 600, "\u02da": 600, "\u02db": 600, "\u02dc": 600, "\u02dd": 600, "\u2013": 600, "\u2014": 600, "\u2018": 600, "\u2019": 600, "\u201a": 600, "\u201c": 600, "\u201d": 600, "\u201e": 600, "\u2020": 600, "\u2021": 600, "\u2022": 600, "\u2026": 600, "\u2030": 600, "\u2039": 600, "\u203a": 600, "\u2044": 600, "\u2122": 600, "\u2202": 600, "\u2206": 600, "\u2211": 600, "\u2212": 600, "\u221a": 600, "\u2260": 600, "\u2264": 600, "\u2265": 600, "\u25ca": 600, "\uf6c3": 600, "\ufb01": 600, "\ufb02": 600, }, ), "Helvetica": ( { "FontName": "Helvetica", "Descent": -207.0, "FontBBox": (-166.0, -225.0, 1000.0, 931.0), "FontWeight": "Medium", "CapHeight": 718.0, "FontFamily": "Helvetica", "Flags": 0, "XHeight": 523.0, "ItalicAngle": 0.0, "Ascent": 718.0, }, { " ": 278, "!": 278, '"': 355, "#": 556, "$": 556, "%": 889, "&": 667, "'": 191, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 278, ";": 278, "<": 584, "=": 584, ">": 584, "?": 556, "@": 1015, "A": 667, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 500, "K": 667, "L": 556, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 278, "\\": 278, "]": 278, "^": 469, "_": 556, "`": 333, "a": 556, "b": 556, "c": 500, "d": 556, "e": 556, "f": 278, "g": 556, "h": 556, "i": 222, "j": 222, "k": 500, "l": 222, "m": 833, "n": 556, "o": 556, "p": 556, "q": 556, "r": 333, "s": 500, "t": 278, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 500, "{": 334, "|": 260, "}": 334, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\xa4": 556, "\xa5": 556, "\xa6": 260, "\xa7": 556, "\xa8": 333, "\xa9": 737, "\xaa": 370, "\xab": 556, "\xac": 584, "\xae": 737, "\xaf": 333, "\xb0": 400, "\xb1": 584, "\xb2": 333, "\xb3": 333, "\xb4": 333, "\xb5": 556, "\xb6": 537, "\xb7": 278, "\xb8": 333, "\xb9": 333, "\xba": 365, "\xbb": 556, "\xbc": 834, "\xbd": 834, "\xbe": 834, "\xbf": 611, "\xc0": 667, "\xc1": 667, "\xc2": 667, "\xc3": 667, "\xc4": 667, "\xc5": 667, "\xc6": 1000, "\xc7": 722, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 278, "\xcd": 278, "\xce": 278, "\xcf": 278, "\xd0": 722, "\xd1": 722, "\xd2": 778, "\xd3": 778, "\xd4": 778, "\xd5": 778, "\xd6": 778, "\xd7": 584, "\xd8": 778, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 667, "\xde": 667, "\xdf": 611, "\xe0": 556, "\xe1": 556, "\xe2": 556, "\xe3": 556, "\xe4": 556, "\xe5": 556, "\xe6": 889, "\xe7": 500, "\xe8": 556, "\xe9": 556, "\xea": 556, "\xeb": 556, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 556, "\xf1": 556, "\xf2": 556, "\xf3": 556, "\xf4": 556, "\xf5": 556, "\xf6": 556, "\xf7": 584, "\xf8": 611, "\xf9": 556, "\xfa": 556, "\xfb": 556, "\xfc": 556, "\xfd": 500, "\xfe": 556, "\xff": 500, "\u0100": 667, "\u0101": 556, "\u0102": 667, "\u0103": 556, "\u0104": 667, "\u0105": 556, "\u0106": 722, "\u0107": 500, "\u010c": 722, "\u010d": 500, "\u010e": 722, "\u010f": 643, "\u0110": 722, "\u0111": 556, "\u0112": 667, "\u0113": 556, "\u0116": 667, "\u0117": 556, "\u0118": 667, "\u0119": 556, "\u011a": 667, "\u011b": 556, "\u011e": 778, "\u011f": 556, "\u0122": 778, "\u0123": 556, "\u012a": 278, "\u012b": 278, "\u012e": 278, "\u012f": 222, "\u0130": 278, "\u0131": 278, "\u0136": 667, "\u0137": 500, "\u0139": 556, "\u013a": 222, "\u013b": 556, "\u013c": 222, "\u013d": 556, "\u013e": 299, "\u0141": 556, "\u0142": 222, "\u0143": 722, "\u0144": 556, "\u0145": 722, "\u0146": 556, "\u0147": 722, "\u0148": 556, "\u014c": 778, "\u014d": 556, "\u0150": 778, "\u0151": 556, "\u0152": 1000, "\u0153": 944, "\u0154": 722, "\u0155": 333, "\u0156": 722, "\u0157": 333, "\u0158": 722, "\u0159": 333, "\u015a": 667, "\u015b": 500, "\u015e": 667, "\u015f": 500, "\u0160": 667, "\u0161": 500, "\u0162": 611, "\u0163": 278, "\u0164": 611, "\u0165": 317, "\u016a": 722, "\u016b": 556, "\u016e": 722, "\u016f": 556, "\u0170": 722, "\u0171": 556, "\u0172": 722, "\u0173": 556, "\u0178": 667, "\u0179": 611, "\u017a": 500, "\u017b": 611, "\u017c": 500, "\u017d": 611, "\u017e": 500, "\u0192": 556, "\u0218": 667, "\u0219": 500, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 556, "\u2014": 1000, "\u2018": 222, "\u2019": 222, "\u201a": 222, "\u201c": 333, "\u201d": 333, "\u201e": 333, "\u2020": 556, "\u2021": 556, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 476, "\u2206": 612, "\u2211": 600, "\u2212": 584, "\u221a": 453, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 471, "\uf6c3": 250, "\ufb01": 500, "\ufb02": 500, }, ), "Helvetica-Bold": ( { "FontName": "Helvetica-Bold", "Descent": -207.0, "FontBBox": (-170.0, -228.0, 1003.0, 962.0), "FontWeight": "Bold", "CapHeight": 718.0, "FontFamily": "Helvetica", "Flags": 0, "XHeight": 532.0, "ItalicAngle": 0.0, "Ascent": 718.0, }, { " ": 278, "!": 333, '"': 474, "#": 556, "$": 556, "%": 889, "&": 722, "'": 238, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 333, ";": 333, "<": 584, "=": 584, ">": 584, "?": 611, "@": 975, "A": 722, "B": 722, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 556, "K": 722, "L": 611, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 584, "_": 556, "`": 333, "a": 556, "b": 611, "c": 556, "d": 611, "e": 556, "f": 333, "g": 611, "h": 611, "i": 278, "j": 278, "k": 556, "l": 278, "m": 889, "n": 611, "o": 611, "p": 611, "q": 611, "r": 389, "s": 556, "t": 333, "u": 611, "v": 556, "w": 778, "x": 556, "y": 556, "z": 500, "{": 389, "|": 280, "}": 389, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\xa4": 556, "\xa5": 556, "\xa6": 280, "\xa7": 556, "\xa8": 333, "\xa9": 737, "\xaa": 370, "\xab": 556, "\xac": 584, "\xae": 737, "\xaf": 333, "\xb0": 400, "\xb1": 584, "\xb2": 333, "\xb3": 333, "\xb4": 333, "\xb5": 611, "\xb6": 556, "\xb7": 278, "\xb8": 333, "\xb9": 333, "\xba": 365, "\xbb": 556, "\xbc": 834, "\xbd": 834, "\xbe": 834, "\xbf": 611, "\xc0": 722, "\xc1": 722, "\xc2": 722, "\xc3": 722, "\xc4": 722, "\xc5": 722, "\xc6": 1000, "\xc7": 722, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 278, "\xcd": 278, "\xce": 278, "\xcf": 278, "\xd0": 722, "\xd1": 722, "\xd2": 778, "\xd3": 778, "\xd4": 778, "\xd5": 778, "\xd6": 778, "\xd7": 584, "\xd8": 778, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 667, "\xde": 667, "\xdf": 611, "\xe0": 556, "\xe1": 556, "\xe2": 556, "\xe3": 556, "\xe4": 556, "\xe5": 556, "\xe6": 889, "\xe7": 556, "\xe8": 556, "\xe9": 556, "\xea": 556, "\xeb": 556, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 611, "\xf1": 611, "\xf2": 611, "\xf3": 611, "\xf4": 611, "\xf5": 611, "\xf6": 611, "\xf7": 584, "\xf8": 611, "\xf9": 611, "\xfa": 611, "\xfb": 611, "\xfc": 611, "\xfd": 556, "\xfe": 611, "\xff": 556, "\u0100": 722, "\u0101": 556, "\u0102": 722, "\u0103": 556, "\u0104": 722, "\u0105": 556, "\u0106": 722, "\u0107": 556, "\u010c": 722, "\u010d": 556, "\u010e": 722, "\u010f": 743, "\u0110": 722, "\u0111": 611, "\u0112": 667, "\u0113": 556, "\u0116": 667, "\u0117": 556, "\u0118": 667, "\u0119": 556, "\u011a": 667, "\u011b": 556, "\u011e": 778, "\u011f": 611, "\u0122": 778, "\u0123": 611, "\u012a": 278, "\u012b": 278, "\u012e": 278, "\u012f": 278, "\u0130": 278, "\u0131": 278, "\u0136": 722, "\u0137": 556, "\u0139": 611, "\u013a": 278, "\u013b": 611, "\u013c": 278, "\u013d": 611, "\u013e": 400, "\u0141": 611, "\u0142": 278, "\u0143": 722, "\u0144": 611, "\u0145": 722, "\u0146": 611, "\u0147": 722, "\u0148": 611, "\u014c": 778, "\u014d": 611, "\u0150": 778, "\u0151": 611, "\u0152": 1000, "\u0153": 944, "\u0154": 722, "\u0155": 389, "\u0156": 722, "\u0157": 389, "\u0158": 722, "\u0159": 389, "\u015a": 667, "\u015b": 556, "\u015e": 667, "\u015f": 556, "\u0160": 667, "\u0161": 556, "\u0162": 611, "\u0163": 333, "\u0164": 611, "\u0165": 389, "\u016a": 722, "\u016b": 611, "\u016e": 722, "\u016f": 611, "\u0170": 722, "\u0171": 611, "\u0172": 722, "\u0173": 611, "\u0178": 667, "\u0179": 611, "\u017a": 500, "\u017b": 611, "\u017c": 500, "\u017d": 611, "\u017e": 500, "\u0192": 556, "\u0218": 667, "\u0219": 556, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 556, "\u2014": 1000, "\u2018": 278, "\u2019": 278, "\u201a": 278, "\u201c": 500, "\u201d": 500, "\u201e": 500, "\u2020": 556, "\u2021": 556, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 494, "\u2206": 612, "\u2211": 600, "\u2212": 584, "\u221a": 549, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 494, "\uf6c3": 250, "\ufb01": 611, "\ufb02": 611, }, ), "Helvetica-BoldOblique": ( { "FontName": "Helvetica-BoldOblique", "Descent": -207.0, "FontBBox": (-175.0, -228.0, 1114.0, 962.0), "FontWeight": "Bold", "CapHeight": 718.0, "FontFamily": "Helvetica", "Flags": 0, "XHeight": 532.0, "ItalicAngle": -12.0, "Ascent": 718.0, }, { " ": 278, "!": 333, '"': 474, "#": 556, "$": 556, "%": 889, "&": 722, "'": 238, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 333, ";": 333, "<": 584, "=": 584, ">": 584, "?": 611, "@": 975, "A": 722, "B": 722, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 556, "K": 722, "L": 611, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 584, "_": 556, "`": 333, "a": 556, "b": 611, "c": 556, "d": 611, "e": 556, "f": 333, "g": 611, "h": 611, "i": 278, "j": 278, "k": 556, "l": 278, "m": 889, "n": 611, "o": 611, "p": 611, "q": 611, "r": 389, "s": 556, "t": 333, "u": 611, "v": 556, "w": 778, "x": 556, "y": 556, "z": 500, "{": 389, "|": 280, "}": 389, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\xa4": 556, "\xa5": 556, "\xa6": 280, "\xa7": 556, "\xa8": 333, "\xa9": 737, "\xaa": 370, "\xab": 556, "\xac": 584, "\xae": 737, "\xaf": 333, "\xb0": 400, "\xb1": 584, "\xb2": 333, "\xb3": 333, "\xb4": 333, "\xb5": 611, "\xb6": 556, "\xb7": 278, "\xb8": 333, "\xb9": 333, "\xba": 365, "\xbb": 556, "\xbc": 834, "\xbd": 834, "\xbe": 834, "\xbf": 611, "\xc0": 722, "\xc1": 722, "\xc2": 722, "\xc3": 722, "\xc4": 722, "\xc5": 722, "\xc6": 1000, "\xc7": 722, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 278, "\xcd": 278, "\xce": 278, "\xcf": 278, "\xd0": 722, "\xd1": 722, "\xd2": 778, "\xd3": 778, "\xd4": 778, "\xd5": 778, "\xd6": 778, "\xd7": 584, "\xd8": 778, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 667, "\xde": 667, "\xdf": 611, "\xe0": 556, "\xe1": 556, "\xe2": 556, "\xe3": 556, "\xe4": 556, "\xe5": 556, "\xe6": 889, "\xe7": 556, "\xe8": 556, "\xe9": 556, "\xea": 556, "\xeb": 556, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 611, "\xf1": 611, "\xf2": 611, "\xf3": 611, "\xf4": 611, "\xf5": 611, "\xf6": 611, "\xf7": 584, "\xf8": 611, "\xf9": 611, "\xfa": 611, "\xfb": 611, "\xfc": 611, "\xfd": 556, "\xfe": 611, "\xff": 556, "\u0100": 722, "\u0101": 556, "\u0102": 722, "\u0103": 556, "\u0104": 722, "\u0105": 556, "\u0106": 722, "\u0107": 556, "\u010c": 722, "\u010d": 556, "\u010e": 722, "\u010f": 743, "\u0110": 722, "\u0111": 611, "\u0112": 667, "\u0113": 556, "\u0116": 667, "\u0117": 556, "\u0118": 667, "\u0119": 556, "\u011a": 667, "\u011b": 556, "\u011e": 778, "\u011f": 611, "\u0122": 778, "\u0123": 611, "\u012a": 278, "\u012b": 278, "\u012e": 278, "\u012f": 278, "\u0130": 278, "\u0131": 278, "\u0136": 722, "\u0137": 556, "\u0139": 611, "\u013a": 278, "\u013b": 611, "\u013c": 278, "\u013d": 611, "\u013e": 400, "\u0141": 611, "\u0142": 278, "\u0143": 722, "\u0144": 611, "\u0145": 722, "\u0146": 611, "\u0147": 722, "\u0148": 611, "\u014c": 778, "\u014d": 611, "\u0150": 778, "\u0151": 611, "\u0152": 1000, "\u0153": 944, "\u0154": 722, "\u0155": 389, "\u0156": 722, "\u0157": 389, "\u0158": 722, "\u0159": 389, "\u015a": 667, "\u015b": 556, "\u015e": 667, "\u015f": 556, "\u0160": 667, "\u0161": 556, "\u0162": 611, "\u0163": 333, "\u0164": 611, "\u0165": 389, "\u016a": 722, "\u016b": 611, "\u016e": 722, "\u016f": 611, "\u0170": 722, "\u0171": 611, "\u0172": 722, "\u0173": 611, "\u0178": 667, "\u0179": 611, "\u017a": 500, "\u017b": 611, "\u017c": 500, "\u017d": 611, "\u017e": 500, "\u0192": 556, "\u0218": 667, "\u0219": 556, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 556, "\u2014": 1000, "\u2018": 278, "\u2019": 278, "\u201a": 278, "\u201c": 500, "\u201d": 500, "\u201e": 500, "\u2020": 556, "\u2021": 556, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 494, "\u2206": 612, "\u2211": 600, "\u2212": 584, "\u221a": 549, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 494, "\uf6c3": 250, "\ufb01": 611, "\ufb02": 611, }, ), "Helvetica-Oblique": ( { "FontName": "Helvetica-Oblique", "Descent": -207.0, "FontBBox": (-171.0, -225.0, 1116.0, 931.0), "FontWeight": "Medium", "CapHeight": 718.0, "FontFamily": "Helvetica", "Flags": 0, "XHeight": 523.0, "ItalicAngle": -12.0, "Ascent": 718.0, }, { " ": 278, "!": 278, '"': 355, "#": 556, "$": 556, "%": 889, "&": 667, "'": 191, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 278, ";": 278, "<": 584, "=": 584, ">": 584, "?": 556, "@": 1015, "A": 667, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 500, "K": 667, "L": 556, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 278, "\\": 278, "]": 278, "^": 469, "_": 556, "`": 333, "a": 556, "b": 556, "c": 500, "d": 556, "e": 556, "f": 278, "g": 556, "h": 556, "i": 222, "j": 222, "k": 500, "l": 222, "m": 833, "n": 556, "o": 556, "p": 556, "q": 556, "r": 333, "s": 500, "t": 278, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 500, "{": 334, "|": 260, "}": 334, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\xa4": 556, "\xa5": 556, "\xa6": 260, "\xa7": 556, "\xa8": 333, "\xa9": 737, "\xaa": 370, "\xab": 556, "\xac": 584, "\xae": 737, "\xaf": 333, "\xb0": 400, "\xb1": 584, "\xb2": 333, "\xb3": 333, "\xb4": 333, "\xb5": 556, "\xb6": 537, "\xb7": 278, "\xb8": 333, "\xb9": 333, "\xba": 365, "\xbb": 556, "\xbc": 834, "\xbd": 834, "\xbe": 834, "\xbf": 611, "\xc0": 667, "\xc1": 667, "\xc2": 667, "\xc3": 667, "\xc4": 667, "\xc5": 667, "\xc6": 1000, "\xc7": 722, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 278, "\xcd": 278, "\xce": 278, "\xcf": 278, "\xd0": 722, "\xd1": 722, "\xd2": 778, "\xd3": 778, "\xd4": 778, "\xd5": 778, "\xd6": 778, "\xd7": 584, "\xd8": 778, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 667, "\xde": 667, "\xdf": 611, "\xe0": 556, "\xe1": 556, "\xe2": 556, "\xe3": 556, "\xe4": 556, "\xe5": 556, "\xe6": 889, "\xe7": 500, "\xe8": 556, "\xe9": 556, "\xea": 556, "\xeb": 556, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 556, "\xf1": 556, "\xf2": 556, "\xf3": 556, "\xf4": 556, "\xf5": 556, "\xf6": 556, "\xf7": 584, "\xf8": 611, "\xf9": 556, "\xfa": 556, "\xfb": 556, "\xfc": 556, "\xfd": 500, "\xfe": 556, "\xff": 500, "\u0100": 667, "\u0101": 556, "\u0102": 667, "\u0103": 556, "\u0104": 667, "\u0105": 556, "\u0106": 722, "\u0107": 500, "\u010c": 722, "\u010d": 500, "\u010e": 722, "\u010f": 643, "\u0110": 722, "\u0111": 556, "\u0112": 667, "\u0113": 556, "\u0116": 667, "\u0117": 556, "\u0118": 667, "\u0119": 556, "\u011a": 667, "\u011b": 556, "\u011e": 778, "\u011f": 556, "\u0122": 778, "\u0123": 556, "\u012a": 278, "\u012b": 278, "\u012e": 278, "\u012f": 222, "\u0130": 278, "\u0131": 278, "\u0136": 667, "\u0137": 500, "\u0139": 556, "\u013a": 222, "\u013b": 556, "\u013c": 222, "\u013d": 556, "\u013e": 299, "\u0141": 556, "\u0142": 222, "\u0143": 722, "\u0144": 556, "\u0145": 722, "\u0146": 556, "\u0147": 722, "\u0148": 556, "\u014c": 778, "\u014d": 556, "\u0150": 778, "\u0151": 556, "\u0152": 1000, "\u0153": 944, "\u0154": 722, "\u0155": 333, "\u0156": 722, "\u0157": 333, "\u0158": 722, "\u0159": 333, "\u015a": 667, "\u015b": 500, "\u015e": 667, "\u015f": 500, "\u0160": 667, "\u0161": 500, "\u0162": 611, "\u0163": 278, "\u0164": 611, "\u0165": 317, "\u016a": 722, "\u016b": 556, "\u016e": 722, "\u016f": 556, "\u0170": 722, "\u0171": 556, "\u0172": 722, "\u0173": 556, "\u0178": 667, "\u0179": 611, "\u017a": 500, "\u017b": 611, "\u017c": 500, "\u017d": 611, "\u017e": 500, "\u0192": 556, "\u0218": 667, "\u0219": 500, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 556, "\u2014": 1000, "\u2018": 222, "\u2019": 222, "\u201a": 222, "\u201c": 333, "\u201d": 333, "\u201e": 333, "\u2020": 556, "\u2021": 556, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 476, "\u2206": 612, "\u2211": 600, "\u2212": 584, "\u221a": 453, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 471, "\uf6c3": 250, "\ufb01": 500, "\ufb02": 500, }, ), "Symbol": ( { "FontName": "Symbol", "FontBBox": (-180.0, -293.0, 1090.0, 1010.0), "FontWeight": "Medium", "FontFamily": "Symbol", "Flags": 0, "ItalicAngle": 0.0, }, { " ": 250, "!": 333, "#": 500, "%": 833, "&": 778, "(": 333, ")": 333, "+": 549, ",": 250, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 278, ";": 278, "<": 549, "=": 549, ">": 549, "?": 444, "[": 333, "]": 333, "_": 500, "{": 480, "|": 200, "}": 480, "\xac": 713, "\xb0": 400, "\xb1": 549, "\xb5": 576, "\xd7": 549, "\xf7": 549, "\u0192": 500, "\u0391": 722, "\u0392": 667, "\u0393": 603, "\u0395": 611, "\u0396": 611, "\u0397": 722, "\u0398": 741, "\u0399": 333, "\u039a": 722, "\u039b": 686, "\u039c": 889, "\u039d": 722, "\u039e": 645, "\u039f": 722, "\u03a0": 768, "\u03a1": 556, "\u03a3": 592, "\u03a4": 611, "\u03a5": 690, "\u03a6": 763, "\u03a7": 722, "\u03a8": 795, "\u03b1": 631, "\u03b2": 549, "\u03b3": 411, "\u03b4": 494, "\u03b5": 439, "\u03b6": 494, "\u03b7": 603, "\u03b8": 521, "\u03b9": 329, "\u03ba": 549, "\u03bb": 549, "\u03bd": 521, "\u03be": 493, "\u03bf": 549, "\u03c0": 549, "\u03c1": 549, "\u03c2": 439, "\u03c3": 603, "\u03c4": 439, "\u03c5": 576, "\u03c6": 521, "\u03c7": 549, "\u03c8": 686, "\u03c9": 686, "\u03d1": 631, "\u03d2": 620, "\u03d5": 603, "\u03d6": 713, "\u2022": 460, "\u2026": 1000, "\u2032": 247, "\u2033": 411, "\u2044": 167, "\u20ac": 750, "\u2111": 686, "\u2118": 987, "\u211c": 795, "\u2126": 768, "\u2135": 823, "\u2190": 987, "\u2191": 603, "\u2192": 987, "\u2193": 603, "\u2194": 1042, "\u21b5": 658, "\u21d0": 987, "\u21d1": 603, "\u21d2": 987, "\u21d3": 603, "\u21d4": 1042, "\u2200": 713, "\u2202": 494, "\u2203": 549, "\u2205": 823, "\u2206": 612, "\u2207": 713, "\u2208": 713, "\u2209": 713, "\u220b": 439, "\u220f": 823, "\u2211": 713, "\u2212": 549, "\u2217": 500, "\u221a": 549, "\u221d": 713, "\u221e": 713, "\u2220": 768, "\u2227": 603, "\u2228": 603, "\u2229": 768, "\u222a": 768, "\u222b": 274, "\u2234": 863, "\u223c": 549, "\u2245": 549, "\u2248": 549, "\u2260": 549, "\u2261": 549, "\u2264": 549, "\u2265": 549, "\u2282": 713, "\u2283": 713, "\u2284": 713, "\u2286": 713, "\u2287": 713, "\u2295": 768, "\u2297": 768, "\u22a5": 658, "\u22c5": 250, "\u2320": 686, "\u2321": 686, "\u2329": 329, "\u232a": 329, "\u25ca": 494, "\u2660": 753, "\u2663": 753, "\u2665": 753, "\u2666": 753, "\uf6d9": 790, "\uf6da": 790, "\uf6db": 890, "\uf8e5": 500, "\uf8e6": 603, "\uf8e7": 1000, "\uf8e8": 790, "\uf8e9": 790, "\uf8ea": 786, "\uf8eb": 384, "\uf8ec": 384, "\uf8ed": 384, "\uf8ee": 384, "\uf8ef": 384, "\uf8f0": 384, "\uf8f1": 494, "\uf8f2": 494, "\uf8f3": 494, "\uf8f4": 494, "\uf8f5": 686, "\uf8f6": 384, "\uf8f7": 384, "\uf8f8": 384, "\uf8f9": 384, "\uf8fa": 384, "\uf8fb": 384, "\uf8fc": 494, "\uf8fd": 494, "\uf8fe": 494, "\uf8ff": 790, }, ), "Times-Bold": ( { "FontName": "Times-Bold", "Descent": -217.0, "FontBBox": (-168.0, -218.0, 1000.0, 935.0), "FontWeight": "Bold", "CapHeight": 676.0, "FontFamily": "Times", "Flags": 0, "XHeight": 461.0, "ItalicAngle": 0.0, "Ascent": 683.0, }, { " ": 250, "!": 333, '"': 555, "#": 500, "$": 500, "%": 1000, "&": 833, "'": 278, "(": 333, ")": 333, "*": 500, "+": 570, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 570, "=": 570, ">": 570, "?": 500, "@": 930, "A": 722, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 778, "I": 389, "J": 500, "K": 778, "L": 667, "M": 944, "N": 722, "O": 778, "P": 611, "Q": 778, "R": 722, "S": 556, "T": 667, "U": 722, "V": 722, "W": 1000, "X": 722, "Y": 722, "Z": 667, "[": 333, "\\": 278, "]": 333, "^": 581, "_": 500, "`": 333, "a": 500, "b": 556, "c": 444, "d": 556, "e": 444, "f": 333, "g": 500, "h": 556, "i": 278, "j": 333, "k": 556, "l": 278, "m": 833, "n": 556, "o": 500, "p": 556, "q": 556, "r": 444, "s": 389, "t": 333, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 444, "{": 394, "|": 220, "}": 394, "~": 520, "\xa1": 333, "\xa2": 500, "\xa3": 500, "\xa4": 500, "\xa5": 500, "\xa6": 220, "\xa7": 500, "\xa8": 333, "\xa9": 747, "\xaa": 300, "\xab": 500, "\xac": 570, "\xae": 747, "\xaf": 333, "\xb0": 400, "\xb1": 570, "\xb2": 300, "\xb3": 300, "\xb4": 333, "\xb5": 556, "\xb6": 540, "\xb7": 250, "\xb8": 333, "\xb9": 300, "\xba": 330, "\xbb": 500, "\xbc": 750, "\xbd": 750, "\xbe": 750, "\xbf": 500, "\xc0": 722, "\xc1": 722, "\xc2": 722, "\xc3": 722, "\xc4": 722, "\xc5": 722, "\xc6": 1000, "\xc7": 722, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 389, "\xcd": 389, "\xce": 389, "\xcf": 389, "\xd0": 722, "\xd1": 722, "\xd2": 778, "\xd3": 778, "\xd4": 778, "\xd5": 778, "\xd6": 778, "\xd7": 570, "\xd8": 778, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 722, "\xde": 611, "\xdf": 556, "\xe0": 500, "\xe1": 500, "\xe2": 500, "\xe3": 500, "\xe4": 500, "\xe5": 500, "\xe6": 722, "\xe7": 444, "\xe8": 444, "\xe9": 444, "\xea": 444, "\xeb": 444, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 500, "\xf1": 556, "\xf2": 500, "\xf3": 500, "\xf4": 500, "\xf5": 500, "\xf6": 500, "\xf7": 570, "\xf8": 500, "\xf9": 556, "\xfa": 556, "\xfb": 556, "\xfc": 556, "\xfd": 500, "\xfe": 556, "\xff": 500, "\u0100": 722, "\u0101": 500, "\u0102": 722, "\u0103": 500, "\u0104": 722, "\u0105": 500, "\u0106": 722, "\u0107": 444, "\u010c": 722, "\u010d": 444, "\u010e": 722, "\u010f": 672, "\u0110": 722, "\u0111": 556, "\u0112": 667, "\u0113": 444, "\u0116": 667, "\u0117": 444, "\u0118": 667, "\u0119": 444, "\u011a": 667, "\u011b": 444, "\u011e": 778, "\u011f": 500, "\u0122": 778, "\u0123": 500, "\u012a": 389, "\u012b": 278, "\u012e": 389, "\u012f": 278, "\u0130": 389, "\u0131": 278, "\u0136": 778, "\u0137": 556, "\u0139": 667, "\u013a": 278, "\u013b": 667, "\u013c": 278, "\u013d": 667, "\u013e": 394, "\u0141": 667, "\u0142": 278, "\u0143": 722, "\u0144": 556, "\u0145": 722, "\u0146": 556, "\u0147": 722, "\u0148": 556, "\u014c": 778, "\u014d": 500, "\u0150": 778, "\u0151": 500, "\u0152": 1000, "\u0153": 722, "\u0154": 722, "\u0155": 444, "\u0156": 722, "\u0157": 444, "\u0158": 722, "\u0159": 444, "\u015a": 556, "\u015b": 389, "\u015e": 556, "\u015f": 389, "\u0160": 556, "\u0161": 389, "\u0162": 667, "\u0163": 333, "\u0164": 667, "\u0165": 416, "\u016a": 722, "\u016b": 556, "\u016e": 722, "\u016f": 556, "\u0170": 722, "\u0171": 556, "\u0172": 722, "\u0173": 556, "\u0178": 722, "\u0179": 667, "\u017a": 444, "\u017b": 667, "\u017c": 444, "\u017d": 667, "\u017e": 444, "\u0192": 500, "\u0218": 556, "\u0219": 389, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 500, "\u2014": 1000, "\u2018": 333, "\u2019": 333, "\u201a": 333, "\u201c": 500, "\u201d": 500, "\u201e": 500, "\u2020": 500, "\u2021": 500, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 494, "\u2206": 612, "\u2211": 600, "\u2212": 570, "\u221a": 549, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 494, "\uf6c3": 250, "\ufb01": 556, "\ufb02": 556, }, ), "Times-BoldItalic": ( { "FontName": "Times-BoldItalic", "Descent": -217.0, "FontBBox": (-200.0, -218.0, 996.0, 921.0), "FontWeight": "Bold", "CapHeight": 669.0, "FontFamily": "Times", "Flags": 0, "XHeight": 462.0, "ItalicAngle": -15.0, "Ascent": 683.0, }, { " ": 250, "!": 389, '"': 555, "#": 500, "$": 500, "%": 833, "&": 778, "'": 278, "(": 333, ")": 333, "*": 500, "+": 570, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 570, "=": 570, ">": 570, "?": 500, "@": 832, "A": 667, "B": 667, "C": 667, "D": 722, "E": 667, "F": 667, "G": 722, "H": 778, "I": 389, "J": 500, "K": 667, "L": 611, "M": 889, "N": 722, "O": 722, "P": 611, "Q": 722, "R": 667, "S": 556, "T": 611, "U": 722, "V": 667, "W": 889, "X": 667, "Y": 611, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 570, "_": 500, "`": 333, "a": 500, "b": 500, "c": 444, "d": 500, "e": 444, "f": 333, "g": 500, "h": 556, "i": 278, "j": 278, "k": 500, "l": 278, "m": 778, "n": 556, "o": 500, "p": 500, "q": 500, "r": 389, "s": 389, "t": 278, "u": 556, "v": 444, "w": 667, "x": 500, "y": 444, "z": 389, "{": 348, "|": 220, "}": 348, "~": 570, "\xa1": 389, "\xa2": 500, "\xa3": 500, "\xa4": 500, "\xa5": 500, "\xa6": 220, "\xa7": 500, "\xa8": 333, "\xa9": 747, "\xaa": 266, "\xab": 500, "\xac": 606, "\xae": 747, "\xaf": 333, "\xb0": 400, "\xb1": 570, "\xb2": 300, "\xb3": 300, "\xb4": 333, "\xb5": 576, "\xb6": 500, "\xb7": 250, "\xb8": 333, "\xb9": 300, "\xba": 300, "\xbb": 500, "\xbc": 750, "\xbd": 750, "\xbe": 750, "\xbf": 500, "\xc0": 667, "\xc1": 667, "\xc2": 667, "\xc3": 667, "\xc4": 667, "\xc5": 667, "\xc6": 944, "\xc7": 667, "\xc8": 667, "\xc9": 667, "\xca": 667, "\xcb": 667, "\xcc": 389, "\xcd": 389, "\xce": 389, "\xcf": 389, "\xd0": 722, "\xd1": 722, "\xd2": 722, "\xd3": 722, "\xd4": 722, "\xd5": 722, "\xd6": 722, "\xd7": 570, "\xd8": 722, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 611, "\xde": 611, "\xdf": 500, "\xe0": 500, "\xe1": 500, "\xe2": 500, "\xe3": 500, "\xe4": 500, "\xe5": 500, "\xe6": 722, "\xe7": 444, "\xe8": 444, "\xe9": 444, "\xea": 444, "\xeb": 444, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 500, "\xf1": 556, "\xf2": 500, "\xf3": 500, "\xf4": 500, "\xf5": 500, "\xf6": 500, "\xf7": 570, "\xf8": 500, "\xf9": 556, "\xfa": 556, "\xfb": 556, "\xfc": 556, "\xfd": 444, "\xfe": 500, "\xff": 444, "\u0100": 667, "\u0101": 500, "\u0102": 667, "\u0103": 500, "\u0104": 667, "\u0105": 500, "\u0106": 667, "\u0107": 444, "\u010c": 667, "\u010d": 444, "\u010e": 722, "\u010f": 608, "\u0110": 722, "\u0111": 500, "\u0112": 667, "\u0113": 444, "\u0116": 667, "\u0117": 444, "\u0118": 667, "\u0119": 444, "\u011a": 667, "\u011b": 444, "\u011e": 722, "\u011f": 500, "\u0122": 722, "\u0123": 500, "\u012a": 389, "\u012b": 278, "\u012e": 389, "\u012f": 278, "\u0130": 389, "\u0131": 278, "\u0136": 667, "\u0137": 500, "\u0139": 611, "\u013a": 278, "\u013b": 611, "\u013c": 278, "\u013d": 611, "\u013e": 382, "\u0141": 611, "\u0142": 278, "\u0143": 722, "\u0144": 556, "\u0145": 722, "\u0146": 556, "\u0147": 722, "\u0148": 556, "\u014c": 722, "\u014d": 500, "\u0150": 722, "\u0151": 500, "\u0152": 944, "\u0153": 722, "\u0154": 667, "\u0155": 389, "\u0156": 667, "\u0157": 389, "\u0158": 667, "\u0159": 389, "\u015a": 556, "\u015b": 389, "\u015e": 556, "\u015f": 389, "\u0160": 556, "\u0161": 389, "\u0162": 611, "\u0163": 278, "\u0164": 611, "\u0165": 366, "\u016a": 722, "\u016b": 556, "\u016e": 722, "\u016f": 556, "\u0170": 722, "\u0171": 556, "\u0172": 722, "\u0173": 556, "\u0178": 611, "\u0179": 611, "\u017a": 389, "\u017b": 611, "\u017c": 389, "\u017d": 611, "\u017e": 389, "\u0192": 500, "\u0218": 556, "\u0219": 389, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 500, "\u2014": 1000, "\u2018": 333, "\u2019": 333, "\u201a": 333, "\u201c": 500, "\u201d": 500, "\u201e": 500, "\u2020": 500, "\u2021": 500, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 1000, "\u2202": 494, "\u2206": 612, "\u2211": 600, "\u2212": 606, "\u221a": 549, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 494, "\uf6c3": 250, "\ufb01": 556, "\ufb02": 556, }, ), "Times-Italic": ( { "FontName": "Times-Italic", "Descent": -217.0, "FontBBox": (-169.0, -217.0, 1010.0, 883.0), "FontWeight": "Medium", "CapHeight": 653.0, "FontFamily": "Times", "Flags": 0, "XHeight": 441.0, "ItalicAngle": -15.5, "Ascent": 683.0, }, { " ": 250, "!": 333, '"': 420, "#": 500, "$": 500, "%": 833, "&": 778, "'": 214, "(": 333, ")": 333, "*": 500, "+": 675, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 675, "=": 675, ">": 675, "?": 500, "@": 920, "A": 611, "B": 611, "C": 667, "D": 722, "E": 611, "F": 611, "G": 722, "H": 722, "I": 333, "J": 444, "K": 667, "L": 556, "M": 833, "N": 667, "O": 722, "P": 611, "Q": 722, "R": 611, "S": 500, "T": 556, "U": 722, "V": 611, "W": 833, "X": 611, "Y": 556, "Z": 556, "[": 389, "\\": 278, "]": 389, "^": 422, "_": 500, "`": 333, "a": 500, "b": 500, "c": 444, "d": 500, "e": 444, "f": 278, "g": 500, "h": 500, "i": 278, "j": 278, "k": 444, "l": 278, "m": 722, "n": 500, "o": 500, "p": 500, "q": 500, "r": 389, "s": 389, "t": 278, "u": 500, "v": 444, "w": 667, "x": 444, "y": 444, "z": 389, "{": 400, "|": 275, "}": 400, "~": 541, "\xa1": 389, "\xa2": 500, "\xa3": 500, "\xa4": 500, "\xa5": 500, "\xa6": 275, "\xa7": 500, "\xa8": 333, "\xa9": 760, "\xaa": 276, "\xab": 500, "\xac": 675, "\xae": 760, "\xaf": 333, "\xb0": 400, "\xb1": 675, "\xb2": 300, "\xb3": 300, "\xb4": 333, "\xb5": 500, "\xb6": 523, "\xb7": 250, "\xb8": 333, "\xb9": 300, "\xba": 310, "\xbb": 500, "\xbc": 750, "\xbd": 750, "\xbe": 750, "\xbf": 500, "\xc0": 611, "\xc1": 611, "\xc2": 611, "\xc3": 611, "\xc4": 611, "\xc5": 611, "\xc6": 889, "\xc7": 667, "\xc8": 611, "\xc9": 611, "\xca": 611, "\xcb": 611, "\xcc": 333, "\xcd": 333, "\xce": 333, "\xcf": 333, "\xd0": 722, "\xd1": 667, "\xd2": 722, "\xd3": 722, "\xd4": 722, "\xd5": 722, "\xd6": 722, "\xd7": 675, "\xd8": 722, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 556, "\xde": 611, "\xdf": 500, "\xe0": 500, "\xe1": 500, "\xe2": 500, "\xe3": 500, "\xe4": 500, "\xe5": 500, "\xe6": 667, "\xe7": 444, "\xe8": 444, "\xe9": 444, "\xea": 444, "\xeb": 444, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 500, "\xf1": 500, "\xf2": 500, "\xf3": 500, "\xf4": 500, "\xf5": 500, "\xf6": 500, "\xf7": 675, "\xf8": 500, "\xf9": 500, "\xfa": 500, "\xfb": 500, "\xfc": 500, "\xfd": 444, "\xfe": 500, "\xff": 444, "\u0100": 611, "\u0101": 500, "\u0102": 611, "\u0103": 500, "\u0104": 611, "\u0105": 500, "\u0106": 667, "\u0107": 444, "\u010c": 667, "\u010d": 444, "\u010e": 722, "\u010f": 544, "\u0110": 722, "\u0111": 500, "\u0112": 611, "\u0113": 444, "\u0116": 611, "\u0117": 444, "\u0118": 611, "\u0119": 444, "\u011a": 611, "\u011b": 444, "\u011e": 722, "\u011f": 500, "\u0122": 722, "\u0123": 500, "\u012a": 333, "\u012b": 278, "\u012e": 333, "\u012f": 278, "\u0130": 333, "\u0131": 278, "\u0136": 667, "\u0137": 444, "\u0139": 556, "\u013a": 278, "\u013b": 556, "\u013c": 278, "\u013d": 611, "\u013e": 300, "\u0141": 556, "\u0142": 278, "\u0143": 667, "\u0144": 500, "\u0145": 667, "\u0146": 500, "\u0147": 667, "\u0148": 500, "\u014c": 722, "\u014d": 500, "\u0150": 722, "\u0151": 500, "\u0152": 944, "\u0153": 667, "\u0154": 611, "\u0155": 389, "\u0156": 611, "\u0157": 389, "\u0158": 611, "\u0159": 389, "\u015a": 500, "\u015b": 389, "\u015e": 500, "\u015f": 389, "\u0160": 500, "\u0161": 389, "\u0162": 556, "\u0163": 278, "\u0164": 556, "\u0165": 300, "\u016a": 722, "\u016b": 500, "\u016e": 722, "\u016f": 500, "\u0170": 722, "\u0171": 500, "\u0172": 722, "\u0173": 500, "\u0178": 556, "\u0179": 556, "\u017a": 389, "\u017b": 556, "\u017c": 389, "\u017d": 556, "\u017e": 389, "\u0192": 500, "\u0218": 500, "\u0219": 389, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 500, "\u2014": 889, "\u2018": 333, "\u2019": 333, "\u201a": 333, "\u201c": 556, "\u201d": 556, "\u201e": 556, "\u2020": 500, "\u2021": 500, "\u2022": 350, "\u2026": 889, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 980, "\u2202": 476, "\u2206": 612, "\u2211": 600, "\u2212": 675, "\u221a": 453, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 471, "\uf6c3": 250, "\ufb01": 500, "\ufb02": 500, }, ), "Times-Roman": ( { "FontName": "Times-Roman", "Descent": -217.0, "FontBBox": (-168.0, -218.0, 1000.0, 898.0), "FontWeight": "Roman", "CapHeight": 662.0, "FontFamily": "Times", "Flags": 0, "XHeight": 450.0, "ItalicAngle": 0.0, "Ascent": 683.0, }, { " ": 250, "!": 333, '"': 408, "#": 500, "$": 500, "%": 833, "&": 778, "'": 180, "(": 333, ")": 333, "*": 500, "+": 564, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 278, ";": 278, "<": 564, "=": 564, ">": 564, "?": 444, "@": 921, "A": 722, "B": 667, "C": 667, "D": 722, "E": 611, "F": 556, "G": 722, "H": 722, "I": 333, "J": 389, "K": 722, "L": 611, "M": 889, "N": 722, "O": 722, "P": 556, "Q": 722, "R": 667, "S": 556, "T": 611, "U": 722, "V": 722, "W": 944, "X": 722, "Y": 722, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 469, "_": 500, "`": 333, "a": 444, "b": 500, "c": 444, "d": 500, "e": 444, "f": 333, "g": 500, "h": 500, "i": 278, "j": 278, "k": 500, "l": 278, "m": 778, "n": 500, "o": 500, "p": 500, "q": 500, "r": 333, "s": 389, "t": 278, "u": 500, "v": 500, "w": 722, "x": 500, "y": 500, "z": 444, "{": 480, "|": 200, "}": 480, "~": 541, "\xa1": 333, "\xa2": 500, "\xa3": 500, "\xa4": 500, "\xa5": 500, "\xa6": 200, "\xa7": 500, "\xa8": 333, "\xa9": 760, "\xaa": 276, "\xab": 500, "\xac": 564, "\xae": 760, "\xaf": 333, "\xb0": 400, "\xb1": 564, "\xb2": 300, "\xb3": 300, "\xb4": 333, "\xb5": 500, "\xb6": 453, "\xb7": 250, "\xb8": 333, "\xb9": 300, "\xba": 310, "\xbb": 500, "\xbc": 750, "\xbd": 750, "\xbe": 750, "\xbf": 444, "\xc0": 722, "\xc1": 722, "\xc2": 722, "\xc3": 722, "\xc4": 722, "\xc5": 722, "\xc6": 889, "\xc7": 667, "\xc8": 611, "\xc9": 611, "\xca": 611, "\xcb": 611, "\xcc": 333, "\xcd": 333, "\xce": 333, "\xcf": 333, "\xd0": 722, "\xd1": 722, "\xd2": 722, "\xd3": 722, "\xd4": 722, "\xd5": 722, "\xd6": 722, "\xd7": 564, "\xd8": 722, "\xd9": 722, "\xda": 722, "\xdb": 722, "\xdc": 722, "\xdd": 722, "\xde": 556, "\xdf": 500, "\xe0": 444, "\xe1": 444, "\xe2": 444, "\xe3": 444, "\xe4": 444, "\xe5": 444, "\xe6": 667, "\xe7": 444, "\xe8": 444, "\xe9": 444, "\xea": 444, "\xeb": 444, "\xec": 278, "\xed": 278, "\xee": 278, "\xef": 278, "\xf0": 500, "\xf1": 500, "\xf2": 500, "\xf3": 500, "\xf4": 500, "\xf5": 500, "\xf6": 500, "\xf7": 564, "\xf8": 500, "\xf9": 500, "\xfa": 500, "\xfb": 500, "\xfc": 500, "\xfd": 500, "\xfe": 500, "\xff": 500, "\u0100": 722, "\u0101": 444, "\u0102": 722, "\u0103": 444, "\u0104": 722, "\u0105": 444, "\u0106": 667, "\u0107": 444, "\u010c": 667, "\u010d": 444, "\u010e": 722, "\u010f": 588, "\u0110": 722, "\u0111": 500, "\u0112": 611, "\u0113": 444, "\u0116": 611, "\u0117": 444, "\u0118": 611, "\u0119": 444, "\u011a": 611, "\u011b": 444, "\u011e": 722, "\u011f": 500, "\u0122": 722, "\u0123": 500, "\u012a": 333, "\u012b": 278, "\u012e": 333, "\u012f": 278, "\u0130": 333, "\u0131": 278, "\u0136": 722, "\u0137": 500, "\u0139": 611, "\u013a": 278, "\u013b": 611, "\u013c": 278, "\u013d": 611, "\u013e": 344, "\u0141": 611, "\u0142": 278, "\u0143": 722, "\u0144": 500, "\u0145": 722, "\u0146": 500, "\u0147": 722, "\u0148": 500, "\u014c": 722, "\u014d": 500, "\u0150": 722, "\u0151": 500, "\u0152": 889, "\u0153": 722, "\u0154": 667, "\u0155": 333, "\u0156": 667, "\u0157": 333, "\u0158": 667, "\u0159": 333, "\u015a": 556, "\u015b": 389, "\u015e": 556, "\u015f": 389, "\u0160": 556, "\u0161": 389, "\u0162": 611, "\u0163": 278, "\u0164": 611, "\u0165": 326, "\u016a": 722, "\u016b": 500, "\u016e": 722, "\u016f": 500, "\u0170": 722, "\u0171": 500, "\u0172": 722, "\u0173": 500, "\u0178": 722, "\u0179": 611, "\u017a": 444, "\u017b": 611, "\u017c": 444, "\u017d": 611, "\u017e": 444, "\u0192": 500, "\u0218": 556, "\u0219": 389, "\u02c6": 333, "\u02c7": 333, "\u02d8": 333, "\u02d9": 333, "\u02da": 333, "\u02db": 333, "\u02dc": 333, "\u02dd": 333, "\u2013": 500, "\u2014": 1000, "\u2018": 333, "\u2019": 333, "\u201a": 333, "\u201c": 444, "\u201d": 444, "\u201e": 444, "\u2020": 500, "\u2021": 500, "\u2022": 350, "\u2026": 1000, "\u2030": 1000, "\u2039": 333, "\u203a": 333, "\u2044": 167, "\u2122": 980, "\u2202": 476, "\u2206": 612, "\u2211": 600, "\u2212": 564, "\u221a": 453, "\u2260": 549, "\u2264": 549, "\u2265": 549, "\u25ca": 471, "\uf6c3": 250, "\ufb01": 556, "\ufb02": 556, }, ), "ZapfDingbats": ( { "FontName": "ZapfDingbats", "FontBBox": (-1.0, -143.0, 981.0, 820.0), "FontWeight": "Medium", "FontFamily": "ITC", "Flags": 0, "ItalicAngle": 0.0, }, { "\x01": 974, "\x02": 961, "\x03": 980, "\x04": 719, "\x05": 789, "\x06": 494, "\x07": 552, "\x08": 537, "\t": 577, "\n": 692, "\x0b": 960, "\x0c": 939, "\r": 549, "\x0e": 855, "\x0f": 911, "\x10": 933, "\x11": 945, "\x12": 974, "\x13": 755, "\x14": 846, "\x15": 762, "\x16": 761, "\x17": 571, "\x18": 677, "\x19": 763, "\x1a": 760, "\x1b": 759, "\x1c": 754, "\x1d": 786, "\x1e": 788, "\x1f": 788, " ": 790, "!": 793, '"': 794, "#": 816, "$": 823, "%": 789, "&": 841, "'": 823, "(": 833, ")": 816, "*": 831, "+": 923, ",": 744, "-": 723, ".": 749, "/": 790, "0": 792, "1": 695, "2": 776, "3": 768, "4": 792, "5": 759, "6": 707, "7": 708, "8": 682, "9": 701, ":": 826, ";": 815, "<": 789, "=": 789, ">": 707, "?": 687, "@": 696, "A": 689, "B": 786, "C": 787, "D": 713, "E": 791, "F": 785, "G": 791, "H": 873, "I": 761, "J": 762, "K": 759, "L": 892, "M": 892, "N": 788, "O": 784, "Q": 438, "R": 138, "S": 277, "T": 415, "U": 509, "V": 410, "W": 234, "X": 234, "Y": 390, "Z": 390, "[": 276, "\\": 276, "]": 317, "^": 317, "_": 334, "`": 334, "a": 392, "b": 392, "c": 668, "d": 668, "e": 732, "f": 544, "g": 544, "h": 910, "i": 911, "j": 667, "k": 760, "l": 760, "m": 626, "n": 694, "o": 595, "p": 776, "u": 690, "v": 791, "w": 790, "x": 788, "y": 788, "z": 788, "{": 788, "|": 788, "}": 788, "~": 788, "\x7f": 788, "\x80": 788, "\x81": 788, "\x82": 788, "\x83": 788, "\x84": 788, "\x85": 788, "\x86": 788, "\x87": 788, "\x88": 788, "\x89": 788, "\x8a": 788, "\x8b": 788, "\x8c": 788, "\x8d": 788, "\x8e": 788, "\x8f": 788, "\x90": 788, "\x91": 788, "\x92": 788, "\x93": 788, "\x94": 788, "\x95": 788, "\x96": 788, "\x97": 788, "\x98": 788, "\x99": 788, "\x9a": 788, "\x9b": 788, "\x9c": 788, "\x9d": 788, "\x9e": 788, "\x9f": 788, "\xa0": 894, "\xa1": 838, "\xa2": 924, "\xa3": 1016, "\xa4": 458, "\xa5": 924, "\xa6": 918, "\xa7": 927, "\xa8": 928, "\xa9": 928, "\xaa": 834, "\xab": 873, "\xac": 828, "\xad": 924, "\xae": 917, "\xaf": 930, "\xb0": 931, "\xb1": 463, "\xb2": 883, "\xb3": 836, "\xb4": 867, "\xb5": 696, "\xb6": 874, "\xb7": 760, "\xb8": 946, "\xb9": 865, "\xba": 967, "\xbb": 831, "\xbc": 873, "\xbd": 927, "\xbe": 970, "\xbf": 918, "\xc0": 748, "\xc1": 836, "\xc2": 771, "\xc3": 888, "\xc4": 748, "\xc5": 771, "\xc6": 888, "\xc7": 867, "\xc8": 696, "\xc9": 874, "\xca": 974, "\xcb": 762, "\xcc": 759, "\xcd": 509, "\xce": 410, }, ), } # Aliases defined in implementation note 62 in Appecix H. related to section 5.5.1 # (Type 1 Fonts) in the PDF Reference. FONT_METRICS["Arial"] = FONT_METRICS["Helvetica"] FONT_METRICS["Arial,Italic"] = FONT_METRICS["Helvetica-Oblique"] FONT_METRICS["Arial,Bold"] = FONT_METRICS["Helvetica-Bold"] FONT_METRICS["Arial,BoldItalic"] = FONT_METRICS["Helvetica-BoldOblique"] FONT_METRICS["CourierNew"] = FONT_METRICS["Courier"] FONT_METRICS["CourierNew,Italic"] = FONT_METRICS["Courier-Oblique"] FONT_METRICS["CourierNew,Bold"] = FONT_METRICS["Courier-Bold"] FONT_METRICS["CourierNew,BoldItalic"] = FONT_METRICS["Courier-BoldOblique"] FONT_METRICS["TimesNewRoman"] = FONT_METRICS["Times-Roman"] FONT_METRICS["TimesNewRoman,Italic"] = FONT_METRICS["Times-Italic"] FONT_METRICS["TimesNewRoman,Bold"] = FONT_METRICS["Times-Bold"] FONT_METRICS["TimesNewRoman,BoldItalic"] = FONT_METRICS["Times-BoldItalic"] ================================================ FILE: babeldoc/pdfminer/glyphlist.py ================================================ """Mappings from Adobe glyph names to Unicode characters. In some CMap tables, Adobe glyph names are used for specifying Unicode characters instead of using decimal/hex character code. The following data was taken by $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt ```python from babeldoc.pdfminer.glyphlist import convert_glyphlist convert_glyphlist("glyphlist.txt")""" # ################################################################################### # Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this documentation file to use, copy, publish, distribute, # sublicense, and/or sell copies of the documentation, and to permit # others to do the same, provided that: # - No modification, editing or other alteration of this document is # allowed; and # - The above copyright notice and this permission notice shall be # included in all copies of the documentation. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this documentation file, to create their own derivative works # from the content of this document to use, copy, publish, distribute, # sublicense, and/or sell the derivative works, and to permit others to do # the same, provided that the derived work is not represented as being a # copy or version of this document. # # Adobe shall not be liable to any party for any loss of revenue or profit # or for indirect, incidental, special, consequential, or other similar # damages, whether based on tort (including without limitation negligence # or strict liability), contract or other legal or equitable grounds even # if Adobe has been advised or had reason to know of the possibility of # such damages. The Adobe materials are provided on an "AS IS" basis. # Adobe specifically disclaims all express, statutory, or implied # warranties relating to the Adobe materials, including but not limited to # those concerning merchantability or fitness for a particular purpose or # non-infringement of any third party rights regarding the Adobe # materials. # ################################################################################### # Name: Adobe Glyph List # Table version: 2.0 # Date: September 20, 2002 # # See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html # # Format: Semicolon-delimited fields: # (1) glyph name # (2) Unicode scalar value def convert_glyphlist(path: str) -> None: """Convert a glyph list into a python representation. See output below. """ state = 0 with open(path) as fileinput: for line in fileinput.readlines(): line = line.strip() if not line or line.startswith("#"): if state == 1: state = 2 print("}\n") print(line) continue if state == 0: print("\nglyphname2unicode = {") state = 1 (name, x) = line.split(";") codes = x.split(" ") print( " {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)), ) glyphname2unicode = { "A": "\u0041", "AE": "\u00c6", "AEacute": "\u01fc", "AEmacron": "\u01e2", "AEsmall": "\uf7e6", "Aacute": "\u00c1", "Aacutesmall": "\uf7e1", "Abreve": "\u0102", "Abreveacute": "\u1eae", "Abrevecyrillic": "\u04d0", "Abrevedotbelow": "\u1eb6", "Abrevegrave": "\u1eb0", "Abrevehookabove": "\u1eb2", "Abrevetilde": "\u1eb4", "Acaron": "\u01cd", "Acircle": "\u24b6", "Acircumflex": "\u00c2", "Acircumflexacute": "\u1ea4", "Acircumflexdotbelow": "\u1eac", "Acircumflexgrave": "\u1ea6", "Acircumflexhookabove": "\u1ea8", "Acircumflexsmall": "\uf7e2", "Acircumflextilde": "\u1eaa", "Acute": "\uf6c9", "Acutesmall": "\uf7b4", "Acyrillic": "\u0410", "Adblgrave": "\u0200", "Adieresis": "\u00c4", "Adieresiscyrillic": "\u04d2", "Adieresismacron": "\u01de", "Adieresissmall": "\uf7e4", "Adotbelow": "\u1ea0", "Adotmacron": "\u01e0", "Agrave": "\u00c0", "Agravesmall": "\uf7e0", "Ahookabove": "\u1ea2", "Aiecyrillic": "\u04d4", "Ainvertedbreve": "\u0202", "Alpha": "\u0391", "Alphatonos": "\u0386", "Amacron": "\u0100", "Amonospace": "\uff21", "Aogonek": "\u0104", "Aring": "\u00c5", "Aringacute": "\u01fa", "Aringbelow": "\u1e00", "Aringsmall": "\uf7e5", "Asmall": "\uf761", "Atilde": "\u00c3", "Atildesmall": "\uf7e3", "Aybarmenian": "\u0531", "B": "\u0042", "Bcircle": "\u24b7", "Bdotaccent": "\u1e02", "Bdotbelow": "\u1e04", "Becyrillic": "\u0411", "Benarmenian": "\u0532", "Beta": "\u0392", "Bhook": "\u0181", "Blinebelow": "\u1e06", "Bmonospace": "\uff22", "Brevesmall": "\uf6f4", "Bsmall": "\uf762", "Btopbar": "\u0182", "C": "\u0043", "Caarmenian": "\u053e", "Cacute": "\u0106", "Caron": "\uf6ca", "Caronsmall": "\uf6f5", "Ccaron": "\u010c", "Ccedilla": "\u00c7", "Ccedillaacute": "\u1e08", "Ccedillasmall": "\uf7e7", "Ccircle": "\u24b8", "Ccircumflex": "\u0108", "Cdot": "\u010a", "Cdotaccent": "\u010a", "Cedillasmall": "\uf7b8", "Chaarmenian": "\u0549", "Cheabkhasiancyrillic": "\u04bc", "Checyrillic": "\u0427", "Chedescenderabkhasiancyrillic": "\u04be", "Chedescendercyrillic": "\u04b6", "Chedieresiscyrillic": "\u04f4", "Cheharmenian": "\u0543", "Chekhakassiancyrillic": "\u04cb", "Cheverticalstrokecyrillic": "\u04b8", "Chi": "\u03a7", "Chook": "\u0187", "Circumflexsmall": "\uf6f6", "Cmonospace": "\uff23", "Coarmenian": "\u0551", "Csmall": "\uf763", "D": "\u0044", "DZ": "\u01f1", "DZcaron": "\u01c4", "Daarmenian": "\u0534", "Dafrican": "\u0189", "Dcaron": "\u010e", "Dcedilla": "\u1e10", "Dcircle": "\u24b9", "Dcircumflexbelow": "\u1e12", "Dcroat": "\u0110", "Ddotaccent": "\u1e0a", "Ddotbelow": "\u1e0c", "Decyrillic": "\u0414", "Deicoptic": "\u03ee", "Delta": "\u2206", "Deltagreek": "\u0394", "Dhook": "\u018a", "Dieresis": "\uf6cb", "DieresisAcute": "\uf6cc", "DieresisGrave": "\uf6cd", "Dieresissmall": "\uf7a8", "Digammagreek": "\u03dc", "Djecyrillic": "\u0402", "Dlinebelow": "\u1e0e", "Dmonospace": "\uff24", "Dotaccentsmall": "\uf6f7", "Dslash": "\u0110", "Dsmall": "\uf764", "Dtopbar": "\u018b", "Dz": "\u01f2", "Dzcaron": "\u01c5", "Dzeabkhasiancyrillic": "\u04e0", "Dzecyrillic": "\u0405", "Dzhecyrillic": "\u040f", "E": "\u0045", "Eacute": "\u00c9", "Eacutesmall": "\uf7e9", "Ebreve": "\u0114", "Ecaron": "\u011a", "Ecedillabreve": "\u1e1c", "Echarmenian": "\u0535", "Ecircle": "\u24ba", "Ecircumflex": "\u00ca", "Ecircumflexacute": "\u1ebe", "Ecircumflexbelow": "\u1e18", "Ecircumflexdotbelow": "\u1ec6", "Ecircumflexgrave": "\u1ec0", "Ecircumflexhookabove": "\u1ec2", "Ecircumflexsmall": "\uf7ea", "Ecircumflextilde": "\u1ec4", "Ecyrillic": "\u0404", "Edblgrave": "\u0204", "Edieresis": "\u00cb", "Edieresissmall": "\uf7eb", "Edot": "\u0116", "Edotaccent": "\u0116", "Edotbelow": "\u1eb8", "Efcyrillic": "\u0424", "Egrave": "\u00c8", "Egravesmall": "\uf7e8", "Eharmenian": "\u0537", "Ehookabove": "\u1eba", "Eightroman": "\u2167", "Einvertedbreve": "\u0206", "Eiotifiedcyrillic": "\u0464", "Elcyrillic": "\u041b", "Elevenroman": "\u216a", "Emacron": "\u0112", "Emacronacute": "\u1e16", "Emacrongrave": "\u1e14", "Emcyrillic": "\u041c", "Emonospace": "\uff25", "Encyrillic": "\u041d", "Endescendercyrillic": "\u04a2", "Eng": "\u014a", "Enghecyrillic": "\u04a4", "Enhookcyrillic": "\u04c7", "Eogonek": "\u0118", "Eopen": "\u0190", "Epsilon": "\u0395", "Epsilontonos": "\u0388", "Ercyrillic": "\u0420", "Ereversed": "\u018e", "Ereversedcyrillic": "\u042d", "Escyrillic": "\u0421", "Esdescendercyrillic": "\u04aa", "Esh": "\u01a9", "Esmall": "\uf765", "Eta": "\u0397", "Etarmenian": "\u0538", "Etatonos": "\u0389", "Eth": "\u00d0", "Ethsmall": "\uf7f0", "Etilde": "\u1ebc", "Etildebelow": "\u1e1a", "Euro": "\u20ac", "Ezh": "\u01b7", "Ezhcaron": "\u01ee", "Ezhreversed": "\u01b8", "F": "\u0046", "Fcircle": "\u24bb", "Fdotaccent": "\u1e1e", "Feharmenian": "\u0556", "Feicoptic": "\u03e4", "Fhook": "\u0191", "Fitacyrillic": "\u0472", "Fiveroman": "\u2164", "Fmonospace": "\uff26", "Fourroman": "\u2163", "Fsmall": "\uf766", "G": "\u0047", "GBsquare": "\u3387", "Gacute": "\u01f4", "Gamma": "\u0393", "Gammaafrican": "\u0194", "Gangiacoptic": "\u03ea", "Gbreve": "\u011e", "Gcaron": "\u01e6", "Gcedilla": "\u0122", "Gcircle": "\u24bc", "Gcircumflex": "\u011c", "Gcommaaccent": "\u0122", "Gdot": "\u0120", "Gdotaccent": "\u0120", "Gecyrillic": "\u0413", "Ghadarmenian": "\u0542", "Ghemiddlehookcyrillic": "\u0494", "Ghestrokecyrillic": "\u0492", "Gheupturncyrillic": "\u0490", "Ghook": "\u0193", "Gimarmenian": "\u0533", "Gjecyrillic": "\u0403", "Gmacron": "\u1e20", "Gmonospace": "\uff27", "Grave": "\uf6ce", "Gravesmall": "\uf760", "Gsmall": "\uf767", "Gsmallhook": "\u029b", "Gstroke": "\u01e4", "H": "\u0048", "H18533": "\u25cf", "H18543": "\u25aa", "H18551": "\u25ab", "H22073": "\u25a1", "HPsquare": "\u33cb", "Haabkhasiancyrillic": "\u04a8", "Hadescendercyrillic": "\u04b2", "Hardsigncyrillic": "\u042a", "Hbar": "\u0126", "Hbrevebelow": "\u1e2a", "Hcedilla": "\u1e28", "Hcircle": "\u24bd", "Hcircumflex": "\u0124", "Hdieresis": "\u1e26", "Hdotaccent": "\u1e22", "Hdotbelow": "\u1e24", "Hmonospace": "\uff28", "Hoarmenian": "\u0540", "Horicoptic": "\u03e8", "Hsmall": "\uf768", "Hungarumlaut": "\uf6cf", "Hungarumlautsmall": "\uf6f8", "Hzsquare": "\u3390", "I": "\u0049", "IAcyrillic": "\u042f", "IJ": "\u0132", "IUcyrillic": "\u042e", "Iacute": "\u00cd", "Iacutesmall": "\uf7ed", "Ibreve": "\u012c", "Icaron": "\u01cf", "Icircle": "\u24be", "Icircumflex": "\u00ce", "Icircumflexsmall": "\uf7ee", "Icyrillic": "\u0406", "Idblgrave": "\u0208", "Idieresis": "\u00cf", "Idieresisacute": "\u1e2e", "Idieresiscyrillic": "\u04e4", "Idieresissmall": "\uf7ef", "Idot": "\u0130", "Idotaccent": "\u0130", "Idotbelow": "\u1eca", "Iebrevecyrillic": "\u04d6", "Iecyrillic": "\u0415", "Ifraktur": "\u2111", "Igrave": "\u00cc", "Igravesmall": "\uf7ec", "Ihookabove": "\u1ec8", "Iicyrillic": "\u0418", "Iinvertedbreve": "\u020a", "Iishortcyrillic": "\u0419", "Imacron": "\u012a", "Imacroncyrillic": "\u04e2", "Imonospace": "\uff29", "Iniarmenian": "\u053b", "Iocyrillic": "\u0401", "Iogonek": "\u012e", "Iota": "\u0399", "Iotaafrican": "\u0196", "Iotadieresis": "\u03aa", "Iotatonos": "\u038a", "Ismall": "\uf769", "Istroke": "\u0197", "Itilde": "\u0128", "Itildebelow": "\u1e2c", "Izhitsacyrillic": "\u0474", "Izhitsadblgravecyrillic": "\u0476", "J": "\u004a", "Jaarmenian": "\u0541", "Jcircle": "\u24bf", "Jcircumflex": "\u0134", "Jecyrillic": "\u0408", "Jheharmenian": "\u054b", "Jmonospace": "\uff2a", "Jsmall": "\uf76a", "K": "\u004b", "KBsquare": "\u3385", "KKsquare": "\u33cd", "Kabashkircyrillic": "\u04a0", "Kacute": "\u1e30", "Kacyrillic": "\u041a", "Kadescendercyrillic": "\u049a", "Kahookcyrillic": "\u04c3", "Kappa": "\u039a", "Kastrokecyrillic": "\u049e", "Kaverticalstrokecyrillic": "\u049c", "Kcaron": "\u01e8", "Kcedilla": "\u0136", "Kcircle": "\u24c0", "Kcommaaccent": "\u0136", "Kdotbelow": "\u1e32", "Keharmenian": "\u0554", "Kenarmenian": "\u053f", "Khacyrillic": "\u0425", "Kheicoptic": "\u03e6", "Khook": "\u0198", "Kjecyrillic": "\u040c", "Klinebelow": "\u1e34", "Kmonospace": "\uff2b", "Koppacyrillic": "\u0480", "Koppagreek": "\u03de", "Ksicyrillic": "\u046e", "Ksmall": "\uf76b", "L": "\u004c", "LJ": "\u01c7", "LL": "\uf6bf", "Lacute": "\u0139", "Lambda": "\u039b", "Lcaron": "\u013d", "Lcedilla": "\u013b", "Lcircle": "\u24c1", "Lcircumflexbelow": "\u1e3c", "Lcommaaccent": "\u013b", "Ldot": "\u013f", "Ldotaccent": "\u013f", "Ldotbelow": "\u1e36", "Ldotbelowmacron": "\u1e38", "Liwnarmenian": "\u053c", "Lj": "\u01c8", "Ljecyrillic": "\u0409", "Llinebelow": "\u1e3a", "Lmonospace": "\uff2c", "Lslash": "\u0141", "Lslashsmall": "\uf6f9", "Lsmall": "\uf76c", "M": "\u004d", "MBsquare": "\u3386", "Macron": "\uf6d0", "Macronsmall": "\uf7af", "Macute": "\u1e3e", "Mcircle": "\u24c2", "Mdotaccent": "\u1e40", "Mdotbelow": "\u1e42", "Menarmenian": "\u0544", "Mmonospace": "\uff2d", "Msmall": "\uf76d", "Mturned": "\u019c", "Mu": "\u039c", "N": "\u004e", "NJ": "\u01ca", "Nacute": "\u0143", "Ncaron": "\u0147", "Ncedilla": "\u0145", "Ncircle": "\u24c3", "Ncircumflexbelow": "\u1e4a", "Ncommaaccent": "\u0145", "Ndotaccent": "\u1e44", "Ndotbelow": "\u1e46", "Nhookleft": "\u019d", "Nineroman": "\u2168", "Nj": "\u01cb", "Njecyrillic": "\u040a", "Nlinebelow": "\u1e48", "Nmonospace": "\uff2e", "Nowarmenian": "\u0546", "Nsmall": "\uf76e", "Ntilde": "\u00d1", "Ntildesmall": "\uf7f1", "Nu": "\u039d", "O": "\u004f", "OE": "\u0152", "OEsmall": "\uf6fa", "Oacute": "\u00d3", "Oacutesmall": "\uf7f3", "Obarredcyrillic": "\u04e8", "Obarreddieresiscyrillic": "\u04ea", "Obreve": "\u014e", "Ocaron": "\u01d1", "Ocenteredtilde": "\u019f", "Ocircle": "\u24c4", "Ocircumflex": "\u00d4", "Ocircumflexacute": "\u1ed0", "Ocircumflexdotbelow": "\u1ed8", "Ocircumflexgrave": "\u1ed2", "Ocircumflexhookabove": "\u1ed4", "Ocircumflexsmall": "\uf7f4", "Ocircumflextilde": "\u1ed6", "Ocyrillic": "\u041e", "Odblacute": "\u0150", "Odblgrave": "\u020c", "Odieresis": "\u00d6", "Odieresiscyrillic": "\u04e6", "Odieresissmall": "\uf7f6", "Odotbelow": "\u1ecc", "Ogoneksmall": "\uf6fb", "Ograve": "\u00d2", "Ogravesmall": "\uf7f2", "Oharmenian": "\u0555", "Ohm": "\u2126", "Ohookabove": "\u1ece", "Ohorn": "\u01a0", "Ohornacute": "\u1eda", "Ohorndotbelow": "\u1ee2", "Ohorngrave": "\u1edc", "Ohornhookabove": "\u1ede", "Ohorntilde": "\u1ee0", "Ohungarumlaut": "\u0150", "Oi": "\u01a2", "Oinvertedbreve": "\u020e", "Omacron": "\u014c", "Omacronacute": "\u1e52", "Omacrongrave": "\u1e50", "Omega": "\u2126", "Omegacyrillic": "\u0460", "Omegagreek": "\u03a9", "Omegaroundcyrillic": "\u047a", "Omegatitlocyrillic": "\u047c", "Omegatonos": "\u038f", "Omicron": "\u039f", "Omicrontonos": "\u038c", "Omonospace": "\uff2f", "Oneroman": "\u2160", "Oogonek": "\u01ea", "Oogonekmacron": "\u01ec", "Oopen": "\u0186", "Oslash": "\u00d8", "Oslashacute": "\u01fe", "Oslashsmall": "\uf7f8", "Osmall": "\uf76f", "Ostrokeacute": "\u01fe", "Otcyrillic": "\u047e", "Otilde": "\u00d5", "Otildeacute": "\u1e4c", "Otildedieresis": "\u1e4e", "Otildesmall": "\uf7f5", "P": "\u0050", "Pacute": "\u1e54", "Pcircle": "\u24c5", "Pdotaccent": "\u1e56", "Pecyrillic": "\u041f", "Peharmenian": "\u054a", "Pemiddlehookcyrillic": "\u04a6", "Phi": "\u03a6", "Phook": "\u01a4", "Pi": "\u03a0", "Piwrarmenian": "\u0553", "Pmonospace": "\uff30", "Psi": "\u03a8", "Psicyrillic": "\u0470", "Psmall": "\uf770", "Q": "\u0051", "Qcircle": "\u24c6", "Qmonospace": "\uff31", "Qsmall": "\uf771", "R": "\u0052", "Raarmenian": "\u054c", "Racute": "\u0154", "Rcaron": "\u0158", "Rcedilla": "\u0156", "Rcircle": "\u24c7", "Rcommaaccent": "\u0156", "Rdblgrave": "\u0210", "Rdotaccent": "\u1e58", "Rdotbelow": "\u1e5a", "Rdotbelowmacron": "\u1e5c", "Reharmenian": "\u0550", "Rfraktur": "\u211c", "Rho": "\u03a1", "Ringsmall": "\uf6fc", "Rinvertedbreve": "\u0212", "Rlinebelow": "\u1e5e", "Rmonospace": "\uff32", "Rsmall": "\uf772", "Rsmallinverted": "\u0281", "Rsmallinvertedsuperior": "\u02b6", "S": "\u0053", "SF010000": "\u250c", "SF020000": "\u2514", "SF030000": "\u2510", "SF040000": "\u2518", "SF050000": "\u253c", "SF060000": "\u252c", "SF070000": "\u2534", "SF080000": "\u251c", "SF090000": "\u2524", "SF100000": "\u2500", "SF110000": "\u2502", "SF190000": "\u2561", "SF200000": "\u2562", "SF210000": "\u2556", "SF220000": "\u2555", "SF230000": "\u2563", "SF240000": "\u2551", "SF250000": "\u2557", "SF260000": "\u255d", "SF270000": "\u255c", "SF280000": "\u255b", "SF360000": "\u255e", "SF370000": "\u255f", "SF380000": "\u255a", "SF390000": "\u2554", "SF400000": "\u2569", "SF410000": "\u2566", "SF420000": "\u2560", "SF430000": "\u2550", "SF440000": "\u256c", "SF450000": "\u2567", "SF460000": "\u2568", "SF470000": "\u2564", "SF480000": "\u2565", "SF490000": "\u2559", "SF500000": "\u2558", "SF510000": "\u2552", "SF520000": "\u2553", "SF530000": "\u256b", "SF540000": "\u256a", "Sacute": "\u015a", "Sacutedotaccent": "\u1e64", "Sampigreek": "\u03e0", "Scaron": "\u0160", "Scarondotaccent": "\u1e66", "Scaronsmall": "\uf6fd", "Scedilla": "\u015e", "Schwa": "\u018f", "Schwacyrillic": "\u04d8", "Schwadieresiscyrillic": "\u04da", "Scircle": "\u24c8", "Scircumflex": "\u015c", "Scommaaccent": "\u0218", "Sdotaccent": "\u1e60", "Sdotbelow": "\u1e62", "Sdotbelowdotaccent": "\u1e68", "Seharmenian": "\u054d", "Sevenroman": "\u2166", "Shaarmenian": "\u0547", "Shacyrillic": "\u0428", "Shchacyrillic": "\u0429", "Sheicoptic": "\u03e2", "Shhacyrillic": "\u04ba", "Shimacoptic": "\u03ec", "Sigma": "\u03a3", "Sixroman": "\u2165", "Smonospace": "\uff33", "Softsigncyrillic": "\u042c", "Ssmall": "\uf773", "Stigmagreek": "\u03da", "T": "\u0054", "Tau": "\u03a4", "Tbar": "\u0166", "Tcaron": "\u0164", "Tcedilla": "\u0162", "Tcircle": "\u24c9", "Tcircumflexbelow": "\u1e70", "Tcommaaccent": "\u0162", "Tdotaccent": "\u1e6a", "Tdotbelow": "\u1e6c", "Tecyrillic": "\u0422", "Tedescendercyrillic": "\u04ac", "Tenroman": "\u2169", "Tetsecyrillic": "\u04b4", "Theta": "\u0398", "Thook": "\u01ac", "Thorn": "\u00de", "Thornsmall": "\uf7fe", "Threeroman": "\u2162", "Tildesmall": "\uf6fe", "Tiwnarmenian": "\u054f", "Tlinebelow": "\u1e6e", "Tmonospace": "\uff34", "Toarmenian": "\u0539", "Tonefive": "\u01bc", "Tonesix": "\u0184", "Tonetwo": "\u01a7", "Tretroflexhook": "\u01ae", "Tsecyrillic": "\u0426", "Tshecyrillic": "\u040b", "Tsmall": "\uf774", "Twelveroman": "\u216b", "Tworoman": "\u2161", "U": "\u0055", "Uacute": "\u00da", "Uacutesmall": "\uf7fa", "Ubreve": "\u016c", "Ucaron": "\u01d3", "Ucircle": "\u24ca", "Ucircumflex": "\u00db", "Ucircumflexbelow": "\u1e76", "Ucircumflexsmall": "\uf7fb", "Ucyrillic": "\u0423", "Udblacute": "\u0170", "Udblgrave": "\u0214", "Udieresis": "\u00dc", "Udieresisacute": "\u01d7", "Udieresisbelow": "\u1e72", "Udieresiscaron": "\u01d9", "Udieresiscyrillic": "\u04f0", "Udieresisgrave": "\u01db", "Udieresismacron": "\u01d5", "Udieresissmall": "\uf7fc", "Udotbelow": "\u1ee4", "Ugrave": "\u00d9", "Ugravesmall": "\uf7f9", "Uhookabove": "\u1ee6", "Uhorn": "\u01af", "Uhornacute": "\u1ee8", "Uhorndotbelow": "\u1ef0", "Uhorngrave": "\u1eea", "Uhornhookabove": "\u1eec", "Uhorntilde": "\u1eee", "Uhungarumlaut": "\u0170", "Uhungarumlautcyrillic": "\u04f2", "Uinvertedbreve": "\u0216", "Ukcyrillic": "\u0478", "Umacron": "\u016a", "Umacroncyrillic": "\u04ee", "Umacrondieresis": "\u1e7a", "Umonospace": "\uff35", "Uogonek": "\u0172", "Upsilon": "\u03a5", "Upsilon1": "\u03d2", "Upsilonacutehooksymbolgreek": "\u03d3", "Upsilonafrican": "\u01b1", "Upsilondieresis": "\u03ab", "Upsilondieresishooksymbolgreek": "\u03d4", "Upsilonhooksymbol": "\u03d2", "Upsilontonos": "\u038e", "Uring": "\u016e", "Ushortcyrillic": "\u040e", "Usmall": "\uf775", "Ustraightcyrillic": "\u04ae", "Ustraightstrokecyrillic": "\u04b0", "Utilde": "\u0168", "Utildeacute": "\u1e78", "Utildebelow": "\u1e74", "V": "\u0056", "Vcircle": "\u24cb", "Vdotbelow": "\u1e7e", "Vecyrillic": "\u0412", "Vewarmenian": "\u054e", "Vhook": "\u01b2", "Vmonospace": "\uff36", "Voarmenian": "\u0548", "Vsmall": "\uf776", "Vtilde": "\u1e7c", "W": "\u0057", "Wacute": "\u1e82", "Wcircle": "\u24cc", "Wcircumflex": "\u0174", "Wdieresis": "\u1e84", "Wdotaccent": "\u1e86", "Wdotbelow": "\u1e88", "Wgrave": "\u1e80", "Wmonospace": "\uff37", "Wsmall": "\uf777", "X": "\u0058", "Xcircle": "\u24cd", "Xdieresis": "\u1e8c", "Xdotaccent": "\u1e8a", "Xeharmenian": "\u053d", "Xi": "\u039e", "Xmonospace": "\uff38", "Xsmall": "\uf778", "Y": "\u0059", "Yacute": "\u00dd", "Yacutesmall": "\uf7fd", "Yatcyrillic": "\u0462", "Ycircle": "\u24ce", "Ycircumflex": "\u0176", "Ydieresis": "\u0178", "Ydieresissmall": "\uf7ff", "Ydotaccent": "\u1e8e", "Ydotbelow": "\u1ef4", "Yericyrillic": "\u042b", "Yerudieresiscyrillic": "\u04f8", "Ygrave": "\u1ef2", "Yhook": "\u01b3", "Yhookabove": "\u1ef6", "Yiarmenian": "\u0545", "Yicyrillic": "\u0407", "Yiwnarmenian": "\u0552", "Ymonospace": "\uff39", "Ysmall": "\uf779", "Ytilde": "\u1ef8", "Yusbigcyrillic": "\u046a", "Yusbigiotifiedcyrillic": "\u046c", "Yuslittlecyrillic": "\u0466", "Yuslittleiotifiedcyrillic": "\u0468", "Z": "\u005a", "Zaarmenian": "\u0536", "Zacute": "\u0179", "Zcaron": "\u017d", "Zcaronsmall": "\uf6ff", "Zcircle": "\u24cf", "Zcircumflex": "\u1e90", "Zdot": "\u017b", "Zdotaccent": "\u017b", "Zdotbelow": "\u1e92", "Zecyrillic": "\u0417", "Zedescendercyrillic": "\u0498", "Zedieresiscyrillic": "\u04de", "Zeta": "\u0396", "Zhearmenian": "\u053a", "Zhebrevecyrillic": "\u04c1", "Zhecyrillic": "\u0416", "Zhedescendercyrillic": "\u0496", "Zhedieresiscyrillic": "\u04dc", "Zlinebelow": "\u1e94", "Zmonospace": "\uff3a", "Zsmall": "\uf77a", "Zstroke": "\u01b5", "a": "\u0061", "aabengali": "\u0986", "aacute": "\u00e1", "aadeva": "\u0906", "aagujarati": "\u0a86", "aagurmukhi": "\u0a06", "aamatragurmukhi": "\u0a3e", "aarusquare": "\u3303", "aavowelsignbengali": "\u09be", "aavowelsigndeva": "\u093e", "aavowelsigngujarati": "\u0abe", "abbreviationmarkarmenian": "\u055f", "abbreviationsigndeva": "\u0970", "abengali": "\u0985", "abopomofo": "\u311a", "abreve": "\u0103", "abreveacute": "\u1eaf", "abrevecyrillic": "\u04d1", "abrevedotbelow": "\u1eb7", "abrevegrave": "\u1eb1", "abrevehookabove": "\u1eb3", "abrevetilde": "\u1eb5", "acaron": "\u01ce", "acircle": "\u24d0", "acircumflex": "\u00e2", "acircumflexacute": "\u1ea5", "acircumflexdotbelow": "\u1ead", "acircumflexgrave": "\u1ea7", "acircumflexhookabove": "\u1ea9", "acircumflextilde": "\u1eab", "acute": "\u00b4", "acutebelowcmb": "\u0317", "acutecmb": "\u0301", "acutecomb": "\u0301", "acutedeva": "\u0954", "acutelowmod": "\u02cf", "acutetonecmb": "\u0341", "acyrillic": "\u0430", "adblgrave": "\u0201", "addakgurmukhi": "\u0a71", "adeva": "\u0905", "adieresis": "\u00e4", "adieresiscyrillic": "\u04d3", "adieresismacron": "\u01df", "adotbelow": "\u1ea1", "adotmacron": "\u01e1", "ae": "\u00e6", "aeacute": "\u01fd", "aekorean": "\u3150", "aemacron": "\u01e3", "afii00208": "\u2015", "afii08941": "\u20a4", "afii10017": "\u0410", "afii10018": "\u0411", "afii10019": "\u0412", "afii10020": "\u0413", "afii10021": "\u0414", "afii10022": "\u0415", "afii10023": "\u0401", "afii10024": "\u0416", "afii10025": "\u0417", "afii10026": "\u0418", "afii10027": "\u0419", "afii10028": "\u041a", "afii10029": "\u041b", "afii10030": "\u041c", "afii10031": "\u041d", "afii10032": "\u041e", "afii10033": "\u041f", "afii10034": "\u0420", "afii10035": "\u0421", "afii10036": "\u0422", "afii10037": "\u0423", "afii10038": "\u0424", "afii10039": "\u0425", "afii10040": "\u0426", "afii10041": "\u0427", "afii10042": "\u0428", "afii10043": "\u0429", "afii10044": "\u042a", "afii10045": "\u042b", "afii10046": "\u042c", "afii10047": "\u042d", "afii10048": "\u042e", "afii10049": "\u042f", "afii10050": "\u0490", "afii10051": "\u0402", "afii10052": "\u0403", "afii10053": "\u0404", "afii10054": "\u0405", "afii10055": "\u0406", "afii10056": "\u0407", "afii10057": "\u0408", "afii10058": "\u0409", "afii10059": "\u040a", "afii10060": "\u040b", "afii10061": "\u040c", "afii10062": "\u040e", "afii10063": "\uf6c4", "afii10064": "\uf6c5", "afii10065": "\u0430", "afii10066": "\u0431", "afii10067": "\u0432", "afii10068": "\u0433", "afii10069": "\u0434", "afii10070": "\u0435", "afii10071": "\u0451", "afii10072": "\u0436", "afii10073": "\u0437", "afii10074": "\u0438", "afii10075": "\u0439", "afii10076": "\u043a", "afii10077": "\u043b", "afii10078": "\u043c", "afii10079": "\u043d", "afii10080": "\u043e", "afii10081": "\u043f", "afii10082": "\u0440", "afii10083": "\u0441", "afii10084": "\u0442", "afii10085": "\u0443", "afii10086": "\u0444", "afii10087": "\u0445", "afii10088": "\u0446", "afii10089": "\u0447", "afii10090": "\u0448", "afii10091": "\u0449", "afii10092": "\u044a", "afii10093": "\u044b", "afii10094": "\u044c", "afii10095": "\u044d", "afii10096": "\u044e", "afii10097": "\u044f", "afii10098": "\u0491", "afii10099": "\u0452", "afii10100": "\u0453", "afii10101": "\u0454", "afii10102": "\u0455", "afii10103": "\u0456", "afii10104": "\u0457", "afii10105": "\u0458", "afii10106": "\u0459", "afii10107": "\u045a", "afii10108": "\u045b", "afii10109": "\u045c", "afii10110": "\u045e", "afii10145": "\u040f", "afii10146": "\u0462", "afii10147": "\u0472", "afii10148": "\u0474", "afii10192": "\uf6c6", "afii10193": "\u045f", "afii10194": "\u0463", "afii10195": "\u0473", "afii10196": "\u0475", "afii10831": "\uf6c7", "afii10832": "\uf6c8", "afii10846": "\u04d9", "afii299": "\u200e", "afii300": "\u200f", "afii301": "\u200d", "afii57381": "\u066a", "afii57388": "\u060c", "afii57392": "\u0660", "afii57393": "\u0661", "afii57394": "\u0662", "afii57395": "\u0663", "afii57396": "\u0664", "afii57397": "\u0665", "afii57398": "\u0666", "afii57399": "\u0667", "afii57400": "\u0668", "afii57401": "\u0669", "afii57403": "\u061b", "afii57407": "\u061f", "afii57409": "\u0621", "afii57410": "\u0622", "afii57411": "\u0623", "afii57412": "\u0624", "afii57413": "\u0625", "afii57414": "\u0626", "afii57415": "\u0627", "afii57416": "\u0628", "afii57417": "\u0629", "afii57418": "\u062a", "afii57419": "\u062b", "afii57420": "\u062c", "afii57421": "\u062d", "afii57422": "\u062e", "afii57423": "\u062f", "afii57424": "\u0630", "afii57425": "\u0631", "afii57426": "\u0632", "afii57427": "\u0633", "afii57428": "\u0634", "afii57429": "\u0635", "afii57430": "\u0636", "afii57431": "\u0637", "afii57432": "\u0638", "afii57433": "\u0639", "afii57434": "\u063a", "afii57440": "\u0640", "afii57441": "\u0641", "afii57442": "\u0642", "afii57443": "\u0643", "afii57444": "\u0644", "afii57445": "\u0645", "afii57446": "\u0646", "afii57448": "\u0648", "afii57449": "\u0649", "afii57450": "\u064a", "afii57451": "\u064b", "afii57452": "\u064c", "afii57453": "\u064d", "afii57454": "\u064e", "afii57455": "\u064f", "afii57456": "\u0650", "afii57457": "\u0651", "afii57458": "\u0652", "afii57470": "\u0647", "afii57505": "\u06a4", "afii57506": "\u067e", "afii57507": "\u0686", "afii57508": "\u0698", "afii57509": "\u06af", "afii57511": "\u0679", "afii57512": "\u0688", "afii57513": "\u0691", "afii57514": "\u06ba", "afii57519": "\u06d2", "afii57534": "\u06d5", "afii57636": "\u20aa", "afii57645": "\u05be", "afii57658": "\u05c3", "afii57664": "\u05d0", "afii57665": "\u05d1", "afii57666": "\u05d2", "afii57667": "\u05d3", "afii57668": "\u05d4", "afii57669": "\u05d5", "afii57670": "\u05d6", "afii57671": "\u05d7", "afii57672": "\u05d8", "afii57673": "\u05d9", "afii57674": "\u05da", "afii57675": "\u05db", "afii57676": "\u05dc", "afii57677": "\u05dd", "afii57678": "\u05de", "afii57679": "\u05df", "afii57680": "\u05e0", "afii57681": "\u05e1", "afii57682": "\u05e2", "afii57683": "\u05e3", "afii57684": "\u05e4", "afii57685": "\u05e5", "afii57686": "\u05e6", "afii57687": "\u05e7", "afii57688": "\u05e8", "afii57689": "\u05e9", "afii57690": "\u05ea", "afii57694": "\ufb2a", "afii57695": "\ufb2b", "afii57700": "\ufb4b", "afii57705": "\ufb1f", "afii57716": "\u05f0", "afii57717": "\u05f1", "afii57718": "\u05f2", "afii57723": "\ufb35", "afii57793": "\u05b4", "afii57794": "\u05b5", "afii57795": "\u05b6", "afii57796": "\u05bb", "afii57797": "\u05b8", "afii57798": "\u05b7", "afii57799": "\u05b0", "afii57800": "\u05b2", "afii57801": "\u05b1", "afii57802": "\u05b3", "afii57803": "\u05c2", "afii57804": "\u05c1", "afii57806": "\u05b9", "afii57807": "\u05bc", "afii57839": "\u05bd", "afii57841": "\u05bf", "afii57842": "\u05c0", "afii57929": "\u02bc", "afii61248": "\u2105", "afii61289": "\u2113", "afii61352": "\u2116", "afii61573": "\u202c", "afii61574": "\u202d", "afii61575": "\u202e", "afii61664": "\u200c", "afii63167": "\u066d", "afii64937": "\u02bd", "agrave": "\u00e0", "agujarati": "\u0a85", "agurmukhi": "\u0a05", "ahiragana": "\u3042", "ahookabove": "\u1ea3", "aibengali": "\u0990", "aibopomofo": "\u311e", "aideva": "\u0910", "aiecyrillic": "\u04d5", "aigujarati": "\u0a90", "aigurmukhi": "\u0a10", "aimatragurmukhi": "\u0a48", "ainarabic": "\u0639", "ainfinalarabic": "\ufeca", "aininitialarabic": "\ufecb", "ainmedialarabic": "\ufecc", "ainvertedbreve": "\u0203", "aivowelsignbengali": "\u09c8", "aivowelsigndeva": "\u0948", "aivowelsigngujarati": "\u0ac8", "akatakana": "\u30a2", "akatakanahalfwidth": "\uff71", "akorean": "\u314f", "alef": "\u05d0", "alefarabic": "\u0627", "alefdageshhebrew": "\ufb30", "aleffinalarabic": "\ufe8e", "alefhamzaabovearabic": "\u0623", "alefhamzaabovefinalarabic": "\ufe84", "alefhamzabelowarabic": "\u0625", "alefhamzabelowfinalarabic": "\ufe88", "alefhebrew": "\u05d0", "aleflamedhebrew": "\ufb4f", "alefmaddaabovearabic": "\u0622", "alefmaddaabovefinalarabic": "\ufe82", "alefmaksuraarabic": "\u0649", "alefmaksurafinalarabic": "\ufef0", "alefmaksurainitialarabic": "\ufef3", "alefmaksuramedialarabic": "\ufef4", "alefpatahhebrew": "\ufb2e", "alefqamatshebrew": "\ufb2f", "aleph": "\u2135", "allequal": "\u224c", "alpha": "\u03b1", "alphatonos": "\u03ac", "amacron": "\u0101", "amonospace": "\uff41", "ampersand": "\u0026", "ampersandmonospace": "\uff06", "ampersandsmall": "\uf726", "amsquare": "\u33c2", "anbopomofo": "\u3122", "angbopomofo": "\u3124", "angkhankhuthai": "\u0e5a", "angle": "\u2220", "anglebracketleft": "\u3008", "anglebracketleftvertical": "\ufe3f", "anglebracketright": "\u3009", "anglebracketrightvertical": "\ufe40", "angleleft": "\u2329", "angleright": "\u232a", "angstrom": "\u212b", "anoteleia": "\u0387", "anudattadeva": "\u0952", "anusvarabengali": "\u0982", "anusvaradeva": "\u0902", "anusvaragujarati": "\u0a82", "aogonek": "\u0105", "apaatosquare": "\u3300", "aparen": "\u249c", "apostrophearmenian": "\u055a", "apostrophemod": "\u02bc", "apple": "\uf8ff", "approaches": "\u2250", "approxequal": "\u2248", "approxequalorimage": "\u2252", "approximatelyequal": "\u2245", "araeaekorean": "\u318e", "araeakorean": "\u318d", "arc": "\u2312", "arighthalfring": "\u1e9a", "aring": "\u00e5", "aringacute": "\u01fb", "aringbelow": "\u1e01", "arrowboth": "\u2194", "arrowdashdown": "\u21e3", "arrowdashleft": "\u21e0", "arrowdashright": "\u21e2", "arrowdashup": "\u21e1", "arrowdblboth": "\u21d4", "arrowdbldown": "\u21d3", "arrowdblleft": "\u21d0", "arrowdblright": "\u21d2", "arrowdblup": "\u21d1", "arrowdown": "\u2193", "arrowdownleft": "\u2199", "arrowdownright": "\u2198", "arrowdownwhite": "\u21e9", "arrowheaddownmod": "\u02c5", "arrowheadleftmod": "\u02c2", "arrowheadrightmod": "\u02c3", "arrowheadupmod": "\u02c4", "arrowhorizex": "\uf8e7", "arrowleft": "\u2190", "arrowleftdbl": "\u21d0", "arrowleftdblstroke": "\u21cd", "arrowleftoverright": "\u21c6", "arrowleftwhite": "\u21e6", "arrowright": "\u2192", "arrowrightdblstroke": "\u21cf", "arrowrightheavy": "\u279e", "arrowrightoverleft": "\u21c4", "arrowrightwhite": "\u21e8", "arrowtableft": "\u21e4", "arrowtabright": "\u21e5", "arrowup": "\u2191", "arrowupdn": "\u2195", "arrowupdnbse": "\u21a8", "arrowupdownbase": "\u21a8", "arrowupleft": "\u2196", "arrowupleftofdown": "\u21c5", "arrowupright": "\u2197", "arrowupwhite": "\u21e7", "arrowvertex": "\uf8e6", "asciicircum": "\u005e", "asciicircummonospace": "\uff3e", "asciitilde": "\u007e", "asciitildemonospace": "\uff5e", "ascript": "\u0251", "ascriptturned": "\u0252", "asmallhiragana": "\u3041", "asmallkatakana": "\u30a1", "asmallkatakanahalfwidth": "\uff67", "asterisk": "\u002a", "asteriskaltonearabic": "\u066d", "asteriskarabic": "\u066d", "asteriskmath": "\u2217", "asteriskmonospace": "\uff0a", "asterisksmall": "\ufe61", "asterism": "\u2042", "asuperior": "\uf6e9", "asymptoticallyequal": "\u2243", "at": "\u0040", "atilde": "\u00e3", "atmonospace": "\uff20", "atsmall": "\ufe6b", "aturned": "\u0250", "aubengali": "\u0994", "aubopomofo": "\u3120", "audeva": "\u0914", "augujarati": "\u0a94", "augurmukhi": "\u0a14", "aulengthmarkbengali": "\u09d7", "aumatragurmukhi": "\u0a4c", "auvowelsignbengali": "\u09cc", "auvowelsigndeva": "\u094c", "auvowelsigngujarati": "\u0acc", "avagrahadeva": "\u093d", "aybarmenian": "\u0561", "ayin": "\u05e2", "ayinaltonehebrew": "\ufb20", "ayinhebrew": "\u05e2", "b": "\u0062", "babengali": "\u09ac", "backslash": "\u005c", "backslashmonospace": "\uff3c", "badeva": "\u092c", "bagujarati": "\u0aac", "bagurmukhi": "\u0a2c", "bahiragana": "\u3070", "bahtthai": "\u0e3f", "bakatakana": "\u30d0", "bar": "\u007c", "barmonospace": "\uff5c", "bbopomofo": "\u3105", "bcircle": "\u24d1", "bdotaccent": "\u1e03", "bdotbelow": "\u1e05", "beamedsixteenthnotes": "\u266c", "because": "\u2235", "becyrillic": "\u0431", "beharabic": "\u0628", "behfinalarabic": "\ufe90", "behinitialarabic": "\ufe91", "behiragana": "\u3079", "behmedialarabic": "\ufe92", "behmeeminitialarabic": "\ufc9f", "behmeemisolatedarabic": "\ufc08", "behnoonfinalarabic": "\ufc6d", "bekatakana": "\u30d9", "benarmenian": "\u0562", "bet": "\u05d1", "beta": "\u03b2", "betasymbolgreek": "\u03d0", "betdagesh": "\ufb31", "betdageshhebrew": "\ufb31", "bethebrew": "\u05d1", "betrafehebrew": "\ufb4c", "bhabengali": "\u09ad", "bhadeva": "\u092d", "bhagujarati": "\u0aad", "bhagurmukhi": "\u0a2d", "bhook": "\u0253", "bihiragana": "\u3073", "bikatakana": "\u30d3", "bilabialclick": "\u0298", "bindigurmukhi": "\u0a02", "birusquare": "\u3331", "blackcircle": "\u25cf", "blackdiamond": "\u25c6", "blackdownpointingtriangle": "\u25bc", "blackleftpointingpointer": "\u25c4", "blackleftpointingtriangle": "\u25c0", "blacklenticularbracketleft": "\u3010", "blacklenticularbracketleftvertical": "\ufe3b", "blacklenticularbracketright": "\u3011", "blacklenticularbracketrightvertical": "\ufe3c", "blacklowerlefttriangle": "\u25e3", "blacklowerrighttriangle": "\u25e2", "blackrectangle": "\u25ac", "blackrightpointingpointer": "\u25ba", "blackrightpointingtriangle": "\u25b6", "blacksmallsquare": "\u25aa", "blacksmilingface": "\u263b", "blacksquare": "\u25a0", "blackstar": "\u2605", "blackupperlefttriangle": "\u25e4", "blackupperrighttriangle": "\u25e5", "blackuppointingsmalltriangle": "\u25b4", "blackuppointingtriangle": "\u25b2", "blank": "\u2423", "blinebelow": "\u1e07", "block": "\u2588", "bmonospace": "\uff42", "bobaimaithai": "\u0e1a", "bohiragana": "\u307c", "bokatakana": "\u30dc", "bparen": "\u249d", "bqsquare": "\u33c3", "braceex": "\uf8f4", "braceleft": "\u007b", "braceleftbt": "\uf8f3", "braceleftmid": "\uf8f2", "braceleftmonospace": "\uff5b", "braceleftsmall": "\ufe5b", "bracelefttp": "\uf8f1", "braceleftvertical": "\ufe37", "braceright": "\u007d", "bracerightbt": "\uf8fe", "bracerightmid": "\uf8fd", "bracerightmonospace": "\uff5d", "bracerightsmall": "\ufe5c", "bracerighttp": "\uf8fc", "bracerightvertical": "\ufe38", "bracketleft": "\u005b", "bracketleftbt": "\uf8f0", "bracketleftex": "\uf8ef", "bracketleftmonospace": "\uff3b", "bracketlefttp": "\uf8ee", "bracketright": "\u005d", "bracketrightbt": "\uf8fb", "bracketrightex": "\uf8fa", "bracketrightmonospace": "\uff3d", "bracketrighttp": "\uf8f9", "breve": "\u02d8", "brevebelowcmb": "\u032e", "brevecmb": "\u0306", "breveinvertedbelowcmb": "\u032f", "breveinvertedcmb": "\u0311", "breveinverteddoublecmb": "\u0361", "bridgebelowcmb": "\u032a", "bridgeinvertedbelowcmb": "\u033a", "brokenbar": "\u00a6", "bstroke": "\u0180", "bsuperior": "\uf6ea", "btopbar": "\u0183", "buhiragana": "\u3076", "bukatakana": "\u30d6", "bullet": "\u2022", "bulletinverse": "\u25d8", "bulletoperator": "\u2219", "bullseye": "\u25ce", "c": "\u0063", "caarmenian": "\u056e", "cabengali": "\u099a", "cacute": "\u0107", "cadeva": "\u091a", "cagujarati": "\u0a9a", "cagurmukhi": "\u0a1a", "calsquare": "\u3388", "candrabindubengali": "\u0981", "candrabinducmb": "\u0310", "candrabindudeva": "\u0901", "candrabindugujarati": "\u0a81", "capslock": "\u21ea", "careof": "\u2105", "caron": "\u02c7", "caronbelowcmb": "\u032c", "caroncmb": "\u030c", "carriagereturn": "\u21b5", "cbopomofo": "\u3118", "ccaron": "\u010d", "ccedilla": "\u00e7", "ccedillaacute": "\u1e09", "ccircle": "\u24d2", "ccircumflex": "\u0109", "ccurl": "\u0255", "cdot": "\u010b", "cdotaccent": "\u010b", "cdsquare": "\u33c5", "cedilla": "\u00b8", "cedillacmb": "\u0327", "cent": "\u00a2", "centigrade": "\u2103", "centinferior": "\uf6df", "centmonospace": "\uffe0", "centoldstyle": "\uf7a2", "centsuperior": "\uf6e0", "chaarmenian": "\u0579", "chabengali": "\u099b", "chadeva": "\u091b", "chagujarati": "\u0a9b", "chagurmukhi": "\u0a1b", "chbopomofo": "\u3114", "cheabkhasiancyrillic": "\u04bd", "checkmark": "\u2713", "checyrillic": "\u0447", "chedescenderabkhasiancyrillic": "\u04bf", "chedescendercyrillic": "\u04b7", "chedieresiscyrillic": "\u04f5", "cheharmenian": "\u0573", "chekhakassiancyrillic": "\u04cc", "cheverticalstrokecyrillic": "\u04b9", "chi": "\u03c7", "chieuchacirclekorean": "\u3277", "chieuchaparenkorean": "\u3217", "chieuchcirclekorean": "\u3269", "chieuchkorean": "\u314a", "chieuchparenkorean": "\u3209", "chochangthai": "\u0e0a", "chochanthai": "\u0e08", "chochingthai": "\u0e09", "chochoethai": "\u0e0c", "chook": "\u0188", "cieucacirclekorean": "\u3276", "cieucaparenkorean": "\u3216", "cieuccirclekorean": "\u3268", "cieuckorean": "\u3148", "cieucparenkorean": "\u3208", "cieucuparenkorean": "\u321c", "circle": "\u25cb", "circlemultiply": "\u2297", "circleot": "\u2299", "circleplus": "\u2295", "circlepostalmark": "\u3036", "circlewithlefthalfblack": "\u25d0", "circlewithrighthalfblack": "\u25d1", "circumflex": "\u02c6", "circumflexbelowcmb": "\u032d", "circumflexcmb": "\u0302", "clear": "\u2327", "clickalveolar": "\u01c2", "clickdental": "\u01c0", "clicklateral": "\u01c1", "clickretroflex": "\u01c3", "club": "\u2663", "clubsuitblack": "\u2663", "clubsuitwhite": "\u2667", "cmcubedsquare": "\u33a4", "cmonospace": "\uff43", "cmsquaredsquare": "\u33a0", "coarmenian": "\u0581", "colon": "\u003a", "colonmonetary": "\u20a1", "colonmonospace": "\uff1a", "colonsign": "\u20a1", "colonsmall": "\ufe55", "colontriangularhalfmod": "\u02d1", "colontriangularmod": "\u02d0", "comma": "\u002c", "commaabovecmb": "\u0313", "commaaboverightcmb": "\u0315", "commaaccent": "\uf6c3", "commaarabic": "\u060c", "commaarmenian": "\u055d", "commainferior": "\uf6e1", "commamonospace": "\uff0c", "commareversedabovecmb": "\u0314", "commareversedmod": "\u02bd", "commasmall": "\ufe50", "commasuperior": "\uf6e2", "commaturnedabovecmb": "\u0312", "commaturnedmod": "\u02bb", "compass": "\u263c", "congruent": "\u2245", "contourintegral": "\u222e", "control": "\u2303", "controlACK": "\u0006", "controlBEL": "\u0007", "controlBS": "\u0008", "controlCAN": "\u0018", "controlCR": "\u000d", "controlDC1": "\u0011", "controlDC2": "\u0012", "controlDC3": "\u0013", "controlDC4": "\u0014", "controlDEL": "\u007f", "controlDLE": "\u0010", "controlEM": "\u0019", "controlENQ": "\u0005", "controlEOT": "\u0004", "controlESC": "\u001b", "controlETB": "\u0017", "controlETX": "\u0003", "controlFF": "\u000c", "controlFS": "\u001c", "controlGS": "\u001d", "controlHT": "\u0009", "controlLF": "\u000a", "controlNAK": "\u0015", "controlRS": "\u001e", "controlSI": "\u000f", "controlSO": "\u000e", "controlSOT": "\u0002", "controlSTX": "\u0001", "controlSUB": "\u001a", "controlSYN": "\u0016", "controlUS": "\u001f", "controlVT": "\u000b", "copyright": "\u00a9", "copyrightsans": "\uf8e9", "copyrightserif": "\uf6d9", "cornerbracketleft": "\u300c", "cornerbracketlefthalfwidth": "\uff62", "cornerbracketleftvertical": "\ufe41", "cornerbracketright": "\u300d", "cornerbracketrighthalfwidth": "\uff63", "cornerbracketrightvertical": "\ufe42", "corporationsquare": "\u337f", "cosquare": "\u33c7", "coverkgsquare": "\u33c6", "cparen": "\u249e", "cruzeiro": "\u20a2", "cstretched": "\u0297", "curlyand": "\u22cf", "curlyor": "\u22ce", "currency": "\u00a4", "cyrBreve": "\uf6d1", "cyrFlex": "\uf6d2", "cyrbreve": "\uf6d4", "cyrflex": "\uf6d5", "d": "\u0064", "daarmenian": "\u0564", "dabengali": "\u09a6", "dadarabic": "\u0636", "dadeva": "\u0926", "dadfinalarabic": "\ufebe", "dadinitialarabic": "\ufebf", "dadmedialarabic": "\ufec0", "dagesh": "\u05bc", "dageshhebrew": "\u05bc", "dagger": "\u2020", "daggerdbl": "\u2021", "dagujarati": "\u0aa6", "dagurmukhi": "\u0a26", "dahiragana": "\u3060", "dakatakana": "\u30c0", "dalarabic": "\u062f", "dalet": "\u05d3", "daletdagesh": "\ufb33", "daletdageshhebrew": "\ufb33", "dalethatafpatah": "\u05d3\u05b2", "dalethatafpatahhebrew": "\u05d3\u05b2", "dalethatafsegol": "\u05d3\u05b1", "dalethatafsegolhebrew": "\u05d3\u05b1", "dalethebrew": "\u05d3", "dalethiriq": "\u05d3\u05b4", "dalethiriqhebrew": "\u05d3\u05b4", "daletholam": "\u05d3\u05b9", "daletholamhebrew": "\u05d3\u05b9", "daletpatah": "\u05d3\u05b7", "daletpatahhebrew": "\u05d3\u05b7", "daletqamats": "\u05d3\u05b8", "daletqamatshebrew": "\u05d3\u05b8", "daletqubuts": "\u05d3\u05bb", "daletqubutshebrew": "\u05d3\u05bb", "daletsegol": "\u05d3\u05b6", "daletsegolhebrew": "\u05d3\u05b6", "daletsheva": "\u05d3\u05b0", "daletshevahebrew": "\u05d3\u05b0", "dalettsere": "\u05d3\u05b5", "dalettserehebrew": "\u05d3\u05b5", "dalfinalarabic": "\ufeaa", "dammaarabic": "\u064f", "dammalowarabic": "\u064f", "dammatanaltonearabic": "\u064c", "dammatanarabic": "\u064c", "danda": "\u0964", "dargahebrew": "\u05a7", "dargalefthebrew": "\u05a7", "dasiapneumatacyrilliccmb": "\u0485", "dblGrave": "\uf6d3", "dblanglebracketleft": "\u300a", "dblanglebracketleftvertical": "\ufe3d", "dblanglebracketright": "\u300b", "dblanglebracketrightvertical": "\ufe3e", "dblarchinvertedbelowcmb": "\u032b", "dblarrowleft": "\u21d4", "dblarrowright": "\u21d2", "dbldanda": "\u0965", "dblgrave": "\uf6d6", "dblgravecmb": "\u030f", "dblintegral": "\u222c", "dbllowline": "\u2017", "dbllowlinecmb": "\u0333", "dbloverlinecmb": "\u033f", "dblprimemod": "\u02ba", "dblverticalbar": "\u2016", "dblverticallineabovecmb": "\u030e", "dbopomofo": "\u3109", "dbsquare": "\u33c8", "dcaron": "\u010f", "dcedilla": "\u1e11", "dcircle": "\u24d3", "dcircumflexbelow": "\u1e13", "dcroat": "\u0111", "ddabengali": "\u09a1", "ddadeva": "\u0921", "ddagujarati": "\u0aa1", "ddagurmukhi": "\u0a21", "ddalarabic": "\u0688", "ddalfinalarabic": "\ufb89", "dddhadeva": "\u095c", "ddhabengali": "\u09a2", "ddhadeva": "\u0922", "ddhagujarati": "\u0aa2", "ddhagurmukhi": "\u0a22", "ddotaccent": "\u1e0b", "ddotbelow": "\u1e0d", "decimalseparatorarabic": "\u066b", "decimalseparatorpersian": "\u066b", "decyrillic": "\u0434", "degree": "\u00b0", "dehihebrew": "\u05ad", "dehiragana": "\u3067", "deicoptic": "\u03ef", "dekatakana": "\u30c7", "deleteleft": "\u232b", "deleteright": "\u2326", "delta": "\u03b4", "deltaturned": "\u018d", "denominatorminusonenumeratorbengali": "\u09f8", "dezh": "\u02a4", "dhabengali": "\u09a7", "dhadeva": "\u0927", "dhagujarati": "\u0aa7", "dhagurmukhi": "\u0a27", "dhook": "\u0257", "dialytikatonos": "\u0385", "dialytikatonoscmb": "\u0344", "diamond": "\u2666", "diamondsuitwhite": "\u2662", "dieresis": "\u00a8", "dieresisacute": "\uf6d7", "dieresisbelowcmb": "\u0324", "dieresiscmb": "\u0308", "dieresisgrave": "\uf6d8", "dieresistonos": "\u0385", "dihiragana": "\u3062", "dikatakana": "\u30c2", "dittomark": "\u3003", "divide": "\u00f7", "divides": "\u2223", "divisionslash": "\u2215", "djecyrillic": "\u0452", "dkshade": "\u2593", "dlinebelow": "\u1e0f", "dlsquare": "\u3397", "dmacron": "\u0111", "dmonospace": "\uff44", "dnblock": "\u2584", "dochadathai": "\u0e0e", "dodekthai": "\u0e14", "dohiragana": "\u3069", "dokatakana": "\u30c9", "dollar": "\u0024", "dollarinferior": "\uf6e3", "dollarmonospace": "\uff04", "dollaroldstyle": "\uf724", "dollarsmall": "\ufe69", "dollarsuperior": "\uf6e4", "dong": "\u20ab", "dorusquare": "\u3326", "dotaccent": "\u02d9", "dotaccentcmb": "\u0307", "dotbelowcmb": "\u0323", "dotbelowcomb": "\u0323", "dotkatakana": "\u30fb", "dotlessi": "\u0131", "dotlessj": "\uf6be", "dotlessjstrokehook": "\u0284", "dotmath": "\u22c5", "dottedcircle": "\u25cc", "doubleyodpatah": "\ufb1f", "doubleyodpatahhebrew": "\ufb1f", "downtackbelowcmb": "\u031e", "downtackmod": "\u02d5", "dparen": "\u249f", "dsuperior": "\uf6eb", "dtail": "\u0256", "dtopbar": "\u018c", "duhiragana": "\u3065", "dukatakana": "\u30c5", "dz": "\u01f3", "dzaltone": "\u02a3", "dzcaron": "\u01c6", "dzcurl": "\u02a5", "dzeabkhasiancyrillic": "\u04e1", "dzecyrillic": "\u0455", "dzhecyrillic": "\u045f", "e": "\u0065", "eacute": "\u00e9", "earth": "\u2641", "ebengali": "\u098f", "ebopomofo": "\u311c", "ebreve": "\u0115", "ecandradeva": "\u090d", "ecandragujarati": "\u0a8d", "ecandravowelsigndeva": "\u0945", "ecandravowelsigngujarati": "\u0ac5", "ecaron": "\u011b", "ecedillabreve": "\u1e1d", "echarmenian": "\u0565", "echyiwnarmenian": "\u0587", "ecircle": "\u24d4", "ecircumflex": "\u00ea", "ecircumflexacute": "\u1ebf", "ecircumflexbelow": "\u1e19", "ecircumflexdotbelow": "\u1ec7", "ecircumflexgrave": "\u1ec1", "ecircumflexhookabove": "\u1ec3", "ecircumflextilde": "\u1ec5", "ecyrillic": "\u0454", "edblgrave": "\u0205", "edeva": "\u090f", "edieresis": "\u00eb", "edot": "\u0117", "edotaccent": "\u0117", "edotbelow": "\u1eb9", "eegurmukhi": "\u0a0f", "eematragurmukhi": "\u0a47", "efcyrillic": "\u0444", "egrave": "\u00e8", "egujarati": "\u0a8f", "eharmenian": "\u0567", "ehbopomofo": "\u311d", "ehiragana": "\u3048", "ehookabove": "\u1ebb", "eibopomofo": "\u311f", "eight": "\u0038", "eightarabic": "\u0668", "eightbengali": "\u09ee", "eightcircle": "\u2467", "eightcircleinversesansserif": "\u2791", "eightdeva": "\u096e", "eighteencircle": "\u2471", "eighteenparen": "\u2485", "eighteenperiod": "\u2499", "eightgujarati": "\u0aee", "eightgurmukhi": "\u0a6e", "eighthackarabic": "\u0668", "eighthangzhou": "\u3028", "eighthnotebeamed": "\u266b", "eightideographicparen": "\u3227", "eightinferior": "\u2088", "eightmonospace": "\uff18", "eightoldstyle": "\uf738", "eightparen": "\u247b", "eightperiod": "\u248f", "eightpersian": "\u06f8", "eightroman": "\u2177", "eightsuperior": "\u2078", "eightthai": "\u0e58", "einvertedbreve": "\u0207", "eiotifiedcyrillic": "\u0465", "ekatakana": "\u30a8", "ekatakanahalfwidth": "\uff74", "ekonkargurmukhi": "\u0a74", "ekorean": "\u3154", "elcyrillic": "\u043b", "element": "\u2208", "elevencircle": "\u246a", "elevenparen": "\u247e", "elevenperiod": "\u2492", "elevenroman": "\u217a", "ellipsis": "\u2026", "ellipsisvertical": "\u22ee", "emacron": "\u0113", "emacronacute": "\u1e17", "emacrongrave": "\u1e15", "emcyrillic": "\u043c", "emdash": "\u2014", "emdashvertical": "\ufe31", "emonospace": "\uff45", "emphasismarkarmenian": "\u055b", "emptyset": "\u2205", "enbopomofo": "\u3123", "encyrillic": "\u043d", "endash": "\u2013", "endashvertical": "\ufe32", "endescendercyrillic": "\u04a3", "eng": "\u014b", "engbopomofo": "\u3125", "enghecyrillic": "\u04a5", "enhookcyrillic": "\u04c8", "enspace": "\u2002", "eogonek": "\u0119", "eokorean": "\u3153", "eopen": "\u025b", "eopenclosed": "\u029a", "eopenreversed": "\u025c", "eopenreversedclosed": "\u025e", "eopenreversedhook": "\u025d", "eparen": "\u24a0", "epsilon": "\u03b5", "epsilontonos": "\u03ad", "equal": "\u003d", "equalmonospace": "\uff1d", "equalsmall": "\ufe66", "equalsuperior": "\u207c", "equivalence": "\u2261", "erbopomofo": "\u3126", "ercyrillic": "\u0440", "ereversed": "\u0258", "ereversedcyrillic": "\u044d", "escyrillic": "\u0441", "esdescendercyrillic": "\u04ab", "esh": "\u0283", "eshcurl": "\u0286", "eshortdeva": "\u090e", "eshortvowelsigndeva": "\u0946", "eshreversedloop": "\u01aa", "eshsquatreversed": "\u0285", "esmallhiragana": "\u3047", "esmallkatakana": "\u30a7", "esmallkatakanahalfwidth": "\uff6a", "estimated": "\u212e", "esuperior": "\uf6ec", "eta": "\u03b7", "etarmenian": "\u0568", "etatonos": "\u03ae", "eth": "\u00f0", "etilde": "\u1ebd", "etildebelow": "\u1e1b", "etnahtafoukhhebrew": "\u0591", "etnahtafoukhlefthebrew": "\u0591", "etnahtahebrew": "\u0591", "etnahtalefthebrew": "\u0591", "eturned": "\u01dd", "eukorean": "\u3161", "euro": "\u20ac", "evowelsignbengali": "\u09c7", "evowelsigndeva": "\u0947", "evowelsigngujarati": "\u0ac7", "exclam": "\u0021", "exclamarmenian": "\u055c", "exclamdbl": "\u203c", "exclamdown": "\u00a1", "exclamdownsmall": "\uf7a1", "exclammonospace": "\uff01", "exclamsmall": "\uf721", "existential": "\u2203", "ezh": "\u0292", "ezhcaron": "\u01ef", "ezhcurl": "\u0293", "ezhreversed": "\u01b9", "ezhtail": "\u01ba", "f": "\u0066", "fadeva": "\u095e", "fagurmukhi": "\u0a5e", "fahrenheit": "\u2109", "fathaarabic": "\u064e", "fathalowarabic": "\u064e", "fathatanarabic": "\u064b", "fbopomofo": "\u3108", "fcircle": "\u24d5", "fdotaccent": "\u1e1f", "feharabic": "\u0641", "feharmenian": "\u0586", "fehfinalarabic": "\ufed2", "fehinitialarabic": "\ufed3", "fehmedialarabic": "\ufed4", "feicoptic": "\u03e5", "female": "\u2640", "ff": "\ufb00", "ffi": "\ufb03", "ffl": "\ufb04", "fi": "\ufb01", "fifteencircle": "\u246e", "fifteenparen": "\u2482", "fifteenperiod": "\u2496", "figuredash": "\u2012", "filledbox": "\u25a0", "filledrect": "\u25ac", "finalkaf": "\u05da", "finalkafdagesh": "\ufb3a", "finalkafdageshhebrew": "\ufb3a", "finalkafhebrew": "\u05da", "finalkafqamats": "\u05da\u05b8", "finalkafqamatshebrew": "\u05da\u05b8", "finalkafsheva": "\u05da\u05b0", "finalkafshevahebrew": "\u05da\u05b0", "finalmem": "\u05dd", "finalmemhebrew": "\u05dd", "finalnun": "\u05df", "finalnunhebrew": "\u05df", "finalpe": "\u05e3", "finalpehebrew": "\u05e3", "finaltsadi": "\u05e5", "finaltsadihebrew": "\u05e5", "firsttonechinese": "\u02c9", "fisheye": "\u25c9", "fitacyrillic": "\u0473", "five": "\u0035", "fivearabic": "\u0665", "fivebengali": "\u09eb", "fivecircle": "\u2464", "fivecircleinversesansserif": "\u278e", "fivedeva": "\u096b", "fiveeighths": "\u215d", "fivegujarati": "\u0aeb", "fivegurmukhi": "\u0a6b", "fivehackarabic": "\u0665", "fivehangzhou": "\u3025", "fiveideographicparen": "\u3224", "fiveinferior": "\u2085", "fivemonospace": "\uff15", "fiveoldstyle": "\uf735", "fiveparen": "\u2478", "fiveperiod": "\u248c", "fivepersian": "\u06f5", "fiveroman": "\u2174", "fivesuperior": "\u2075", "fivethai": "\u0e55", "fl": "\ufb02", "florin": "\u0192", "fmonospace": "\uff46", "fmsquare": "\u3399", "fofanthai": "\u0e1f", "fofathai": "\u0e1d", "fongmanthai": "\u0e4f", "forall": "\u2200", "four": "\u0034", "fourarabic": "\u0664", "fourbengali": "\u09ea", "fourcircle": "\u2463", "fourcircleinversesansserif": "\u278d", "fourdeva": "\u096a", "fourgujarati": "\u0aea", "fourgurmukhi": "\u0a6a", "fourhackarabic": "\u0664", "fourhangzhou": "\u3024", "fourideographicparen": "\u3223", "fourinferior": "\u2084", "fourmonospace": "\uff14", "fournumeratorbengali": "\u09f7", "fouroldstyle": "\uf734", "fourparen": "\u2477", "fourperiod": "\u248b", "fourpersian": "\u06f4", "fourroman": "\u2173", "foursuperior": "\u2074", "fourteencircle": "\u246d", "fourteenparen": "\u2481", "fourteenperiod": "\u2495", "fourthai": "\u0e54", "fourthtonechinese": "\u02cb", "fparen": "\u24a1", "fraction": "\u2044", "franc": "\u20a3", "g": "\u0067", "gabengali": "\u0997", "gacute": "\u01f5", "gadeva": "\u0917", "gafarabic": "\u06af", "gaffinalarabic": "\ufb93", "gafinitialarabic": "\ufb94", "gafmedialarabic": "\ufb95", "gagujarati": "\u0a97", "gagurmukhi": "\u0a17", "gahiragana": "\u304c", "gakatakana": "\u30ac", "gamma": "\u03b3", "gammalatinsmall": "\u0263", "gammasuperior": "\u02e0", "gangiacoptic": "\u03eb", "gbopomofo": "\u310d", "gbreve": "\u011f", "gcaron": "\u01e7", "gcedilla": "\u0123", "gcircle": "\u24d6", "gcircumflex": "\u011d", "gcommaaccent": "\u0123", "gdot": "\u0121", "gdotaccent": "\u0121", "gecyrillic": "\u0433", "gehiragana": "\u3052", "gekatakana": "\u30b2", "geometricallyequal": "\u2251", "gereshaccenthebrew": "\u059c", "gereshhebrew": "\u05f3", "gereshmuqdamhebrew": "\u059d", "germandbls": "\u00df", "gershayimaccenthebrew": "\u059e", "gershayimhebrew": "\u05f4", "getamark": "\u3013", "ghabengali": "\u0998", "ghadarmenian": "\u0572", "ghadeva": "\u0918", "ghagujarati": "\u0a98", "ghagurmukhi": "\u0a18", "ghainarabic": "\u063a", "ghainfinalarabic": "\ufece", "ghaininitialarabic": "\ufecf", "ghainmedialarabic": "\ufed0", "ghemiddlehookcyrillic": "\u0495", "ghestrokecyrillic": "\u0493", "gheupturncyrillic": "\u0491", "ghhadeva": "\u095a", "ghhagurmukhi": "\u0a5a", "ghook": "\u0260", "ghzsquare": "\u3393", "gihiragana": "\u304e", "gikatakana": "\u30ae", "gimarmenian": "\u0563", "gimel": "\u05d2", "gimeldagesh": "\ufb32", "gimeldageshhebrew": "\ufb32", "gimelhebrew": "\u05d2", "gjecyrillic": "\u0453", "glottalinvertedstroke": "\u01be", "glottalstop": "\u0294", "glottalstopinverted": "\u0296", "glottalstopmod": "\u02c0", "glottalstopreversed": "\u0295", "glottalstopreversedmod": "\u02c1", "glottalstopreversedsuperior": "\u02e4", "glottalstopstroke": "\u02a1", "glottalstopstrokereversed": "\u02a2", "gmacron": "\u1e21", "gmonospace": "\uff47", "gohiragana": "\u3054", "gokatakana": "\u30b4", "gparen": "\u24a2", "gpasquare": "\u33ac", "gradient": "\u2207", "grave": "\u0060", "gravebelowcmb": "\u0316", "gravecmb": "\u0300", "gravecomb": "\u0300", "gravedeva": "\u0953", "gravelowmod": "\u02ce", "gravemonospace": "\uff40", "gravetonecmb": "\u0340", "greater": "\u003e", "greaterequal": "\u2265", "greaterequalorless": "\u22db", "greatermonospace": "\uff1e", "greaterorequivalent": "\u2273", "greaterorless": "\u2277", "greateroverequal": "\u2267", "greatersmall": "\ufe65", "gscript": "\u0261", "gstroke": "\u01e5", "guhiragana": "\u3050", "guillemotleft": "\u00ab", "guillemotright": "\u00bb", "guilsinglleft": "\u2039", "guilsinglright": "\u203a", "gukatakana": "\u30b0", "guramusquare": "\u3318", "gysquare": "\u33c9", "h": "\u0068", "haabkhasiancyrillic": "\u04a9", "haaltonearabic": "\u06c1", "habengali": "\u09b9", "hadescendercyrillic": "\u04b3", "hadeva": "\u0939", "hagujarati": "\u0ab9", "hagurmukhi": "\u0a39", "haharabic": "\u062d", "hahfinalarabic": "\ufea2", "hahinitialarabic": "\ufea3", "hahiragana": "\u306f", "hahmedialarabic": "\ufea4", "haitusquare": "\u332a", "hakatakana": "\u30cf", "hakatakanahalfwidth": "\uff8a", "halantgurmukhi": "\u0a4d", "hamzaarabic": "\u0621", "hamzadammaarabic": "\u0621\u064f", "hamzadammatanarabic": "\u0621\u064c", "hamzafathaarabic": "\u0621\u064e", "hamzafathatanarabic": "\u0621\u064b", "hamzalowarabic": "\u0621", "hamzalowkasraarabic": "\u0621\u0650", "hamzalowkasratanarabic": "\u0621\u064d", "hamzasukunarabic": "\u0621\u0652", "hangulfiller": "\u3164", "hardsigncyrillic": "\u044a", "harpoonleftbarbup": "\u21bc", "harpoonrightbarbup": "\u21c0", "hasquare": "\u33ca", "hatafpatah": "\u05b2", "hatafpatah16": "\u05b2", "hatafpatah23": "\u05b2", "hatafpatah2f": "\u05b2", "hatafpatahhebrew": "\u05b2", "hatafpatahnarrowhebrew": "\u05b2", "hatafpatahquarterhebrew": "\u05b2", "hatafpatahwidehebrew": "\u05b2", "hatafqamats": "\u05b3", "hatafqamats1b": "\u05b3", "hatafqamats28": "\u05b3", "hatafqamats34": "\u05b3", "hatafqamatshebrew": "\u05b3", "hatafqamatsnarrowhebrew": "\u05b3", "hatafqamatsquarterhebrew": "\u05b3", "hatafqamatswidehebrew": "\u05b3", "hatafsegol": "\u05b1", "hatafsegol17": "\u05b1", "hatafsegol24": "\u05b1", "hatafsegol30": "\u05b1", "hatafsegolhebrew": "\u05b1", "hatafsegolnarrowhebrew": "\u05b1", "hatafsegolquarterhebrew": "\u05b1", "hatafsegolwidehebrew": "\u05b1", "hbar": "\u0127", "hbopomofo": "\u310f", "hbrevebelow": "\u1e2b", "hcedilla": "\u1e29", "hcircle": "\u24d7", "hcircumflex": "\u0125", "hdieresis": "\u1e27", "hdotaccent": "\u1e23", "hdotbelow": "\u1e25", "he": "\u05d4", "heart": "\u2665", "heartsuitblack": "\u2665", "heartsuitwhite": "\u2661", "hedagesh": "\ufb34", "hedageshhebrew": "\ufb34", "hehaltonearabic": "\u06c1", "heharabic": "\u0647", "hehebrew": "\u05d4", "hehfinalaltonearabic": "\ufba7", "hehfinalalttwoarabic": "\ufeea", "hehfinalarabic": "\ufeea", "hehhamzaabovefinalarabic": "\ufba5", "hehhamzaaboveisolatedarabic": "\ufba4", "hehinitialaltonearabic": "\ufba8", "hehinitialarabic": "\ufeeb", "hehiragana": "\u3078", "hehmedialaltonearabic": "\ufba9", "hehmedialarabic": "\ufeec", "heiseierasquare": "\u337b", "hekatakana": "\u30d8", "hekatakanahalfwidth": "\uff8d", "hekutaarusquare": "\u3336", "henghook": "\u0267", "herutusquare": "\u3339", "het": "\u05d7", "hethebrew": "\u05d7", "hhook": "\u0266", "hhooksuperior": "\u02b1", "hieuhacirclekorean": "\u327b", "hieuhaparenkorean": "\u321b", "hieuhcirclekorean": "\u326d", "hieuhkorean": "\u314e", "hieuhparenkorean": "\u320d", "hihiragana": "\u3072", "hikatakana": "\u30d2", "hikatakanahalfwidth": "\uff8b", "hiriq": "\u05b4", "hiriq14": "\u05b4", "hiriq21": "\u05b4", "hiriq2d": "\u05b4", "hiriqhebrew": "\u05b4", "hiriqnarrowhebrew": "\u05b4", "hiriqquarterhebrew": "\u05b4", "hiriqwidehebrew": "\u05b4", "hlinebelow": "\u1e96", "hmonospace": "\uff48", "hoarmenian": "\u0570", "hohipthai": "\u0e2b", "hohiragana": "\u307b", "hokatakana": "\u30db", "hokatakanahalfwidth": "\uff8e", "holam": "\u05b9", "holam19": "\u05b9", "holam26": "\u05b9", "holam32": "\u05b9", "holamhebrew": "\u05b9", "holamnarrowhebrew": "\u05b9", "holamquarterhebrew": "\u05b9", "holamwidehebrew": "\u05b9", "honokhukthai": "\u0e2e", "hookabovecomb": "\u0309", "hookcmb": "\u0309", "hookpalatalizedbelowcmb": "\u0321", "hookretroflexbelowcmb": "\u0322", "hoonsquare": "\u3342", "horicoptic": "\u03e9", "horizontalbar": "\u2015", "horncmb": "\u031b", "hotsprings": "\u2668", "house": "\u2302", "hparen": "\u24a3", "hsuperior": "\u02b0", "hturned": "\u0265", "huhiragana": "\u3075", "huiitosquare": "\u3333", "hukatakana": "\u30d5", "hukatakanahalfwidth": "\uff8c", "hungarumlaut": "\u02dd", "hungarumlautcmb": "\u030b", "hv": "\u0195", "hyphen": "\u002d", "hypheninferior": "\uf6e5", "hyphenmonospace": "\uff0d", "hyphensmall": "\ufe63", "hyphensuperior": "\uf6e6", "hyphentwo": "\u2010", "i": "\u0069", "iacute": "\u00ed", "iacyrillic": "\u044f", "ibengali": "\u0987", "ibopomofo": "\u3127", "ibreve": "\u012d", "icaron": "\u01d0", "icircle": "\u24d8", "icircumflex": "\u00ee", "icyrillic": "\u0456", "idblgrave": "\u0209", "ideographearthcircle": "\u328f", "ideographfirecircle": "\u328b", "ideographicallianceparen": "\u323f", "ideographiccallparen": "\u323a", "ideographiccentrecircle": "\u32a5", "ideographicclose": "\u3006", "ideographiccomma": "\u3001", "ideographiccommaleft": "\uff64", "ideographiccongratulationparen": "\u3237", "ideographiccorrectcircle": "\u32a3", "ideographicearthparen": "\u322f", "ideographicenterpriseparen": "\u323d", "ideographicexcellentcircle": "\u329d", "ideographicfestivalparen": "\u3240", "ideographicfinancialcircle": "\u3296", "ideographicfinancialparen": "\u3236", "ideographicfireparen": "\u322b", "ideographichaveparen": "\u3232", "ideographichighcircle": "\u32a4", "ideographiciterationmark": "\u3005", "ideographiclaborcircle": "\u3298", "ideographiclaborparen": "\u3238", "ideographicleftcircle": "\u32a7", "ideographiclowcircle": "\u32a6", "ideographicmedicinecircle": "\u32a9", "ideographicmetalparen": "\u322e", "ideographicmoonparen": "\u322a", "ideographicnameparen": "\u3234", "ideographicperiod": "\u3002", "ideographicprintcircle": "\u329e", "ideographicreachparen": "\u3243", "ideographicrepresentparen": "\u3239", "ideographicresourceparen": "\u323e", "ideographicrightcircle": "\u32a8", "ideographicsecretcircle": "\u3299", "ideographicselfparen": "\u3242", "ideographicsocietyparen": "\u3233", "ideographicspace": "\u3000", "ideographicspecialparen": "\u3235", "ideographicstockparen": "\u3231", "ideographicstudyparen": "\u323b", "ideographicsunparen": "\u3230", "ideographicsuperviseparen": "\u323c", "ideographicwaterparen": "\u322c", "ideographicwoodparen": "\u322d", "ideographiczero": "\u3007", "ideographmetalcircle": "\u328e", "ideographmooncircle": "\u328a", "ideographnamecircle": "\u3294", "ideographsuncircle": "\u3290", "ideographwatercircle": "\u328c", "ideographwoodcircle": "\u328d", "ideva": "\u0907", "idieresis": "\u00ef", "idieresisacute": "\u1e2f", "idieresiscyrillic": "\u04e5", "idotbelow": "\u1ecb", "iebrevecyrillic": "\u04d7", "iecyrillic": "\u0435", "ieungacirclekorean": "\u3275", "ieungaparenkorean": "\u3215", "ieungcirclekorean": "\u3267", "ieungkorean": "\u3147", "ieungparenkorean": "\u3207", "igrave": "\u00ec", "igujarati": "\u0a87", "igurmukhi": "\u0a07", "ihiragana": "\u3044", "ihookabove": "\u1ec9", "iibengali": "\u0988", "iicyrillic": "\u0438", "iideva": "\u0908", "iigujarati": "\u0a88", "iigurmukhi": "\u0a08", "iimatragurmukhi": "\u0a40", "iinvertedbreve": "\u020b", "iishortcyrillic": "\u0439", "iivowelsignbengali": "\u09c0", "iivowelsigndeva": "\u0940", "iivowelsigngujarati": "\u0ac0", "ij": "\u0133", "ikatakana": "\u30a4", "ikatakanahalfwidth": "\uff72", "ikorean": "\u3163", "ilde": "\u02dc", "iluyhebrew": "\u05ac", "imacron": "\u012b", "imacroncyrillic": "\u04e3", "imageorapproximatelyequal": "\u2253", "imatragurmukhi": "\u0a3f", "imonospace": "\uff49", "increment": "\u2206", "infinity": "\u221e", "iniarmenian": "\u056b", "integral": "\u222b", "integralbottom": "\u2321", "integralbt": "\u2321", "integralex": "\uf8f5", "integraltop": "\u2320", "integraltp": "\u2320", "intersection": "\u2229", "intisquare": "\u3305", "invbullet": "\u25d8", "invcircle": "\u25d9", "invsmileface": "\u263b", "iocyrillic": "\u0451", "iogonek": "\u012f", "iota": "\u03b9", "iotadieresis": "\u03ca", "iotadieresistonos": "\u0390", "iotalatin": "\u0269", "iotatonos": "\u03af", "iparen": "\u24a4", "irigurmukhi": "\u0a72", "ismallhiragana": "\u3043", "ismallkatakana": "\u30a3", "ismallkatakanahalfwidth": "\uff68", "issharbengali": "\u09fa", "istroke": "\u0268", "isuperior": "\uf6ed", "iterationhiragana": "\u309d", "iterationkatakana": "\u30fd", "itilde": "\u0129", "itildebelow": "\u1e2d", "iubopomofo": "\u3129", "iucyrillic": "\u044e", "ivowelsignbengali": "\u09bf", "ivowelsigndeva": "\u093f", "ivowelsigngujarati": "\u0abf", "izhitsacyrillic": "\u0475", "izhitsadblgravecyrillic": "\u0477", "j": "\u006a", "jaarmenian": "\u0571", "jabengali": "\u099c", "jadeva": "\u091c", "jagujarati": "\u0a9c", "jagurmukhi": "\u0a1c", "jbopomofo": "\u3110", "jcaron": "\u01f0", "jcircle": "\u24d9", "jcircumflex": "\u0135", "jcrossedtail": "\u029d", "jdotlessstroke": "\u025f", "jecyrillic": "\u0458", "jeemarabic": "\u062c", "jeemfinalarabic": "\ufe9e", "jeeminitialarabic": "\ufe9f", "jeemmedialarabic": "\ufea0", "jeharabic": "\u0698", "jehfinalarabic": "\ufb8b", "jhabengali": "\u099d", "jhadeva": "\u091d", "jhagujarati": "\u0a9d", "jhagurmukhi": "\u0a1d", "jheharmenian": "\u057b", "jis": "\u3004", "jmonospace": "\uff4a", "jparen": "\u24a5", "jsuperior": "\u02b2", "k": "\u006b", "kabashkircyrillic": "\u04a1", "kabengali": "\u0995", "kacute": "\u1e31", "kacyrillic": "\u043a", "kadescendercyrillic": "\u049b", "kadeva": "\u0915", "kaf": "\u05db", "kafarabic": "\u0643", "kafdagesh": "\ufb3b", "kafdageshhebrew": "\ufb3b", "kaffinalarabic": "\ufeda", "kafhebrew": "\u05db", "kafinitialarabic": "\ufedb", "kafmedialarabic": "\ufedc", "kafrafehebrew": "\ufb4d", "kagujarati": "\u0a95", "kagurmukhi": "\u0a15", "kahiragana": "\u304b", "kahookcyrillic": "\u04c4", "kakatakana": "\u30ab", "kakatakanahalfwidth": "\uff76", "kappa": "\u03ba", "kappasymbolgreek": "\u03f0", "kapyeounmieumkorean": "\u3171", "kapyeounphieuphkorean": "\u3184", "kapyeounpieupkorean": "\u3178", "kapyeounssangpieupkorean": "\u3179", "karoriisquare": "\u330d", "kashidaautoarabic": "\u0640", "kashidaautonosidebearingarabic": "\u0640", "kasmallkatakana": "\u30f5", "kasquare": "\u3384", "kasraarabic": "\u0650", "kasratanarabic": "\u064d", "kastrokecyrillic": "\u049f", "katahiraprolongmarkhalfwidth": "\uff70", "kaverticalstrokecyrillic": "\u049d", "kbopomofo": "\u310e", "kcalsquare": "\u3389", "kcaron": "\u01e9", "kcedilla": "\u0137", "kcircle": "\u24da", "kcommaaccent": "\u0137", "kdotbelow": "\u1e33", "keharmenian": "\u0584", "kehiragana": "\u3051", "kekatakana": "\u30b1", "kekatakanahalfwidth": "\uff79", "kenarmenian": "\u056f", "kesmallkatakana": "\u30f6", "kgreenlandic": "\u0138", "khabengali": "\u0996", "khacyrillic": "\u0445", "khadeva": "\u0916", "khagujarati": "\u0a96", "khagurmukhi": "\u0a16", "khaharabic": "\u062e", "khahfinalarabic": "\ufea6", "khahinitialarabic": "\ufea7", "khahmedialarabic": "\ufea8", "kheicoptic": "\u03e7", "khhadeva": "\u0959", "khhagurmukhi": "\u0a59", "khieukhacirclekorean": "\u3278", "khieukhaparenkorean": "\u3218", "khieukhcirclekorean": "\u326a", "khieukhkorean": "\u314b", "khieukhparenkorean": "\u320a", "khokhaithai": "\u0e02", "khokhonthai": "\u0e05", "khokhuatthai": "\u0e03", "khokhwaithai": "\u0e04", "khomutthai": "\u0e5b", "khook": "\u0199", "khorakhangthai": "\u0e06", "khzsquare": "\u3391", "kihiragana": "\u304d", "kikatakana": "\u30ad", "kikatakanahalfwidth": "\uff77", "kiroguramusquare": "\u3315", "kiromeetorusquare": "\u3316", "kirosquare": "\u3314", "kiyeokacirclekorean": "\u326e", "kiyeokaparenkorean": "\u320e", "kiyeokcirclekorean": "\u3260", "kiyeokkorean": "\u3131", "kiyeokparenkorean": "\u3200", "kiyeoksioskorean": "\u3133", "kjecyrillic": "\u045c", "klinebelow": "\u1e35", "klsquare": "\u3398", "kmcubedsquare": "\u33a6", "kmonospace": "\uff4b", "kmsquaredsquare": "\u33a2", "kohiragana": "\u3053", "kohmsquare": "\u33c0", "kokaithai": "\u0e01", "kokatakana": "\u30b3", "kokatakanahalfwidth": "\uff7a", "kooposquare": "\u331e", "koppacyrillic": "\u0481", "koreanstandardsymbol": "\u327f", "koroniscmb": "\u0343", "kparen": "\u24a6", "kpasquare": "\u33aa", "ksicyrillic": "\u046f", "ktsquare": "\u33cf", "kturned": "\u029e", "kuhiragana": "\u304f", "kukatakana": "\u30af", "kukatakanahalfwidth": "\uff78", "kvsquare": "\u33b8", "kwsquare": "\u33be", "l": "\u006c", "labengali": "\u09b2", "lacute": "\u013a", "ladeva": "\u0932", "lagujarati": "\u0ab2", "lagurmukhi": "\u0a32", "lakkhangyaothai": "\u0e45", "lamaleffinalarabic": "\ufefc", "lamalefhamzaabovefinalarabic": "\ufef8", "lamalefhamzaaboveisolatedarabic": "\ufef7", "lamalefhamzabelowfinalarabic": "\ufefa", "lamalefhamzabelowisolatedarabic": "\ufef9", "lamalefisolatedarabic": "\ufefb", "lamalefmaddaabovefinalarabic": "\ufef6", "lamalefmaddaaboveisolatedarabic": "\ufef5", "lamarabic": "\u0644", "lambda": "\u03bb", "lambdastroke": "\u019b", "lamed": "\u05dc", "lameddagesh": "\ufb3c", "lameddageshhebrew": "\ufb3c", "lamedhebrew": "\u05dc", "lamedholam": "\u05dc\u05b9", "lamedholamdagesh": "\u05dc\u05b9\u05bc", "lamedholamdageshhebrew": "\u05dc\u05b9\u05bc", "lamedholamhebrew": "\u05dc\u05b9", "lamfinalarabic": "\ufede", "lamhahinitialarabic": "\ufcca", "laminitialarabic": "\ufedf", "lamjeeminitialarabic": "\ufcc9", "lamkhahinitialarabic": "\ufccb", "lamlamhehisolatedarabic": "\ufdf2", "lammedialarabic": "\ufee0", "lammeemhahinitialarabic": "\ufd88", "lammeeminitialarabic": "\ufccc", "lammeemjeeminitialarabic": "\ufedf\ufee4\ufea0", "lammeemkhahinitialarabic": "\ufedf\ufee4\ufea8", "largecircle": "\u25ef", "lbar": "\u019a", "lbelt": "\u026c", "lbopomofo": "\u310c", "lcaron": "\u013e", "lcedilla": "\u013c", "lcircle": "\u24db", "lcircumflexbelow": "\u1e3d", "lcommaaccent": "\u013c", "ldot": "\u0140", "ldotaccent": "\u0140", "ldotbelow": "\u1e37", "ldotbelowmacron": "\u1e39", "leftangleabovecmb": "\u031a", "lefttackbelowcmb": "\u0318", "less": "\u003c", "lessequal": "\u2264", "lessequalorgreater": "\u22da", "lessmonospace": "\uff1c", "lessorequivalent": "\u2272", "lessorgreater": "\u2276", "lessoverequal": "\u2266", "lesssmall": "\ufe64", "lezh": "\u026e", "lfblock": "\u258c", "lhookretroflex": "\u026d", "lira": "\u20a4", "liwnarmenian": "\u056c", "lj": "\u01c9", "ljecyrillic": "\u0459", "ll": "\uf6c0", "lladeva": "\u0933", "llagujarati": "\u0ab3", "llinebelow": "\u1e3b", "llladeva": "\u0934", "llvocalicbengali": "\u09e1", "llvocalicdeva": "\u0961", "llvocalicvowelsignbengali": "\u09e3", "llvocalicvowelsigndeva": "\u0963", "lmiddletilde": "\u026b", "lmonospace": "\uff4c", "lmsquare": "\u33d0", "lochulathai": "\u0e2c", "logicaland": "\u2227", "logicalnot": "\u00ac", "logicalnotreversed": "\u2310", "logicalor": "\u2228", "lolingthai": "\u0e25", "longs": "\u017f", "lowlinecenterline": "\ufe4e", "lowlinecmb": "\u0332", "lowlinedashed": "\ufe4d", "lozenge": "\u25ca", "lparen": "\u24a7", "lslash": "\u0142", "lsquare": "\u2113", "lsuperior": "\uf6ee", "ltshade": "\u2591", "luthai": "\u0e26", "lvocalicbengali": "\u098c", "lvocalicdeva": "\u090c", "lvocalicvowelsignbengali": "\u09e2", "lvocalicvowelsigndeva": "\u0962", "lxsquare": "\u33d3", "m": "\u006d", "mabengali": "\u09ae", "macron": "\u00af", "macronbelowcmb": "\u0331", "macroncmb": "\u0304", "macronlowmod": "\u02cd", "macronmonospace": "\uffe3", "macute": "\u1e3f", "madeva": "\u092e", "magujarati": "\u0aae", "magurmukhi": "\u0a2e", "mahapakhhebrew": "\u05a4", "mahapakhlefthebrew": "\u05a4", "mahiragana": "\u307e", "maichattawalowleftthai": "\uf895", "maichattawalowrightthai": "\uf894", "maichattawathai": "\u0e4b", "maichattawaupperleftthai": "\uf893", "maieklowleftthai": "\uf88c", "maieklowrightthai": "\uf88b", "maiekthai": "\u0e48", "maiekupperleftthai": "\uf88a", "maihanakatleftthai": "\uf884", "maihanakatthai": "\u0e31", "maitaikhuleftthai": "\uf889", "maitaikhuthai": "\u0e47", "maitholowleftthai": "\uf88f", "maitholowrightthai": "\uf88e", "maithothai": "\u0e49", "maithoupperleftthai": "\uf88d", "maitrilowleftthai": "\uf892", "maitrilowrightthai": "\uf891", "maitrithai": "\u0e4a", "maitriupperleftthai": "\uf890", "maiyamokthai": "\u0e46", "makatakana": "\u30de", "makatakanahalfwidth": "\uff8f", "male": "\u2642", "mansyonsquare": "\u3347", "maqafhebrew": "\u05be", "mars": "\u2642", "masoracirclehebrew": "\u05af", "masquare": "\u3383", "mbopomofo": "\u3107", "mbsquare": "\u33d4", "mcircle": "\u24dc", "mcubedsquare": "\u33a5", "mdotaccent": "\u1e41", "mdotbelow": "\u1e43", "meemarabic": "\u0645", "meemfinalarabic": "\ufee2", "meeminitialarabic": "\ufee3", "meemmedialarabic": "\ufee4", "meemmeeminitialarabic": "\ufcd1", "meemmeemisolatedarabic": "\ufc48", "meetorusquare": "\u334d", "mehiragana": "\u3081", "meizierasquare": "\u337e", "mekatakana": "\u30e1", "mekatakanahalfwidth": "\uff92", "mem": "\u05de", "memdagesh": "\ufb3e", "memdageshhebrew": "\ufb3e", "memhebrew": "\u05de", "menarmenian": "\u0574", "merkhahebrew": "\u05a5", "merkhakefulahebrew": "\u05a6", "merkhakefulalefthebrew": "\u05a6", "merkhalefthebrew": "\u05a5", "mhook": "\u0271", "mhzsquare": "\u3392", "middledotkatakanahalfwidth": "\uff65", "middot": "\u00b7", "mieumacirclekorean": "\u3272", "mieumaparenkorean": "\u3212", "mieumcirclekorean": "\u3264", "mieumkorean": "\u3141", "mieumpansioskorean": "\u3170", "mieumparenkorean": "\u3204", "mieumpieupkorean": "\u316e", "mieumsioskorean": "\u316f", "mihiragana": "\u307f", "mikatakana": "\u30df", "mikatakanahalfwidth": "\uff90", "minus": "\u2212", "minusbelowcmb": "\u0320", "minuscircle": "\u2296", "minusmod": "\u02d7", "minusplus": "\u2213", "minute": "\u2032", "miribaarusquare": "\u334a", "mirisquare": "\u3349", "mlonglegturned": "\u0270", "mlsquare": "\u3396", "mmcubedsquare": "\u33a3", "mmonospace": "\uff4d", "mmsquaredsquare": "\u339f", "mohiragana": "\u3082", "mohmsquare": "\u33c1", "mokatakana": "\u30e2", "mokatakanahalfwidth": "\uff93", "molsquare": "\u33d6", "momathai": "\u0e21", "moverssquare": "\u33a7", "moverssquaredsquare": "\u33a8", "mparen": "\u24a8", "mpasquare": "\u33ab", "mssquare": "\u33b3", "msuperior": "\uf6ef", "mturned": "\u026f", "mu": "\u00b5", "mu1": "\u00b5", "muasquare": "\u3382", "muchgreater": "\u226b", "muchless": "\u226a", "mufsquare": "\u338c", "mugreek": "\u03bc", "mugsquare": "\u338d", "muhiragana": "\u3080", "mukatakana": "\u30e0", "mukatakanahalfwidth": "\uff91", "mulsquare": "\u3395", "multiply": "\u00d7", "mumsquare": "\u339b", "munahhebrew": "\u05a3", "munahlefthebrew": "\u05a3", "musicalnote": "\u266a", "musicalnotedbl": "\u266b", "musicflatsign": "\u266d", "musicsharpsign": "\u266f", "mussquare": "\u33b2", "muvsquare": "\u33b6", "muwsquare": "\u33bc", "mvmegasquare": "\u33b9", "mvsquare": "\u33b7", "mwmegasquare": "\u33bf", "mwsquare": "\u33bd", "n": "\u006e", "nabengali": "\u09a8", "nabla": "\u2207", "nacute": "\u0144", "nadeva": "\u0928", "nagujarati": "\u0aa8", "nagurmukhi": "\u0a28", "nahiragana": "\u306a", "nakatakana": "\u30ca", "nakatakanahalfwidth": "\uff85", "napostrophe": "\u0149", "nasquare": "\u3381", "nbopomofo": "\u310b", "nbspace": "\u00a0", "ncaron": "\u0148", "ncedilla": "\u0146", "ncircle": "\u24dd", "ncircumflexbelow": "\u1e4b", "ncommaaccent": "\u0146", "ndotaccent": "\u1e45", "ndotbelow": "\u1e47", "nehiragana": "\u306d", "nekatakana": "\u30cd", "nekatakanahalfwidth": "\uff88", "newsheqelsign": "\u20aa", "nfsquare": "\u338b", "ngabengali": "\u0999", "ngadeva": "\u0919", "ngagujarati": "\u0a99", "ngagurmukhi": "\u0a19", "ngonguthai": "\u0e07", "nhiragana": "\u3093", "nhookleft": "\u0272", "nhookretroflex": "\u0273", "nieunacirclekorean": "\u326f", "nieunaparenkorean": "\u320f", "nieuncieuckorean": "\u3135", "nieuncirclekorean": "\u3261", "nieunhieuhkorean": "\u3136", "nieunkorean": "\u3134", "nieunpansioskorean": "\u3168", "nieunparenkorean": "\u3201", "nieunsioskorean": "\u3167", "nieuntikeutkorean": "\u3166", "nihiragana": "\u306b", "nikatakana": "\u30cb", "nikatakanahalfwidth": "\uff86", "nikhahitleftthai": "\uf899", "nikhahitthai": "\u0e4d", "nine": "\u0039", "ninearabic": "\u0669", "ninebengali": "\u09ef", "ninecircle": "\u2468", "ninecircleinversesansserif": "\u2792", "ninedeva": "\u096f", "ninegujarati": "\u0aef", "ninegurmukhi": "\u0a6f", "ninehackarabic": "\u0669", "ninehangzhou": "\u3029", "nineideographicparen": "\u3228", "nineinferior": "\u2089", "ninemonospace": "\uff19", "nineoldstyle": "\uf739", "nineparen": "\u247c", "nineperiod": "\u2490", "ninepersian": "\u06f9", "nineroman": "\u2178", "ninesuperior": "\u2079", "nineteencircle": "\u2472", "nineteenparen": "\u2486", "nineteenperiod": "\u249a", "ninethai": "\u0e59", "nj": "\u01cc", "njecyrillic": "\u045a", "nkatakana": "\u30f3", "nkatakanahalfwidth": "\uff9d", "nlegrightlong": "\u019e", "nlinebelow": "\u1e49", "nmonospace": "\uff4e", "nmsquare": "\u339a", "nnabengali": "\u09a3", "nnadeva": "\u0923", "nnagujarati": "\u0aa3", "nnagurmukhi": "\u0a23", "nnnadeva": "\u0929", "nohiragana": "\u306e", "nokatakana": "\u30ce", "nokatakanahalfwidth": "\uff89", "nonbreakingspace": "\u00a0", "nonenthai": "\u0e13", "nonuthai": "\u0e19", "noonarabic": "\u0646", "noonfinalarabic": "\ufee6", "noonghunnaarabic": "\u06ba", "noonghunnafinalarabic": "\ufb9f", "noonhehinitialarabic": "\ufee7\ufeec", "nooninitialarabic": "\ufee7", "noonjeeminitialarabic": "\ufcd2", "noonjeemisolatedarabic": "\ufc4b", "noonmedialarabic": "\ufee8", "noonmeeminitialarabic": "\ufcd5", "noonmeemisolatedarabic": "\ufc4e", "noonnoonfinalarabic": "\ufc8d", "notcontains": "\u220c", "notelement": "\u2209", "notelementof": "\u2209", "notequal": "\u2260", "notgreater": "\u226f", "notgreaternorequal": "\u2271", "notgreaternorless": "\u2279", "notidentical": "\u2262", "notless": "\u226e", "notlessnorequal": "\u2270", "notparallel": "\u2226", "notprecedes": "\u2280", "notsubset": "\u2284", "notsucceeds": "\u2281", "notsuperset": "\u2285", "nowarmenian": "\u0576", "nparen": "\u24a9", "nssquare": "\u33b1", "nsuperior": "\u207f", "ntilde": "\u00f1", "nu": "\u03bd", "nuhiragana": "\u306c", "nukatakana": "\u30cc", "nukatakanahalfwidth": "\uff87", "nuktabengali": "\u09bc", "nuktadeva": "\u093c", "nuktagujarati": "\u0abc", "nuktagurmukhi": "\u0a3c", "numbersign": "\u0023", "numbersignmonospace": "\uff03", "numbersignsmall": "\ufe5f", "numeralsigngreek": "\u0374", "numeralsignlowergreek": "\u0375", "numero": "\u2116", "nun": "\u05e0", "nundagesh": "\ufb40", "nundageshhebrew": "\ufb40", "nunhebrew": "\u05e0", "nvsquare": "\u33b5", "nwsquare": "\u33bb", "nyabengali": "\u099e", "nyadeva": "\u091e", "nyagujarati": "\u0a9e", "nyagurmukhi": "\u0a1e", "o": "\u006f", "oacute": "\u00f3", "oangthai": "\u0e2d", "obarred": "\u0275", "obarredcyrillic": "\u04e9", "obarreddieresiscyrillic": "\u04eb", "obengali": "\u0993", "obopomofo": "\u311b", "obreve": "\u014f", "ocandradeva": "\u0911", "ocandragujarati": "\u0a91", "ocandravowelsigndeva": "\u0949", "ocandravowelsigngujarati": "\u0ac9", "ocaron": "\u01d2", "ocircle": "\u24de", "ocircumflex": "\u00f4", "ocircumflexacute": "\u1ed1", "ocircumflexdotbelow": "\u1ed9", "ocircumflexgrave": "\u1ed3", "ocircumflexhookabove": "\u1ed5", "ocircumflextilde": "\u1ed7", "ocyrillic": "\u043e", "odblacute": "\u0151", "odblgrave": "\u020d", "odeva": "\u0913", "odieresis": "\u00f6", "odieresiscyrillic": "\u04e7", "odotbelow": "\u1ecd", "oe": "\u0153", "oekorean": "\u315a", "ogonek": "\u02db", "ogonekcmb": "\u0328", "ograve": "\u00f2", "ogujarati": "\u0a93", "oharmenian": "\u0585", "ohiragana": "\u304a", "ohookabove": "\u1ecf", "ohorn": "\u01a1", "ohornacute": "\u1edb", "ohorndotbelow": "\u1ee3", "ohorngrave": "\u1edd", "ohornhookabove": "\u1edf", "ohorntilde": "\u1ee1", "ohungarumlaut": "\u0151", "oi": "\u01a3", "oinvertedbreve": "\u020f", "okatakana": "\u30aa", "okatakanahalfwidth": "\uff75", "okorean": "\u3157", "olehebrew": "\u05ab", "omacron": "\u014d", "omacronacute": "\u1e53", "omacrongrave": "\u1e51", "omdeva": "\u0950", "omega": "\u03c9", "omega1": "\u03d6", "omegacyrillic": "\u0461", "omegalatinclosed": "\u0277", "omegaroundcyrillic": "\u047b", "omegatitlocyrillic": "\u047d", "omegatonos": "\u03ce", "omgujarati": "\u0ad0", "omicron": "\u03bf", "omicrontonos": "\u03cc", "omonospace": "\uff4f", "one": "\u0031", "onearabic": "\u0661", "onebengali": "\u09e7", "onecircle": "\u2460", "onecircleinversesansserif": "\u278a", "onedeva": "\u0967", "onedotenleader": "\u2024", "oneeighth": "\u215b", "onefitted": "\uf6dc", "onegujarati": "\u0ae7", "onegurmukhi": "\u0a67", "onehackarabic": "\u0661", "onehalf": "\u00bd", "onehangzhou": "\u3021", "oneideographicparen": "\u3220", "oneinferior": "\u2081", "onemonospace": "\uff11", "onenumeratorbengali": "\u09f4", "oneoldstyle": "\uf731", "oneparen": "\u2474", "oneperiod": "\u2488", "onepersian": "\u06f1", "onequarter": "\u00bc", "oneroman": "\u2170", "onesuperior": "\u00b9", "onethai": "\u0e51", "onethird": "\u2153", "oogonek": "\u01eb", "oogonekmacron": "\u01ed", "oogurmukhi": "\u0a13", "oomatragurmukhi": "\u0a4b", "oopen": "\u0254", "oparen": "\u24aa", "openbullet": "\u25e6", "option": "\u2325", "ordfeminine": "\u00aa", "ordmasculine": "\u00ba", "orthogonal": "\u221f", "oshortdeva": "\u0912", "oshortvowelsigndeva": "\u094a", "oslash": "\u00f8", "oslashacute": "\u01ff", "osmallhiragana": "\u3049", "osmallkatakana": "\u30a9", "osmallkatakanahalfwidth": "\uff6b", "ostrokeacute": "\u01ff", "osuperior": "\uf6f0", "otcyrillic": "\u047f", "otilde": "\u00f5", "otildeacute": "\u1e4d", "otildedieresis": "\u1e4f", "oubopomofo": "\u3121", "overline": "\u203e", "overlinecenterline": "\ufe4a", "overlinecmb": "\u0305", "overlinedashed": "\ufe49", "overlinedblwavy": "\ufe4c", "overlinewavy": "\ufe4b", "overscore": "\u00af", "ovowelsignbengali": "\u09cb", "ovowelsigndeva": "\u094b", "ovowelsigngujarati": "\u0acb", "p": "\u0070", "paampssquare": "\u3380", "paasentosquare": "\u332b", "pabengali": "\u09aa", "pacute": "\u1e55", "padeva": "\u092a", "pagedown": "\u21df", "pageup": "\u21de", "pagujarati": "\u0aaa", "pagurmukhi": "\u0a2a", "pahiragana": "\u3071", "paiyannoithai": "\u0e2f", "pakatakana": "\u30d1", "palatalizationcyrilliccmb": "\u0484", "palochkacyrillic": "\u04c0", "pansioskorean": "\u317f", "paragraph": "\u00b6", "parallel": "\u2225", "parenleft": "\u0028", "parenleftaltonearabic": "\ufd3e", "parenleftbt": "\uf8ed", "parenleftex": "\uf8ec", "parenleftinferior": "\u208d", "parenleftmonospace": "\uff08", "parenleftsmall": "\ufe59", "parenleftsuperior": "\u207d", "parenlefttp": "\uf8eb", "parenleftvertical": "\ufe35", "parenright": "\u0029", "parenrightaltonearabic": "\ufd3f", "parenrightbt": "\uf8f8", "parenrightex": "\uf8f7", "parenrightinferior": "\u208e", "parenrightmonospace": "\uff09", "parenrightsmall": "\ufe5a", "parenrightsuperior": "\u207e", "parenrighttp": "\uf8f6", "parenrightvertical": "\ufe36", "partialdiff": "\u2202", "paseqhebrew": "\u05c0", "pashtahebrew": "\u0599", "pasquare": "\u33a9", "patah": "\u05b7", "patah11": "\u05b7", "patah1d": "\u05b7", "patah2a": "\u05b7", "patahhebrew": "\u05b7", "patahnarrowhebrew": "\u05b7", "patahquarterhebrew": "\u05b7", "patahwidehebrew": "\u05b7", "pazerhebrew": "\u05a1", "pbopomofo": "\u3106", "pcircle": "\u24df", "pdotaccent": "\u1e57", "pe": "\u05e4", "pecyrillic": "\u043f", "pedagesh": "\ufb44", "pedageshhebrew": "\ufb44", "peezisquare": "\u333b", "pefinaldageshhebrew": "\ufb43", "peharabic": "\u067e", "peharmenian": "\u057a", "pehebrew": "\u05e4", "pehfinalarabic": "\ufb57", "pehinitialarabic": "\ufb58", "pehiragana": "\u307a", "pehmedialarabic": "\ufb59", "pekatakana": "\u30da", "pemiddlehookcyrillic": "\u04a7", "perafehebrew": "\ufb4e", "percent": "\u0025", "percentarabic": "\u066a", "percentmonospace": "\uff05", "percentsmall": "\ufe6a", "period": "\u002e", "periodarmenian": "\u0589", "periodcentered": "\u00b7", "periodhalfwidth": "\uff61", "periodinferior": "\uf6e7", "periodmonospace": "\uff0e", "periodsmall": "\ufe52", "periodsuperior": "\uf6e8", "perispomenigreekcmb": "\u0342", "perpendicular": "\u22a5", "perthousand": "\u2030", "peseta": "\u20a7", "pfsquare": "\u338a", "phabengali": "\u09ab", "phadeva": "\u092b", "phagujarati": "\u0aab", "phagurmukhi": "\u0a2b", "phi": "\u03c6", "phi1": "\u03d5", "phieuphacirclekorean": "\u327a", "phieuphaparenkorean": "\u321a", "phieuphcirclekorean": "\u326c", "phieuphkorean": "\u314d", "phieuphparenkorean": "\u320c", "philatin": "\u0278", "phinthuthai": "\u0e3a", "phisymbolgreek": "\u03d5", "phook": "\u01a5", "phophanthai": "\u0e1e", "phophungthai": "\u0e1c", "phosamphaothai": "\u0e20", "pi": "\u03c0", "pieupacirclekorean": "\u3273", "pieupaparenkorean": "\u3213", "pieupcieuckorean": "\u3176", "pieupcirclekorean": "\u3265", "pieupkiyeokkorean": "\u3172", "pieupkorean": "\u3142", "pieupparenkorean": "\u3205", "pieupsioskiyeokkorean": "\u3174", "pieupsioskorean": "\u3144", "pieupsiostikeutkorean": "\u3175", "pieupthieuthkorean": "\u3177", "pieuptikeutkorean": "\u3173", "pihiragana": "\u3074", "pikatakana": "\u30d4", "pisymbolgreek": "\u03d6", "piwrarmenian": "\u0583", "plus": "\u002b", "plusbelowcmb": "\u031f", "pluscircle": "\u2295", "plusminus": "\u00b1", "plusmod": "\u02d6", "plusmonospace": "\uff0b", "plussmall": "\ufe62", "plussuperior": "\u207a", "pmonospace": "\uff50", "pmsquare": "\u33d8", "pohiragana": "\u307d", "pointingindexdownwhite": "\u261f", "pointingindexleftwhite": "\u261c", "pointingindexrightwhite": "\u261e", "pointingindexupwhite": "\u261d", "pokatakana": "\u30dd", "poplathai": "\u0e1b", "postalmark": "\u3012", "postalmarkface": "\u3020", "pparen": "\u24ab", "precedes": "\u227a", "prescription": "\u211e", "primemod": "\u02b9", "primereversed": "\u2035", "product": "\u220f", "projective": "\u2305", "prolongedkana": "\u30fc", "propellor": "\u2318", "propersubset": "\u2282", "propersuperset": "\u2283", "proportion": "\u2237", "proportional": "\u221d", "psi": "\u03c8", "psicyrillic": "\u0471", "psilipneumatacyrilliccmb": "\u0486", "pssquare": "\u33b0", "puhiragana": "\u3077", "pukatakana": "\u30d7", "pvsquare": "\u33b4", "pwsquare": "\u33ba", "q": "\u0071", "qadeva": "\u0958", "qadmahebrew": "\u05a8", "qafarabic": "\u0642", "qaffinalarabic": "\ufed6", "qafinitialarabic": "\ufed7", "qafmedialarabic": "\ufed8", "qamats": "\u05b8", "qamats10": "\u05b8", "qamats1a": "\u05b8", "qamats1c": "\u05b8", "qamats27": "\u05b8", "qamats29": "\u05b8", "qamats33": "\u05b8", "qamatsde": "\u05b8", "qamatshebrew": "\u05b8", "qamatsnarrowhebrew": "\u05b8", "qamatsqatanhebrew": "\u05b8", "qamatsqatannarrowhebrew": "\u05b8", "qamatsqatanquarterhebrew": "\u05b8", "qamatsqatanwidehebrew": "\u05b8", "qamatsquarterhebrew": "\u05b8", "qamatswidehebrew": "\u05b8", "qarneyparahebrew": "\u059f", "qbopomofo": "\u3111", "qcircle": "\u24e0", "qhook": "\u02a0", "qmonospace": "\uff51", "qof": "\u05e7", "qofdagesh": "\ufb47", "qofdageshhebrew": "\ufb47", "qofhatafpatah": "\u05e7\u05b2", "qofhatafpatahhebrew": "\u05e7\u05b2", "qofhatafsegol": "\u05e7\u05b1", "qofhatafsegolhebrew": "\u05e7\u05b1", "qofhebrew": "\u05e7", "qofhiriq": "\u05e7\u05b4", "qofhiriqhebrew": "\u05e7\u05b4", "qofholam": "\u05e7\u05b9", "qofholamhebrew": "\u05e7\u05b9", "qofpatah": "\u05e7\u05b7", "qofpatahhebrew": "\u05e7\u05b7", "qofqamats": "\u05e7\u05b8", "qofqamatshebrew": "\u05e7\u05b8", "qofqubuts": "\u05e7\u05bb", "qofqubutshebrew": "\u05e7\u05bb", "qofsegol": "\u05e7\u05b6", "qofsegolhebrew": "\u05e7\u05b6", "qofsheva": "\u05e7\u05b0", "qofshevahebrew": "\u05e7\u05b0", "qoftsere": "\u05e7\u05b5", "qoftserehebrew": "\u05e7\u05b5", "qparen": "\u24ac", "quarternote": "\u2669", "qubuts": "\u05bb", "qubuts18": "\u05bb", "qubuts25": "\u05bb", "qubuts31": "\u05bb", "qubutshebrew": "\u05bb", "qubutsnarrowhebrew": "\u05bb", "qubutsquarterhebrew": "\u05bb", "qubutswidehebrew": "\u05bb", "question": "\u003f", "questionarabic": "\u061f", "questionarmenian": "\u055e", "questiondown": "\u00bf", "questiondownsmall": "\uf7bf", "questiongreek": "\u037e", "questionmonospace": "\uff1f", "questionsmall": "\uf73f", "quotedbl": "\u0022", "quotedblbase": "\u201e", "quotedblleft": "\u201c", "quotedblmonospace": "\uff02", "quotedblprime": "\u301e", "quotedblprimereversed": "\u301d", "quotedblright": "\u201d", "quoteleft": "\u2018", "quoteleftreversed": "\u201b", "quotereversed": "\u201b", "quoteright": "\u2019", "quoterightn": "\u0149", "quotesinglbase": "\u201a", "quotesingle": "\u0027", "quotesinglemonospace": "\uff07", "r": "\u0072", "raarmenian": "\u057c", "rabengali": "\u09b0", "racute": "\u0155", "radeva": "\u0930", "radical": "\u221a", "radicalex": "\uf8e5", "radoverssquare": "\u33ae", "radoverssquaredsquare": "\u33af", "radsquare": "\u33ad", "rafe": "\u05bf", "rafehebrew": "\u05bf", "ragujarati": "\u0ab0", "ragurmukhi": "\u0a30", "rahiragana": "\u3089", "rakatakana": "\u30e9", "rakatakanahalfwidth": "\uff97", "ralowerdiagonalbengali": "\u09f1", "ramiddlediagonalbengali": "\u09f0", "ramshorn": "\u0264", "ratio": "\u2236", "rbopomofo": "\u3116", "rcaron": "\u0159", "rcedilla": "\u0157", "rcircle": "\u24e1", "rcommaaccent": "\u0157", "rdblgrave": "\u0211", "rdotaccent": "\u1e59", "rdotbelow": "\u1e5b", "rdotbelowmacron": "\u1e5d", "referencemark": "\u203b", "reflexsubset": "\u2286", "reflexsuperset": "\u2287", "registered": "\u00ae", "registersans": "\uf8e8", "registerserif": "\uf6da", "reharabic": "\u0631", "reharmenian": "\u0580", "rehfinalarabic": "\ufeae", "rehiragana": "\u308c", "rehyehaleflamarabic": "\u0631\ufef3\ufe8e\u0644", "rekatakana": "\u30ec", "rekatakanahalfwidth": "\uff9a", "resh": "\u05e8", "reshdageshhebrew": "\ufb48", "reshhatafpatah": "\u05e8\u05b2", "reshhatafpatahhebrew": "\u05e8\u05b2", "reshhatafsegol": "\u05e8\u05b1", "reshhatafsegolhebrew": "\u05e8\u05b1", "reshhebrew": "\u05e8", "reshhiriq": "\u05e8\u05b4", "reshhiriqhebrew": "\u05e8\u05b4", "reshholam": "\u05e8\u05b9", "reshholamhebrew": "\u05e8\u05b9", "reshpatah": "\u05e8\u05b7", "reshpatahhebrew": "\u05e8\u05b7", "reshqamats": "\u05e8\u05b8", "reshqamatshebrew": "\u05e8\u05b8", "reshqubuts": "\u05e8\u05bb", "reshqubutshebrew": "\u05e8\u05bb", "reshsegol": "\u05e8\u05b6", "reshsegolhebrew": "\u05e8\u05b6", "reshsheva": "\u05e8\u05b0", "reshshevahebrew": "\u05e8\u05b0", "reshtsere": "\u05e8\u05b5", "reshtserehebrew": "\u05e8\u05b5", "reversedtilde": "\u223d", "reviahebrew": "\u0597", "reviamugrashhebrew": "\u0597", "revlogicalnot": "\u2310", "rfishhook": "\u027e", "rfishhookreversed": "\u027f", "rhabengali": "\u09dd", "rhadeva": "\u095d", "rho": "\u03c1", "rhook": "\u027d", "rhookturned": "\u027b", "rhookturnedsuperior": "\u02b5", "rhosymbolgreek": "\u03f1", "rhotichookmod": "\u02de", "rieulacirclekorean": "\u3271", "rieulaparenkorean": "\u3211", "rieulcirclekorean": "\u3263", "rieulhieuhkorean": "\u3140", "rieulkiyeokkorean": "\u313a", "rieulkiyeoksioskorean": "\u3169", "rieulkorean": "\u3139", "rieulmieumkorean": "\u313b", "rieulpansioskorean": "\u316c", "rieulparenkorean": "\u3203", "rieulphieuphkorean": "\u313f", "rieulpieupkorean": "\u313c", "rieulpieupsioskorean": "\u316b", "rieulsioskorean": "\u313d", "rieulthieuthkorean": "\u313e", "rieultikeutkorean": "\u316a", "rieulyeorinhieuhkorean": "\u316d", "rightangle": "\u221f", "righttackbelowcmb": "\u0319", "righttriangle": "\u22bf", "rihiragana": "\u308a", "rikatakana": "\u30ea", "rikatakanahalfwidth": "\uff98", "ring": "\u02da", "ringbelowcmb": "\u0325", "ringcmb": "\u030a", "ringhalfleft": "\u02bf", "ringhalfleftarmenian": "\u0559", "ringhalfleftbelowcmb": "\u031c", "ringhalfleftcentered": "\u02d3", "ringhalfright": "\u02be", "ringhalfrightbelowcmb": "\u0339", "ringhalfrightcentered": "\u02d2", "rinvertedbreve": "\u0213", "rittorusquare": "\u3351", "rlinebelow": "\u1e5f", "rlongleg": "\u027c", "rlonglegturned": "\u027a", "rmonospace": "\uff52", "rohiragana": "\u308d", "rokatakana": "\u30ed", "rokatakanahalfwidth": "\uff9b", "roruathai": "\u0e23", "rparen": "\u24ad", "rrabengali": "\u09dc", "rradeva": "\u0931", "rragurmukhi": "\u0a5c", "rreharabic": "\u0691", "rrehfinalarabic": "\ufb8d", "rrvocalicbengali": "\u09e0", "rrvocalicdeva": "\u0960", "rrvocalicgujarati": "\u0ae0", "rrvocalicvowelsignbengali": "\u09c4", "rrvocalicvowelsigndeva": "\u0944", "rrvocalicvowelsigngujarati": "\u0ac4", "rsuperior": "\uf6f1", "rtblock": "\u2590", "rturned": "\u0279", "rturnedsuperior": "\u02b4", "ruhiragana": "\u308b", "rukatakana": "\u30eb", "rukatakanahalfwidth": "\uff99", "rupeemarkbengali": "\u09f2", "rupeesignbengali": "\u09f3", "rupiah": "\uf6dd", "ruthai": "\u0e24", "rvocalicbengali": "\u098b", "rvocalicdeva": "\u090b", "rvocalicgujarati": "\u0a8b", "rvocalicvowelsignbengali": "\u09c3", "rvocalicvowelsigndeva": "\u0943", "rvocalicvowelsigngujarati": "\u0ac3", "s": "\u0073", "sabengali": "\u09b8", "sacute": "\u015b", "sacutedotaccent": "\u1e65", "sadarabic": "\u0635", "sadeva": "\u0938", "sadfinalarabic": "\ufeba", "sadinitialarabic": "\ufebb", "sadmedialarabic": "\ufebc", "sagujarati": "\u0ab8", "sagurmukhi": "\u0a38", "sahiragana": "\u3055", "sakatakana": "\u30b5", "sakatakanahalfwidth": "\uff7b", "sallallahoualayhewasallamarabic": "\ufdfa", "samekh": "\u05e1", "samekhdagesh": "\ufb41", "samekhdageshhebrew": "\ufb41", "samekhhebrew": "\u05e1", "saraaathai": "\u0e32", "saraaethai": "\u0e41", "saraaimaimalaithai": "\u0e44", "saraaimaimuanthai": "\u0e43", "saraamthai": "\u0e33", "saraathai": "\u0e30", "saraethai": "\u0e40", "saraiileftthai": "\uf886", "saraiithai": "\u0e35", "saraileftthai": "\uf885", "saraithai": "\u0e34", "saraothai": "\u0e42", "saraueeleftthai": "\uf888", "saraueethai": "\u0e37", "saraueleftthai": "\uf887", "sarauethai": "\u0e36", "sarauthai": "\u0e38", "sarauuthai": "\u0e39", "sbopomofo": "\u3119", "scaron": "\u0161", "scarondotaccent": "\u1e67", "scedilla": "\u015f", "schwa": "\u0259", "schwacyrillic": "\u04d9", "schwadieresiscyrillic": "\u04db", "schwahook": "\u025a", "scircle": "\u24e2", "scircumflex": "\u015d", "scommaaccent": "\u0219", "sdotaccent": "\u1e61", "sdotbelow": "\u1e63", "sdotbelowdotaccent": "\u1e69", "seagullbelowcmb": "\u033c", "second": "\u2033", "secondtonechinese": "\u02ca", "section": "\u00a7", "seenarabic": "\u0633", "seenfinalarabic": "\ufeb2", "seeninitialarabic": "\ufeb3", "seenmedialarabic": "\ufeb4", "segol": "\u05b6", "segol13": "\u05b6", "segol1f": "\u05b6", "segol2c": "\u05b6", "segolhebrew": "\u05b6", "segolnarrowhebrew": "\u05b6", "segolquarterhebrew": "\u05b6", "segoltahebrew": "\u0592", "segolwidehebrew": "\u05b6", "seharmenian": "\u057d", "sehiragana": "\u305b", "sekatakana": "\u30bb", "sekatakanahalfwidth": "\uff7e", "semicolon": "\u003b", "semicolonarabic": "\u061b", "semicolonmonospace": "\uff1b", "semicolonsmall": "\ufe54", "semivoicedmarkkana": "\u309c", "semivoicedmarkkanahalfwidth": "\uff9f", "sentisquare": "\u3322", "sentosquare": "\u3323", "seven": "\u0037", "sevenarabic": "\u0667", "sevenbengali": "\u09ed", "sevencircle": "\u2466", "sevencircleinversesansserif": "\u2790", "sevendeva": "\u096d", "seveneighths": "\u215e", "sevengujarati": "\u0aed", "sevengurmukhi": "\u0a6d", "sevenhackarabic": "\u0667", "sevenhangzhou": "\u3027", "sevenideographicparen": "\u3226", "seveninferior": "\u2087", "sevenmonospace": "\uff17", "sevenoldstyle": "\uf737", "sevenparen": "\u247a", "sevenperiod": "\u248e", "sevenpersian": "\u06f7", "sevenroman": "\u2176", "sevensuperior": "\u2077", "seventeencircle": "\u2470", "seventeenparen": "\u2484", "seventeenperiod": "\u2498", "seventhai": "\u0e57", "sfthyphen": "\u00ad", "shaarmenian": "\u0577", "shabengali": "\u09b6", "shacyrillic": "\u0448", "shaddaarabic": "\u0651", "shaddadammaarabic": "\ufc61", "shaddadammatanarabic": "\ufc5e", "shaddafathaarabic": "\ufc60", "shaddafathatanarabic": "\u0651\u064b", "shaddakasraarabic": "\ufc62", "shaddakasratanarabic": "\ufc5f", "shade": "\u2592", "shadedark": "\u2593", "shadelight": "\u2591", "shademedium": "\u2592", "shadeva": "\u0936", "shagujarati": "\u0ab6", "shagurmukhi": "\u0a36", "shalshelethebrew": "\u0593", "shbopomofo": "\u3115", "shchacyrillic": "\u0449", "sheenarabic": "\u0634", "sheenfinalarabic": "\ufeb6", "sheeninitialarabic": "\ufeb7", "sheenmedialarabic": "\ufeb8", "sheicoptic": "\u03e3", "sheqel": "\u20aa", "sheqelhebrew": "\u20aa", "sheva": "\u05b0", "sheva115": "\u05b0", "sheva15": "\u05b0", "sheva22": "\u05b0", "sheva2e": "\u05b0", "shevahebrew": "\u05b0", "shevanarrowhebrew": "\u05b0", "shevaquarterhebrew": "\u05b0", "shevawidehebrew": "\u05b0", "shhacyrillic": "\u04bb", "shimacoptic": "\u03ed", "shin": "\u05e9", "shindagesh": "\ufb49", "shindageshhebrew": "\ufb49", "shindageshshindot": "\ufb2c", "shindageshshindothebrew": "\ufb2c", "shindageshsindot": "\ufb2d", "shindageshsindothebrew": "\ufb2d", "shindothebrew": "\u05c1", "shinhebrew": "\u05e9", "shinshindot": "\ufb2a", "shinshindothebrew": "\ufb2a", "shinsindot": "\ufb2b", "shinsindothebrew": "\ufb2b", "shook": "\u0282", "sigma": "\u03c3", "sigma1": "\u03c2", "sigmafinal": "\u03c2", "sigmalunatesymbolgreek": "\u03f2", "sihiragana": "\u3057", "sikatakana": "\u30b7", "sikatakanahalfwidth": "\uff7c", "siluqhebrew": "\u05bd", "siluqlefthebrew": "\u05bd", "similar": "\u223c", "sindothebrew": "\u05c2", "siosacirclekorean": "\u3274", "siosaparenkorean": "\u3214", "sioscieuckorean": "\u317e", "sioscirclekorean": "\u3266", "sioskiyeokkorean": "\u317a", "sioskorean": "\u3145", "siosnieunkorean": "\u317b", "siosparenkorean": "\u3206", "siospieupkorean": "\u317d", "siostikeutkorean": "\u317c", "six": "\u0036", "sixarabic": "\u0666", "sixbengali": "\u09ec", "sixcircle": "\u2465", "sixcircleinversesansserif": "\u278f", "sixdeva": "\u096c", "sixgujarati": "\u0aec", "sixgurmukhi": "\u0a6c", "sixhackarabic": "\u0666", "sixhangzhou": "\u3026", "sixideographicparen": "\u3225", "sixinferior": "\u2086", "sixmonospace": "\uff16", "sixoldstyle": "\uf736", "sixparen": "\u2479", "sixperiod": "\u248d", "sixpersian": "\u06f6", "sixroman": "\u2175", "sixsuperior": "\u2076", "sixteencircle": "\u246f", "sixteencurrencydenominatorbengali": "\u09f9", "sixteenparen": "\u2483", "sixteenperiod": "\u2497", "sixthai": "\u0e56", "slash": "\u002f", "slashmonospace": "\uff0f", "slong": "\u017f", "slongdotaccent": "\u1e9b", "smileface": "\u263a", "smonospace": "\uff53", "sofpasuqhebrew": "\u05c3", "softhyphen": "\u00ad", "softsigncyrillic": "\u044c", "sohiragana": "\u305d", "sokatakana": "\u30bd", "sokatakanahalfwidth": "\uff7f", "soliduslongoverlaycmb": "\u0338", "solidusshortoverlaycmb": "\u0337", "sorusithai": "\u0e29", "sosalathai": "\u0e28", "sosothai": "\u0e0b", "sosuathai": "\u0e2a", "space": "\u0020", "spacehackarabic": "\u0020", "spade": "\u2660", "spadesuitblack": "\u2660", "spadesuitwhite": "\u2664", "sparen": "\u24ae", "squarebelowcmb": "\u033b", "squarecc": "\u33c4", "squarecm": "\u339d", "squarediagonalcrosshatchfill": "\u25a9", "squarehorizontalfill": "\u25a4", "squarekg": "\u338f", "squarekm": "\u339e", "squarekmcapital": "\u33ce", "squareln": "\u33d1", "squarelog": "\u33d2", "squaremg": "\u338e", "squaremil": "\u33d5", "squaremm": "\u339c", "squaremsquared": "\u33a1", "squareorthogonalcrosshatchfill": "\u25a6", "squareupperlefttolowerrightfill": "\u25a7", "squareupperrighttolowerleftfill": "\u25a8", "squareverticalfill": "\u25a5", "squarewhitewithsmallblack": "\u25a3", "srsquare": "\u33db", "ssabengali": "\u09b7", "ssadeva": "\u0937", "ssagujarati": "\u0ab7", "ssangcieuckorean": "\u3149", "ssanghieuhkorean": "\u3185", "ssangieungkorean": "\u3180", "ssangkiyeokkorean": "\u3132", "ssangnieunkorean": "\u3165", "ssangpieupkorean": "\u3143", "ssangsioskorean": "\u3146", "ssangtikeutkorean": "\u3138", "ssuperior": "\uf6f2", "sterling": "\u00a3", "sterlingmonospace": "\uffe1", "strokelongoverlaycmb": "\u0336", "strokeshortoverlaycmb": "\u0335", "subset": "\u2282", "subsetnotequal": "\u228a", "subsetorequal": "\u2286", "succeeds": "\u227b", "suchthat": "\u220b", "suhiragana": "\u3059", "sukatakana": "\u30b9", "sukatakanahalfwidth": "\uff7d", "sukunarabic": "\u0652", "summation": "\u2211", "sun": "\u263c", "superset": "\u2283", "supersetnotequal": "\u228b", "supersetorequal": "\u2287", "svsquare": "\u33dc", "syouwaerasquare": "\u337c", "t": "\u0074", "tabengali": "\u09a4", "tackdown": "\u22a4", "tackleft": "\u22a3", "tadeva": "\u0924", "tagujarati": "\u0aa4", "tagurmukhi": "\u0a24", "taharabic": "\u0637", "tahfinalarabic": "\ufec2", "tahinitialarabic": "\ufec3", "tahiragana": "\u305f", "tahmedialarabic": "\ufec4", "taisyouerasquare": "\u337d", "takatakana": "\u30bf", "takatakanahalfwidth": "\uff80", "tatweelarabic": "\u0640", "tau": "\u03c4", "tav": "\u05ea", "tavdages": "\ufb4a", "tavdagesh": "\ufb4a", "tavdageshhebrew": "\ufb4a", "tavhebrew": "\u05ea", "tbar": "\u0167", "tbopomofo": "\u310a", "tcaron": "\u0165", "tccurl": "\u02a8", "tcedilla": "\u0163", "tcheharabic": "\u0686", "tchehfinalarabic": "\ufb7b", "tchehinitialarabic": "\ufb7c", "tchehmedialarabic": "\ufb7d", "tchehmeeminitialarabic": "\ufb7c\ufee4", "tcircle": "\u24e3", "tcircumflexbelow": "\u1e71", "tcommaaccent": "\u0163", "tdieresis": "\u1e97", "tdotaccent": "\u1e6b", "tdotbelow": "\u1e6d", "tecyrillic": "\u0442", "tedescendercyrillic": "\u04ad", "teharabic": "\u062a", "tehfinalarabic": "\ufe96", "tehhahinitialarabic": "\ufca2", "tehhahisolatedarabic": "\ufc0c", "tehinitialarabic": "\ufe97", "tehiragana": "\u3066", "tehjeeminitialarabic": "\ufca1", "tehjeemisolatedarabic": "\ufc0b", "tehmarbutaarabic": "\u0629", "tehmarbutafinalarabic": "\ufe94", "tehmedialarabic": "\ufe98", "tehmeeminitialarabic": "\ufca4", "tehmeemisolatedarabic": "\ufc0e", "tehnoonfinalarabic": "\ufc73", "tekatakana": "\u30c6", "tekatakanahalfwidth": "\uff83", "telephone": "\u2121", "telephoneblack": "\u260e", "telishagedolahebrew": "\u05a0", "telishaqetanahebrew": "\u05a9", "tencircle": "\u2469", "tenideographicparen": "\u3229", "tenparen": "\u247d", "tenperiod": "\u2491", "tenroman": "\u2179", "tesh": "\u02a7", "tet": "\u05d8", "tetdagesh": "\ufb38", "tetdageshhebrew": "\ufb38", "tethebrew": "\u05d8", "tetsecyrillic": "\u04b5", "tevirhebrew": "\u059b", "tevirlefthebrew": "\u059b", "thabengali": "\u09a5", "thadeva": "\u0925", "thagujarati": "\u0aa5", "thagurmukhi": "\u0a25", "thalarabic": "\u0630", "thalfinalarabic": "\ufeac", "thanthakhatlowleftthai": "\uf898", "thanthakhatlowrightthai": "\uf897", "thanthakhatthai": "\u0e4c", "thanthakhatupperleftthai": "\uf896", "theharabic": "\u062b", "thehfinalarabic": "\ufe9a", "thehinitialarabic": "\ufe9b", "thehmedialarabic": "\ufe9c", "thereexists": "\u2203", "therefore": "\u2234", "theta": "\u03b8", "theta1": "\u03d1", "thetasymbolgreek": "\u03d1", "thieuthacirclekorean": "\u3279", "thieuthaparenkorean": "\u3219", "thieuthcirclekorean": "\u326b", "thieuthkorean": "\u314c", "thieuthparenkorean": "\u320b", "thirteencircle": "\u246c", "thirteenparen": "\u2480", "thirteenperiod": "\u2494", "thonangmonthothai": "\u0e11", "thook": "\u01ad", "thophuthaothai": "\u0e12", "thorn": "\u00fe", "thothahanthai": "\u0e17", "thothanthai": "\u0e10", "thothongthai": "\u0e18", "thothungthai": "\u0e16", "thousandcyrillic": "\u0482", "thousandsseparatorarabic": "\u066c", "thousandsseparatorpersian": "\u066c", "three": "\u0033", "threearabic": "\u0663", "threebengali": "\u09e9", "threecircle": "\u2462", "threecircleinversesansserif": "\u278c", "threedeva": "\u0969", "threeeighths": "\u215c", "threegujarati": "\u0ae9", "threegurmukhi": "\u0a69", "threehackarabic": "\u0663", "threehangzhou": "\u3023", "threeideographicparen": "\u3222", "threeinferior": "\u2083", "threemonospace": "\uff13", "threenumeratorbengali": "\u09f6", "threeoldstyle": "\uf733", "threeparen": "\u2476", "threeperiod": "\u248a", "threepersian": "\u06f3", "threequarters": "\u00be", "threequartersemdash": "\uf6de", "threeroman": "\u2172", "threesuperior": "\u00b3", "threethai": "\u0e53", "thzsquare": "\u3394", "tihiragana": "\u3061", "tikatakana": "\u30c1", "tikatakanahalfwidth": "\uff81", "tikeutacirclekorean": "\u3270", "tikeutaparenkorean": "\u3210", "tikeutcirclekorean": "\u3262", "tikeutkorean": "\u3137", "tikeutparenkorean": "\u3202", "tilde": "\u02dc", "tildebelowcmb": "\u0330", "tildecmb": "\u0303", "tildecomb": "\u0303", "tildedoublecmb": "\u0360", "tildeoperator": "\u223c", "tildeoverlaycmb": "\u0334", "tildeverticalcmb": "\u033e", "timescircle": "\u2297", "tipehahebrew": "\u0596", "tipehalefthebrew": "\u0596", "tippigurmukhi": "\u0a70", "titlocyrilliccmb": "\u0483", "tiwnarmenian": "\u057f", "tlinebelow": "\u1e6f", "tmonospace": "\uff54", "toarmenian": "\u0569", "tohiragana": "\u3068", "tokatakana": "\u30c8", "tokatakanahalfwidth": "\uff84", "tonebarextrahighmod": "\u02e5", "tonebarextralowmod": "\u02e9", "tonebarhighmod": "\u02e6", "tonebarlowmod": "\u02e8", "tonebarmidmod": "\u02e7", "tonefive": "\u01bd", "tonesix": "\u0185", "tonetwo": "\u01a8", "tonos": "\u0384", "tonsquare": "\u3327", "topatakthai": "\u0e0f", "tortoiseshellbracketleft": "\u3014", "tortoiseshellbracketleftsmall": "\ufe5d", "tortoiseshellbracketleftvertical": "\ufe39", "tortoiseshellbracketright": "\u3015", "tortoiseshellbracketrightsmall": "\ufe5e", "tortoiseshellbracketrightvertical": "\ufe3a", "totaothai": "\u0e15", "tpalatalhook": "\u01ab", "tparen": "\u24af", "trademark": "\u2122", "trademarksans": "\uf8ea", "trademarkserif": "\uf6db", "tretroflexhook": "\u0288", "triagdn": "\u25bc", "triaglf": "\u25c4", "triagrt": "\u25ba", "triagup": "\u25b2", "ts": "\u02a6", "tsadi": "\u05e6", "tsadidagesh": "\ufb46", "tsadidageshhebrew": "\ufb46", "tsadihebrew": "\u05e6", "tsecyrillic": "\u0446", "tsere": "\u05b5", "tsere12": "\u05b5", "tsere1e": "\u05b5", "tsere2b": "\u05b5", "tserehebrew": "\u05b5", "tserenarrowhebrew": "\u05b5", "tserequarterhebrew": "\u05b5", "tserewidehebrew": "\u05b5", "tshecyrillic": "\u045b", "tsuperior": "\uf6f3", "ttabengali": "\u099f", "ttadeva": "\u091f", "ttagujarati": "\u0a9f", "ttagurmukhi": "\u0a1f", "tteharabic": "\u0679", "ttehfinalarabic": "\ufb67", "ttehinitialarabic": "\ufb68", "ttehmedialarabic": "\ufb69", "tthabengali": "\u09a0", "tthadeva": "\u0920", "tthagujarati": "\u0aa0", "tthagurmukhi": "\u0a20", "tturned": "\u0287", "tuhiragana": "\u3064", "tukatakana": "\u30c4", "tukatakanahalfwidth": "\uff82", "tusmallhiragana": "\u3063", "tusmallkatakana": "\u30c3", "tusmallkatakanahalfwidth": "\uff6f", "twelvecircle": "\u246b", "twelveparen": "\u247f", "twelveperiod": "\u2493", "twelveroman": "\u217b", "twentycircle": "\u2473", "twentyhangzhou": "\u5344", "twentyparen": "\u2487", "twentyperiod": "\u249b", "two": "\u0032", "twoarabic": "\u0662", "twobengali": "\u09e8", "twocircle": "\u2461", "twocircleinversesansserif": "\u278b", "twodeva": "\u0968", "twodotenleader": "\u2025", "twodotleader": "\u2025", "twodotleadervertical": "\ufe30", "twogujarati": "\u0ae8", "twogurmukhi": "\u0a68", "twohackarabic": "\u0662", "twohangzhou": "\u3022", "twoideographicparen": "\u3221", "twoinferior": "\u2082", "twomonospace": "\uff12", "twonumeratorbengali": "\u09f5", "twooldstyle": "\uf732", "twoparen": "\u2475", "twoperiod": "\u2489", "twopersian": "\u06f2", "tworoman": "\u2171", "twostroke": "\u01bb", "twosuperior": "\u00b2", "twothai": "\u0e52", "twothirds": "\u2154", "u": "\u0075", "uacute": "\u00fa", "ubar": "\u0289", "ubengali": "\u0989", "ubopomofo": "\u3128", "ubreve": "\u016d", "ucaron": "\u01d4", "ucircle": "\u24e4", "ucircumflex": "\u00fb", "ucircumflexbelow": "\u1e77", "ucyrillic": "\u0443", "udattadeva": "\u0951", "udblacute": "\u0171", "udblgrave": "\u0215", "udeva": "\u0909", "udieresis": "\u00fc", "udieresisacute": "\u01d8", "udieresisbelow": "\u1e73", "udieresiscaron": "\u01da", "udieresiscyrillic": "\u04f1", "udieresisgrave": "\u01dc", "udieresismacron": "\u01d6", "udotbelow": "\u1ee5", "ugrave": "\u00f9", "ugujarati": "\u0a89", "ugurmukhi": "\u0a09", "uhiragana": "\u3046", "uhookabove": "\u1ee7", "uhorn": "\u01b0", "uhornacute": "\u1ee9", "uhorndotbelow": "\u1ef1", "uhorngrave": "\u1eeb", "uhornhookabove": "\u1eed", "uhorntilde": "\u1eef", "uhungarumlaut": "\u0171", "uhungarumlautcyrillic": "\u04f3", "uinvertedbreve": "\u0217", "ukatakana": "\u30a6", "ukatakanahalfwidth": "\uff73", "ukcyrillic": "\u0479", "ukorean": "\u315c", "umacron": "\u016b", "umacroncyrillic": "\u04ef", "umacrondieresis": "\u1e7b", "umatragurmukhi": "\u0a41", "umonospace": "\uff55", "underscore": "\u005f", "underscoredbl": "\u2017", "underscoremonospace": "\uff3f", "underscorevertical": "\ufe33", "underscorewavy": "\ufe4f", "union": "\u222a", "universal": "\u2200", "uogonek": "\u0173", "uparen": "\u24b0", "upblock": "\u2580", "upperdothebrew": "\u05c4", "upsilon": "\u03c5", "upsilondieresis": "\u03cb", "upsilondieresistonos": "\u03b0", "upsilonlatin": "\u028a", "upsilontonos": "\u03cd", "uptackbelowcmb": "\u031d", "uptackmod": "\u02d4", "uragurmukhi": "\u0a73", "uring": "\u016f", "ushortcyrillic": "\u045e", "usmallhiragana": "\u3045", "usmallkatakana": "\u30a5", "usmallkatakanahalfwidth": "\uff69", "ustraightcyrillic": "\u04af", "ustraightstrokecyrillic": "\u04b1", "utilde": "\u0169", "utildeacute": "\u1e79", "utildebelow": "\u1e75", "uubengali": "\u098a", "uudeva": "\u090a", "uugujarati": "\u0a8a", "uugurmukhi": "\u0a0a", "uumatragurmukhi": "\u0a42", "uuvowelsignbengali": "\u09c2", "uuvowelsigndeva": "\u0942", "uuvowelsigngujarati": "\u0ac2", "uvowelsignbengali": "\u09c1", "uvowelsigndeva": "\u0941", "uvowelsigngujarati": "\u0ac1", "v": "\u0076", "vadeva": "\u0935", "vagujarati": "\u0ab5", "vagurmukhi": "\u0a35", "vakatakana": "\u30f7", "vav": "\u05d5", "vavdagesh": "\ufb35", "vavdagesh65": "\ufb35", "vavdageshhebrew": "\ufb35", "vavhebrew": "\u05d5", "vavholam": "\ufb4b", "vavholamhebrew": "\ufb4b", "vavvavhebrew": "\u05f0", "vavyodhebrew": "\u05f1", "vcircle": "\u24e5", "vdotbelow": "\u1e7f", "vecyrillic": "\u0432", "veharabic": "\u06a4", "vehfinalarabic": "\ufb6b", "vehinitialarabic": "\ufb6c", "vehmedialarabic": "\ufb6d", "vekatakana": "\u30f9", "venus": "\u2640", "verticalbar": "\u007c", "verticallineabovecmb": "\u030d", "verticallinebelowcmb": "\u0329", "verticallinelowmod": "\u02cc", "verticallinemod": "\u02c8", "vewarmenian": "\u057e", "vhook": "\u028b", "vikatakana": "\u30f8", "viramabengali": "\u09cd", "viramadeva": "\u094d", "viramagujarati": "\u0acd", "visargabengali": "\u0983", "visargadeva": "\u0903", "visargagujarati": "\u0a83", "vmonospace": "\uff56", "voarmenian": "\u0578", "voicediterationhiragana": "\u309e", "voicediterationkatakana": "\u30fe", "voicedmarkkana": "\u309b", "voicedmarkkanahalfwidth": "\uff9e", "vokatakana": "\u30fa", "vparen": "\u24b1", "vtilde": "\u1e7d", "vturned": "\u028c", "vuhiragana": "\u3094", "vukatakana": "\u30f4", "w": "\u0077", "wacute": "\u1e83", "waekorean": "\u3159", "wahiragana": "\u308f", "wakatakana": "\u30ef", "wakatakanahalfwidth": "\uff9c", "wakorean": "\u3158", "wasmallhiragana": "\u308e", "wasmallkatakana": "\u30ee", "wattosquare": "\u3357", "wavedash": "\u301c", "wavyunderscorevertical": "\ufe34", "wawarabic": "\u0648", "wawfinalarabic": "\ufeee", "wawhamzaabovearabic": "\u0624", "wawhamzaabovefinalarabic": "\ufe86", "wbsquare": "\u33dd", "wcircle": "\u24e6", "wcircumflex": "\u0175", "wdieresis": "\u1e85", "wdotaccent": "\u1e87", "wdotbelow": "\u1e89", "wehiragana": "\u3091", "weierstrass": "\u2118", "wekatakana": "\u30f1", "wekorean": "\u315e", "weokorean": "\u315d", "wgrave": "\u1e81", "whitebullet": "\u25e6", "whitecircle": "\u25cb", "whitecircleinverse": "\u25d9", "whitecornerbracketleft": "\u300e", "whitecornerbracketleftvertical": "\ufe43", "whitecornerbracketright": "\u300f", "whitecornerbracketrightvertical": "\ufe44", "whitediamond": "\u25c7", "whitediamondcontainingblacksmalldiamond": "\u25c8", "whitedownpointingsmalltriangle": "\u25bf", "whitedownpointingtriangle": "\u25bd", "whiteleftpointingsmalltriangle": "\u25c3", "whiteleftpointingtriangle": "\u25c1", "whitelenticularbracketleft": "\u3016", "whitelenticularbracketright": "\u3017", "whiterightpointingsmalltriangle": "\u25b9", "whiterightpointingtriangle": "\u25b7", "whitesmallsquare": "\u25ab", "whitesmilingface": "\u263a", "whitesquare": "\u25a1", "whitestar": "\u2606", "whitetelephone": "\u260f", "whitetortoiseshellbracketleft": "\u3018", "whitetortoiseshellbracketright": "\u3019", "whiteuppointingsmalltriangle": "\u25b5", "whiteuppointingtriangle": "\u25b3", "wihiragana": "\u3090", "wikatakana": "\u30f0", "wikorean": "\u315f", "wmonospace": "\uff57", "wohiragana": "\u3092", "wokatakana": "\u30f2", "wokatakanahalfwidth": "\uff66", "won": "\u20a9", "wonmonospace": "\uffe6", "wowaenthai": "\u0e27", "wparen": "\u24b2", "wring": "\u1e98", "wsuperior": "\u02b7", "wturned": "\u028d", "wynn": "\u01bf", "x": "\u0078", "xabovecmb": "\u033d", "xbopomofo": "\u3112", "xcircle": "\u24e7", "xdieresis": "\u1e8d", "xdotaccent": "\u1e8b", "xeharmenian": "\u056d", "xi": "\u03be", "xmonospace": "\uff58", "xparen": "\u24b3", "xsuperior": "\u02e3", "y": "\u0079", "yaadosquare": "\u334e", "yabengali": "\u09af", "yacute": "\u00fd", "yadeva": "\u092f", "yaekorean": "\u3152", "yagujarati": "\u0aaf", "yagurmukhi": "\u0a2f", "yahiragana": "\u3084", "yakatakana": "\u30e4", "yakatakanahalfwidth": "\uff94", "yakorean": "\u3151", "yamakkanthai": "\u0e4e", "yasmallhiragana": "\u3083", "yasmallkatakana": "\u30e3", "yasmallkatakanahalfwidth": "\uff6c", "yatcyrillic": "\u0463", "ycircle": "\u24e8", "ycircumflex": "\u0177", "ydieresis": "\u00ff", "ydotaccent": "\u1e8f", "ydotbelow": "\u1ef5", "yeharabic": "\u064a", "yehbarreearabic": "\u06d2", "yehbarreefinalarabic": "\ufbaf", "yehfinalarabic": "\ufef2", "yehhamzaabovearabic": "\u0626", "yehhamzaabovefinalarabic": "\ufe8a", "yehhamzaaboveinitialarabic": "\ufe8b", "yehhamzaabovemedialarabic": "\ufe8c", "yehinitialarabic": "\ufef3", "yehmedialarabic": "\ufef4", "yehmeeminitialarabic": "\ufcdd", "yehmeemisolatedarabic": "\ufc58", "yehnoonfinalarabic": "\ufc94", "yehthreedotsbelowarabic": "\u06d1", "yekorean": "\u3156", "yen": "\u00a5", "yenmonospace": "\uffe5", "yeokorean": "\u3155", "yeorinhieuhkorean": "\u3186", "yerahbenyomohebrew": "\u05aa", "yerahbenyomolefthebrew": "\u05aa", "yericyrillic": "\u044b", "yerudieresiscyrillic": "\u04f9", "yesieungkorean": "\u3181", "yesieungpansioskorean": "\u3183", "yesieungsioskorean": "\u3182", "yetivhebrew": "\u059a", "ygrave": "\u1ef3", "yhook": "\u01b4", "yhookabove": "\u1ef7", "yiarmenian": "\u0575", "yicyrillic": "\u0457", "yikorean": "\u3162", "yinyang": "\u262f", "yiwnarmenian": "\u0582", "ymonospace": "\uff59", "yod": "\u05d9", "yoddagesh": "\ufb39", "yoddageshhebrew": "\ufb39", "yodhebrew": "\u05d9", "yodyodhebrew": "\u05f2", "yodyodpatahhebrew": "\ufb1f", "yohiragana": "\u3088", "yoikorean": "\u3189", "yokatakana": "\u30e8", "yokatakanahalfwidth": "\uff96", "yokorean": "\u315b", "yosmallhiragana": "\u3087", "yosmallkatakana": "\u30e7", "yosmallkatakanahalfwidth": "\uff6e", "yotgreek": "\u03f3", "yoyaekorean": "\u3188", "yoyakorean": "\u3187", "yoyakthai": "\u0e22", "yoyingthai": "\u0e0d", "yparen": "\u24b4", "ypogegrammeni": "\u037a", "ypogegrammenigreekcmb": "\u0345", "yr": "\u01a6", "yring": "\u1e99", "ysuperior": "\u02b8", "ytilde": "\u1ef9", "yturned": "\u028e", "yuhiragana": "\u3086", "yuikorean": "\u318c", "yukatakana": "\u30e6", "yukatakanahalfwidth": "\uff95", "yukorean": "\u3160", "yusbigcyrillic": "\u046b", "yusbigiotifiedcyrillic": "\u046d", "yuslittlecyrillic": "\u0467", "yuslittleiotifiedcyrillic": "\u0469", "yusmallhiragana": "\u3085", "yusmallkatakana": "\u30e5", "yusmallkatakanahalfwidth": "\uff6d", "yuyekorean": "\u318b", "yuyeokorean": "\u318a", "yyabengali": "\u09df", "yyadeva": "\u095f", "z": "\u007a", "zaarmenian": "\u0566", "zacute": "\u017a", "zadeva": "\u095b", "zagurmukhi": "\u0a5b", "zaharabic": "\u0638", "zahfinalarabic": "\ufec6", "zahinitialarabic": "\ufec7", "zahiragana": "\u3056", "zahmedialarabic": "\ufec8", "zainarabic": "\u0632", "zainfinalarabic": "\ufeb0", "zakatakana": "\u30b6", "zaqefgadolhebrew": "\u0595", "zaqefqatanhebrew": "\u0594", "zarqahebrew": "\u0598", "zayin": "\u05d6", "zayindagesh": "\ufb36", "zayindageshhebrew": "\ufb36", "zayinhebrew": "\u05d6", "zbopomofo": "\u3117", "zcaron": "\u017e", "zcircle": "\u24e9", "zcircumflex": "\u1e91", "zcurl": "\u0291", "zdot": "\u017c", "zdotaccent": "\u017c", "zdotbelow": "\u1e93", "zecyrillic": "\u0437", "zedescendercyrillic": "\u0499", "zedieresiscyrillic": "\u04df", "zehiragana": "\u305c", "zekatakana": "\u30bc", "zero": "\u0030", "zeroarabic": "\u0660", "zerobengali": "\u09e6", "zerodeva": "\u0966", "zerogujarati": "\u0ae6", "zerogurmukhi": "\u0a66", "zerohackarabic": "\u0660", "zeroinferior": "\u2080", "zeromonospace": "\uff10", "zerooldstyle": "\uf730", "zeropersian": "\u06f0", "zerosuperior": "\u2070", "zerothai": "\u0e50", "zerowidthjoiner": "\ufeff", "zerowidthnonjoiner": "\u200c", "zerowidthspace": "\u200b", "zeta": "\u03b6", "zhbopomofo": "\u3113", "zhearmenian": "\u056a", "zhebrevecyrillic": "\u04c2", "zhecyrillic": "\u0436", "zhedescendercyrillic": "\u0497", "zhedieresiscyrillic": "\u04dd", "zihiragana": "\u3058", "zikatakana": "\u30b8", "zinorhebrew": "\u05ae", "zlinebelow": "\u1e95", "zmonospace": "\uff5a", "zohiragana": "\u305e", "zokatakana": "\u30be", "zparen": "\u24b5", "zretroflexhook": "\u0290", "zstroke": "\u01b6", "zuhiragana": "\u305a", "zukatakana": "\u30ba", } # --end ================================================ FILE: babeldoc/pdfminer/high_level.py ================================================ """Functions that can be used for the most common use-cases for pdfminer.six""" import logging import sys from collections.abc import Container from collections.abc import Iterator from io import StringIO from typing import Any from typing import BinaryIO from typing import cast from babeldoc.pdfminer.converter import HOCRConverter from babeldoc.pdfminer.converter import HTMLConverter from babeldoc.pdfminer.converter import PDFPageAggregator from babeldoc.pdfminer.converter import TextConverter from babeldoc.pdfminer.converter import XMLConverter from babeldoc.pdfminer.image import ImageWriter from babeldoc.pdfminer.layout import LAParams from babeldoc.pdfminer.layout import LTPage from babeldoc.pdfminer.pdfdevice import PDFDevice from babeldoc.pdfminer.pdfdevice import TagExtractor from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdfinterp import PDFPageInterpreter from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.utils import AnyIO from babeldoc.pdfminer.utils import FileOrName from babeldoc.pdfminer.utils import open_filename def extract_text_to_fp( inf: BinaryIO, outfp: AnyIO, output_type: str = "text", codec: str = "utf-8", laparams: LAParams | None = None, maxpages: int = 0, page_numbers: Container[int] | None = None, password: str = "", scale: float = 1.0, rotation: int = 0, layoutmode: str = "normal", output_dir: str | None = None, strip_control: bool = False, debug: bool = False, disable_caching: bool = False, **kwargs: Any, ) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. Beware laparams: Including an empty LAParams is not the same as passing None! :param inf: a file-like object to read PDF structure from, such as a file handler (using the builtin `open()` function) or a `BytesIO`. :param outfp: a file-like object to write the text to. :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. Only 'text' works properly. :param codec: Text decoding codec :param laparams: An LAParams object from babeldoc.pdfminer.layout. Default is None but may not layout correctly. :param maxpages: How many pages to stop parsing after :param page_numbers: zero-indexed page numbers to operate on. :param password: For encrypted PDFs, the password to decrypt. :param scale: Scale factor :param rotation: Rotation factor :param layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter :param output_dir: If given, creates an ImageWriter for extracted images. :param strip_control: Does what it says on the tin :param debug: Output more logging data :param disable_caching: Does what it says on the tin :param other: :return: nothing, acting as it does on two streams. Use StringIO to get strings. """ if debug: logging.getLogger().setLevel(logging.DEBUG) imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) device: PDFDevice | None = None if output_type != "text" and outfp == sys.stdout: outfp = sys.stdout.buffer if output_type == "text": device = TextConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, ) elif output_type == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=strip_control, ) elif output_type == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, ) elif output_type == "hocr": device = HOCRConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control, ) elif output_type == "tag": # Binary I/O is required, but we have no good way to test it here. device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) else: msg = f"Output type can be text, html, xml or tag but is {output_type}" raise PDFValueError(msg) assert device is not None interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( inf, page_numbers, maxpages=maxpages, password=password, caching=not disable_caching, ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() def extract_text( pdf_file: FileOrName, password: str = "", page_numbers: Container[int] | None = None, maxpages: int = 0, caching: bool = True, codec: str = "utf-8", laparams: LAParams | None = None, ) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from babeldoc.pdfminer.layout. If None, uses some default settings that often work well. :return: a string containing all of the text extracted. """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) return output_string.getvalue() def extract_pages( pdf_file: FileOrName, password: str = "", page_numbers: Container[int] | None = None, maxpages: int = 0, caching: bool = True, laparams: LAParams | None = None, ) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param laparams: An LAParams object from babeldoc.pdfminer.layout. If None, uses some default settings that often work well. :return: LTPage objects """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp: fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) layout = device.get_result() yield layout ================================================ FILE: babeldoc/pdfminer/image.py ================================================ import os import os.path import struct from io import BytesIO from typing import BinaryIO from typing import Literal from babeldoc.pdfminer.jbig2 import JBIG2StreamReader from babeldoc.pdfminer.jbig2 import JBIG2StreamWriter from babeldoc.pdfminer.layout import LTImage from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_CMYK from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_GRAY from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_RGB from babeldoc.pdfminer.pdfcolor import LITERAL_INLINE_DEVICE_GRAY from babeldoc.pdfminer.pdfcolor import LITERAL_INLINE_DEVICE_RGB from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdftypes import LITERALS_DCT_DECODE from babeldoc.pdfminer.pdftypes import LITERALS_FLATE_DECODE from babeldoc.pdfminer.pdftypes import LITERALS_JBIG2_DECODE from babeldoc.pdfminer.pdftypes import LITERALS_JPX_DECODE PIL_ERROR_MESSAGE = ( "Could not import Pillow. This dependency of pdfminer.six is not " "installed by default. You need it to to save jpg images to a file. Install it " "with `pip install 'pdfminer.six[image]'`" ) def align32(x: int) -> int: return ((x + 3) // 4) * 4 class BMPWriter: def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None: self.fp = fp self.bits = bits self.width = width self.height = height if bits == 1: ncols = 2 elif bits == 8: ncols = 256 elif bits == 24: ncols = 0 else: raise PDFValueError(bits) self.linesize = align32((self.width * self.bits + 7) // 8) self.datasize = self.linesize * self.height headersize = 14 + 40 + ncols * 4 info = struct.pack( " None: self.fp.seek(self.pos1 - (y + 1) * self.linesize) self.fp.write(data) class ImageWriter: """Write image to a file Supports various image types: JPEG, JBIG2 and bitmaps """ def __init__(self, outdir: str) -> None: self.outdir = outdir if not os.path.exists(self.outdir): os.makedirs(self.outdir) def export_image(self, image: LTImage) -> str: """Save an LTImage to disk""" (width, height) = image.srcsize filters = image.stream.get_filters() if filters[-1][0] in LITERALS_DCT_DECODE: name = self._save_jpeg(image) elif filters[-1][0] in LITERALS_JPX_DECODE: name = self._save_jpeg2000(image) elif self._is_jbig2_iamge(image): name = self._save_jbig2(image) elif image.bits == 1: name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) elif image.bits == 8 and ( LITERAL_DEVICE_RGB in image.colorspace or LITERAL_INLINE_DEVICE_RGB in image.colorspace ): name = self._save_bmp(image, width, height, width * 3, image.bits * 3) elif image.bits == 8 and ( LITERAL_DEVICE_GRAY in image.colorspace or LITERAL_INLINE_DEVICE_GRAY in image.colorspace ): name = self._save_bmp(image, width, height, width, image.bits) elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: name = self._save_bytes(image) else: name = self._save_raw(image) return name def _save_jpeg(self, image: LTImage) -> str: """Save a JPEG encoded image""" data = image.stream.get_data() name, path = self._create_unique_image_name(image, ".jpg") with open(path, "wb") as fp: if LITERAL_DEVICE_CMYK in image.colorspace: try: from PIL import Image # type: ignore[import] from PIL import ImageChops # type: ignore[import] except ImportError: raise ImportError(PIL_ERROR_MESSAGE) ifp = BytesIO(data) i = Image.open(ifp) i = ImageChops.invert(i) i = i.convert("RGB") i.save(fp, "JPEG") else: fp.write(data) return name def _save_jpeg2000(self, image: LTImage) -> str: """Save a JPEG 2000 encoded image""" data = image.stream.get_data() name, path = self._create_unique_image_name(image, ".jp2") with open(path, "wb") as fp: try: from PIL import Image # type: ignore[import] except ImportError: raise ImportError(PIL_ERROR_MESSAGE) # if we just write the raw data, most image programs # that I have tried cannot open the file. However, # open and saving with PIL produces a file that # seems to be easily opened by other programs ifp = BytesIO(data) i = Image.open(ifp) i.save(fp, "JPEG2000") return name def _save_jbig2(self, image: LTImage) -> str: """Save a JBIG2 encoded image""" name, path = self._create_unique_image_name(image, ".jb2") with open(path, "wb") as fp: input_stream = BytesIO() global_streams = [] filters = image.stream.get_filters() for filter_name, params in filters: if filter_name in LITERALS_JBIG2_DECODE: global_streams.append(params["JBIG2Globals"].resolve()) if len(global_streams) > 1: msg = ( "There should never be more than one JBIG2Globals " "associated with a JBIG2 embedded image" ) raise PDFValueError(msg) if len(global_streams) == 1: input_stream.write(global_streams[0].get_data().rstrip(b"\n")) input_stream.write(image.stream.get_data()) input_stream.seek(0) reader = JBIG2StreamReader(input_stream) segments = reader.get_segments() writer = JBIG2StreamWriter(fp) writer.write_file(segments) return name def _save_bmp( self, image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, ) -> str: """Save a BMP encoded image""" name, path = self._create_unique_image_name(image, ".bmp") with open(path, "wb") as fp: bmp = BMPWriter(fp, bits, width, height) data = image.stream.get_data() i = 0 for y in range(height): bmp.write_line(y, data[i : i + bytes_per_line]) i += bytes_per_line return name def _save_bytes(self, image: LTImage) -> str: """Save an image without encoding, just bytes""" name, path = self._create_unique_image_name(image, ".jpg") width, height = image.srcsize channels = len(image.stream.get_data()) / width / height / (image.bits / 8) with open(path, "wb") as fp: try: from PIL import Image # type: ignore[import] from PIL import ImageOps except ImportError: raise ImportError(PIL_ERROR_MESSAGE) mode: Literal["1", "L", "RGB", "CMYK"] if image.bits == 1: mode = "1" elif image.bits == 8 and channels == 1: mode = "L" elif image.bits == 8 and channels == 3: mode = "RGB" elif image.bits == 8 and channels == 4: mode = "CMYK" img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw") if mode == "L": img = ImageOps.invert(img) img.save(fp) return name def _save_raw(self, image: LTImage) -> str: """Save an image with unknown encoding""" ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1]) name, path = self._create_unique_image_name(image, ext) with open(path, "wb") as fp: fp.write(image.stream.get_data()) return name @staticmethod def _is_jbig2_iamge(image: LTImage) -> bool: filters = image.stream.get_filters() for filter_name, params in filters: if filter_name in LITERALS_JBIG2_DECODE: return True return False def _create_unique_image_name(self, image: LTImage, ext: str) -> tuple[str, str]: name = image.name + ext path = os.path.join(self.outdir, name) img_index = 0 while os.path.exists(path): name = "%s.%d%s" % (image.name, img_index, ext) path = os.path.join(self.outdir, name) img_index += 1 return name, path ================================================ FILE: babeldoc/pdfminer/jbig2.py ================================================ import math import os from collections.abc import Iterable from struct import calcsize from struct import pack from struct import unpack from typing import BinaryIO from typing import cast from babeldoc.pdfminer.pdfexceptions import PDFValueError # segment structure base SEG_STRUCT = [ (">L", "number"), (">B", "flags"), (">B", "retention_flags"), (">B", "page_assoc"), (">L", "data_length"), ] # segment header literals HEADER_FLAG_DEFERRED = 0b10000000 HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000 SEG_TYPE_MASK = 0b00111111 REF_COUNT_SHORT_MASK = 0b11100000 REF_COUNT_LONG_MASK = 0x1FFFFFFF REF_COUNT_LONG = 7 DATA_LEN_UNKNOWN = 0xFFFFFFFF # segment types SEG_TYPE_IMMEDIATE_GEN_REGION = 38 SEG_TYPE_END_OF_PAGE = 49 SEG_TYPE_END_OF_FILE = 51 # file literals FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a" FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 def bit_set(bit_pos: int, value: int) -> bool: return bool((value >> bit_pos) & 1) def check_flag(flag: int, value: int) -> bool: return bool(flag & value) def masked_value(mask: int, value: int) -> int: for bit_pos in range(31): if bit_set(bit_pos, mask): return (value & mask) >> bit_pos raise PDFValueError("Invalid mask or value") def mask_value(mask: int, value: int) -> int: for bit_pos in range(31): if bit_set(bit_pos, mask): return (value & (mask >> bit_pos)) << bit_pos raise PDFValueError("Invalid mask or value") def unpack_int(format: str, buffer: bytes) -> int: assert format in {">B", ">I", ">L"} [result] = cast(tuple[int], unpack(format, buffer)) return result JBIG2SegmentFlags = dict[str, int | bool] JBIG2RetentionFlags = dict[str, int | list[int] | list[bool]] JBIG2Segment = dict[ str, bool | int | bytes | JBIG2SegmentFlags | JBIG2RetentionFlags, ] class JBIG2StreamReader: """Read segments from a JBIG2 byte stream""" def __init__(self, stream: BinaryIO) -> None: self.stream = stream def get_segments(self) -> list[JBIG2Segment]: segments: list[JBIG2Segment] = [] while not self.is_eof(): segment: JBIG2Segment = {} for field_format, name in SEG_STRUCT: field_len = calcsize(field_format) field = self.stream.read(field_len) if len(field) < field_len: segment["_error"] = True break value = unpack_int(field_format, field) parser = getattr(self, "parse_%s" % name, None) if callable(parser): value = parser(segment, value, field) segment[name] = value if not segment.get("_error"): segments.append(segment) return segments def is_eof(self) -> bool: if self.stream.read(1) == b"": return True else: self.stream.seek(-1, os.SEEK_CUR) return False def parse_flags( self, segment: JBIG2Segment, flags: int, field: bytes, ) -> JBIG2SegmentFlags: return { "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), "type": masked_value(SEG_TYPE_MASK, flags), } def parse_retention_flags( self, segment: JBIG2Segment, flags: int, field: bytes, ) -> JBIG2RetentionFlags: ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) retain_segments = [] ref_segments = [] if ref_count < REF_COUNT_LONG: for bit_pos in range(5): retain_segments.append(bit_set(bit_pos, flags)) else: field += self.stream.read(3) ref_count = unpack_int(">L", field) ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count) ret_bytes_count = int(math.ceil((ref_count + 1) / 8)) for ret_byte_index in range(ret_bytes_count): ret_byte = unpack_int(">B", self.stream.read(1)) for bit_pos in range(7): retain_segments.append(bit_set(bit_pos, ret_byte)) seg_num = segment["number"] assert isinstance(seg_num, int) if seg_num <= 256: ref_format = ">B" elif seg_num <= 65536: ref_format = ">I" else: ref_format = ">L" ref_size = calcsize(ref_format) for ref_index in range(ref_count): ref_data = self.stream.read(ref_size) ref = unpack_int(ref_format, ref_data) ref_segments.append(ref) return { "ref_count": ref_count, "retain_segments": retain_segments, "ref_segments": ref_segments, } def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int: if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]: field += self.stream.read(3) page = unpack_int(">L", field) return page def parse_data_length( self, segment: JBIG2Segment, length: int, field: bytes, ) -> int: if length: if ( cast(JBIG2SegmentFlags, segment["flags"])["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION ) and (length == DATA_LEN_UNKNOWN): raise NotImplementedError( "Working with unknown segment length is not implemented yet", ) else: segment["raw_data"] = self.stream.read(length) return length class JBIG2StreamWriter: """Write JBIG2 segments to a file in JBIG2 format""" EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = { "ref_count": 0, "ref_segments": cast(list[int], []), "retain_segments": cast(list[bool], []), } def __init__(self, stream: BinaryIO) -> None: self.stream = stream def write_segments( self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True, ) -> int: data_len = 0 current_page: int | None = None seg_num: int | None = None for segment in segments: data = self.encode_segment(segment) self.stream.write(data) data_len += len(data) seg_num = cast(int | None, segment["number"]) if fix_last_page: seg_page = cast(int, segment.get("page_assoc")) if ( cast(JBIG2SegmentFlags, segment["flags"])["type"] == SEG_TYPE_END_OF_PAGE ): current_page = None elif seg_page: current_page = seg_page if fix_last_page and current_page and (seg_num is not None): segment = self.get_eop_segment(seg_num + 1, current_page) data = self.encode_segment(segment) self.stream.write(data) data_len += len(data) return data_len def write_file( self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True, ) -> int: header = FILE_HEADER_ID header_flags = FILE_HEAD_FLAG_SEQUENTIAL header += pack(">B", header_flags) # The embedded JBIG2 files in a PDF always # only have one page number_of_pages = pack(">L", 1) header += number_of_pages self.stream.write(header) data_len = len(header) data_len += self.write_segments(segments, fix_last_page) seg_num = 0 for segment in segments: seg_num = cast(int, segment["number"]) if fix_last_page: seg_num_offset = 2 else: seg_num_offset = 1 eof_segment = self.get_eof_segment(seg_num + seg_num_offset) data = self.encode_segment(eof_segment) self.stream.write(data) data_len += len(data) return data_len def encode_segment(self, segment: JBIG2Segment) -> bytes: data = b"" for field_format, name in SEG_STRUCT: value = segment.get(name) encoder = getattr(self, "encode_%s" % name, None) if callable(encoder): field = encoder(value, segment) else: field = pack(field_format, value) data += field return data def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes: flags = 0 if value.get("deferred"): flags |= HEADER_FLAG_DEFERRED if "page_assoc_long" in value: flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags else: flags |= ( HEADER_FLAG_PAGE_ASSOC_LONG if cast(int, segment.get("page", 0)) > 255 else flags ) flags |= mask_value(SEG_TYPE_MASK, value["type"]) return pack(">B", flags) def encode_retention_flags( self, value: JBIG2RetentionFlags, segment: JBIG2Segment, ) -> bytes: flags = [] flags_format = ">B" ref_count = value["ref_count"] assert isinstance(ref_count, int) retain_segments = cast(list[bool], value.get("retain_segments", [])) if ref_count <= 4: flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) for ref_index, ref_retain in enumerate(retain_segments): if ref_retain: flags_byte |= 1 << ref_index flags.append(flags_byte) else: bytes_count = math.ceil((ref_count + 1) / 8) flags_format = ">L" + ("B" * bytes_count) flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24 flags.append(flags_dword) for byte_index in range(bytes_count): ret_byte = 0 ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8] for bit_pos, ret_seg in enumerate(ret_part): ret_byte |= 1 << bit_pos if ret_seg else ret_byte flags.append(ret_byte) ref_segments = cast(list[int], value.get("ref_segments", [])) seg_num = cast(int, segment["number"]) if seg_num <= 256: ref_format = "B" elif seg_num <= 65536: ref_format = "I" else: ref_format = "L" for ref in ref_segments: flags_format += ref_format flags.append(ref) return pack(flags_format, *flags) def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes: data = pack(">L", value) data += cast(bytes, segment["raw_data"]) return data def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment: return { "data_length": 0, "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE}, "number": seg_number, "page_assoc": page_number, "raw_data": b"", "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS, } def get_eof_segment(self, seg_number: int) -> JBIG2Segment: return { "data_length": 0, "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE}, "number": seg_number, "page_assoc": 0, "raw_data": b"", "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS, } ================================================ FILE: babeldoc/pdfminer/latin_enc.py ================================================ """Standard encoding tables used in PDF. This table is extracted from PDF Reference Manual 1.6, pp.925 "D.1 Latin Character Set and Encodings" """ EncodingRow = tuple[str, int | None, int | None, int | None, int | None] ENCODING: list[EncodingRow] = [ # (name, std, mac, win, pdf) ("A", 65, 65, 65, 65), ("AE", 225, 174, 198, 198), ("Aacute", None, 231, 193, 193), ("Acircumflex", None, 229, 194, 194), ("Adieresis", None, 128, 196, 196), ("Agrave", None, 203, 192, 192), ("Aring", None, 129, 197, 197), ("Atilde", None, 204, 195, 195), ("B", 66, 66, 66, 66), ("C", 67, 67, 67, 67), ("Ccedilla", None, 130, 199, 199), ("D", 68, 68, 68, 68), ("E", 69, 69, 69, 69), ("Eacute", None, 131, 201, 201), ("Ecircumflex", None, 230, 202, 202), ("Edieresis", None, 232, 203, 203), ("Egrave", None, 233, 200, 200), ("Eth", None, None, 208, 208), ("Euro", None, None, 128, 160), ("F", 70, 70, 70, 70), ("G", 71, 71, 71, 71), ("H", 72, 72, 72, 72), ("I", 73, 73, 73, 73), ("Iacute", None, 234, 205, 205), ("Icircumflex", None, 235, 206, 206), ("Idieresis", None, 236, 207, 207), ("Igrave", None, 237, 204, 204), ("J", 74, 74, 74, 74), ("K", 75, 75, 75, 75), ("L", 76, 76, 76, 76), ("Lslash", 232, None, None, 149), ("M", 77, 77, 77, 77), ("N", 78, 78, 78, 78), ("Ntilde", None, 132, 209, 209), ("O", 79, 79, 79, 79), ("OE", 234, 206, 140, 150), ("Oacute", None, 238, 211, 211), ("Ocircumflex", None, 239, 212, 212), ("Odieresis", None, 133, 214, 214), ("Ograve", None, 241, 210, 210), ("Oslash", 233, 175, 216, 216), ("Otilde", None, 205, 213, 213), ("P", 80, 80, 80, 80), ("Q", 81, 81, 81, 81), ("R", 82, 82, 82, 82), ("S", 83, 83, 83, 83), ("Scaron", None, None, 138, 151), ("T", 84, 84, 84, 84), ("Thorn", None, None, 222, 222), ("U", 85, 85, 85, 85), ("Uacute", None, 242, 218, 218), ("Ucircumflex", None, 243, 219, 219), ("Udieresis", None, 134, 220, 220), ("Ugrave", None, 244, 217, 217), ("V", 86, 86, 86, 86), ("W", 87, 87, 87, 87), ("X", 88, 88, 88, 88), ("Y", 89, 89, 89, 89), ("Yacute", None, None, 221, 221), ("Ydieresis", None, 217, 159, 152), ("Z", 90, 90, 90, 90), ("Zcaron", None, None, 142, 153), ("a", 97, 97, 97, 97), ("aacute", None, 135, 225, 225), ("acircumflex", None, 137, 226, 226), ("acute", 194, 171, 180, 180), ("adieresis", None, 138, 228, 228), ("ae", 241, 190, 230, 230), ("agrave", None, 136, 224, 224), ("ampersand", 38, 38, 38, 38), ("aring", None, 140, 229, 229), ("asciicircum", 94, 94, 94, 94), ("asciitilde", 126, 126, 126, 126), ("asterisk", 42, 42, 42, 42), ("at", 64, 64, 64, 64), ("atilde", None, 139, 227, 227), ("b", 98, 98, 98, 98), ("backslash", 92, 92, 92, 92), ("bar", 124, 124, 124, 124), ("braceleft", 123, 123, 123, 123), ("braceright", 125, 125, 125, 125), ("bracketleft", 91, 91, 91, 91), ("bracketright", 93, 93, 93, 93), ("breve", 198, 249, None, 24), ("brokenbar", None, None, 166, 166), ("bullet", 183, 165, 149, 128), ("c", 99, 99, 99, 99), ("caron", 207, 255, None, 25), ("ccedilla", None, 141, 231, 231), ("cedilla", 203, 252, 184, 184), ("cent", 162, 162, 162, 162), ("circumflex", 195, 246, 136, 26), ("colon", 58, 58, 58, 58), ("comma", 44, 44, 44, 44), ("copyright", None, 169, 169, 169), ("currency", 168, 219, 164, 164), ("d", 100, 100, 100, 100), ("dagger", 178, 160, 134, 129), ("daggerdbl", 179, 224, 135, 130), ("degree", None, 161, 176, 176), ("dieresis", 200, 172, 168, 168), ("divide", None, 214, 247, 247), ("dollar", 36, 36, 36, 36), ("dotaccent", 199, 250, None, 27), ("dotlessi", 245, 245, None, 154), ("e", 101, 101, 101, 101), ("eacute", None, 142, 233, 233), ("ecircumflex", None, 144, 234, 234), ("edieresis", None, 145, 235, 235), ("egrave", None, 143, 232, 232), ("eight", 56, 56, 56, 56), ("ellipsis", 188, 201, 133, 131), ("emdash", 208, 209, 151, 132), ("endash", 177, 208, 150, 133), ("equal", 61, 61, 61, 61), ("eth", None, None, 240, 240), ("exclam", 33, 33, 33, 33), ("exclamdown", 161, 193, 161, 161), ("f", 102, 102, 102, 102), ("fi", 174, 222, None, 147), ("five", 53, 53, 53, 53), ("fl", 175, 223, None, 148), ("florin", 166, 196, 131, 134), ("four", 52, 52, 52, 52), ("fraction", 164, 218, None, 135), ("g", 103, 103, 103, 103), ("germandbls", 251, 167, 223, 223), ("grave", 193, 96, 96, 96), ("greater", 62, 62, 62, 62), ("guillemotleft", 171, 199, 171, 171), ("guillemotright", 187, 200, 187, 187), ("guilsinglleft", 172, 220, 139, 136), ("guilsinglright", 173, 221, 155, 137), ("h", 104, 104, 104, 104), ("hungarumlaut", 205, 253, None, 28), ("hyphen", 45, 45, 45, 45), ("i", 105, 105, 105, 105), ("iacute", None, 146, 237, 237), ("icircumflex", None, 148, 238, 238), ("idieresis", None, 149, 239, 239), ("igrave", None, 147, 236, 236), ("j", 106, 106, 106, 106), ("k", 107, 107, 107, 107), ("l", 108, 108, 108, 108), ("less", 60, 60, 60, 60), ("logicalnot", None, 194, 172, 172), ("lslash", 248, None, None, 155), ("m", 109, 109, 109, 109), ("macron", 197, 248, 175, 175), ("minus", None, None, None, 138), ("mu", None, 181, 181, 181), ("multiply", None, None, 215, 215), ("n", 110, 110, 110, 110), ("nbspace", None, 202, 160, None), ("nine", 57, 57, 57, 57), ("ntilde", None, 150, 241, 241), ("numbersign", 35, 35, 35, 35), ("o", 111, 111, 111, 111), ("oacute", None, 151, 243, 243), ("ocircumflex", None, 153, 244, 244), ("odieresis", None, 154, 246, 246), ("oe", 250, 207, 156, 156), ("ogonek", 206, 254, None, 29), ("ograve", None, 152, 242, 242), ("one", 49, 49, 49, 49), ("onehalf", None, None, 189, 189), ("onequarter", None, None, 188, 188), ("onesuperior", None, None, 185, 185), ("ordfeminine", 227, 187, 170, 170), ("ordmasculine", 235, 188, 186, 186), ("oslash", 249, 191, 248, 248), ("otilde", None, 155, 245, 245), ("p", 112, 112, 112, 112), ("paragraph", 182, 166, 182, 182), ("parenleft", 40, 40, 40, 40), ("parenright", 41, 41, 41, 41), ("percent", 37, 37, 37, 37), ("period", 46, 46, 46, 46), ("periodcentered", 180, 225, 183, 183), ("perthousand", 189, 228, 137, 139), ("plus", 43, 43, 43, 43), ("plusminus", None, 177, 177, 177), ("q", 113, 113, 113, 113), ("question", 63, 63, 63, 63), ("questiondown", 191, 192, 191, 191), ("quotedbl", 34, 34, 34, 34), ("quotedblbase", 185, 227, 132, 140), ("quotedblleft", 170, 210, 147, 141), ("quotedblright", 186, 211, 148, 142), ("quoteleft", 96, 212, 145, 143), ("quoteright", 39, 213, 146, 144), ("quotesinglbase", 184, 226, 130, 145), ("quotesingle", 169, 39, 39, 39), ("r", 114, 114, 114, 114), ("registered", None, 168, 174, 174), ("ring", 202, 251, None, 30), ("s", 115, 115, 115, 115), ("scaron", None, None, 154, 157), ("section", 167, 164, 167, 167), ("semicolon", 59, 59, 59, 59), ("seven", 55, 55, 55, 55), ("six", 54, 54, 54, 54), ("slash", 47, 47, 47, 47), ("space", 32, 32, 32, 32), ("space", None, 202, 160, None), ("space", None, 202, 173, None), ("sterling", 163, 163, 163, 163), ("t", 116, 116, 116, 116), ("thorn", None, None, 254, 254), ("three", 51, 51, 51, 51), ("threequarters", None, None, 190, 190), ("threesuperior", None, None, 179, 179), ("tilde", 196, 247, 152, 31), ("trademark", None, 170, 153, 146), ("two", 50, 50, 50, 50), ("twosuperior", None, None, 178, 178), ("u", 117, 117, 117, 117), ("uacute", None, 156, 250, 250), ("ucircumflex", None, 158, 251, 251), ("udieresis", None, 159, 252, 252), ("ugrave", None, 157, 249, 249), ("underscore", 95, 95, 95, 95), ("v", 118, 118, 118, 118), ("w", 119, 119, 119, 119), ("x", 120, 120, 120, 120), ("y", 121, 121, 121, 121), ("yacute", None, None, 253, 253), ("ydieresis", None, 216, 255, 255), ("yen", 165, 180, 165, 165), ("z", 122, 122, 122, 122), ("zcaron", None, None, 158, 158), ("zero", 48, 48, 48, 48), ] ================================================ FILE: babeldoc/pdfminer/layout.py ================================================ import heapq import logging from collections.abc import Iterable from collections.abc import Iterator from collections.abc import Sequence from typing import Generic from typing import TypeVar from typing import Union from typing import cast from babeldoc.format.pdf.babelpdf.utils import guarded_bbox from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdfexceptions import PDFTypeError from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdfinterp import Color from babeldoc.pdfminer.pdfinterp import PDFGraphicState from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.utils import INF from babeldoc.pdfminer.utils import LTComponentT from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import PathSegment from babeldoc.pdfminer.utils import Plane from babeldoc.pdfminer.utils import Point from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import apply_matrix_pt from babeldoc.pdfminer.utils import bbox2str from babeldoc.pdfminer.utils import fsplit from babeldoc.pdfminer.utils import get_bound from babeldoc.pdfminer.utils import matrix2str from babeldoc.pdfminer.utils import uniq logger = logging.getLogger(__name__) class IndexAssigner: def __init__(self, index: int = 0) -> None: self.index = index def run(self, obj: "LTItem") -> None: if isinstance(obj, LTTextBox): obj.index = self.index self.index += 1 elif isinstance(obj, LTTextGroup): for x in obj: self.run(x) class LAParams: """Parameters for layout analysis :param line_overlap: If two characters have more overlap than this they are considered to be on the same line. The overlap is specified relative to the minimum height of both characters. :param char_margin: If two characters are closer together than this margin they are considered part of the same line. The margin is specified relative to the width of the character. :param word_margin: If two characters on the same line are further apart than this margin then they are considered to be two separate words, and an intermediate space will be added for readability. The margin is specified relative to the width of the character. :param line_margin: If two lines are are close together they are considered to be part of the same paragraph. The margin is specified relative to the height of a line. :param boxes_flow: Specifies how much a horizontal and vertical position of a text matters when determining the order of text boxes. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). You can also pass `None` to disable advanced layout analysis, and instead return text based on the position of the bottom left corner of the text box. :param detect_vertical: If vertical text should be considered during layout analysis :param all_texts: If layout analysis should be performed on text in figures. """ def __init__( self, line_overlap: float = 0.5, char_margin: float = 2.0, line_margin: float = 0.5, word_margin: float = 0.1, boxes_flow: float | None = 0.5, detect_vertical: bool = False, all_texts: bool = False, ) -> None: self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin self.word_margin = word_margin self.boxes_flow = boxes_flow self.detect_vertical = detect_vertical self.all_texts = all_texts self._validate() def _validate(self) -> None: if self.boxes_flow is not None: boxes_flow_err_msg = ( "LAParam boxes_flow should be None, or a number between -1 and +1" ) if not ( isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float) ): raise PDFTypeError(boxes_flow_err_msg) if not -1 <= self.boxes_flow <= 1: raise PDFValueError(boxes_flow_err_msg) def __repr__(self) -> str: return ( "" % (self.char_margin, self.line_margin, self.word_margin, self.all_texts) ) class LTItem: """Interface for things that can be analyzed""" def analyze(self, laparams: LAParams) -> None: """Perform the layout analysis.""" class LTText: """Interface for things that have text""" def __repr__(self) -> str: return f"<{self.__class__.__name__} {self.get_text()!r}>" def get_text(self) -> str: """Text contained in this object""" raise NotImplementedError class LTComponent(LTItem): """Object with a bounding box""" def __init__(self, bbox: Rect) -> None: LTItem.__init__(self) self.set_bbox(bbox) def __repr__(self) -> str: return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>" # Disable comparison. def __lt__(self, _: object) -> bool: raise PDFValueError def __le__(self, _: object) -> bool: raise PDFValueError def __gt__(self, _: object) -> bool: raise PDFValueError def __ge__(self, _: object) -> bool: raise PDFValueError def set_bbox(self, bbox: Rect) -> None: (x0, y0, x1, y1) = bbox self.x0 = x0 self.y0 = y0 self.x1 = x1 self.y1 = y1 self.width = x1 - x0 self.height = y1 - y0 self.bbox = bbox def is_empty(self) -> bool: return self.width <= 0 or self.height <= 0 def is_hoverlap(self, obj: "LTComponent") -> bool: assert isinstance(obj, LTComponent), str(type(obj)) return obj.x0 <= self.x1 and self.x0 <= obj.x1 def hdistance(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return 0 else: return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) def hoverlap(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) else: return 0 def is_voverlap(self, obj: "LTComponent") -> bool: assert isinstance(obj, LTComponent), str(type(obj)) return obj.y0 <= self.y1 and self.y0 <= obj.y1 def vdistance(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return 0 else: return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) def voverlap(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) else: return 0 class LTCurve(LTComponent): """A generic Bezier curve The parameter `original_path` contains the original pathing information from the pdf (e.g. for reconstructing Bezier Curves). `dashing_style` contains the Dashing information if any. """ def __init__( self, linewidth: float, pts: list[Point], stroke: bool = False, fill: bool = False, evenodd: bool = False, stroking_color: Color | None = None, non_stroking_color: Color | None = None, original_path: list[PathSegment] | None = None, dashing_style: tuple[object, object] | None = None, ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts self.linewidth = linewidth self.stroke = stroke self.fill = fill self.evenodd = evenodd self.stroking_color = stroking_color self.non_stroking_color = non_stroking_color self.original_path = original_path self.dashing_style = dashing_style def get_pts(self) -> str: return ",".join("%.3f,%.3f" % p for p in self.pts) class LTLine(LTCurve): """A single straight line. Could be used for separating text or figures. """ def __init__( self, linewidth: float, p0: Point, p1: Point, stroke: bool = False, fill: bool = False, evenodd: bool = False, stroking_color: Color | None = None, non_stroking_color: Color | None = None, original_path: list[PathSegment] | None = None, dashing_style: tuple[object, object] | None = None, ) -> None: LTCurve.__init__( self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color, original_path, dashing_style, ) class LTRect(LTCurve): """A rectangle. Could be used for framing another pictures or figures. """ def __init__( self, linewidth: float, bbox: Rect, stroke: bool = False, fill: bool = False, evenodd: bool = False, stroking_color: Color | None = None, non_stroking_color: Color | None = None, original_path: list[PathSegment] | None = None, dashing_style: tuple[object, object] | None = None, ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__( self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, fill, evenodd, stroking_color, non_stroking_color, original_path, dashing_style, ) class LTImage(LTComponent): """An image object. Embedded images can be in JPEG, Bitmap or JBIG2. """ def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: LTComponent.__init__(self, bbox) self.name = name self.stream = stream self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height"))) self.imagemask = stream.get_any(("IM", "ImageMask")) self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1) self.colorspace = stream.get_any(("CS", "ColorSpace")) if not isinstance(self.colorspace, list): self.colorspace = [self.colorspace] def __repr__(self) -> str: return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>" class LTAnno(LTItem, LTText): """Actual letter in the text as a Unicode string. Note that, while a LTChar object has actual boundaries, LTAnno objects does not, as these are "virtual" characters, inserted by a layout analyzer according to the relationship between two characters (e.g. a space). """ def __init__(self, text: str) -> None: self._text = text def get_text(self) -> str: return self._text class LTChar(LTComponent, LTText): """Actual letter in the text as a Unicode string.""" def __init__( self, matrix: Matrix, font: PDFFont, fontsize: float, scaling: float, rise: float, text: str, textwidth: float, textdisp: float | tuple[float | None, float], ncs: PDFColorSpace, graphicstate: PDFGraphicState, ) -> None: LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname self.ncs = ncs self.graphicstate = graphicstate self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. if font.is_vertical(): # vertical assert isinstance(textdisp, tuple) (vx, vy) = textdisp if vx is None: vx = fontsize * 0.5 else: vx = vx * fontsize * 0.001 vy = (1000 - vy) * fontsize * 0.001 bbox_lower_left = (-vx, vy + rise + self.adv) bbox_upper_right = (-vx + fontsize, vy + rise) else: # horizontal descent = font.get_descent() * fontsize bbox_lower_left = (0, descent + rise) bbox_upper_right = (self.adv, descent + rise + fontsize) (a, b, c, d, e, f) = self.matrix self.upright = a * d * scaling > 0 and b * c <= 0 (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left) (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right) if x1 < x0: (x0, x1) = (x1, x0) if y1 < y0: (y0, y1) = (y1, y0) LTComponent.__init__(self, (x0, y0, x1, y1)) if font.is_vertical(): self.size = self.width else: self.size = self.height def __repr__(self) -> str: return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>" def get_text(self) -> str: return self._text LTItemT = TypeVar("LTItemT", bound=LTItem) class LTContainer(LTComponent, Generic[LTItemT]): """Object that can be extended and analyzed""" def __init__(self, bbox: Rect) -> None: LTComponent.__init__(self, bbox) self._objs: list[LTItemT] = [] def __iter__(self) -> Iterator[LTItemT]: return iter(self._objs) def __len__(self) -> int: return len(self._objs) def add(self, obj: LTItemT) -> None: self._objs.append(obj) def extend(self, objs: Iterable[LTItemT]) -> None: for obj in objs: self.add(obj) def analyze(self, laparams: LAParams) -> None: for obj in self._objs: obj.analyze(laparams) class LTExpandableContainer(LTContainer[LTItemT]): def __init__(self) -> None: LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) # Incompatible override: we take an LTComponent (with bounding box), but # super() LTContainer only considers LTItem (no bounding box). def add(self, obj: LTComponent) -> None: # type: ignore[override] LTContainer.add(self, cast(LTItemT, obj)) self.set_bbox( ( min(self.x0, obj.x0), min(self.y0, obj.y0), max(self.x1, obj.x1), max(self.y1, obj.y1), ), ) class LTTextContainer(LTExpandableContainer[LTItemT], LTText): def __init__(self) -> None: LTText.__init__(self) LTExpandableContainer.__init__(self) def get_text(self) -> str: return "".join( cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText) ) TextLineElement = Union[LTChar, LTAnno] class LTTextLine(LTTextContainer[TextLineElement]): """Contains a list of LTChar objects that represent a single text line. The characters are aligned either horizontally or vertically, depending on the text's writing mode. """ def __init__(self, word_margin: float) -> None: super().__init__() self.word_margin = word_margin def __repr__(self) -> str: return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>" def analyze(self, laparams: LAParams) -> None: for obj in self._objs: obj.analyze(laparams) LTContainer.add(self, LTAnno("\n")) def find_neighbors( self, plane: Plane[LTComponentT], ratio: float, ) -> list["LTTextLine"]: raise NotImplementedError def is_empty(self) -> bool: return super().is_empty() or self.get_text().isspace() class LTTextLineHorizontal(LTTextLine): def __init__(self, word_margin: float) -> None: LTTextLine.__init__(self, word_margin) self._x1: float = +INF # Incompatible override: we take an LTComponent (with bounding box), but # LTContainer only considers LTItem (no bounding box). def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0 - margin: LTContainer.add(self, LTAnno(" ")) self._x1 = obj.x1 super().add(obj) def find_neighbors( self, plane: Plane[LTComponentT], ratio: float, ) -> list[LTTextLine]: """Finds neighboring LTTextLineHorizontals in the plane. Returns a list of other LTTestLineHorizontals in the plane which are close to self. "Close" can be controlled by ratio. The returned objects will be the same height as self, and also either left-, right-, or centrally-aligned. """ d = ratio * self.height objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) return [ obj for obj in objs if ( isinstance(obj, LTTextLineHorizontal) and self._is_same_height_as(obj, tolerance=d) and ( self._is_left_aligned_with(obj, tolerance=d) or self._is_right_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d) ) ) ] def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: """Whether the left-hand edge of `other` is within `tolerance`.""" return abs(other.x0 - self.x0) <= tolerance def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: """Whether the right-hand edge of `other` is within `tolerance`.""" return abs(other.x1 - self.x1) <= tolerance def _is_centrally_aligned_with( self, other: LTComponent, tolerance: float = 0, ) -> bool: """Whether the horizontal center of `other` is within `tolerance`.""" return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool: return abs(other.height - self.height) <= tolerance class LTTextLineVertical(LTTextLine): def __init__(self, word_margin: float) -> None: LTTextLine.__init__(self, word_margin) self._y0: float = -INF # Incompatible override: we take an LTComponent (with bounding box), but # LTContainer only considers LTItem (no bounding box). def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if obj.y1 + margin < self._y0: LTContainer.add(self, LTAnno(" ")) self._y0 = obj.y0 super().add(obj) def find_neighbors( self, plane: Plane[LTComponentT], ratio: float, ) -> list[LTTextLine]: """Finds neighboring LTTextLineVerticals in the plane. Returns a list of other LTTextLineVerticals in the plane which are close to self. "Close" can be controlled by ratio. The returned objects will be the same width as self, and also either upper-, lower-, or centrally-aligned. """ d = ratio * self.width objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) return [ obj for obj in objs if ( isinstance(obj, LTTextLineVertical) and self._is_same_width_as(obj, tolerance=d) and ( self._is_lower_aligned_with(obj, tolerance=d) or self._is_upper_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d) ) ) ] def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: """Whether the lower edge of `other` is within `tolerance`.""" return abs(other.y0 - self.y0) <= tolerance def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: """Whether the upper edge of `other` is within `tolerance`.""" return abs(other.y1 - self.y1) <= tolerance def _is_centrally_aligned_with( self, other: LTComponent, tolerance: float = 0, ) -> bool: """Whether the vertical center of `other` is within `tolerance`.""" return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: return abs(other.width - self.width) <= tolerance class LTTextBox(LTTextContainer[LTTextLine]): """Represents a group of text chunks in a rectangular area. Note that this box is created by geometric analysis and does not necessarily represents a logical boundary of the text. It contains a list of LTTextLine objects. """ def __init__(self) -> None: LTTextContainer.__init__(self) self.index: int = -1 def __repr__(self) -> str: return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>" def get_writing_mode(self) -> str: raise NotImplementedError class LTTextBoxHorizontal(LTTextBox): def analyze(self, laparams: LAParams) -> None: super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.y1) def get_writing_mode(self) -> str: return "lr-tb" class LTTextBoxVertical(LTTextBox): def analyze(self, laparams: LAParams) -> None: super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.x1) def get_writing_mode(self) -> str: return "tb-rl" TextGroupElement = Union[LTTextBox, "LTTextGroup"] class LTTextGroup(LTTextContainer[TextGroupElement]): def __init__(self, objs: Iterable[TextGroupElement]) -> None: super().__init__() self.extend(objs) class LTTextGroupLRTB(LTTextGroup): def analyze(self, laparams: LAParams) -> None: super().analyze(laparams) assert laparams.boxes_flow is not None boxes_flow = laparams.boxes_flow # reorder the objects from top-left to bottom-right. self._objs.sort( key=lambda obj: (1 - boxes_flow) * obj.x0 - (1 + boxes_flow) * (obj.y0 + obj.y1), ) class LTTextGroupTBRL(LTTextGroup): def analyze(self, laparams: LAParams) -> None: super().analyze(laparams) assert laparams.boxes_flow is not None boxes_flow = laparams.boxes_flow # reorder the objects from top-right to bottom-left. self._objs.sort( key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1) - (1 - boxes_flow) * obj.y1, ) class LTLayoutContainer(LTContainer[LTComponent]): def __init__(self, bbox: Rect) -> None: LTContainer.__init__(self, bbox) self.groups: list[LTTextGroup] | None = None # group_objects: group text object to textlines. def group_objects( self, laparams: LAParams, objs: Iterable[LTComponent], ) -> Iterator[LTTextLine]: obj0 = None line = None for obj1 in objs: if obj0 is not None: # halign: obj0 and obj1 is horizontally aligned. # # +------+ - - - # | obj0 | - - +------+ - # | | | obj1 | | (line_overlap) # +------+ - - | | - # - - - +------+ # # |<--->| # (char_margin) halign = ( obj0.is_voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin ) # valign: obj0 and obj1 is vertically aligned. # # +------+ # | obj0 | # | | # +------+ - - - # | | | (char_margin) # +------+ - - # | obj1 | # | | # +------+ # # |<-->| # (line_overlap) valign = ( laparams.detect_vertical and obj0.is_hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin ) if (halign and isinstance(line, LTTextLineHorizontal)) or ( valign and isinstance(line, LTTextLineVertical) ): line.add(obj1) elif line is not None: yield line line = None elif valign and not halign: line = LTTextLineVertical(laparams.word_margin) line.add(obj0) line.add(obj1) elif halign and not valign: line = LTTextLineHorizontal(laparams.word_margin) line.add(obj0) line.add(obj1) else: line = LTTextLineHorizontal(laparams.word_margin) line.add(obj0) yield line line = None obj0 = obj1 if line is None: line = LTTextLineHorizontal(laparams.word_margin) assert obj0 is not None line.add(obj0) yield line def group_textlines( self, laparams: LAParams, lines: Iterable[LTTextLine], ) -> Iterator[LTTextBox]: """Group neighboring lines to textboxes""" plane: Plane[LTTextLine] = Plane(self.bbox) plane.extend(lines) boxes: dict[LTTextLine, LTTextBox] = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) members = [line] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box: LTTextBox = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box def group_textboxes( self, laparams: LAParams, boxes: Sequence[LTTextBox], ) -> list[LTTextGroup]: """Group textboxes hierarchically. Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to other objects & groups are added to the process queue. For performance reason, pair-wise distances and object pair info are maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in element tuples. :param laparams: LAParams object. :param boxes: All textbox objects to be grouped. :return: a list that has only one element, the final top level group. """ ElementT = Union[LTTextBox, LTTextGroup] plane: Plane[ElementT] = Plane(self.bbox) def dist(obj1: LTComponent, obj2: LTComponent) -> float: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. Return its area less the areas of obj1 and obj2, shown as 'www' below. This value may be negative. +------+..........+ (x1, y1) | obj1 |wwwwwwwwww: +------+www+------+ :wwwwwwwwww| obj2 | (x0, y0) +..........+------+ """ x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) return ( (x1 - x0) * (y1 - y0) - obj1.width * obj1.height - obj2.width * obj2.height ) def isany(obj1: ElementT, obj2: ElementT) -> set[ElementT]: """Check if there's any other object between obj1 and obj2.""" x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) dists: list[tuple[bool, float, int, int, ElementT, ElementT]] = [] for i in range(len(boxes)): box1 = boxes[i] for j in range(i + 1, len(boxes)): box2 = boxes[j] dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2)) heapq.heapify(dists) plane.extend(boxes) done = set() while len(dists) > 0: (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) # Skip objects that are already merged if (id1 not in done) and (id2 not in done): if not skip_isany and isany(obj1, obj2): heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) continue if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance( obj2, (LTTextBoxVertical, LTTextGroupTBRL), ): group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) done.update([id1, id2]) for other in plane: heapq.heappush( dists, (False, dist(group, other), id(group), id(other), group, other), ) plane.add(group) # By now only groups are in the plane return list(cast(LTTextGroup, g) for g in plane) def analyze(self, laparams: LAParams) -> None: # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) for obj in otherobjs: obj.analyze(laparams) if not textobjs: return textlines = list(self.group_objects(laparams, textobjs)) (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) for obj in empties: obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) if laparams.boxes_flow is None: for textbox in textboxes: textbox.analyze(laparams) def getkey(box: LTTextBox) -> tuple[int, float, float]: if isinstance(box, LTTextBoxVertical): return (0, -box.x1, -box.y0) else: return (1, -box.y0, box.x0) textboxes.sort(key=getkey) else: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() for group in self.groups: group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) self._objs = ( cast(list[LTComponent], textboxes) + otherobjs + cast(list[LTComponent], empties) ) class LTFigure(LTLayoutContainer): """Represents an area used by PDF Form objects. PDF Forms can be used to present figures or pictures by embedding yet another PDF document within a page. Note that LTFigure objects can appear recursively. """ def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: self.name = name self.matrix = matrix (x, y, w, h) = guarded_bbox(bbox) bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds) LTLayoutContainer.__init__(self, bbox) def __repr__(self) -> str: return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" def analyze(self, laparams: LAParams) -> None: if not laparams.all_texts: return LTLayoutContainer.analyze(self, laparams) class LTPage(LTLayoutContainer): """Represents an entire page. Like any other LTLayoutContainer, an LTPage can be iterated to obtain child objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. """ def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: LTLayoutContainer.__init__(self, bbox) self.pageid = pageid self.rotate = rotate def __repr__(self) -> str: return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>" ================================================ FILE: babeldoc/pdfminer/lzw.py ================================================ import logging from collections.abc import Iterator from io import BytesIO from typing import BinaryIO from typing import cast from babeldoc.pdfminer.pdfexceptions import PDFEOFError from babeldoc.pdfminer.pdfexceptions import PDFException logger = logging.getLogger(__name__) class CorruptDataError(PDFException): pass class LZWDecoder: def __init__(self, fp: BinaryIO) -> None: self.fp = fp self.buff = 0 self.bpos = 8 self.nbits = 9 # NB: self.table stores None only in indices 256 and 257 self.table: list[bytes | None] = [] self.prevbuf: bytes | None = None def readbits(self, bits: int) -> int: v = 0 while 1: # the number of remaining bits we can get from the current buffer. r = 8 - self.bpos if bits <= r: # |-----8-bits-----| # |-bpos-|-bits-| | # | |----r----| v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1)) self.bpos += bits break else: # |-----8-bits-----| # |-bpos-|---bits----... # | |----r----| v = (v << r) | (self.buff & ((1 << r) - 1)) bits -= r x = self.fp.read(1) if not x: raise PDFEOFError self.buff = ord(x) self.bpos = 0 return v def feed(self, code: int) -> bytes: x = b"" if code == 256: self.table = [bytes((c,)) for c in range(256)] # 0-255 self.table.append(None) # 256 self.table.append(None) # 257 self.prevbuf = b"" self.nbits = 9 elif code == 257: pass elif not self.prevbuf: x = self.prevbuf = cast(bytes, self.table[code]) # assume not None else: if code < len(self.table): x = cast(bytes, self.table[code]) # assume not None self.table.append(self.prevbuf + x[:1]) elif code == len(self.table): self.table.append(self.prevbuf + self.prevbuf[:1]) x = cast(bytes, self.table[code]) else: raise CorruptDataError table_length = len(self.table) if table_length == 511: self.nbits = 10 elif table_length == 1023: self.nbits = 11 elif table_length == 2047: self.nbits = 12 self.prevbuf = x return x def run(self) -> Iterator[bytes]: while 1: try: code = self.readbits(self.nbits) except EOFError: break try: x = self.feed(code) except CorruptDataError: # just ignore corrupt data and stop yielding there break yield x logger.debug( "nbits=%d, code=%d, output=%r, table=%r", self.nbits, code, x, self.table[258:], ) def lzwdecode(data: bytes) -> bytes: fp = BytesIO(data) s = LZWDecoder(fp).run() return b"".join(s) ================================================ FILE: babeldoc/pdfminer/pdfcolor.py ================================================ import collections from babeldoc.pdfminer.psparser import LIT LITERAL_DEVICE_GRAY = LIT("DeviceGray") LITERAL_DEVICE_RGB = LIT("DeviceRGB") LITERAL_DEVICE_CMYK = LIT("DeviceCMYK") # Abbreviations for inline images LITERAL_INLINE_DEVICE_GRAY = LIT("G") LITERAL_INLINE_DEVICE_RGB = LIT("RGB") LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK") class PDFColorSpace: def __init__(self, name: str, ncomponents: int) -> None: self.name = name self.ncomponents = ncomponents def __repr__(self) -> str: return "" % (self.name, self.ncomponents) PREDEFINED_COLORSPACE: dict[str, PDFColorSpace] = collections.OrderedDict() for name, n in [ ("DeviceGray", 1), # default value first ("CalRGB", 3), ("CalGray", 1), ("Lab", 3), ("DeviceRGB", 3), ("DeviceCMYK", 4), ("Separation", 1), ("Indexed", 1), ("Pattern", 1), ]: PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n) ================================================ FILE: babeldoc/pdfminer/pdfdevice.py ================================================ import logging from collections.abc import Iterable from collections.abc import Sequence from typing import TYPE_CHECKING from typing import BinaryIO from typing import Optional from typing import cast from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import PathSegment from babeldoc.pdfminer.utils import Point from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer import utils if TYPE_CHECKING: from babeldoc.pdfminer.pdfinterp import PDFGraphicState from babeldoc.pdfminer.pdfinterp import PDFResourceManager from babeldoc.pdfminer.pdfinterp import PDFStackT from babeldoc.pdfminer.pdfinterp import PDFTextState PDFTextSeq = Iterable[int | float | bytes] logger = logging.getLogger(__name__) class PDFDevice: """Translate the output of PDFPageInterpreter to the output that is needed""" def __init__(self, rsrcmgr: "PDFResourceManager") -> None: self.rsrcmgr = rsrcmgr self.ctm: Matrix | None = None def __repr__(self) -> str: return "" def __enter__(self) -> "PDFDevice": return self def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: self.close() def close(self) -> None: pass def set_ctm(self, ctm: Matrix) -> None: self.ctm = ctm def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: pass def end_tag(self) -> None: pass def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: pass def begin_page(self, page: PDFPage, ctm: Matrix) -> None: pass def end_page(self, page: PDFPage) -> None: pass def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: pass def end_figure(self, name: str) -> None: pass def paint_path( self, graphicstate: "PDFGraphicState", stroke: bool, fill: bool, evenodd: bool, path: Sequence[PathSegment], ) -> None: pass def render_image(self, name: str, stream: PDFStream) -> None: pass def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> None: pass class PDFTextDevice(PDFDevice): def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> None: assert self.ctm is not None matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font font.font_id_temp = getattr(textstate, "font_id", None) fontsize = textstate.fontsize scaling = textstate.scaling * 0.01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise assert font is not None if font.is_multibyte(): wordspace = 0 dxscale = 0.001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate, ) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate, ) def render_string_horizontal( self, seq: PDFTextSeq, matrix: Matrix, pos: Point, font: PDFFont, fontsize: float, scaling: float, charspace: float, wordspace: float, rise: float, dxscale: float, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> Point: (x, y) = pos needcharspace = False for obj in seq: if isinstance(obj, (int, float)): x -= obj * dxscale needcharspace = True elif isinstance(obj, bytes): for cid in font.decode(obj): if needcharspace: x += charspace x += self.render_char( utils.translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid, ncs, graphicstate, ) if cid == 32 and wordspace: x += wordspace needcharspace = True else: logger.warning( f"Cannot render horizontal string because {obj!r} is not a valid int, float or bytes." ) return (x, y) def render_string_vertical( self, seq: PDFTextSeq, matrix: Matrix, pos: Point, font: PDFFont, fontsize: float, scaling: float, charspace: float, wordspace: float, rise: float, dxscale: float, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> Point: (x, y) = pos needcharspace = False for obj in seq: if isinstance(obj, (int, float)): y -= obj * dxscale needcharspace = True elif isinstance(obj, bytes): for cid in font.decode(obj): if needcharspace: y += charspace y += self.render_char( utils.translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid, ncs, graphicstate, ) if cid == 32 and wordspace: y += wordspace needcharspace = True else: logger.warning( f"Cannot render vertical string because {obj!r} is not a valid int, float or bytes." ) return (x, y) def render_char( self, matrix: Matrix, font: PDFFont, fontsize: float, scaling: float, rise: float, cid: int, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> float: return 0 class TagExtractor(PDFDevice): def __init__( self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, codec: str = "utf-8", ) -> None: PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec self.pageno = 0 self._stack: list[PSLiteral] = [] def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, ncs: PDFColorSpace, graphicstate: "PDFGraphicState", ) -> None: font = textstate.font assert font is not None text = "" for obj in seq: if isinstance(obj, str): obj = utils.make_compat_bytes(obj) if not isinstance(obj, bytes): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: pass self._write(utils.enc(text)) def begin_page(self, page: PDFPage, ctm: Matrix) -> None: output = '' % ( self.pageno, utils.bbox2str(page.mediabox), page.rotate, ) self._write(output) def end_page(self, page: PDFPage) -> None: self._write("\n") self.pageno += 1 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: s = "" if isinstance(props, dict): s = "".join( [ f' {utils.enc(k)}="{utils.make_compat_str(v)}"' for (k, v) in sorted(props.items()) ], ) out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" self._write(out_s) self._stack.append(tag) def end_tag(self) -> None: assert self._stack, str(self.pageno) tag = self._stack.pop(-1) out_s = "" % utils.enc(cast(str, tag.name)) self._write(out_s) def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: self.begin_tag(tag, props) self._stack.pop(-1) def _write(self, s: str) -> None: self.outfp.write(s.encode(self.codec)) ================================================ FILE: babeldoc/pdfminer/pdfdocument.py ================================================ import itertools import logging import re import struct from collections.abc import Callable from collections.abc import Iterable from collections.abc import Iterator from collections.abc import KeysView from collections.abc import Sequence from hashlib import md5 from hashlib import sha256 from hashlib import sha384 from hashlib import sha512 from typing import Any from typing import cast from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.ciphers import Cipher from cryptography.hazmat.primitives.ciphers import algorithms from cryptography.hazmat.primitives.ciphers import modes from babeldoc.pdfminer.arcfour import Arcfour from babeldoc.pdfminer.casting import safe_int from babeldoc.pdfminer.data_structures import NumberTree from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdfexceptions import PDFKeyError from babeldoc.pdfminer.pdfexceptions import PDFObjectNotFound from babeldoc.pdfminer.pdfexceptions import PDFTypeError from babeldoc.pdfminer.pdfparser import PDFParser from babeldoc.pdfminer.pdfparser import PDFStreamParser from babeldoc.pdfminer.pdfparser import PDFSyntaxError from babeldoc.pdfminer.pdftypes import DecipherCallable from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.pdftypes import decipher_all from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import int_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.pdftypes import str_value from babeldoc.pdfminer.pdftypes import stream_value from babeldoc.pdfminer.pdftypes import uint_value from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psparser import KWD from babeldoc.pdfminer.psparser import LIT from babeldoc.pdfminer.psparser import literal_name from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer.utils import decode_text from babeldoc.pdfminer.utils import format_int_alpha from babeldoc.pdfminer.utils import format_int_roman from babeldoc.pdfminer.utils import nunpack from babeldoc.pdfminer import settings log = logging.getLogger(__name__) class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoValidXRefWarning(SyntaxWarning): """Legacy warning for missing xref. Not used anymore because warnings.warn is replaced by logger.Logger.warn. """ class PDFNoOutlines(PDFException): pass class PDFNoPageLabels(PDFException): pass class PDFDestinationNotFound(PDFException): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFEncryptionWarning(UserWarning): """Legacy warning for failed decryption. Not used anymore because warnings.warn is replaced by logger.Logger.warn. """ class PDFTextExtractionNotAllowedWarning(UserWarning): """Legacy warning for PDF that does not allow extraction. Not used anymore because warnings.warn is replaced by logger.Logger.warn. """ class PDFTextExtractionNotAllowed(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = LIT("ObjStm") LITERAL_XREF = LIT("XRef") LITERAL_CATALOG = LIT("Catalog") class PDFBaseXRef: def get_trailer(self) -> dict[str, Any]: raise NotImplementedError def get_objids(self) -> Iterable[int]: return [] # Must return # (strmid, index, genno) # or (None, pos, genno) def get_pos(self, objid: int) -> tuple[int | None, int, int]: raise PDFKeyError(objid) def load(self, parser: PDFParser) -> None: raise NotImplementedError class PDFXRef(PDFBaseXRef): def __init__(self) -> None: self.offsets: dict[int, tuple[int | None, int, int]] = {} self.trailer: dict[str, Any] = {} def __repr__(self) -> str: return "" % (self.offsets.keys()) def load(self, parser: PDFParser) -> None: while True: try: (pos, line) = parser.nextline() line = line.strip() if not line: continue except PSEOF: raise PDFNoValidXRef("Unexpected EOF - file corrupted?") if line.startswith(b"trailer"): parser.seek(pos) break f = line.split(b" ") if len(f) != 2: error_msg = f"Trailer not found: {parser!r}: line={line!r}" raise PDFNoValidXRef(error_msg) try: (start, nobjs) = map(int, f) except ValueError: error_msg = f"Invalid line: {parser!r}: line={line!r}" raise PDFNoValidXRef(error_msg) for objid in range(start, start + nobjs): try: (_, line) = parser.nextline() line = line.strip() except PSEOF: raise PDFNoValidXRef("Unexpected EOF - file corrupted?") f = line.split(b" ") if len(f) != 3: error_msg = f"Invalid XRef format: {parser!r}, line={line!r}" raise PDFNoValidXRef(error_msg) (pos_b, genno_b, use_b) = f if use_b != b"n": continue pos_i = safe_int(pos_b) genno_i = safe_int(genno_b) if pos_i is not None and genno_i is not None: self.offsets[objid] = (None, pos_i, genno_i) else: log.warning( f"Not adding object {objid} to xref because position {pos_b!r} " f"or generation number {genno_b!r} cannot be parsed as an int" ) log.debug("xref objects: %r", self.offsets) self.load_trailer(parser) def load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() assert kwd is KWD(b"trailer"), str(kwd) (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef("Unexpected EOF - file corrupted") (_, dic) = x[0] self.trailer.update(dict_value(dic)) log.debug("trailer=%r", self.trailer) def get_trailer(self) -> dict[str, Any]: return self.trailer def get_objids(self) -> KeysView[int]: return self.offsets.keys() def get_pos(self, objid: int) -> tuple[int | None, int, int]: return self.offsets[objid] class PDFXRefFallback(PDFXRef): def __repr__(self) -> str: return "" % (self.offsets.keys()) PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b") def load(self, parser: PDFParser) -> None: parser.seek(0) while 1: try: (pos, line_bytes) = parser.nextline() except PSEOF: break if line_bytes.startswith(b"trailer"): parser.seek(pos) self.load_trailer(parser) log.debug("trailer: %r", self.trailer) break line = line_bytes.decode("latin-1") # default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue (objid_s, genno_s) = m.groups() objid = int(objid_s) genno = int(genno_s) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) (_, obj) = parser.nextobject() if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream["N"] except KeyError: if settings.STRICT: raise PDFSyntaxError("N is not defined: %r" % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) objs: list[int] = [] try: while 1: (_, obj) = parser1.nextobject() objs.append(cast(int, obj)) except PSEOF: pass n = min(n, len(objs) // 2) for index in range(n): objid1 = objs[index * 2] self.offsets[objid1] = (objid, index, 0) class PDFXRefStream(PDFBaseXRef): def __init__(self) -> None: self.data: bytes | None = None self.entlen: int | None = None self.fl1: int | None = None self.fl2: int | None = None self.fl3: int | None = None self.ranges: list[tuple[int, int]] = [] def __repr__(self) -> str: return "" % (self.ranges) def load(self, parser: PDFParser) -> None: (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = parser.nextobject() if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF: raise PDFNoValidXRef("Invalid PDF stream spec.") size = stream["Size"] index_array = stream.get("Index", (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError("Invalid index number") self.ranges.extend(cast(Iterator[tuple[int, int]], choplist(2, index_array))) (self.fl1, self.fl2, self.fl3) = stream["W"] assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None self.data = stream.get_data() self.entlen = self.fl1 + self.fl2 + self.fl3 self.trailer = stream.attrs log.debug( "xref stream: objid=%s, fields=%d,%d,%d", ", ".join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3, ) def get_trailer(self) -> dict[str, Any]: return self.trailer def get_objids(self) -> Iterator[int]: for start, nobjs in self.ranges: for i in range(nobjs): assert self.entlen is not None assert self.data is not None offset = self.entlen * i ent = self.data[offset : offset + self.entlen] f1 = nunpack(ent[: self.fl1], 1) if f1 == 1 or f1 == 2: yield start + i def get_pos(self, objid: int) -> tuple[int | None, int, int]: index = 0 for start, nobjs in self.ranges: if start <= objid and objid < start + nobjs: index += objid - start break else: index += nobjs else: raise PDFKeyError(objid) assert self.entlen is not None assert self.data is not None assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None offset = self.entlen * index ent = self.data[offset : offset + self.entlen] f1 = nunpack(ent[: self.fl1], 1) f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2]) f3 = nunpack(ent[self.fl1 + self.fl2 :]) if f1 == 1: return (None, f2, f3) elif f1 == 2: return (f2, f3, 0) else: # this is a free object raise PDFKeyError(objid) class PDFStandardSecurityHandler: PASSWORD_PADDING = ( b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz" ) supported_revisions: tuple[int, ...] = (2, 3) def __init__( self, docid: Sequence[bytes], param: dict[str, Any], password: str = "", ) -> None: self.docid = docid self.param = param self.password = password self.init() def init(self) -> None: self.init_params() if self.r not in self.supported_revisions: error_msg = "Unsupported revision: param=%r" % self.param raise PDFEncryptionError(error_msg) self.init_key() def init_params(self) -> None: self.v = int_value(self.param.get("V", 0)) self.r = int_value(self.param["R"]) self.p = uint_value(self.param["P"], 32) self.o = str_value(self.param["O"]) self.u = str_value(self.param["U"]) self.length = int_value(self.param.get("Length", 40)) def init_key(self) -> None: self.key = self.authenticate(self.password) if self.key is None: raise PDFPasswordIncorrect def is_printable(self) -> bool: return bool(self.p & 4) def is_modifiable(self) -> bool: return bool(self.p & 8) def is_extractable(self) -> bool: return bool(self.p & 16) def compute_u(self, key: bytes) -> bytes: if self.r == 2: # Algorithm 3.4 return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 else: # Algorithm 3.5 hash = md5(self.PASSWORD_PADDING) # 2 hash.update(self.docid[0]) # 3 result = Arcfour(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 k = b"".join(bytes((c ^ i,)) for c in iter(key)) result = Arcfour(k).encrypt(result) result += result # 6 return result def compute_encryption_key(self, password: bytes) -> bytes: # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 hash = md5(password) # 2 hash.update(self.o) # 3 # See https://github.com/pdfminer/pdfminer.six/issues/186 hash.update(struct.pack("= 4: if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata: hash.update(b"\xff\xff\xff\xff") result = hash.digest() n = 5 if self.r >= 3: n = self.length // 8 for _ in range(50): result = md5(result[:n]).digest() return result[:n] def authenticate(self, password: str) -> bytes | None: password_bytes = password.encode("latin1") key = self.authenticate_user_password(password_bytes) if key is None: key = self.authenticate_owner_password(password_bytes) return key def authenticate_user_password(self, password: bytes) -> bytes | None: key = self.compute_encryption_key(password) if self.verify_encryption_key(key): return key else: return None def verify_encryption_key(self, key: bytes) -> bool: # Algorithm 3.6 u = self.compute_u(key) if self.r == 2: return u == self.u return u[:16] == self.u[:16] def authenticate_owner_password(self, password: bytes) -> bytes | None: # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] hash = md5(password) if self.r >= 3: for _ in range(50): hash = md5(hash.digest()) n = 5 if self.r >= 3: n = self.length // 8 key = hash.digest()[:n] if self.r == 2: user_password = Arcfour(key).decrypt(self.o) else: user_password = self.o for i in range(19, -1, -1): k = b"".join(bytes((c ^ i,)) for c in iter(key)) user_password = Arcfour(k).decrypt(user_password) return self.authenticate_user_password(user_password) def decrypt( self, objid: int, genno: int, data: bytes, attrs: dict[str, Any] | None = None, ) -> bytes: return self.decrypt_rc4(objid, genno, data) def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: assert self.key is not None key = self.key + struct.pack(" None: super().init_params() self.length = 128 self.cf = dict_value(self.param.get("CF")) self.stmf = literal_name(self.param["StmF"]) self.strf = literal_name(self.param["StrF"]) self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True)) if self.stmf != self.strf: error_msg = "Unsupported crypt filter: param=%r" % self.param raise PDFEncryptionError(error_msg) self.cfm = {} for k, v in self.cf.items(): f = self.get_cfm(literal_name(v["CFM"])) if f is None: error_msg = "Unknown crypt filter method: param=%r" % self.param raise PDFEncryptionError(error_msg) self.cfm[k] = f self.cfm["Identity"] = self.decrypt_identity if self.strf not in self.cfm: error_msg = "Undefined crypt filter: param=%r" % self.param raise PDFEncryptionError(error_msg) def get_cfm(self, name: str) -> Callable[[int, int, bytes], bytes] | None: if name == "V2": return self.decrypt_rc4 elif name == "AESV2": return self.decrypt_aes128 else: return None def decrypt( self, objid: int, genno: int, data: bytes, attrs: dict[str, Any] | None = None, name: str | None = None, ) -> bytes: if not self.encrypt_metadata and attrs is not None: t = attrs.get("Type") if t is not None and literal_name(t) == "Metadata": return data if name is None: name = self.strf return self.cfm[name](objid, genno, data) def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes: return data def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes: assert self.key is not None key = ( self.key + struct.pack(" None: super().init_params() self.length = 256 self.oe = str_value(self.param["OE"]) self.ue = str_value(self.param["UE"]) self.o_hash = self.o[:32] self.o_validation_salt = self.o[32:40] self.o_key_salt = self.o[40:] self.u_hash = self.u[:32] self.u_validation_salt = self.u[32:40] self.u_key_salt = self.u[40:] def get_cfm(self, name: str) -> Callable[[int, int, bytes], bytes] | None: if name == "AESV3": return self.decrypt_aes256 else: return None def authenticate(self, password: str) -> bytes | None: password_b = self._normalize_password(password) hash = self._password_hash(password_b, self.o_validation_salt, self.u) if hash == self.o_hash: hash = self._password_hash(password_b, self.o_key_salt, self.u) cipher = Cipher( algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend(), ) # type: ignore return cipher.decryptor().update(self.oe) # type: ignore hash = self._password_hash(password_b, self.u_validation_salt) if hash == self.u_hash: hash = self._password_hash(password_b, self.u_key_salt) cipher = Cipher( algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend(), ) # type: ignore return cipher.decryptor().update(self.ue) # type: ignore return None def _normalize_password(self, password: str) -> bytes: if self.r == 6: # saslprep expects non-empty strings, apparently if not password: return b"" from babeldoc.pdfminer._saslprep import saslprep password = saslprep(password) return password.encode("utf-8")[:127] def _password_hash( self, password: bytes, salt: bytes, vector: bytes | None = None, ) -> bytes: """Compute password hash depending on revision number""" if self.r == 5: return self._r5_password(password, salt, vector) return self._r6_password(password, salt[0:8], vector) def _r5_password( self, password: bytes, salt: bytes, vector: bytes | None = None, ) -> bytes: """Compute the password for revision 5""" hash = sha256(password) hash.update(salt) if vector is not None: hash.update(vector) return hash.digest() def _r6_password( self, password: bytes, salt: bytes, vector: bytes | None = None, ) -> bytes: """Compute the password for revision 6""" initial_hash = sha256(password) initial_hash.update(salt) if vector is not None: initial_hash.update(vector) k = initial_hash.digest() hashes = (sha256, sha384, sha512) round_no = last_byte_val = 0 while round_no < 64 or last_byte_val > round_no - 32: k1 = (password + k + (vector or b"")) * 64 e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1) # compute the first 16 bytes of e, # interpreted as an unsigned integer mod 3 next_hash = hashes[self._bytes_mod_3(e[:16])] k = next_hash(e).digest() last_byte_val = e[len(e) - 1] round_no += 1 return k[:32] @staticmethod def _bytes_mod_3(input_bytes: bytes) -> int: # 256 is 1 mod 3, so we can just sum 'em return sum(b % 3 for b in input_bytes) % 3 def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes: cipher = Cipher(algorithms.AES(key), modes.CBC(iv)) encryptor = cipher.encryptor() # type: ignore return encryptor.update(data) + encryptor.finalize() # type: ignore def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes: initialization_vector = data[:16] ciphertext = data[16:] assert self.key is not None cipher = Cipher( algorithms.AES(self.key), modes.CBC(initialization_vector), backend=default_backend(), ) # type: ignore return cipher.decryptor().update(ciphertext) # type: ignore class PDFDocument: """PDFDocument object represents a PDF document. Since a PDF file can be very big, normally it is not loaded at once. So PDF document has to cooperate with a PDF parser in order to dynamically import the data as processing goes. Typical usage: doc = PDFDocument(parser, password) obj = doc.getobj(objid) """ security_handler_registry: dict[int, type[PDFStandardSecurityHandler]] = { 1: PDFStandardSecurityHandler, 2: PDFStandardSecurityHandler, 4: PDFStandardSecurityHandlerV4, 5: PDFStandardSecurityHandlerV5, } def __init__( self, parser: PDFParser, password: str = "", caching: bool = True, fallback: bool = True, ) -> None: """Set the document to use a given PDFParser object.""" self.caching = caching self.xrefs: list[PDFBaseXRef] = [] self.info = [] self.catalog: dict[str, Any] = {} self.encryption: tuple[Any, Any] | None = None self.decipher: DecipherCallable | None = None self._parser = None self._cached_objs: dict[int, tuple[object, int]] = {} self._parsed_objs: dict[int, tuple[list[object], int]] = {} self._parser = parser self._parser.set_document(self) self.is_printable = self.is_modifiable = self.is_extractable = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. try: pos = self.find_xref(parser) self.read_xref_from(parser, pos, self.xrefs) except PDFNoValidXRef: if fallback: parser.fallback = True newxref = PDFXRefFallback() newxref.load(parser) self.xrefs.append(newxref) for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if "Encrypt" in trailer: if "ID" in trailer: id_value = list_value(trailer["ID"]) else: # Some documents may not have a /ID, use two empty # byte strings instead. Solves # https://github.com/pdfminer/pdfminer.six/issues/594 id_value = (b"", b"") self.encryption = (id_value, dict_value(trailer["Encrypt"])) self._initialize_password(password) if "Info" in trailer: self.info.append(dict_value(trailer["Info"])) if "Root" in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer["Root"]) break else: raise PDFSyntaxError("No /Root object! - Is this really a PDF?") if self.catalog.get("Type") is not LITERAL_CATALOG: if settings.STRICT: raise PDFSyntaxError("Catalog not found!") KEYWORD_OBJ = KWD(b"obj") # _initialize_password(password=b'') # Perform the initialization with a given password. def _initialize_password(self, password: str = "") -> None: assert self.encryption is not None (docid, param) = self.encryption if literal_name(param.get("Filter")) != "Standard": raise PDFEncryptionError("Unknown filter: param=%r" % param) v = int_value(param.get("V", 0)) factory = self.security_handler_registry.get(v) if factory is None: raise PDFEncryptionError("Unknown algorithm: param=%r" % param) handler = factory(docid, param, password) self.decipher = handler.decrypt self.is_printable = handler.is_printable() self.is_modifiable = handler.is_modifiable() self.is_extractable = handler.is_extractable() assert self._parser is not None self._parser.fallback = False # need to read streams with exact length def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: (objs, n) = self._get_objects(stream) if self.caching: assert stream.objid is not None self._parsed_objs[stream.objid] = (objs, n) i = n * 2 + index try: obj = objs[i] except IndexError: raise PDFSyntaxError("index too big: %r" % index) return obj def _get_objects(self, stream: PDFStream) -> tuple[list[object], int]: if stream.get("Type") is not LITERAL_OBJSTM: if settings.STRICT: raise PDFSyntaxError("Not a stream object: %r" % stream) try: n = cast(int, stream["N"]) except KeyError: if settings.STRICT: raise PDFSyntaxError("N is not defined: %r" % stream) n = 0 parser = PDFStreamParser(stream.get_data()) parser.set_document(self) objs: list[object] = [] try: while 1: (_, obj) = parser.nextobject() objs.append(obj) except PSEOF: pass return (objs, n) def _getobj_parse(self, pos: int, objid: int) -> object: assert self._parser is not None self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid (_, genno) = self._parser.nexttoken() # genno (_, kwd) = self._parser.nexttoken() # hack around malformed pdf files # copied from https://github.com/jaepil/pdfminer3k/blob/master/ # pdfminer/pdfparser.py#L399 # to solve https://github.com/pdfminer/pdfminer.six/issues/56 # assert objid1 == objid, str((objid1, objid)) if objid1 != objid: x = [] while kwd is not self.KEYWORD_OBJ: (_, kwd) = self._parser.nexttoken() x.append(kwd) if len(x) >= 2: objid1 = x[-2] # #### end hack around malformed pdf files if objid1 != objid: raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}") if kwd != KWD(b"obj"): raise PDFSyntaxError("Invalid object spec: offset=%r" % pos) (_, obj) = self._parser.nextobject() return obj # can raise PDFObjectNotFound def getobj(self, objid: int) -> object: """Get object from PDF :raises PDFException if PDFDocument is not initialized :raises PDFObjectNotFound if objid does not exist in PDF """ if not self.xrefs: raise PDFException("PDFDocument is not initialized") log.debug("getobj: objid=%r", objid) if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] else: for xref in self.xrefs: try: (strmid, index, genno) = xref.get_pos(objid) except KeyError: continue try: if strmid is not None: stream = stream_value(self.getobj(strmid)) obj = self._getobj_objstm(stream, index, objid) else: obj = self._getobj_parse(index, objid) if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) if isinstance(obj, PDFStream): obj.set_objid(objid, genno) break except (PSEOF, PDFSyntaxError): continue else: raise PDFObjectNotFound(objid) log.debug("register: objid=%r: %r", objid, obj) if self.caching: self._cached_objs[objid] = (obj, genno) return obj OutlineType = tuple[Any, Any, Any, Any, Any] def get_outlines(self) -> Iterator[OutlineType]: if "Outlines" not in self.catalog: raise PDFNoOutlines def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]: entry = dict_value(entry) if "Title" in entry: if "A" in entry or "Dest" in entry: title = decode_text(str_value(entry["Title"])) dest = entry.get("Dest") action = entry.get("A") se = entry.get("SE") yield (level, title, dest, action, se) if "First" in entry and "Last" in entry: yield from search(entry["First"], level + 1) if "Next" in entry: yield from search(entry["Next"], level) return search(self.catalog["Outlines"], 0) def get_page_labels(self) -> Iterator[str]: """Generate page label strings for the PDF document. If the document includes page labels, generates strings, one per page. If not, raises PDFNoPageLabels. The resulting iteration is unbounded. """ assert self.catalog is not None try: page_labels = PageLabels(self.catalog["PageLabels"]) except (PDFTypeError, KeyError): raise PDFNoPageLabels return page_labels.labels def lookup_name(self, cat: str, key: str | bytes) -> Any: try: names = dict_value(self.catalog["Names"]) except (PDFTypeError, KeyError): raise PDFKeyError((cat, key)) # may raise KeyError d0 = dict_value(names[cat]) def lookup(d: dict[str, Any]) -> Any: if "Limits" in d: (k1, k2) = list_value(d["Limits"]) if key < k1 or k2 < key: return None if "Names" in d: objs = list_value(d["Names"]) names = dict( cast(Iterator[tuple[str | bytes, Any]], choplist(2, objs)), ) return names[key] if "Kids" in d: for c in list_value(d["Kids"]): v = lookup(dict_value(c)) if v: return v raise PDFKeyError((cat, key)) return lookup(d0) def get_dest(self, name: str | bytes) -> Any: try: # PDF-1.2 or later obj = self.lookup_name("Dests", name) except KeyError: # PDF-1.1 or prior if "Dests" not in self.catalog: raise PDFDestinationNotFound(name) d0 = dict_value(self.catalog["Dests"]) if name not in d0: raise PDFDestinationNotFound(name) obj = d0[name] return obj # find_xref def find_xref(self, parser: PDFParser) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. prev = b"" for line in parser.revreadlines(): line = line.strip() log.debug("find_xref: %r", line) if line == b"startxref": log.debug("xref found: pos=%r", prev) if not prev.isdigit(): raise PDFNoValidXRef(f"Invalid xref position: {prev!r}") start = int(prev) if not start >= 0: raise PDFNoValidXRef(f"Invalid negative xref position: {start}") return start if line: prev = line raise PDFNoValidXRef("Unexpected EOF") # read xref table def read_xref_from( self, parser: PDFParser, start: int, xrefs: list[PDFBaseXRef], ) -> None: """Reads XRefs from the given location.""" parser.seek(start) parser.reset() try: (pos, token) = parser.nexttoken() except PSEOF: raise PDFNoValidXRef("Unexpected EOF") log.debug("read_xref_from: start=%d, token=%r", start, token) if isinstance(token, int): # XRefStream: PDF-1.5 parser.seek(pos) parser.reset() xref: PDFBaseXRef = PDFXRefStream() xref.load(parser) else: if token is parser.KEYWORD_XREF: parser.nextline() xref = PDFXRef() xref.load(parser) xrefs.append(xref) trailer = xref.get_trailer() log.debug("trailer: %r", trailer) if "XRefStm" in trailer: pos = int_value(trailer["XRefStm"]) self.read_xref_from(parser, pos, xrefs) if "Prev" in trailer: # find previous xref pos = int_value(trailer["Prev"]) self.read_xref_from(parser, pos, xrefs) class PageLabels(NumberTree): """PageLabels from the document catalog. See Section 8.3.1 in the PDF Reference. """ @property def labels(self) -> Iterator[str]: ranges = self.values # The tree must begin with page index 0 if len(ranges) == 0 or ranges[0][0] != 0: if settings.STRICT: raise PDFSyntaxError("PageLabels is missing page index 0") else: # Try to cope, by assuming empty labels for the initial pages ranges.insert(0, (0, {})) for next, (start, label_dict_unchecked) in enumerate(ranges, 1): label_dict = dict_value(label_dict_unchecked) style = label_dict.get("S") prefix = decode_text(str_value(label_dict.get("P", b""))) first_value = int_value(label_dict.get("St", 1)) if next == len(ranges): # This is the last specified range. It continues until the end # of the document. values: Iterable[int] = itertools.count(first_value) else: end, _ = ranges[next] range_length = end - start values = range(first_value, first_value + range_length) for value in values: label = self._format_page_label(value, style) yield prefix + label @staticmethod def _format_page_label(value: int, style: Any) -> str: """Format page label value in a specific style""" if style is None: label = "" elif style is LIT("D"): # Decimal arabic numerals label = str(value) elif style is LIT("R"): # Uppercase roman numerals label = format_int_roman(value).upper() elif style is LIT("r"): # Lowercase roman numerals label = format_int_roman(value) elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ... label = format_int_alpha(value).upper() elif style is LIT("a"): # Lowercase letters a-z, aa-zz... label = format_int_alpha(value) else: log.warning("Unknown page label style: %r", style) label = "" return label ================================================ FILE: babeldoc/pdfminer/pdfexceptions.py ================================================ from babeldoc.pdfminer.psexceptions import PSException class PDFException(PSException): pass class PDFTypeError(PDFException, TypeError): pass class PDFValueError(PDFException, ValueError): pass class PDFObjectNotFound(PDFException): pass class PDFNotImplementedError(PDFException, NotImplementedError): pass class PDFKeyError(PDFException, KeyError): pass class PDFEOFError(PDFException, EOFError): pass class PDFIOError(PDFException, IOError): pass ================================================ FILE: babeldoc/pdfminer/pdffont.py ================================================ import logging import struct from collections.abc import Iterable from collections.abc import Iterator from collections.abc import Mapping from io import BytesIO from typing import TYPE_CHECKING from typing import Any from typing import BinaryIO from typing import cast import freetype from babeldoc.pdfminer.casting import safe_float from babeldoc.pdfminer.casting import safe_rect_list from babeldoc.pdfminer.cmapdb import CMap from babeldoc.pdfminer.cmapdb import CMapBase from babeldoc.pdfminer.cmapdb import CMapDB from babeldoc.pdfminer.cmapdb import CMapParser from babeldoc.pdfminer.cmapdb import FileUnicodeMap from babeldoc.pdfminer.cmapdb import IdentityUnicodeMap from babeldoc.pdfminer.cmapdb import UnicodeMap from babeldoc.pdfminer.encodingdb import EncodingDB from babeldoc.pdfminer.encodingdb import name2unicode from babeldoc.pdfminer.fontmetrics import FONT_METRICS from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdfexceptions import PDFKeyError from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import int_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.pdftypes import num_value from babeldoc.pdfminer.pdftypes import resolve1 from babeldoc.pdfminer.pdftypes import resolve_all from babeldoc.pdfminer.pdftypes import stream_value from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psparser import KWD from babeldoc.pdfminer.psparser import LIT from babeldoc.pdfminer.psparser import PSKeyword from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.psparser import PSStackParser from babeldoc.pdfminer.psparser import literal_name from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import Point from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import apply_matrix_norm from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer.utils import nunpack from babeldoc.pdfminer import settings from babeldoc.format.pdf.babelpdf.cmap import CharacterMap if TYPE_CHECKING: from babeldoc.pdfminer.pdfinterp import PDFResourceManager log = logging.getLogger(__name__) def get_widths(seq: Iterable[object]) -> dict[str | int, float]: """Build a mapping of character widths for horizontal writing.""" widths: dict[int, float] = {} r: list[float] = [] for v in seq: v = resolve1(v) if isinstance(v, list): if r: char1 = r[-1] for i, w in enumerate(v): widths[cast(int, char1) + i] = w r = [] elif isinstance(v, (int, float)): # == utils.isnumber(v) r.append(v) if len(r) == 3: (char1, char2, w) = r if isinstance(char1, int) and isinstance(char2, int): for i in range(cast(int, char1), cast(int, char2) + 1): widths[i] = w else: log.warning( f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int" ) r = [] else: log.warning( f"Skipping invalid font width specification for {v} because it is not a number or a list" ) return cast(dict[str | int, float], widths) def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]: """Build a mapping of character widths for vertical writing.""" widths: dict[int, tuple[float, Point]] = {} r: list[float] = [] for v in seq: if isinstance(v, list): if r: char1 = r[-1] for i, (w, vx, vy) in enumerate(choplist(3, v)): widths[cast(int, char1) + i] = (w, (vx, vy)) r = [] elif isinstance(v, (int, float)): # == utils.isnumber(v) r.append(v) if len(r) == 5: (char1, char2, w, vx, vy) = r for i in range(cast(int, char1), cast(int, char2) + 1): widths[i] = (w, (vx, vy)) r = [] return widths class FontMetricsDB: @classmethod def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]: return FONT_METRICS[fontname] # int here means that we're not extending PSStackParser with additional types. class Type1FontHeaderParser(PSStackParser[int]): KEYWORD_BEGIN = KWD(b"begin") KEYWORD_END = KWD(b"end") KEYWORD_DEF = KWD(b"def") KEYWORD_PUT = KWD(b"put") KEYWORD_DICT = KWD(b"dict") KEYWORD_ARRAY = KWD(b"array") KEYWORD_READONLY = KWD(b"readonly") KEYWORD_FOR = KWD(b"for") def __init__(self, data: BinaryIO) -> None: PSStackParser.__init__(self, data) self._cid2unicode: dict[int, str] = {} def get_encoding(self) -> dict[int, str]: """Parse the font encoding. The Type1 font encoding maps character codes to character names. These character names could either be standard Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a sequence of operations that describe how the character should be drawn. Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format :returns mapping of character identifiers (cid's) to unicode characters """ while 1: try: (cid, name) = self.nextobject() except PSEOF: break try: self._cid2unicode[cid] = name2unicode(cast(str, name)) except KeyError as e: log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_PUT: ((_, key), (_, value)) = self.pop(2) if isinstance(key, int) and isinstance(value, PSLiteral): self.add_results((key, literal_name(value))) NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") # Mapping of cmap names. Original cmap name is kept if not in the mapping. # (missing reference for why DLIdent is mapped to Identity) IDENTITY_ENCODER = { "DLIdent-H": "Identity-H", "DLIdent-V": "Identity-V", } def getdict(data: bytes) -> dict[int, list[float | int]]: d: dict[int, list[float | int]] = {} fp = BytesIO(data) stack: list[float | int] = [] while 1: c = fp.read(1) if not c: break b0 = ord(c) if b0 <= 21: d[b0] = stack stack = [] continue if b0 == 30: s = "" loop = True while loop: b = ord(fp.read(1)) for n in (b >> 4, b & 15): if n == 15: loop = False else: nibble = NIBBLES[n] assert nibble is not None s += nibble value = float(s) elif b0 >= 32 and b0 <= 246: value = b0 - 139 else: b1 = ord(fp.read(1)) if b0 >= 247 and b0 <= 250: value = ((b0 - 247) << 8) + b1 + 108 elif b0 >= 251 and b0 <= 254: value = -((b0 - 251) << 8) - b1 - 108 else: b2 = ord(fp.read(1)) if b1 >= 128: b1 -= 256 if b0 == 28: value = b1 << 8 | b2 else: value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] stack.append(value) return d class CFFFont: STANDARD_STRINGS = ( ".notdef", "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", "ampersand", "quoteright", "parenleft", "parenright", "asterisk", "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "colon", "semicolon", "less", "equal", "greater", "question", "at", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", "backslash", "bracketright", "asciicircum", "underscore", "quoteleft", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", "bar", "braceright", "asciitilde", "exclamdown", "cent", "sterling", "fraction", "yen", "florin", "section", "currency", "quotesingle", "quotedblleft", "guillemotleft", "guilsinglleft", "guilsinglright", "fi", "fl", "endash", "dagger", "daggerdbl", "periodcentered", "paragraph", "bullet", "quotesinglbase", "quotedblbase", "quotedblright", "guillemotright", "ellipsis", "perthousand", "questiondown", "grave", "acute", "circumflex", "tilde", "macron", "breve", "dotaccent", "dieresis", "ring", "cedilla", "hungarumlaut", "ogonek", "caron", "emdash", "AE", "ordfeminine", "Lslash", "Oslash", "OE", "ordmasculine", "ae", "dotlessi", "lslash", "oslash", "oe", "germandbls", "onesuperior", "logicalnot", "mu", "trademark", "Eth", "onehalf", "plusminus", "Thorn", "onequarter", "divide", "brokenbar", "degree", "thorn", "threequarters", "twosuperior", "registered", "minus", "eth", "multiply", "threesuperior", "copyright", "Aacute", "Acircumflex", "Adieresis", "Agrave", "Aring", "Atilde", "Ccedilla", "Eacute", "Ecircumflex", "Edieresis", "Egrave", "Iacute", "Icircumflex", "Idieresis", "Igrave", "Ntilde", "Oacute", "Ocircumflex", "Odieresis", "Ograve", "Otilde", "Scaron", "Uacute", "Ucircumflex", "Udieresis", "Ugrave", "Yacute", "Ydieresis", "Zcaron", "aacute", "acircumflex", "adieresis", "agrave", "aring", "atilde", "ccedilla", "eacute", "ecircumflex", "edieresis", "egrave", "iacute", "icircumflex", "idieresis", "igrave", "ntilde", "oacute", "ocircumflex", "odieresis", "ograve", "otilde", "scaron", "uacute", "ucircumflex", "udieresis", "ugrave", "yacute", "ydieresis", "zcaron", "exclamsmall", "Hungarumlautsmall", "dollaroldstyle", "dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior", "parenrightsuperior", "twodotenleader", "onedotenleader", "zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle", "commasuperior", "threequartersemdash", "periodsuperior", "questionsmall", "asuperior", "bsuperior", "centsuperior", "dsuperior", "esuperior", "isuperior", "lsuperior", "msuperior", "nsuperior", "osuperior", "rsuperior", "ssuperior", "tsuperior", "ff", "ffi", "ffl", "parenleftinferior", "parenrightinferior", "Circumflexsmall", "hyphensuperior", "Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall", "Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall", "Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall", "Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall", "colonmonetary", "onefitted", "rupiah", "Tildesmall", "exclamdownsmall", "centoldstyle", "Lslashsmall", "Scaronsmall", "Zcaronsmall", "Dieresissmall", "Brevesmall", "Caronsmall", "Dotaccentsmall", "Macronsmall", "figuredash", "hypheninferior", "Ogoneksmall", "Ringsmall", "Cedillasmall", "questiondownsmall", "oneeighth", "threeeighths", "fiveeighths", "seveneighths", "onethird", "twothirds", "zerosuperior", "foursuperior", "fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior", "ninesuperior", "zeroinferior", "oneinferior", "twoinferior", "threeinferior", "fourinferior", "fiveinferior", "sixinferior", "seveninferior", "eightinferior", "nineinferior", "centinferior", "dollarinferior", "periodinferior", "commainferior", "Agravesmall", "Aacutesmall", "Acircumflexsmall", "Atildesmall", "Adieresissmall", "Aringsmall", "AEsmall", "Ccedillasmall", "Egravesmall", "Eacutesmall", "Ecircumflexsmall", "Edieresissmall", "Igravesmall", "Iacutesmall", "Icircumflexsmall", "Idieresissmall", "Ethsmall", "Ntildesmall", "Ogravesmall", "Oacutesmall", "Ocircumflexsmall", "Otildesmall", "Odieresissmall", "OEsmall", "Oslashsmall", "Ugravesmall", "Uacutesmall", "Ucircumflexsmall", "Udieresissmall", "Yacutesmall", "Thornsmall", "Ydieresissmall", "001.000", "001.001", "001.002", "001.003", "Black", "Bold", "Book", "Light", "Medium", "Regular", "Roman", "Semibold", ) class INDEX: def __init__(self, fp: BinaryIO) -> None: self.fp = fp self.offsets: list[int] = [] (count, offsize) = struct.unpack(">HB", self.fp.read(3)) for i in range(count + 1): self.offsets.append(nunpack(self.fp.read(offsize))) self.base = self.fp.tell() - 1 self.fp.seek(self.base + self.offsets[-1]) def __repr__(self) -> str: return "" % len(self) def __len__(self) -> int: return len(self.offsets) - 1 def __getitem__(self, i: int) -> bytes: self.fp.seek(self.base + self.offsets[i]) return self.fp.read(self.offsets[i + 1] - self.offsets[i]) def __iter__(self) -> Iterator[bytes]: return iter(self[i] for i in range(len(self))) def __init__(self, name: str, fp: BinaryIO) -> None: self.name = name self.fp = fp # Header (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) self.fp.read(hdrsize - 4) # Name INDEX self.name_index = self.INDEX(self.fp) # Top DICT INDEX self.dict_index = self.INDEX(self.fp) # String INDEX self.string_index = self.INDEX(self.fp) # Global Subr INDEX self.subr_index = self.INDEX(self.fp) # Top DICT DATA self.top_dict = getdict(self.dict_index[0]) (charset_pos,) = self.top_dict.get(15, [0]) (encoding_pos,) = self.top_dict.get(16, [0]) (charstring_pos,) = self.top_dict.get(17, [0]) # CharStrings self.fp.seek(cast(int, charstring_pos)) self.charstring = self.INDEX(self.fp) self.nglyphs = len(self.charstring) # Encodings self.code2gid = {} self.gid2code = {} self.fp.seek(cast(int, encoding_pos)) format = self.fp.read(1) if format == b"\x00": # Format 0 (n,) = struct.unpack("B", self.fp.read(1)) for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): self.code2gid[code] = gid self.gid2code[gid] = code elif format == b"\x01": # Format 1 (n,) = struct.unpack("B", self.fp.read(1)) code = 0 for i in range(n): (first, nleft) = struct.unpack("BB", self.fp.read(2)) for gid in range(first, first + nleft + 1): self.code2gid[code] = gid self.gid2code[gid] = code code += 1 else: raise PDFValueError("unsupported encoding format: %r" % format) # Charsets self.name2gid = {} self.gid2name = {} self.fp.seek(cast(int, charset_pos)) format = self.fp.read(1) if format == b"\x00": # Format 0 n = self.nglyphs - 1 for gid, sid in enumerate( cast( tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) ), ): gid += 1 sidname = self.getstr(sid) self.name2gid[sidname] = gid self.gid2name[gid] = sidname elif format == b"\x01": # Format 1 (n,) = struct.unpack("B", self.fp.read(1)) sid = 0 for i in range(n): (first, nleft) = struct.unpack("BB", self.fp.read(2)) for gid in range(first, first + nleft + 1): sidname = self.getstr(sid) self.name2gid[sidname] = gid self.gid2name[gid] = sidname sid += 1 elif format == b"\x02": # Format 2 assert False, str(("Unhandled", format)) else: raise PDFValueError("unsupported charset format: %r" % format) def getstr(self, sid: int) -> str | bytes: # This returns str for one of the STANDARD_STRINGS but bytes otherwise, # and appears to be a needless source of type complexity. if sid < len(self.STANDARD_STRINGS): return self.STANDARD_STRINGS[sid] return self.string_index[sid - len(self.STANDARD_STRINGS)] class TrueTypeFont: class CMapNotFound(PDFException): pass def __init__(self, name: str, fp: BinaryIO) -> None: self.name = name self.fp = fp self.tables: dict[bytes, tuple[int, int]] = {} self.fonttype = fp.read(4) try: (ntables, _1, _2, _3) = cast( tuple[int, int, int, int], struct.unpack(">HHHH", fp.read(8)), ) for _ in range(ntables): (name_bytes, tsum, offset, length) = cast( tuple[bytes, int, int, int], struct.unpack(">4sLLL", fp.read(16)), ) self.tables[name_bytes] = (offset, length) except struct.error: # Do not fail if there are not enough bytes to read. Even for # corrupted PDFs we would like to get as much information as # possible, so continue. pass def create_unicode_map(self) -> FileUnicodeMap: if b"cmap" not in self.tables: raise TrueTypeFont.CMapNotFound fp = self.fp char2gid = [] try: face = freetype.Face(fp) char2gid = list(face.get_chars()) except Exception: raise TrueTypeFont.CMapNotFound # create unicode map unicode_map = FileUnicodeMap() for char, gid in char2gid: unicode_map.add_cid2unichr(gid, char) return unicode_map class PDFFontError(PDFException): pass class PDFUnicodeNotDefined(PDFFontError): pass LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") LITERAL_TYPE1C = LIT("Type1C") # Font widths are maintained in a dict type that maps from *either* unicode # chars or integer character IDs. FontWidthDict = dict[int | str, float] class PDFFont: def __init__( self, descriptor: Mapping[str, Any], widths: FontWidthDict, default_width: float | None = None, ) -> None: self.descriptor = descriptor self.widths: FontWidthDict = resolve_all(widths) self.fontname = resolve1(descriptor.get("FontName", "unknown")) if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) self.flags = int_value(descriptor.get("Flags", 0)) self.ascent = num_value(descriptor.get("Ascent", 0)) self.descent = num_value(descriptor.get("Descent", 0)) self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) if default_width is None: self.default_width = num_value(descriptor.get("MissingWidth", 0)) else: self.default_width = default_width self.default_width = resolve1(self.default_width) self.leading = num_value(descriptor.get("Leading", 0)) self.bbox = self._parse_bbox(descriptor) self.hscale = self.vscale = 0.001 # PDF RM 9.8.1 specifies /Descent should always be a negative number. # PScript5.dll seems to produce Descent with a positive number, but # text analysis will be wrong if this is taken as correct. So force # descent to negative. if self.descent > 0: self.descent = -self.descent def __repr__(self) -> str: return "" def is_vertical(self) -> bool: return False def is_multibyte(self) -> bool: return False def decode(self, bytes: bytes) -> Iterable[int]: return bytearray(bytes) # map(ord, bytes) def get_ascent(self) -> float: """Ascent above the baseline, in text space units""" return self.ascent * self.vscale def get_descent(self) -> float: """Descent below the baseline, in text space units; always negative""" return self.descent * self.vscale def get_width(self) -> float: w = self.bbox[2] - self.bbox[0] if w == 0: w = -self.default_width return w * self.hscale def get_height(self) -> float: h = self.bbox[3] - self.bbox[1] if h == 0: h = self.ascent - self.descent return h * self.vscale def char_width(self, cid: int) -> float: # Because character widths may be mapping either IDs or strings, # we try to lookup the character ID first, then its str equivalent. cid_width = safe_float(self.widths.get(cid)) if cid_width is not None: return cid_width * self.hscale try: str_cid = self.to_unichr(cid) cid_width = safe_float(self.widths.get(str_cid)) if cid_width is not None: return cid_width * self.hscale except PDFUnicodeNotDefined: pass return self.default_width * self.hscale def char_disp(self, cid: int) -> float | tuple[float | None, float]: """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" return 0 def string_width(self, s: bytes) -> float: return sum(self.char_width(cid) for cid in self.decode(s)) def to_unichr(self, cid: int) -> str: raise NotImplementedError @staticmethod def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect: """Parse FontBBox from the fonts descriptor""" font_bbox = resolve_all(descriptor.get("FontBBox")) bbox = safe_rect_list(font_bbox) if bbox is None: log.warning( f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats" ) return 0.0, 0.0, 0.0, 0.0 return bbox class PDFSimpleFont(PDFFont): def __init__( self, descriptor: Mapping[str, Any], widths: FontWidthDict, spec: Mapping[str, Any], ) -> None: # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. if "Encoding" in spec: encoding = resolve1(spec["Encoding"]) else: encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) diff = list_value(encoding.get("Differences", [])) self.cid2unicode = EncodingDB.get_encoding(name, diff) else: self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) self.unicode_map: UnicodeMap | None = None if "ToUnicode" in spec: strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() PDFFont.__init__(self, descriptor, widths) def to_unichr(self, cid: int) -> str: if self.unicode_map: try: return self.unicode_map.get_unichr(cid) except KeyError: pass try: return self.cid2unicode[cid] except KeyError: raise PDFUnicodeNotDefined(None, cid) class PDFType1Font(PDFSimpleFont): def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: try: self.basefont = literal_name(spec["BaseFont"]) except KeyError: if settings.STRICT: raise PDFFontError("BaseFont is missing") self.basefont = "unknown" widths: FontWidthDict try: (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) widths = cast(dict[str | int, float], int_widths) # implicit int->float except KeyError: descriptor = dict_value(spec.get("FontDescriptor", {})) firstchar = int_value(spec.get("FirstChar", 0)) # lastchar = int_value(spec.get('LastChar', 255)) width_list = list_value(spec.get("Widths", [0] * 256)) widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} PDFSimpleFont.__init__(self, descriptor, widths, spec) if "Encoding" not in spec and "FontFile" in descriptor: # try to recover the missing encoding info from the font file. self.fontfile = stream_value(descriptor.get("FontFile")) length1 = int_value(self.fontfile["Length1"]) data = self.fontfile.get_data()[:length1] # awcm: quickfix for type 1 font which contains bad string literals offset = 0 if enc_offset := data.index(b"/Encoding"): offset = enc_offset parser = Type1FontHeaderParser(BytesIO(data[offset:])) self.cid2unicode = parser.get_encoding() def __repr__(self) -> str: return "" % self.basefont class PDFTrueTypeFont(PDFType1Font): def __repr__(self) -> str: return "" % self.basefont class PDFType3Font(PDFSimpleFont): def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: firstchar = int_value(spec.get("FirstChar", 0)) # lastchar = int_value(spec.get('LastChar', 0)) width_list = list_value(spec.get("Widths", [0] * 256)) widths: dict[str | int, float] = { i + firstchar: w for (i, w) in enumerate(width_list) } if "FontDescriptor" in spec: descriptor = dict_value(spec["FontDescriptor"]) else: descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) def __repr__(self) -> str: return "" class PDFCIDFont(PDFFont): default_disp: float | tuple[float | None, float] def __init__( self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any], strict: bool = settings.STRICT, ) -> None: try: self.basefont = literal_name(spec["BaseFont"]) except KeyError: if strict: raise PDFFontError("BaseFont is missing") self.basefont = "unknown" self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( "latin1", ) cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( "latin1", ) self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) try: descriptor = dict_value(spec["FontDescriptor"]) except KeyError: if strict: raise PDFFontError("FontDescriptor is missing") descriptor = {} ttf = None self.has_encoding = False self.cid_encoding = None try: if "Encoding" in spec: encoding_part = resolve1(spec["Encoding"]) if isinstance(encoding_part, PDFStream): self.has_encoding = True self.cid_encoding = CharacterMap( encoding_part.get_data().decode("U8") ) except Exception as e: log.error(f"Error get cid_encoding from spec: {e}") self.has_encoding = False self.cid_encoding = None if "FontFile2" in descriptor: self.fontfile = stream_value(descriptor.get("FontFile2")) ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) self.unicode_map: UnicodeMap | None = None if "ToUnicode" in spec: if isinstance(spec["ToUnicode"], PDFStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() else: cmap_name = literal_name(spec["ToUnicode"]) encoding = literal_name(spec["Encoding"]) if ( "Identity" in cid_ordering or "Identity" in cmap_name or "Identity" in encoding ): self.unicode_map = IdentityUnicodeMap() elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): if ttf: try: self.unicode_map = ttf.create_unicode_map() except TrueTypeFont.CMapNotFound: pass else: try: self.unicode_map = CMapDB.get_unicode_map( self.cidcoding, self.cmap.is_vertical(), ) except CMapDB.CMapNotFound: pass self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical widths2 = get_widths2(list_value(spec.get("W2", []))) self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} (vy, w) = resolve1(spec.get("DW2", [880, -1000])) self.default_disp = (None, vy) widths: dict[str | int, float] = { cid: w for (cid, (w, _)) in widths2.items() } default_width = w else: # writing mode: horizontal self.disps = {} self.default_disp = 0 widths = get_widths(list_value(spec.get("W", []))) default_width = spec.get("DW", 1000) PDFFont.__init__(self, descriptor, widths, default_width=default_width) def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: """Get cmap from font specification For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an attribute of spec['Encoding']. The horizontal/vertical modes are mentioned with different name such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. """ cmap_name = self._get_cmap_name(spec, strict) try: return CMapDB.get_cmap(cmap_name) except CMapDB.CMapNotFound as e: if strict: raise PDFFontError(e) return CMap() @staticmethod def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: """Get cmap name from font specification""" cmap_name = "unknown" # default value try: spec_encoding = spec["Encoding"] if hasattr(spec_encoding, "name"): cmap_name = literal_name(spec["Encoding"]) else: cmap_name = literal_name(spec_encoding["CMapName"]) except KeyError: if strict: raise PDFFontError("Encoding is unspecified") if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) if "CMapName" in cmap_name_stream: cmap_name = cmap_name_stream.get("CMapName").name elif strict: raise PDFFontError("CMapName unspecified for encoding") return IDENTITY_ENCODER.get(cmap_name, cmap_name) def __repr__(self) -> str: return f"" def is_vertical(self) -> bool: return self.vertical def is_multibyte(self) -> bool: return True def decode(self, bytes: bytes) -> Iterable[int]: try: if self.has_encoding: res = self.cid_encoding.decode(bytes) if res is not None and all(x > 0 for x in res): return res except Exception as e: log.error(f"Error use cid_encoding to decode bytes: {e}") return self.cmap.decode(bytes) def char_disp(self, cid: int) -> float | tuple[float | None, float]: """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" return self.disps.get(cid, self.default_disp) def to_unichr(self, cid: int) -> str: try: if not self.unicode_map: raise PDFKeyError(cid) return self.unicode_map.get_unichr(cid) except KeyError: raise PDFUnicodeNotDefined(self.cidcoding, cid) ================================================ FILE: babeldoc/pdfminer/pdfinterp.py ================================================ import logging import re from collections.abc import Mapping from collections.abc import Sequence from io import BytesIO from typing import Union from typing import cast from babeldoc.pdfminer.casting import safe_cmyk from babeldoc.pdfminer.casting import safe_float from babeldoc.pdfminer.casting import safe_int from babeldoc.pdfminer.casting import safe_matrix from babeldoc.pdfminer.casting import safe_rgb from babeldoc.pdfminer.cmapdb import CMap from babeldoc.pdfminer.cmapdb import CMapBase from babeldoc.pdfminer.cmapdb import CMapDB from babeldoc.pdfminer.pdfcolor import PREDEFINED_COLORSPACE from babeldoc.pdfminer.pdfcolor import PDFColorSpace from babeldoc.pdfminer.pdfdevice import PDFDevice from babeldoc.pdfminer.pdfdevice import PDFTextSeq from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdffont import PDFCIDFont from babeldoc.pdfminer.pdffont import PDFFont from babeldoc.pdfminer.pdffont import PDFFontError from babeldoc.pdfminer.pdffont import PDFTrueTypeFont from babeldoc.pdfminer.pdffont import PDFType1Font from babeldoc.pdfminer.pdffont import PDFType3Font from babeldoc.pdfminer.pdfpage import PDFPage from babeldoc.pdfminer.pdftypes import LITERALS_ASCII85_DECODE from babeldoc.pdfminer.pdftypes import PDFObjRef from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.pdftypes import resolve1 from babeldoc.pdfminer.pdftypes import stream_value from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psexceptions import PSTypeError from babeldoc.pdfminer.psparser import KWD from babeldoc.pdfminer.psparser import LIT from babeldoc.pdfminer.psparser import PSKeyword from babeldoc.pdfminer.psparser import PSLiteral from babeldoc.pdfminer.psparser import PSStackParser from babeldoc.pdfminer.psparser import PSStackType from babeldoc.pdfminer.psparser import keyword_name from babeldoc.pdfminer.psparser import literal_name from babeldoc.pdfminer.utils import MATRIX_IDENTITY, apply_matrix_pt from babeldoc.pdfminer.utils import Matrix from babeldoc.pdfminer.utils import PathSegment from babeldoc.pdfminer.utils import Point from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer.utils import mult_matrix from babeldoc.pdfminer import settings log = logging.getLogger(__name__) class PDFResourceError(PDFException): pass class PDFInterpreterError(PDFException): pass LITERAL_PDF = LIT("PDF") LITERAL_TEXT = LIT("Text") LITERAL_FONT = LIT("Font") LITERAL_FORM = LIT("Form") LITERAL_IMAGE = LIT("Image") class PDFTextState: matrix: Matrix linematrix: Point def __init__(self) -> None: self.font: PDFFont | None = None self.fontsize: float = 0 self.charspace: float = 0 self.wordspace: float = 0 self.scaling: float = 100 self.leading: float = 0 self.render: int = 0 self.rise: float = 0 self.reset() # self.matrix is set # self.linematrix is set def __repr__(self) -> str: return ( "" % ( self.font, self.fontsize, self.charspace, self.wordspace, self.scaling, self.leading, self.render, self.rise, self.matrix, self.linematrix, ) ) def copy(self) -> "PDFTextState": obj = PDFTextState() obj.font = self.font obj.fontsize = self.fontsize obj.charspace = self.charspace obj.wordspace = self.wordspace obj.scaling = self.scaling obj.leading = self.leading obj.render = self.render obj.rise = self.rise obj.matrix = self.matrix obj.linematrix = self.linematrix obj.font_id = getattr(self, "font_id", None) return obj def reset(self) -> None: self.matrix = MATRIX_IDENTITY self.linematrix = (0, 0) Color = Union[ float, # Greyscale tuple[float, float, float], # R, G, B tuple[float, float, float, float], # C, M, Y, K ] class PDFGraphicState: def __init__(self) -> None: self.linewidth: float = 0 self.linecap: object | None = None self.linejoin: object | None = None self.miterlimit: object | None = None self.dash: tuple[object, object] | None = None self.intent: object | None = None self.flatness: object | None = None # stroking color self.scolor: Color | None = None # non stroking color self.ncolor: Color | None = None def copy(self) -> "PDFGraphicState": obj = PDFGraphicState() obj.linewidth = self.linewidth obj.linecap = self.linecap obj.linejoin = self.linejoin obj.miterlimit = self.miterlimit obj.dash = self.dash obj.intent = self.intent obj.flatness = self.flatness obj.scolor = self.scolor obj.ncolor = self.ncolor return obj def __repr__(self) -> str: return ( "" % ( self.linewidth, self.linecap, self.linejoin, self.miterlimit, self.dash, self.intent, self.flatness, self.scolor, self.ncolor, ) ) class PDFResourceManager: """Repository of shared resources. ResourceManager facilitates reuse of shared resources such as fonts and images so that large objects are not allocated multiple times. """ def __init__(self, caching: bool = True) -> None: self.caching = caching self._cached_fonts: dict[object, PDFFont] = {} def get_procset(self, procs: Sequence[object]) -> None: for proc in procs: if proc is LITERAL_PDF or proc is LITERAL_TEXT: pass else: pass def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: try: return CMapDB.get_cmap(cmapname) except CMapDB.CMapNotFound: if strict: raise return CMap() def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: log.debug("get_font: create: objid=%r, spec=%r", objid, spec) if settings.STRICT: if spec["Type"] is not LITERAL_FONT: raise PDFFontError("Type is not /Font") # Create a Font object. if "Subtype" in spec: subtype = literal_name(spec["Subtype"]) else: if settings.STRICT: raise PDFFontError("Font Subtype is not specified.") subtype = "Type1" if subtype in ("Type1", "MMType1"): # Type1 Font font = PDFType1Font(self, spec) elif subtype == "TrueType": # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == "Type3": # Type3 Font font = PDFType3Font(self, spec) elif subtype in ("CIDFontType0", "CIDFontType2"): # CID Font font = PDFCIDFont(self, spec) elif subtype == "Type0": # Type0 Font dfonts = list_value(spec["DescendantFonts"]) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ("Encoding", "ToUnicode"): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if settings.STRICT: raise PDFFontError("Invalid Font spec: %r" % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): def __init__(self, streams: Sequence[object]) -> None: self.streams = streams self.istream = 0 # PSStackParser.__init__(fp=None) is safe only because we've overloaded # all the methods that would attempt to access self.fp without first # calling self.fillfp(). PSStackParser.__init__(self, None) # type: ignore[arg-type] def fillfp(self) -> None: if not self.fp: if self.istream < len(self.streams): strm = stream_value(self.streams[self.istream]) self.istream += 1 else: raise PSEOF("Unexpected EOF, file truncated?") self.fp = BytesIO(strm.get_data()) def seek(self, pos: int) -> None: self.fillfp() PSStackParser.seek(self, pos) def fillbuf(self) -> None: if self.charpos < len(self.buf): return while 1: self.fillfp() self.bufpos = self.fp.tell() self.buf = self.fp.read(self.BUFSIZ) if self.buf: break self.fp = None # type: ignore[assignment] self.charpos = 0 def get_inline_data(self, pos: int, target: bytes = b"EI") -> tuple[int, bytes]: self.seek(pos) i = 0 data = b"" while i <= len(target): self.fillbuf() if i: ci = self.buf[self.charpos] c = bytes((ci,)) data += c self.charpos += 1 if ( len(target) <= i and c.isspace() or i < len(target) and c == (bytes((target[i],))) ): i += 1 else: i = 0 else: try: j = self.buf.index(target[0], self.charpos) data += self.buf[self.charpos : j + 1] self.charpos = j + 1 i = 1 except ValueError: data += self.buf[self.charpos :] self.charpos = len(self.buf) data = data[: -(len(target) + 1)] # strip the last part data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data) return (pos, data) def flush(self) -> None: self.add_results(*self.popall()) KEYWORD_BI = KWD(b"BI") KEYWORD_ID = KWD(b"ID") KEYWORD_EI = KWD(b"EI") def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, "inline") elif token is self.KEYWORD_ID: try: (_, objs) = self.end_type("inline") if len(objs) % 2 != 0: error_msg = f"Invalid dictionary construct: {objs!r}" raise PSTypeError(error_msg) d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} eos = b"EI" filter = d.get("F", None) if filter is not None: if isinstance(filter, PSLiteral): filter = [filter] if filter[0] in LITERALS_ASCII85_DECODE: eos = b"~>" (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) if eos != b"EI": # it may be necessary for decoding data += eos obj = PDFStream(d, data) self.push((pos, obj)) if eos == b"EI": # otherwise it is still in the stream self.push((pos, self.KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise else: self.push((pos, token)) PDFStackT = PSStackType[PDFStream] """Types that may appear on the PDF argument stack.""" class PDFPageInterpreter: """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary """ def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None: self.rsrcmgr = rsrcmgr self.device = device def dup(self) -> "PDFPageInterpreter": return self.__class__(self.rsrcmgr, self.device) def init_resources(self, resources: dict[object, object]) -> None: """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap: dict[object, PDFFont] = {} self.xobjmap = {} self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec: object) -> PDFColorSpace | None: if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, stream_value(spec[1])["N"]) elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) for k, v in dict_value(resources).items(): log.debug("Resource: %r: %r", k, v) if k == "Font": for fontid, spec in dict_value(v).items(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == "ColorSpace": for csid, spec in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec)) if colorspace is not None: self.csmap[csid] = colorspace elif k == "ProcSet": self.rsrcmgr.get_procset(list_value(v)) elif k == "XObject": for xobjid, xobjstrm in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm def init_state(self, ctm: Matrix) -> None: """Initialize the text and graphic states for rendering a page.""" # gstack: stack for graphical states. self.gstack: list[tuple[Matrix, PDFTextState, PDFGraphicState]] = [] self.ctm = ctm self.device.set_ctm(self.ctm) self.textstate = PDFTextState() self.graphicstate = PDFGraphicState() self.curpath: list[PathSegment] = [] # argstack: stack for command arguments. self.argstack: list[PDFStackT] = [] # set some global states. self.scs: PDFColorSpace | None = None self.ncs: PDFColorSpace | None = None if self.csmap: self.scs = self.ncs = next(iter(self.csmap.values())) def push(self, obj: PDFStackT) -> None: self.argstack.append(obj) def pop(self, n: int) -> list[PDFStackT]: if n == 0: return [] x = self.argstack[-n:] self.argstack = self.argstack[:-n] return x def get_current_state(self) -> tuple[Matrix, PDFTextState, PDFGraphicState]: return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) def set_current_state( self, state: tuple[Matrix, PDFTextState, PDFGraphicState], ) -> None: (self.ctm, self.textstate, self.graphicstate) = state self.device.set_ctm(self.ctm) def do_q(self) -> None: """Save graphics state""" self.gstack.append(self.get_current_state()) def do_Q(self) -> None: """Restore graphics state""" if self.gstack: self.set_current_state(self.gstack.pop()) def do_cm( self, a1: PDFStackT, b1: PDFStackT, c1: PDFStackT, d1: PDFStackT, e1: PDFStackT, f1: PDFStackT, ) -> None: """Concatenate matrix to current transformation matrix""" matrix = safe_matrix(a1, b1, c1, d1, e1, f1) if matrix is None: log.warning( f"Cannot concatenate matrix to current transformation matrix because not all values in {(a1, b1, c1, d1, e1, f1)!r} can be parsed as floats" ) else: self.ctm = mult_matrix(matrix, self.ctm) self.device.set_ctm(self.ctm) def do_w(self, linewidth: PDFStackT) -> None: """Set line width""" linewidth_f = safe_float(linewidth) if linewidth_f is None: log.warning( f"Cannot set line width because {linewidth!r} is an invalid float value" ) else: self.graphicstate.linewidth = linewidth_f def do_J(self, linecap: PDFStackT) -> None: """Set line cap style""" self.graphicstate.linecap = linecap def do_j(self, linejoin: PDFStackT) -> None: """Set line join style""" self.graphicstate.linejoin = linejoin def do_M(self, miterlimit: PDFStackT) -> None: """Set miter limit""" self.graphicstate.miterlimit = miterlimit def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: """Set line dash pattern""" self.graphicstate.dash = (dash, phase) def do_ri(self, intent: PDFStackT) -> None: """Set color rendering intent""" self.graphicstate.intent = intent def do_i(self, flatness: PDFStackT) -> None: """Set flatness tolerance""" self.graphicstate.flatness = flatness def do_gs(self, name: PDFStackT) -> None: """Set parameters from graphics state parameter dictionary""" # to do def do_m(self, x: PDFStackT, y: PDFStackT) -> None: """Begin new subpath""" x_f = safe_float(x) y_f = safe_float(y) if x_f is None or y_f is None: point = ("m", x, y) log.warning( f"Cannot start new subpath because not all values in {point!r} can be parsed as floats" ) else: point = ("m", x_f, y_f) self.curpath.append(point) def do_l(self, x: PDFStackT, y: PDFStackT) -> None: """Append straight line segment to path""" x_f = safe_float(x) y_f = safe_float(y) if x_f is None or y_f is None: point = ("l", x, y) log.warning( f"Cannot append straight line segment to path because not all values in {point!r} can be parsed as floats" ) else: point = ("l", x_f, y_f) self.curpath.append(point) def do_c( self, x1: PDFStackT, y1: PDFStackT, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT, ) -> None: """Append curved segment to path (three control points)""" x1_f = safe_float(x1) y1_f = safe_float(y1) x2_f = safe_float(x2) y2_f = safe_float(y2) x3_f = safe_float(x3) y3_f = safe_float(y3) if ( x1_f is None or y1_f is None or x2_f is None or y2_f is None or x3_f is None or y3_f is None ): point = ("c", x1, y1, x2, y2, x3, y3) log.warning( f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" ) else: point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f) self.curpath.append(point) def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: """Append curved segment to path (initial point replicated)""" x2_f = safe_float(x2) y2_f = safe_float(y2) x3_f = safe_float(x3) y3_f = safe_float(y3) if x2_f is None or y2_f is None or x3_f is None or y3_f is None: point = ("v", x2, y2, x3, y3) log.warning( f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" ) else: point = ("v", x2_f, y2_f, x3_f, y3_f) self.curpath.append(point) def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: """Append curved segment to path (final point replicated)""" x1_f = safe_float(x1) y1_f = safe_float(y1) x3_f = safe_float(x3) y3_f = safe_float(y3) if x1_f is None or y1_f is None or x3_f is None or y3_f is None: point = ("y", x1, y1, x3, y3) log.warning( f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" ) else: point = ("y", x1_f, y1_f, x3_f, y3_f) self.curpath.append(point) def do_h(self) -> None: """Close subpath""" self.curpath.append(("h",)) def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: """Append rectangle to path""" x_f = safe_float(x) y_f = safe_float(y) w_f = safe_float(w) h_f = safe_float(h) if x_f is None or y_f is None or w_f is None or h_f is None: values = (x, y, w, h) log.warning( f"Cannot append rectangle to path because not all values in {values!r} can be parsed as floats" ) else: self.curpath.append(("m", x_f, y_f)) self.curpath.append(("l", x_f + w_f, y_f)) self.curpath.append(("l", x_f + w_f, y_f + h_f)) self.curpath.append(("l", x_f, y_f + h_f)) self.curpath.append(("h",)) def do_S(self) -> None: """Stroke path""" self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] def do_s(self) -> None: """Close and stroke path""" self.do_h() self.do_S() def do_f(self) -> None: """Fill path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] def do_F(self) -> None: """Fill path using nonzero winding number rule (obsolete)""" def do_f_a(self) -> None: """Fill path using even-odd rule""" self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] def do_b(self) -> None: """Close, fill, and stroke path using nonzero winding number rule""" self.do_h() self.do_B() def do_b_a(self) -> None: """Close, fill, and stroke path using even-odd rule""" self.do_h() self.do_B_a() def do_n(self) -> None: """End path without filling or stroking""" self.curpath = [] def do_W(self) -> None: """Set clipping path using nonzero winding number rule""" pass def do_W_a(self) -> None: """Set clipping path using even-odd rule""" pass def do_CS(self, name: PDFStackT) -> None: """Set color space for stroking operations Introduced in PDF 1.1 """ try: self.scs = self.csmap[literal_name(name)] except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined ColorSpace: %r" % name) def do_cs(self, name: PDFStackT) -> None: """Set color space for nonstroking operations""" try: self.ncs = self.csmap[literal_name(name)] except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined ColorSpace: %r" % name) def do_G(self, gray: PDFStackT) -> None: """Set gray level for stroking operations""" gray_f = safe_float(gray) if gray_f is None: log.warning( f"Cannot set gray level because {gray!r} is an invalid float value" ) else: self.graphicstate.scolor = gray_f self.scs = self.csmap["DeviceGray"] def do_g(self, gray: PDFStackT) -> None: """Set gray level for nonstroking operations""" gray_f = safe_float(gray) if gray_f is None: log.warning( f"Cannot set gray level because {gray!r} is an invalid float value" ) else: self.graphicstate.ncolor = gray_f self.ncs = self.csmap["DeviceGray"] def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: """Set RGB color for stroking operations""" rgb = safe_rgb(r, g, b) if rgb is None: log.warning( f"Cannot set RGB stroke color because not all values in {(r, g, b)!r} can be parsed as floats" ) else: self.graphicstate.scolor = rgb self.scs = self.csmap["DeviceRGB"] def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: """Set RGB color for nonstroking operations""" rgb = safe_rgb(r, g, b) if rgb is None: log.warning( f"Cannot set RGB non-stroke color because not all values in {(r, g, b)!r} can be parsed as floats" ) else: self.graphicstate.ncolor = rgb self.ncs = self.csmap["DeviceRGB"] def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: """Set CMYK color for stroking operations""" cmyk = safe_cmyk(c, m, y, k) if cmyk is None: log.warning( f"Cannot set CMYK stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats" ) else: self.graphicstate.scolor = cmyk self.scs = self.csmap["DeviceCMYK"] def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: """Set CMYK color for nonstroking operations""" cmyk = safe_cmyk(c, m, y, k) if cmyk is None: log.warning( f"Cannot set CMYK non-stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats" ) else: self.graphicstate.ncolor = cmyk self.ncs = self.csmap["DeviceCMYK"] def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: n = self.scs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 if n == 1: gray = self.pop(1)[0] gray_f = safe_float(gray) if gray_f is None: log.warning( f"Cannot set gray stroke color because {gray!r} is an invalid float value" ) else: self.graphicstate.scolor = gray_f elif n == 3: values = self.pop(3) rgb = safe_rgb(*values) if rgb is None: log.warning( f"Cannot set RGB stroke color because not all values in {values!r} can be parsed as floats" ) else: self.graphicstate.scolor = rgb elif n == 4: values = self.pop(4) cmyk = safe_cmyk(*values) if cmyk is None: log.warning( f"Cannot set CMYK stroke color because not all values in {values!r} can be parsed as floats" ) else: self.graphicstate.scolor = cmyk else: log.warning( f"Cannot set stroke color because {n} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported" ) def do_scn(self) -> None: """Set color for nonstroking operations""" if self.ncs: n = self.ncs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 if n == 1: gray = self.pop(1)[0] gray_f = safe_float(gray) if gray_f is None: log.warning( f"Cannot set gray non-stroke color because {gray!r} is an invalid float value" ) else: self.graphicstate.ncolor = gray_f elif n == 3: values = self.pop(3) rgb = safe_rgb(*values) if rgb is None: log.warning( f"Cannot set RGB non-stroke color because not all values in {values!r} can be parsed as floats" ) else: self.graphicstate.ncolor = rgb elif n == 4: values = self.pop(4) cmyk = safe_cmyk(*values) if cmyk is None: log.warning( f"Cannot set CMYK non-stroke color because not all values in {values!r} can be parsed as floats" ) else: self.graphicstate.ncolor = cmyk else: log.warning( f"Cannot set non-stroke color because {n} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported" ) def do_SC(self) -> None: """Set color for stroking operations""" self.do_SCN() def do_sc(self) -> None: """Set color for nonstroking operations""" self.do_scn() def do_sh(self, name: object) -> None: """Paint area defined by shading pattern""" def do_BT(self) -> None: """Begin text object Initializing the text matrix, Tm, and the text line matrix, Tlm, to the identity matrix. Text objects cannot be nested; a second BT cannot appear before an ET. """ self.textstate.reset() def do_ET(self) -> None: """End a text object""" def do_BX(self) -> None: """Begin compatibility section""" def do_EX(self) -> None: """End compatibility section""" def do_MP(self, tag: PDFStackT) -> None: """Define marked-content point""" if isinstance(tag, PSLiteral): self.device.do_tag(tag) else: log.warning( f"Cannot define marked-content point because {tag!r} is not a PSLiteral" ) def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None: """Define marked-content point with property list""" if isinstance(tag, PSLiteral): self.device.do_tag(tag, props) else: log.warning( f"Cannot define marked-content point with property list because {tag!r} is not a PSLiteral" ) def do_BMC(self, tag: PDFStackT) -> None: """Begin marked-content sequence""" if isinstance(tag, PSLiteral): self.device.begin_tag(tag) else: log.warning( f"Cannot begin marked-content sequence because {tag!r} is not a PSLiteral" ) def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None: """Begin marked-content sequence with property list""" if isinstance(tag, PSLiteral): self.device.begin_tag(tag, props) else: log.warning( f"Cannot begin marked-content sequence with property list because {tag!r} is not a PSLiteral" ) def do_EMC(self) -> None: """End marked-content sequence""" self.device.end_tag() def do_Tc(self, space: PDFStackT) -> None: """Set character spacing. Character spacing is used by the Tj, TJ, and ' operators. :param space: a number expressed in unscaled text space units. """ charspace = safe_float(space) if charspace is None: log.warning( f"Could not set character spacing because {space!r} is an invalid float value" ) else: self.textstate.charspace = charspace def do_Tw(self, space: PDFStackT) -> None: """Set the word spacing. Word spacing is used by the Tj, TJ, and ' operators. :param space: a number expressed in unscaled text space units """ wordspace = safe_float(space) if wordspace is None: log.warning( f"Could not set word spacing becuase {space!r} is an invalid float value" ) else: self.textstate.wordspace = wordspace def do_Tz(self, scale: PDFStackT) -> None: """Set the horizontal scaling. :param scale: is a number specifying the percentage of the normal width """ scale_f = safe_float(scale) if scale_f is None: log.warning( f"Could not set horizontal scaling because {scale!r} is an invalid float value" ) else: self.textstate.scaling = scale_f def do_TL(self, leading: PDFStackT) -> None: """Set the text leading. Text leading is used only by the T*, ', and " operators. :param leading: a number expressed in unscaled text space units """ leading_f = safe_float(leading) if leading_f is None: log.warning( f"Could not set text leading because {leading!r} is an invalid float value" ) else: self.textstate.leading = -leading_f def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None: """Set the text font :param fontid: the name of a font resource in the Font subdictionary of the current resource dictionary :param fontsize: size is a number representing a scale factor. """ try: self.textstate.font = self.fontmap[literal_name(fontid)] self.textstate.font_id = literal_name(fontid) except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined Font id: %r" % fontid) self.textstate.font = self.rsrcmgr.get_font(None, {}) fontsize_f = safe_float(fontsize) if fontsize_f is None: log.warning( f"Could not set text font because {fontsize!r} is an invalid float value" ) else: self.textstate.fontsize = fontsize_f def do_Tr(self, render: PDFStackT) -> None: """Set the text rendering mode""" render_i = safe_int(render) if render_i is None: log.warning( f"Could not set text rendering mode because {render!r} is an invalid int value" ) else: self.textstate.render = render_i def do_Ts(self, rise: PDFStackT) -> None: """Set the text rise :param rise: a number expressed in unscaled text space units """ rise_f = safe_float(rise) if rise_f is None: log.warning( f"Could not set text rise because {rise!r} is an invalid float value" ) else: self.textstate.rise = rise_f def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None: """Move to the start of the next line Offset from the start of the current line by (tx , ty). """ tx_ = safe_float(tx) ty_ = safe_float(ty) if tx_ is not None and ty_ is not None: (a, b, c, d, e, f) = self.textstate.matrix e_new = tx_ * a + ty_ * c + e f_new = tx_ * b + ty_ * d + f self.textstate.matrix = (a, b, c, d, e_new, f_new) elif settings.STRICT: raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td") self.textstate.linematrix = (0, 0) def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: """Move to the start of the next line. offset from the start of the current line by (tx , ty). As a side effect, this operator sets the leading parameter in the text state. """ tx_ = safe_float(tx) ty_ = safe_float(ty) if tx_ is not None and ty_ is not None: (a, b, c, d, e, f) = self.textstate.matrix e_new = tx_ * a + ty_ * c + e f_new = tx_ * b + ty_ * d + f self.textstate.matrix = (a, b, c, d, e_new, f_new) elif settings.STRICT: raise PDFValueError("Invalid offset ({tx}, {ty}) for TD") if ty_ is not None: self.textstate.leading = ty_ self.textstate.linematrix = (0, 0) def do_Tm( self, a: PDFStackT, b: PDFStackT, c: PDFStackT, d: PDFStackT, e: PDFStackT, f: PDFStackT, ) -> None: """Set text matrix and text line matrix""" values = (a, b, c, d, e, f) matrix = safe_matrix(*values) if matrix is None: log.warning( f"Could not set text matrix because not all values in {values!r} can be parsed as floats" ) else: self.textstate.matrix = matrix self.textstate.linematrix = (0, 0) def do_T_a(self) -> None: """Move to start of next text line""" (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = ( a, b, c, d, self.textstate.leading * c + e, self.textstate.leading * d + f, ) self.textstate.linematrix = (0, 0) def do_TJ(self, seq: PDFStackT) -> None: """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: raise PDFInterpreterError("No font specified!") return assert self.ncs is not None self.device.render_string( self.textstate, cast(PDFTextSeq, seq), self.ncs, self.graphicstate.copy(), ) def do_Tj(self, s: PDFStackT) -> None: """Show text""" self.do_TJ([s]) def do__q(self, s: PDFStackT) -> None: """Move to next line and show text The ' (single quote) operator. """ self.do_T_a() self.do_TJ([s]) def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None: """Set word and character spacing, move to next line, and show text The " (double quote) operator. """ self.do_Tw(aw) self.do_Tc(ac) self.do_TJ([s]) def do_BI(self) -> None: """Begin inline image object""" def do_ID(self) -> None: """Begin inline image data""" def do_EI(self, obj: PDFStackT) -> None: """End inline image object""" if isinstance(obj, PDFStream) and "W" in obj and "H" in obj: iobjid = str(id(obj)) self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.render_image(iobjid, obj) self.device.end_figure(iobjid) def do_Do(self, xobjid_arg: PDFStackT) -> None: """Invoke named XObject""" xobjid = literal_name(xobjid_arg) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return log.debug("Processing xobj: %r", xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() bbox = cast(Rect, list_value(xobj["BBox"])) matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. xobjres = xobj.get("Resources") if xobjres: resources = dict_value(xobjres) else: resources = self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents( resources, [xobj], ctm=mult_matrix(matrix, self.ctm), ) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass def process_page(self, page: PDFPage) -> None: log.debug("Processing page: %r", page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) elif page.rotate == 180: ctm = (-1, 0, 0, -1, x1, y1) elif page.rotate == 270: ctm = (0, 1, -1, 0, y1, -x0) else: ctm = (1, 0, 0, 1, -x0, -y0) self.device.begin_page(page, ctm) self.render_contents(page.resources, page.contents, ctm=ctm) self.device.end_page(page) def render_contents( self, resources: dict[object, object], streams: Sequence[object], ctm: Matrix = MATRIX_IDENTITY, ) -> None: """Render the content streams. This method may be called recursively. """ log.debug( "render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm, ) self.init_resources(resources) self.init_state(ctm) self.execute(list_value(streams)) def execute(self, streams: Sequence[object]) -> None: try: parser = PDFContentParser(streams) except PSEOF: # empty page return while True: try: (_, obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): name = keyword_name(obj) method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( "'", "_q", ) if hasattr(self, method): func = getattr(self, method) nargs = func.__code__.co_argcount - 1 if nargs: args = self.pop(nargs) log.debug("exec: %s %r", name, args) if len(args) == nargs: func(*args) else: log.debug("exec: %s", name) func() elif settings.STRICT: error_msg = "Unknown operator: %r" % name raise PDFInterpreterError(error_msg) else: self.push(obj) ================================================ FILE: babeldoc/pdfminer/pdfpage.py ================================================ import itertools import logging from collections.abc import Container from collections.abc import Iterator from typing import Any from typing import BinaryIO from babeldoc.pdfminer.pdfdocument import PDFDocument from babeldoc.pdfminer.pdfdocument import PDFNoPageLabels from babeldoc.pdfminer.pdfdocument import PDFTextExtractionNotAllowed from babeldoc.pdfminer.pdfexceptions import PDFObjectNotFound from babeldoc.pdfminer.pdfexceptions import PDFValueError from babeldoc.pdfminer.pdfparser import PDFParser from babeldoc.pdfminer.pdftypes import dict_value, PDFObjRef from babeldoc.pdfminer.pdftypes import int_value from babeldoc.pdfminer.pdftypes import list_value from babeldoc.pdfminer.pdftypes import resolve1 from babeldoc.pdfminer.psparser import LIT from babeldoc.pdfminer.utils import Rect from babeldoc.pdfminer.utils import parse_rect from babeldoc.pdfminer import settings log = logging.getLogger(__name__) # some predefined literals and keywords. LITERAL_PAGE = LIT("Page") LITERAL_PAGES = LIT("Pages") class PDFPage: """An object that holds the information about a page. A PDFPage object is merely a convenience class that has a set of keys and values, which describe the properties of a page and point to its contents. Attributes ---------- doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. contents: a list of PDFStream objects that represents the page content. lastmod: the last modified time of the page. resources: a dictionary of resources used by the page. mediabox: the physical size of the page. cropbox: the crop rectangle of the page. rotate: the page rotation (in degree). annots: the page annotations. beads: a chain that represents natural reading order. label: the page's label (typically, the logical page number). """ def __init__( self, doc: PDFDocument, pageid: object, attrs: object, label: str | None, ) -> None: """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. label: page label string. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.label = label self.lastmod = resolve1(self.attrs.get("LastModified")) self.resources: dict[object, object] = resolve1( self.attrs.get("Resources", dict()), ) try: while isinstance(attrs["MediaBox"], PDFObjRef): attrs["MediaBox"] = resolve1(attrs["MediaBox"]) except Exception: log.exception(f"try to fix mediabox failed: {attrs}") self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox")) try: self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox) except Exception: self.cropbox = self.mediabox self.contents = self._parse_contents(self.attrs.get("Contents")) self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 self.annots = self.attrs.get("Annots") self.beads = self.attrs.get("B") def __repr__(self) -> str: return f"" INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} @classmethod def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: def depth_first_search( obj: Any, parent: dict[str, Any], visited: set[Any] | None = None, ) -> Iterator[tuple[int, dict[Any, dict[Any, Any]]]]: if isinstance(obj, int): object_id = obj object_properties = dict_value(document.getobj(object_id)).copy() else: # This looks broken. obj.objid means obj could be either # PDFObjRef or PDFStream, but neither is valid for dict_value. object_id = obj.objid # type: ignore[attr-defined] object_properties = dict_value(obj).copy() # Avoid recursion errors by keeping track of visited nodes if visited is None: visited = set() if object_id in visited: return visited.add(object_id) for k, v in parent.items(): if k in cls.INHERITABLE_ATTRS and k not in object_properties: object_properties[k] = v object_type = object_properties.get("Type") if object_type is None and not settings.STRICT: # See #64 object_type = object_properties.get("type") if object_type is LITERAL_PAGES and "Kids" in object_properties: log.debug("Pages: Kids=%r", object_properties["Kids"]) for child in list_value(object_properties["Kids"]): yield from depth_first_search(child, object_properties, visited) elif object_type is LITERAL_PAGE: log.debug("Page: %r", object_properties) yield (object_id, object_properties) try: page_labels: Iterator[str | None] = document.get_page_labels() except PDFNoPageLabels: page_labels = itertools.repeat(None) pages = False if "Pages" in document.catalog: objects = depth_first_search(document.catalog["Pages"], document.catalog) for objid, tree in objects: yield cls(document, objid, tree, next(page_labels)) pages = True if not pages: # fallback when /Pages is missing. for xref in document.xrefs: for objid in xref.get_objids(): try: obj = document.getobj(objid) if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: yield cls(document, objid, obj, next(page_labels)) except PDFObjectNotFound: pass @classmethod def get_pages( cls, fp: BinaryIO, pagenos: Container[int] | None = None, maxpages: int = 0, password: str = "", caching: bool = True, check_extractable: bool = False, ) -> Iterator["PDFPage"]: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. # If not, warn the user and proceed. if not doc.is_extractable: if check_extractable: error_msg = "Text extraction is not allowed: %r" % fp raise PDFTextExtractionNotAllowed(error_msg) else: warning_msg = ( "The PDF %r contains a metadata field " "indicating that it should not allow " "text extraction. Ignoring this field " "and proceeding. Use the check_extractable " "if you want to raise an error in this case" % fp ) log.warning(warning_msg) # Process each page contained in the document. for pageno, page in enumerate(cls.create_pages(doc)): if pagenos and (pageno not in pagenos): continue yield page if maxpages and maxpages <= pageno + 1: break def _parse_mediabox(self, value: Any) -> Rect: us_letter = (0.0, 0.0, 612.0, 792.0) if value is None: log.warning( "MediaBox missing from /Page (and not inherited), " "defaulting to US Letter" ) return us_letter try: return parse_rect(resolve1(val) for val in resolve1(value)) except PDFValueError: log.warning("Invalid MediaBox in /Page, defaulting to US Letter") return us_letter def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect: if value is None: # CropBox is optional, and MediaBox is used if not specified. return mediabox try: return parse_rect(resolve1(val) for val in resolve1(value)) except PDFValueError: log.warning("Invalid CropBox in /Page, defaulting to MediaBox") return mediabox def _parse_contents(self, value: Any) -> list[Any]: contents: list[Any] = [] if value is not None: contents = resolve1(value) if not isinstance(contents, list): contents = [contents] return contents ================================================ FILE: babeldoc/pdfminer/pdfparser.py ================================================ import logging from io import BytesIO from typing import TYPE_CHECKING from typing import BinaryIO from typing import Union from babeldoc.pdfminer.casting import safe_int from babeldoc.pdfminer.pdfexceptions import PDFException from babeldoc.pdfminer.pdftypes import PDFObjRef from babeldoc.pdfminer.pdftypes import PDFStream from babeldoc.pdfminer.pdftypes import dict_value from babeldoc.pdfminer.pdftypes import int_value from babeldoc.pdfminer.psexceptions import PSEOF from babeldoc.pdfminer.psparser import KWD from babeldoc.pdfminer.psparser import PSKeyword from babeldoc.pdfminer.psparser import PSStackParser from babeldoc.pdfminer import settings if TYPE_CHECKING: from babeldoc.pdfminer.pdfdocument import PDFDocument log = logging.getLogger(__name__) class PDFSyntaxError(PDFException): pass # PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """PDFParser fetch PDF objects from a file stream. It can handle indirect references by referring to a PDF document set by set_document method. It also reads XRefs at the end of every PDF file. Typical usage: parser = PDFParser(fp) parser.read_xref() parser.read_xref(fallback=True) # optional parser.set_document(doc) parser.seek(offset) parser.nextobject() """ def __init__(self, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.doc: PDFDocument | None = None self.fallback = False def set_document(self, doc: "PDFDocument") -> None: """Associates the parser with a PDFDocument object.""" self.doc = doc KEYWORD_R = KWD(b"R") KEYWORD_NULL = KWD(b"null") KEYWORD_ENDOBJ = KWD(b"endobj") KEYWORD_STREAM = KWD(b"stream") KEYWORD_XREF = KWD(b"xref") KEYWORD_STARTXREF = KWD(b"startxref") def do_keyword(self, pos: int, token: PSKeyword) -> None: """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) elif token is self.KEYWORD_NULL: # null object self.push((pos, None)) elif token is self.KEYWORD_R: # reference to indirect object if len(self.curstack) >= 2: (_, _object_id), _ = self.pop(2) object_id = safe_int(_object_id) if object_id is not None: obj = PDFObjRef(self.doc, object_id) self.push((pos, obj)) elif token is self.KEYWORD_STREAM: # stream object ((_, dic),) = self.pop(1) dic = dict_value(dic) objlen = 0 if not self.fallback: try: objlen = int_value(dic["Length"]) except KeyError: if settings.STRICT: raise PDFSyntaxError("/Length is undefined: %r" % dic) self.seek(pos) try: (_, line) = self.nextline() # 'stream' except PSEOF: if settings.STRICT: raise PDFSyntaxError("Unexpected EOF") return pos += len(line) self.fp.seek(pos) data = bytearray(self.fp.read(objlen)) self.seek(pos + objlen) while 1: try: (linepos, line) = self.nextline() except PSEOF: if settings.STRICT: raise PDFSyntaxError("Unexpected EOF") break if b"endstream" in line: i = line.index(b"endstream") objlen += i if self.fallback: data += line[:i] break objlen += len(line) if self.fallback: data += line self.seek(pos + objlen) # XXX limit objlen not to exceed object boundary log.debug( "Stream: pos=%d, objlen=%d, dic=%r, data=%r...", pos, objlen, dic, data[:10], ) assert self.doc is not None stream = PDFStream(dic, bytes(data), self.doc.decipher) self.push((pos, stream)) else: # others self.push((pos, token)) class PDFStreamParser(PDFParser): """PDFStreamParser is used to parse PDF content streams that is contained in each page and has instructions for rendering the page. A reference to a PDF document is needed because a PDF content stream can also have indirect references to other objects in the same document. """ def __init__(self, data: bytes) -> None: PDFParser.__init__(self, BytesIO(data)) def flush(self) -> None: self.add_results(*self.popall()) KEYWORD_OBJ = KWD(b"obj") def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_R: # reference to indirect object (_, _object_id), _ = self.pop(2) object_id = safe_int(_object_id) if object_id is not None: obj = PDFObjRef(self.doc, object_id) self.push((pos, obj)) return elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ): if settings.STRICT: # See PDF Spec 3.4.6: Only the object values are stored in the # stream; the obj and endobj keywords are not used. raise PDFSyntaxError("Keyword endobj found in stream") return # others self.push((pos, token)) ================================================ FILE: babeldoc/pdfminer/pdftypes.py ================================================ import io import logging import zlib from collections.abc import Iterable from typing import TYPE_CHECKING from typing import Any from typing import Optional from typing import Protocol from typing import cast from warnings import warn from babeldoc.pdfminer.ascii85 import ascii85decode from babeldoc.pdfminer.ascii85 import asciihexdecode from babeldoc.pdfminer.ccitt import ccittfaxdecode from babeldoc.pdfminer.lzw import lzwdecode from babeldoc.pdfminer.psparser import LIT from babeldoc.pdfminer.psparser import PSObject from babeldoc.pdfminer.runlength import rldecode from babeldoc.pdfminer.utils import apply_png_predictor from babeldoc.pdfminer import pdfexceptions from babeldoc.pdfminer import settings if TYPE_CHECKING: from babeldoc.pdfminer.pdfdocument import PDFDocument logger = logging.getLogger(__name__) LITERAL_CRYPT = LIT("Crypt") # Abbreviation of Filter names in PDF 4.8.6. "Inline Images" LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl")) LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW")) LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85")) LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx")) LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL")) LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF")) LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT")) LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),) LITERALS_JPX_DECODE = (LIT("JPXDecode"),) class DecipherCallable(Protocol): """Fully typed a decipher callback, with optional parameter.""" def __call__( self, objid: int, genno: int, data: bytes, attrs: dict[str, Any] | None = None, ) -> bytes: raise NotImplementedError class PDFObject(PSObject): pass # Adding aliases for these exceptions for backwards compatibility PDFException = pdfexceptions.PDFException PDFTypeError = pdfexceptions.PDFTypeError PDFValueError = pdfexceptions.PDFValueError PDFObjectNotFound = pdfexceptions.PDFObjectNotFound PDFNotImplementedError = pdfexceptions.PDFNotImplementedError _DEFAULT = object() class PDFObjRef(PDFObject): def __init__( self, doc: Optional["PDFDocument"], objid: int, _: Any = _DEFAULT, ) -> None: """Reference to a PDF object. :param doc: The PDF document. :param objid: The object number. :param _: Unused argument for backwards compatibility. """ if _ is not _DEFAULT: warn( "The third argument of PDFObjRef is unused and will be removed after " "2024", DeprecationWarning, ) if objid == 0: if settings.STRICT: raise PDFValueError("PDF object id cannot be 0.") self.doc = doc self.objid = objid def __repr__(self) -> str: return "" % (self.objid) def resolve(self, default: object = None) -> Any: assert self.doc is not None try: return self.doc.getobj(self.objid) except PDFObjectNotFound: return default def resolve1(x: object, default: object = None) -> Any: """Resolves an object. If this is an array or dictionary, it may still contains some indirect objects inside. """ while isinstance(x, PDFObjRef): x = x.resolve(default=default) return x def resolve_all(x: object, default: object = None) -> Any: """Recursively resolves the given object and all the internals. Make sure there is no indirect reference within the nested object. This procedure might be slow. """ while isinstance(x, PDFObjRef): x = x.resolve(default=default) if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] elif isinstance(x, dict): for k, v in x.items(): x[k] = resolve_all(v, default=default) return x def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any: """Recursively deciphers the given object.""" if isinstance(x, bytes): if len(x) == 0: return x return decipher(objid, genno, x) if isinstance(x, list): x = [decipher_all(decipher, objid, genno, v) for v in x] elif isinstance(x, dict): for k, v in x.items(): x[k] = decipher_all(decipher, objid, genno, v) return x def int_value(x: object) -> int: x = resolve1(x) if not isinstance(x, int): if settings.STRICT: raise PDFTypeError("Integer required: %r" % x) return 0 return x def float_value(x: object) -> float: x = resolve1(x) if not isinstance(x, float): if settings.STRICT: raise PDFTypeError("Float required: %r" % x) return 0.0 return x def num_value(x: object) -> float: x = resolve1(x) if not isinstance(x, (int, float)): # == utils.isnumber(x) if settings.STRICT: raise PDFTypeError("Int or Float required: %r" % x) return 0 return x def uint_value(x: object, n_bits: int) -> int: """Resolve number and interpret it as a two's-complement unsigned number""" xi = int_value(x) if xi > 0: return xi else: return xi + cast(int, 2**n_bits) def str_value(x: object) -> bytes: x = resolve1(x) if not isinstance(x, bytes): if settings.STRICT: raise PDFTypeError("String required: %r" % x) return b"" return x def list_value(x: object) -> list[Any] | tuple[Any, ...]: x = resolve1(x) if not isinstance(x, (list, tuple)): if settings.STRICT: raise PDFTypeError("List required: %r" % x) return [] return x def dict_value(x: object) -> dict[Any, Any]: x = resolve1(x) if not isinstance(x, dict): if settings.STRICT: logger.error("PDFTypeError : Dict required: %r", x) raise PDFTypeError("Dict required: %r" % x) return {} return x def stream_value(x: object) -> "PDFStream": x = resolve1(x) if not isinstance(x, PDFStream): if settings.STRICT: raise PDFTypeError("PDFStream required: %r" % x) return PDFStream({}, b"") return x def decompress_corrupted(data: bytes) -> bytes: """Called on some data that can't be properly decoded because of CRC checksum error. Attempt to decode it skipping the CRC. """ d = zlib.decompressobj() f = io.BytesIO(data) result_str = b"" buffer = f.read(1) i = 0 try: while buffer: result_str += d.decompress(buffer) buffer = f.read(1) i += 1 except zlib.error: # Let the error propagates if we're not yet in the CRC checksum if i < len(data) - 3: logger.warning("Data-loss while decompressing corrupted data") return result_str class PDFStream(PDFObject): def __init__( self, attrs: dict[str, Any], rawdata: bytes, decipher: DecipherCallable | None = None, ) -> None: assert isinstance(attrs, dict), str(type(attrs)) self.attrs = attrs self.rawdata: bytes | None = rawdata self.decipher = decipher self.data: bytes | None = None self.objid: int | None = None self.genno: int | None = None def set_objid(self, objid: int, genno: int) -> None: self.objid = objid self.genno = genno def __repr__(self) -> str: if self.data is None: assert self.rawdata is not None return "" % ( self.objid, len(self.rawdata), self.attrs, ) else: assert self.data is not None return "" % ( self.objid, len(self.data), self.attrs, ) def __contains__(self, name: object) -> bool: return name in self.attrs def __getitem__(self, name: str) -> Any: return self.attrs[name] def get(self, name: str, default: object = None) -> Any: return self.attrs.get(name, default) def get_any(self, names: Iterable[str], default: object = None) -> Any: for name in names: if name in self.attrs: return self.attrs[name] return default def get_filters(self) -> list[tuple[Any, Any]]: filters = resolve1(self.get_any(("F", "Filter"), [])) params = resolve1(self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})) if not filters: return [] if not isinstance(filters, list): filters = [filters] if not isinstance(params, list): # Make sure the parameters list is the same as filters. params = [params] * len(filters) if settings.STRICT and len(params) != len(filters): raise PDFException("Parameters len filter mismatch") resolved_filters = [resolve1(f) for f in filters] resolved_params = [resolve1(param) for param in params] return list(zip(resolved_filters, resolved_params, strict=False)) def decode(self) -> None: assert self.data is None and self.rawdata is not None, str( (self.data, self.rawdata), ) data = self.rawdata if self.decipher: # Handle encryption assert self.objid is not None assert self.genno is not None data = self.decipher(self.objid, self.genno, data, self.attrs) filters = self.get_filters() if not filters: self.data = data self.rawdata = None return for f, params in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error as e: if settings.STRICT: error_msg = f"Invalid zlib bytes: {e!r}, {data!r}" raise PDFException(error_msg) try: data = decompress_corrupted(data) except zlib.error: data = b"" elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: data = ccittfaxdecode(data, params) elif f in LITERALS_DCT_DECODE: # This is probably a JPG stream # it does not need to be decoded twice. # Just return the stream to the user. pass elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE: pass elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError("/Crypt filter is unsupported") else: raise PDFNotImplementedError("Unsupported filter: %r" % f) # apply predictors if params and "Predictor" in params: pred = int_value(params["Predictor"]) if pred == 1: # no predictor pass elif pred >= 10: # PNG predictor colors = int_value(params.get("Colors", 1)) columns = int_value(params.get("Columns", 1)) raw_bits_per_component = params.get("BitsPerComponent", 8) bitspercomponent = int_value(raw_bits_per_component) data = apply_png_predictor( pred, colors, columns, bitspercomponent, data, ) else: error_msg = "Unsupported predictor: %r" % pred raise PDFNotImplementedError(error_msg) self.data = data self.rawdata = None def get_data(self) -> bytes: if self.data is None: self.decode() assert self.data is not None return self.data def get_rawdata(self) -> bytes | None: return self.rawdata ================================================ FILE: babeldoc/pdfminer/psexceptions.py ================================================ class PSException(Exception): pass class PSEOF(PSException): pass class PSSyntaxError(PSException): pass class PSTypeError(PSException): pass class PSValueError(PSException): pass ================================================ FILE: babeldoc/pdfminer/psparser.py ================================================ #!/usr/bin/env python3 import io import logging import re from collections.abc import Iterator from typing import Any from typing import BinaryIO from typing import Generic from typing import TypeVar from typing import Union from babeldoc.pdfminer.utils import choplist from babeldoc.pdfminer import psexceptions from babeldoc.pdfminer import settings log = logging.getLogger(__name__) # Adding aliases for these exceptions for backwards compatibility PSException = psexceptions.PSException PSEOF = psexceptions.PSEOF PSSyntaxError = psexceptions.PSSyntaxError PSTypeError = psexceptions.PSTypeError PSValueError = psexceptions.PSValueError class PSObject: """Base class for all PS or PDF-related data types.""" class PSLiteral(PSObject): """A class that represents a PostScript literal. Postscript literals are used as identifiers, such as variable names, property names and dictionary keys. Literals are case sensitive and denoted by a preceding slash sign (e.g. "/Name") Note: Do not create an instance of PSLiteral directly. Always use PSLiteralTable.intern(). """ NameType = Union[str, bytes] def __init__(self, name: NameType) -> None: self.name = name def __repr__(self) -> str: name = self.name return "/%r" % name class PSKeyword(PSObject): """A class that represents a PostScript keyword. PostScript keywords are a dozen of predefined words. Commands and directives in PostScript are expressed by keywords. They are also used to denote the content boundaries. Note: Do not create an instance of PSKeyword directly. Always use PSKeywordTable.intern(). """ def __init__(self, name: bytes) -> None: self.name = name def __repr__(self) -> str: name = self.name return "/%r" % name _SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) class PSSymbolTable(Generic[_SymbolT]): """A utility class for storing PSLiteral/PSKeyword objects. Interned objects can be checked its identity with "is" operator. """ def __init__(self, klass: type[_SymbolT]) -> None: self.dict: dict[PSLiteral.NameType, _SymbolT] = {} self.klass: type[_SymbolT] = klass def intern(self, name: PSLiteral.NameType) -> _SymbolT: if name in self.dict: lit = self.dict[name] else: # Type confusion issue: PSKeyword always takes bytes as name # PSLiteral uses either str or bytes lit = self.klass(name) # type: ignore[arg-type] self.dict[name] = lit return lit PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) LIT = PSLiteralTable.intern KWD = PSKeywordTable.intern KEYWORD_PROC_BEGIN = KWD(b"{") KEYWORD_PROC_END = KWD(b"}") KEYWORD_ARRAY_BEGIN = KWD(b"[") KEYWORD_ARRAY_END = KWD(b"]") KEYWORD_DICT_BEGIN = KWD(b"<<") KEYWORD_DICT_END = KWD(b">>") def literal_name(x: Any) -> str: if isinstance(x, PSLiteral): if isinstance(x.name, str): return x.name try: return str(x.name, "utf-8") except UnicodeDecodeError: return str(x.name) else: if settings.STRICT: raise PSTypeError(f"Literal required: {x!r}") return str(x) def keyword_name(x: Any) -> Any: if not isinstance(x, PSKeyword): if settings.STRICT: raise PSTypeError("Keyword required: %r" % x) else: name = x else: name = str(x.name, "utf-8", "ignore") return name EOL = re.compile(rb"[\r\n]") SPC = re.compile(rb"\s") NONSPC = re.compile(rb"\S") HEX = re.compile(rb"[0-9a-fA-F]") END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]") END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]") HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.") END_NUMBER = re.compile(rb"[^0-9]") END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]") END_STRING = re.compile(rb"[()\134]") OCT_STRING = re.compile(rb"[0-7]") ESC_STRING = { b"b": 8, b"t": 9, b"n": 10, b"f": 12, b"r": 13, b"(": 40, b")": 41, b"\\": 92, } PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] class PSBaseParser: """Most basic PostScript parser that performs only tokenization.""" BUFSIZ = 4096 def __init__(self, fp: BinaryIO) -> None: self.fp = fp self.eof = False self.seek(0) def __repr__(self) -> str: return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos) def flush(self) -> None: pass def close(self) -> None: self.flush() def tell(self) -> int: return self.bufpos + self.charpos def poll(self, pos: int | None = None, n: int = 80) -> None: pos0 = self.fp.tell() if not pos: pos = self.bufpos + self.charpos self.fp.seek(pos) log.debug("poll(%d): %r", pos, self.fp.read(n)) self.fp.seek(pos0) def seek(self, pos: int) -> None: """Seeks the parser to the given position.""" log.debug("seek: %r", pos) self.fp.seek(pos) # reset the status for nextline() self.bufpos = pos self.buf = b"" self.charpos = 0 # reset the status for nexttoken() self._parse1 = self._parse_main self._curtoken = b"" self._curtokenpos = 0 self._tokens: list[tuple[int, PSBaseParserToken]] = [] self.eof = False def fillbuf(self) -> None: if self.charpos < len(self.buf): return # fetch next chunk. self.bufpos = self.fp.tell() self.buf = self.fp.read(self.BUFSIZ) if not self.buf: raise PSEOF("Unexpected EOF") self.charpos = 0 def nextline(self) -> tuple[int, bytes]: """Fetches a next line that ends either with \\r or \\n.""" linebuf = b"" linepos = self.bufpos + self.charpos eol = False while 1: self.fillbuf() if eol: c = self.buf[self.charpos : self.charpos + 1] # handle b'\r\n' if c == b"\n": linebuf += c self.charpos += 1 break m = EOL.search(self.buf, self.charpos) if m: linebuf += self.buf[self.charpos : m.end(0)] self.charpos = m.end(0) if linebuf[-1:] == b"\r": eol = True else: break else: linebuf += self.buf[self.charpos :] self.charpos = len(self.buf) log.debug("nextline: %r, %r", linepos, linebuf) return (linepos, linebuf) def revreadlines(self) -> Iterator[bytes]: """Fetches a next line backword. This is used to locate the trailers at the end of a file. """ self.fp.seek(0, io.SEEK_END) pos = self.fp.tell() buf = b"" while pos > 0: prevpos = pos pos = max(0, pos - self.BUFSIZ) self.fp.seek(pos) s = self.fp.read(prevpos - pos) if not s: break while 1: n = max(s.rfind(b"\r"), s.rfind(b"\n")) if n == -1: buf = s + buf break yield s[n:] + buf s = s[:n] buf = b"" def _parse_main(self, s: bytes, i: int) -> int: m = NONSPC.search(s, i) if not m: return len(s) j = m.start(0) c = s[j : j + 1] self._curtokenpos = self.bufpos + j if c == b"%": self._curtoken = b"%" self._parse1 = self._parse_comment return j + 1 elif c == b"/": self._curtoken = b"" self._parse1 = self._parse_literal return j + 1 elif c in b"-+" or c.isdigit(): self._curtoken = c self._parse1 = self._parse_number return j + 1 elif c == b".": self._curtoken = c self._parse1 = self._parse_float return j + 1 elif c.isalpha(): self._curtoken = c self._parse1 = self._parse_keyword return j + 1 elif c == b"(": self._curtoken = b"" self.paren = 1 self._parse1 = self._parse_string return j + 1 elif c == b"<": self._curtoken = b"" self._parse1 = self._parse_wopen return j + 1 elif c == b">": self._curtoken = b"" self._parse1 = self._parse_wclose return j + 1 elif c == b"\x00": return j + 1 else: self._add_token(KWD(c)) return j + 1 def _add_token(self, obj: PSBaseParserToken) -> None: self._tokens.append((self._curtokenpos, obj)) def _parse_comment(self, s: bytes, i: int) -> int: m = EOL.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] self._parse1 = self._parse_main # We ignore comments. # self._tokens.append(self._curtoken) return j def _parse_literal(self, s: bytes, i: int) -> int: m = END_LITERAL.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] c = s[j : j + 1] if c == b"#": self.hex = b"" self._parse1 = self._parse_literal_hex return j + 1 try: name: str | bytes = str(self._curtoken, "utf-8") except Exception: name = self._curtoken self._add_token(LIT(name)) self._parse1 = self._parse_main return j def _parse_literal_hex(self, s: bytes, i: int) -> int: c = s[i : i + 1] if HEX.match(c) and len(self.hex) < 2: self.hex += c return i + 1 if self.hex: self._curtoken += bytes((int(self.hex, 16),)) self._parse1 = self._parse_literal return i def _parse_number(self, s: bytes, i: int) -> int: m = END_NUMBER.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] c = s[j : j + 1] if c == b".": self._curtoken += c self._parse1 = self._parse_float return j + 1 try: self._add_token(int(self._curtoken)) except ValueError: pass self._parse1 = self._parse_main return j def _parse_float(self, s: bytes, i: int) -> int: m = END_NUMBER.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] try: self._add_token(float(self._curtoken)) except ValueError: pass self._parse1 = self._parse_main return j def _parse_keyword(self, s: bytes, i: int) -> int: m = END_KEYWORD.search(s, i) if m: j = m.start(0) self._curtoken += s[i:j] else: self._curtoken += s[i:] return len(s) if self._curtoken == b"true": token: bool | PSKeyword = True elif self._curtoken == b"false": token = False else: token = KWD(self._curtoken) self._add_token(token) self._parse1 = self._parse_main return j def _parse_string(self, s: bytes, i: int) -> int: m = END_STRING.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] c = s[j : j + 1] if c == b"\\": self.oct = b"" self._parse1 = self._parse_string_1 return j + 1 if c == b"(": self.paren += 1 self._curtoken += c return j + 1 if c == b")": self.paren -= 1 if self.paren: # WTF, they said balanced parens need no special treatment. self._curtoken += c return j + 1 self._add_token(self._curtoken) self._parse1 = self._parse_main return j + 1 def _parse_string_1(self, s: bytes, i: int) -> int: """Parse literal strings PDF Reference 3.2.3 """ c = s[i : i + 1] if OCT_STRING.match(c) and len(self.oct) < 3: self.oct += c return i + 1 elif self.oct: chrcode = int(self.oct, 8) assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode) self._curtoken += bytes((chrcode,)) self._parse1 = self._parse_string return i elif c in ESC_STRING: self._curtoken += bytes((ESC_STRING[c],)) elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n": # If current and next character is \r\n skip both because enters # after a \ are ignored i += 1 # default action self._parse1 = self._parse_string return i + 1 def _parse_wopen(self, s: bytes, i: int) -> int: c = s[i : i + 1] if c == b"<": self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main i += 1 else: self._parse1 = self._parse_hexstring return i def _parse_wclose(self, s: bytes, i: int) -> int: c = s[i : i + 1] if c == b">": self._add_token(KEYWORD_DICT_END) i += 1 self._parse1 = self._parse_main return i def _parse_hexstring(self, s: bytes, i: int) -> int: m = END_HEX_STRING.search(s, i) if not m: self._curtoken += s[i:] return len(s) j = m.start(0) self._curtoken += s[i:j] token = HEX_PAIR.sub( lambda m: bytes((int(m.group(0), 16),)), SPC.sub(b"", self._curtoken), ) self._add_token(token) self._parse1 = self._parse_main return j def nexttoken(self) -> tuple[int, PSBaseParserToken]: if self.eof: # It's not really unexpected, come on now... raise PSEOF("Unexpected EOF") while not self._tokens: try: self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) except PSEOF: # If we hit EOF in the middle of a token, try to parse # it by tacking on whitespace, and delay raising PSEOF # until next time around self.charpos = self._parse1(b"\n", 0) self.eof = True # Oh, so there wasn't actually a token there? OK. if not self._tokens: raise token = self._tokens.pop(0) log.debug("nexttoken: %r", token) return token # Stack slots may by occupied by any of: # * the name of a literal # * the PSBaseParserToken types # * list (via KEYWORD_ARRAY) # * dict (via KEYWORD_DICT) # * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT ExtraT = TypeVar("ExtraT") PSStackType = Union[str, float, bool, PSLiteral, bytes, list, dict, ExtraT] PSStackEntry = tuple[int, PSStackType[ExtraT]] class PSStackParser(PSBaseParser, Generic[ExtraT]): def __init__(self, fp: BinaryIO) -> None: PSBaseParser.__init__(self, fp) self.reset() def reset(self) -> None: self.context: list[tuple[int, str | None, list[PSStackEntry[ExtraT]]]] = [] self.curtype: str | None = None self.curstack: list[PSStackEntry[ExtraT]] = [] self.results: list[PSStackEntry[ExtraT]] = [] def seek(self, pos: int) -> None: PSBaseParser.seek(self, pos) self.reset() def push(self, *objs: PSStackEntry[ExtraT]) -> None: self.curstack.extend(objs) def pop(self, n: int) -> list[PSStackEntry[ExtraT]]: objs = self.curstack[-n:] self.curstack[-n:] = [] return objs def popall(self) -> list[PSStackEntry[ExtraT]]: objs = self.curstack self.curstack = [] return objs def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: try: log.debug("add_results: %r", objs) except Exception: log.debug("add_results: (unprintable object)") self.results.extend(objs) def start_type(self, pos: int, type: str) -> None: self.context.append((pos, self.curtype, self.curstack)) (self.curtype, self.curstack) = (type, []) log.debug("start_type: pos=%r, type=%r", pos, type) def end_type(self, type: str) -> tuple[int, list[PSStackType[ExtraT]]]: if self.curtype != type: raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") objs = [obj for (_, obj) in self.curstack] (pos, self.curtype, self.curstack) = self.context.pop() log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) return (pos, objs) def do_keyword(self, pos: int, token: PSKeyword) -> None: pass def nextobject(self) -> PSStackEntry[ExtraT]: """Yields a list of objects. Arrays and dictionaries are represented as Python lists and dictionaries. :return: keywords, literals, strings, numbers, arrays and dictionaries. """ while not self.results: (pos, token) = self.nexttoken() if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): # normal token self.push((pos, token)) elif token == KEYWORD_ARRAY_BEGIN: # begin array self.start_type(pos, "a") elif token == KEYWORD_ARRAY_END: # end array try: self.push(self.end_type("a")) except PSTypeError: if settings.STRICT: raise elif token == KEYWORD_DICT_BEGIN: # begin dictionary self.start_type(pos, "d") elif token == KEYWORD_DICT_END: # end dictionary try: (pos, objs) = self.end_type("d") if len(objs) % 2 != 0: error_msg = "Invalid dictionary construct: %r" % objs raise PSSyntaxError(error_msg) d = { literal_name(k): v for (k, v) in choplist(2, objs) if v is not None } self.push((pos, d)) except PSTypeError: if settings.STRICT: raise elif token == KEYWORD_PROC_BEGIN: # begin proc self.start_type(pos, "p") elif token == KEYWORD_PROC_END: # end proc try: self.push(self.end_type("p")) except PSTypeError: if settings.STRICT: raise elif isinstance(token, PSKeyword): log.debug( "do_keyword: pos=%r, token=%r, stack=%r", pos, token, self.curstack, ) self.do_keyword(pos, token) else: log.error( "unknown token: pos=%r, token=%r, stack=%r", pos, token, self.curstack, ) self.do_keyword(pos, token) raise PSException if self.context: continue else: self.flush() obj = self.results.pop(0) try: log.debug("nextobject: %r", obj) except Exception: log.debug("nextobject: (unprintable object)") return obj ================================================ FILE: babeldoc/pdfminer/py.typed ================================================ ================================================ FILE: babeldoc/pdfminer/runlength.py ================================================ # # RunLength decoder (Adobe version) implementation based on PDF Reference # version 1.4 section 3.3.4. # # * public domain * # def rldecode(data: bytes) -> bytes: """RunLength decoder (Adobe version) implementation based on PDF Reference version 1.4 section 3.3.4: The RunLengthDecode filter decodes data that has been encoded in a simple byte-oriented format based on run length. The encoded data is a sequence of runs, where each run consists of a length byte followed by 1 to 128 bytes of data. If the length byte is in the range 0 to 127, the following length + 1 (1 to 128) bytes are copied literally during decompression. If length is in the range 129 to 255, the following single byte is to be copied 257 - length (2 to 128) times during decompression. A length value of 128 denotes EOD. """ decoded_array: list[int] = [] data_iter = iter(data) while True: length = next(data_iter, 128) if length == 128: break if 0 <= length < 128: decoded_array.extend(next(data_iter) for _ in range(length + 1)) if length > 128: run = [next(data_iter)] * (257 - length) decoded_array.extend(run) return bytes(decoded_array) ================================================ FILE: babeldoc/pdfminer/settings.py ================================================ STRICT = False ================================================ FILE: babeldoc/pdfminer/utils.py ================================================ """Miscellaneous Routines.""" import io import pathlib import string from collections.abc import Callable from collections.abc import Iterable from collections.abc import Iterator from html import escape from typing import TYPE_CHECKING from typing import Any from typing import BinaryIO from typing import Generic from typing import TextIO from typing import TypeVar from typing import Union from typing import cast from babeldoc.pdfminer.pdfexceptions import PDFTypeError from babeldoc.pdfminer.pdfexceptions import PDFValueError if TYPE_CHECKING: from babeldoc.pdfminer.layout import LTComponent import charset_normalizer # For str encoding detection # from sys import maxint as INF doesn't work anymore under Python3, but PDF # still uses 32 bits ints INF = (1 << 31) - 1 FileOrName = Union[pathlib.PurePath, str, io.IOBase] AnyIO = Union[TextIO, BinaryIO] class open_filename: """Context manager that allows opening a filename (str or pathlib.PurePath type is supported) and closes it on exit, (just like `open`), but does nothing for file-like objects. """ def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None: if isinstance(filename, pathlib.PurePath): filename = str(filename) if isinstance(filename, str): self.file_handler: AnyIO = open(filename, *args, **kwargs) self.closing = True elif isinstance(filename, io.IOBase): self.file_handler = cast(AnyIO, filename) self.closing = False else: raise PDFTypeError("Unsupported input type: %s" % type(filename)) def __enter__(self) -> AnyIO: return self.file_handler def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: if self.closing: self.file_handler.close() def make_compat_bytes(in_str: str) -> bytes: """Converts to bytes, encoding to unicode.""" assert isinstance(in_str, str), str(type(in_str)) return in_str.encode() def make_compat_str(o: object) -> str: """Converts everything to string, if bytes guessing the encoding.""" if isinstance(o, bytes): enc = charset_normalizer.detect(o) try: return o.decode(enc["encoding"]) except UnicodeDecodeError: return str(o) else: return str(o) def shorten_str(s: str, size: int) -> str: if size < 7: return s[:size] if len(s) > size: length = (size - 5) // 2 return f"{s[:length]} ... {s[-length:]}" else: return s def compatible_encode_method( bytesorstring: bytes | str, encoding: str = "utf-8", erraction: str = "ignore", ) -> str: """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either. """ if isinstance(bytesorstring, str): return bytesorstring assert isinstance(bytesorstring, bytes), str(type(bytesorstring)) return bytesorstring.decode(encoding, erraction) def paeth_predictor(left: int, above: int, upper_left: int) -> int: # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html # Initial estimate p = left + above - upper_left # Distances to a,b,c pa = abs(p - left) pb = abs(p - above) pc = abs(p - upper_left) # Return nearest of a,b,c breaking ties in order a,b,c if pa <= pb and pa <= pc: return left elif pb <= pc: return above else: return upper_left def apply_png_predictor( pred: int, colors: int, columns: int, bitspercomponent: int, data: bytes, ) -> bytes: """Reverse the effect of the PNG predictor Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html """ if bitspercomponent not in [8, 1]: msg = "Unsupported `bitspercomponent': %d" % bitspercomponent raise PDFValueError(msg) nbytes = colors * columns * bitspercomponent // 8 bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel buf = [] line_above = list(b"\x00" * columns) for scanline_i in range(0, len(data), nbytes + 1): filter_type = data[scanline_i] line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes] raw = [] if filter_type == 0: # Filter type 0: None raw = list(line_encoded) elif filter_type == 1: # Filter type 1: Sub # To reverse the effect of the Sub() filter after decompression, # output the following value: # Raw(x) = Sub(x) + Raw(x - bpp) # (computed mod 256), where Raw() refers to the bytes already # decoded. for j, sub_x in enumerate(line_encoded): if j - bpp < 0: raw_x_bpp = 0 else: raw_x_bpp = int(raw[j - bpp]) raw_x = (sub_x + raw_x_bpp) & 255 raw.append(raw_x) elif filter_type == 2: # Filter type 2: Up # To reverse the effect of the Up() filter after decompression, # output the following value: # Raw(x) = Up(x) + Prior(x) # (computed mod 256), where Prior() refers to the decoded bytes of # the prior scanline. for up_x, prior_x in zip(line_encoded, line_above, strict=False): raw_x = (up_x + prior_x) & 255 raw.append(raw_x) elif filter_type == 3: # Filter type 3: Average # To reverse the effect of the Average() filter after # decompression, output the following value: # Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2) # where the result is computed mod 256, but the prediction is # calculated in the same way as for encoding. Raw() refers to the # bytes already decoded, and Prior() refers to the decoded bytes of # the prior scanline. for j, average_x in enumerate(line_encoded): if j - bpp < 0: raw_x_bpp = 0 else: raw_x_bpp = int(raw[j - bpp]) prior_x = int(line_above[j]) raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255 raw.append(raw_x) elif filter_type == 4: # Filter type 4: Paeth # To reverse the effect of the Paeth() filter after decompression, # output the following value: # Raw(x) = Paeth(x) # + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp)) # (computed mod 256), where Raw() and Prior() refer to bytes # already decoded. Exactly the same PaethPredictor() function is # used by both encoder and decoder. for j, paeth_x in enumerate(line_encoded): if j - bpp < 0: raw_x_bpp = 0 prior_x_bpp = 0 else: raw_x_bpp = int(raw[j - bpp]) prior_x_bpp = int(line_above[j - bpp]) prior_x = int(line_above[j]) paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp) raw_x = (paeth_x + paeth) & 255 raw.append(raw_x) else: raise PDFValueError("Unsupported predictor value: %d" % filter_type) buf.extend(raw) line_above = raw return bytes(buf) Point = tuple[float, float] Rect = tuple[float, float, float, float] Matrix = tuple[float, float, float, float, float, float] PathSegment = Union[ tuple[str], # Literal['h'] tuple[str, float, float], # Literal['m', 'l'] tuple[str, float, float, float, float], # Literal['v', 'y'] tuple[str, float, float, float, float, float, float], ] # Literal['c'] # Matrix operations MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) def parse_rect(o: Any) -> Rect: try: (x0, y0, x1, y1) = o return float(x0), float(y0), float(x1), float(y1) except ValueError: raise PDFValueError("Could not parse rectangle") def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix: (a1, b1, c1, d1, e1, f1) = m1 (a0, b0, c0, d0, e0, f0) = m0 """Returns the multiplication of two matrices.""" return ( a0 * a1 + c0 * b1, b0 * a1 + d0 * b1, a0 * c1 + c0 * d1, b0 * c1 + d0 * d1, a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0, ) def translate_matrix(m: Matrix, v: Point) -> Matrix: """Translates a matrix by (x, y).""" (a, b, c, d, e, f) = m (x, y) = v return a, b, c, d, x * a + y * c + e, x * b + y * d + f def apply_matrix_pt(m: Matrix, v: Point) -> Point: (a, b, c, d, e, f) = m (x, y) = v """Applies a matrix to a point.""" return a * x + c * y + e, b * x + d * y + f def apply_matrix_norm(m: Matrix, v: Point) -> Point: """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" (a, b, c, d, e, f) = m (p, q) = v return a * p + c * q, b * p + d * q # Utility functions def isnumber(x: object) -> bool: return isinstance(x, (int, float)) _T = TypeVar("_T") def uniq(objs: Iterable[_T]) -> Iterator[_T]: """Eliminates duplicated elements.""" done = set() for obj in objs: if obj in done: continue done.add(obj) yield obj def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> tuple[list[_T], list[_T]]: """Split a list into two classes according to the predicate.""" t = [] f = [] for obj in objs: if pred(obj): t.append(obj) else: f.append(obj) return t, f def drange(v0: float, v1: float, d: int) -> range: """Returns a discrete range.""" return range(int(v0) // d, int(v1 + d) // d) def get_bound(pts: Iterable[Point]) -> Rect: """Compute a minimal rectangle that covers all the points.""" limit: Rect = (INF, INF, -INF, -INF) (x0, y0, x1, y1) = limit for x, y in pts: x0 = min(x0, x) y0 = min(y0, y) x1 = max(x1, x) y1 = max(y1, y) return x0, y0, x1, y1 def pick( seq: Iterable[_T], func: Callable[[_T], float], maxobj: _T | None = None, ) -> _T | None: """Picks the object obj where func(obj) has the highest value.""" maxscore = None for obj in seq: score = func(obj) if maxscore is None or maxscore < score: (maxscore, maxobj) = (score, obj) return maxobj def choplist(n: int, seq: Iterable[_T]) -> Iterator[tuple[_T, ...]]: """Groups every n elements of the list.""" r = [] for x in seq: r.append(x) if len(r) == n: yield tuple(r) r = [] def nunpack(s: bytes, default: int = 0) -> int: """Unpacks variable-length unsigned integers (big endian).""" length = len(s) if not length: return default else: return int.from_bytes(s, byteorder="big", signed=False) PDFDocEncoding = "".join( chr(x) for x in ( 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017, 0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018, 0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160, 0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E, 0x0000, 0x20AC, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x0000, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, ) ) def decode_text(s: bytes) -> str: """Decodes a PDFDocEncoding string to Unicode.""" if s.startswith(b"\xfe\xff"): return str(s[2:], "utf-16be", "ignore") else: return "".join(PDFDocEncoding[c] for c in s) def enc(x: str) -> str: """Encodes a string for SGML/XML/HTML""" if isinstance(x, bytes): return "" return escape(x) def bbox2str(bbox: Rect) -> str: (x0, y0, x1, y1) = bbox return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}" def matrix2str(m: Matrix) -> str: (a, b, c, d, e, f) = m return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]" def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. Return vector between 2 boxes boundaries if they don't overlap, otherwise returns vector betweeen boxes centers +------+..........+ (x1, y1) | obj1 | : +------+www+------+ : | obj2 | (x0, y0) +..........+------+ """ (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0)) (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1)) (ow, oh) = (x1 - x0, y1 - y0) (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height) if iw < 0 and ih < 0: # if one is inside another we compute euclidean distance (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2) (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2) return xc1 - xc2, yc1 - yc2 else: return max(0, iw), max(0, ih) LTComponentT = TypeVar("LTComponentT", bound="LTComponent") class Plane(Generic[LTComponentT]): """A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area. It maintains two parallel lists of objects, each of which is sorted by its x or y coordinate. """ def __init__(self, bbox: Rect, gridsize: int = 50) -> None: self._seq: list[LTComponentT] = [] # preserve the object order. self._objs: set[LTComponentT] = set() self._grid: dict[Point, list[LTComponentT]] = {} self.gridsize = gridsize (self.x0, self.y0, self.x1, self.y1) = bbox def __repr__(self) -> str: return "" % list(self) def __iter__(self) -> Iterator[LTComponentT]: return (obj for obj in self._seq if obj in self._objs) def __len__(self) -> int: return len(self._objs) def __contains__(self, obj: object) -> bool: return obj in self._objs def _getrange(self, bbox: Rect) -> Iterator[Point]: (x0, y0, x1, y1) = bbox if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0: return x0 = max(self.x0, x0) y0 = max(self.y0, y0) x1 = min(self.x1, x1) y1 = min(self.y1, y1) for grid_y in drange(y0, y1, self.gridsize): for grid_x in drange(x0, x1, self.gridsize): yield (grid_x, grid_y) def extend(self, objs: Iterable[LTComponentT]) -> None: for obj in objs: self.add(obj) def add(self, obj: LTComponentT) -> None: """Place an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): if k not in self._grid: r: list[LTComponentT] = [] self._grid[k] = r else: r = self._grid[k] r.append(obj) self._seq.append(obj) self._objs.add(obj) def remove(self, obj: LTComponentT) -> None: """Displace an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): try: self._grid[k].remove(obj) except (KeyError, ValueError): pass self._objs.remove(obj) def find(self, bbox: Rect) -> Iterator[LTComponentT]: """Finds objects that are in a certain area.""" (x0, y0, x1, y1) = bbox done = set() for k in self._getrange(bbox): if k not in self._grid: continue for obj in self._grid[k]: if obj in done: continue done.add(obj) if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0: continue yield obj ROMAN_ONES = ["i", "x", "c", "m"] ROMAN_FIVES = ["v", "l", "d"] def format_int_roman(value: int) -> str: """Format a number as lowercase Roman numerals.""" assert 0 < value < 4000 result: list[str] = [] index = 0 while value != 0: value, remainder = divmod(value, 10) if remainder == 9: result.insert(0, ROMAN_ONES[index]) result.insert(1, ROMAN_ONES[index + 1]) elif remainder == 4: result.insert(0, ROMAN_ONES[index]) result.insert(1, ROMAN_FIVES[index]) else: over_five = remainder >= 5 if over_five: result.insert(0, ROMAN_FIVES[index]) remainder -= 5 result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder) index += 1 return "".join(result) def format_int_alpha(value: int) -> str: """Format a number as lowercase letters a-z, aa-zz, etc.""" assert value > 0 result: list[str] = [] while value != 0: value, remainder = divmod(value - 1, len(string.ascii_lowercase)) result.append(string.ascii_lowercase[remainder]) result.reverse() return "".join(result) ================================================ FILE: babeldoc/progress_monitor.py ================================================ import asyncio import logging import threading import time from asyncio import CancelledError from collections.abc import Callable from typing import Optional logger = logging.getLogger(__name__) class ProgressMonitor: def __init__( self, stages: list[tuple[str, float]], progress_change_callback: Callable | None = None, finish_callback: Callable | None = None, report_interval: float = 0.1, finish_event: asyncio.Event | None = None, cancel_event: threading.Event | None = None, loop: asyncio.AbstractEventLoop | None = None, parent_monitor: Optional["ProgressMonitor"] = None, part_index: int | None = 0, total_parts: int | None = 1, ): self.lock = threading.Lock() self.parent_monitor = parent_monitor self.part_index = part_index self.total_parts = total_parts self.raw_stages = stages self.part_results = {} # Convert stages list to dict with name and weight self.stage = {} total_weight = sum(weight for _, weight in stages) for name, weight in stages: normalized_weight = weight / total_weight self.stage[name] = TranslationStage( name, 0, self, normalized_weight, self.lock, ) self.progress_change_callback = progress_change_callback self.finish_callback = finish_callback self.report_interval = report_interval logger.debug(f"report_interval: {self.report_interval}") self.last_report_time = 0 self.finish_stage_count = 0 self.finish_event = finish_event self.cancel_event = cancel_event self.loop = loop self.disable = False if finish_event and not loop: raise ValueError("finish_event requires a loop") if self.progress_change_callback: self.progress_change_callback( type="stage_summary", stages=[ { "name": name, "percent": self.stage[name].weight, } for name, _ in stages ], part_index=self.part_index, total_parts=self.total_parts, ) def create_part_monitor( self, part_index: int, total_parts: int ) -> "ProgressMonitor": """Create a new progress monitor for a document part""" return ProgressMonitor( stages=self.raw_stages, progress_change_callback=self._handle_part_progress, finish_callback=self._handle_part_finish, report_interval=self.report_interval, cancel_event=self.cancel_event, loop=self.loop, parent_monitor=self, part_index=part_index, total_parts=total_parts, ) def _handle_part_progress(self, **kwargs): """Handle progress updates from part monitors""" if self.progress_change_callback and not self.disable: # Add part information to progress update kwargs["part_index"] = kwargs.get("part_index") kwargs["total_parts"] = kwargs.get("total_parts") self.progress_change_callback(**kwargs) def _handle_part_finish(self, **kwargs): """Handle completion of a part translation""" if kwargs["type"] == "error": logger.info(f"progress_monitor handle_part_finish: {kwargs['error']}") self.finish_callback(type="error", error=kwargs["error"]) return if "translate_result" in kwargs: part_index = kwargs.get("part_index") if part_index is not None: self.part_results[part_index] = kwargs["translate_result"] # if self.finish_callback and not self.disable: # self.finish_callback(**kwargs) def stage_start(self, stage_name: str, total: int): if self.disable or self.parent_monitor and self.parent_monitor.disable: return DummyTranslationStage(stage_name, total, self, 0) stage = self.stage[stage_name] stage.run_time += 1 stage.name = stage_name stage.display_name = f"{stage_name}" if stage.run_time > 1 else stage_name stage.current = 0 stage.total = total if self.progress_change_callback: self.progress_change_callback( type="progress_start", stage=stage.display_name, stage_progress=0.0, stage_current=0, stage_total=total, overall_progress=self.calculate_current_progress(), part_index=self.part_index + 1, total_parts=self.total_parts, ) self.last_report_time = 0.0 return stage def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): logger.debug("ProgressMonitor __exit__") def on_finish(self): if self.disable or self.parent_monitor and self.parent_monitor.disable: return if self.cancel_event: self.cancel_event.set() if self.finish_event and self.loop: self.loop.call_soon_threadsafe(self.finish_event.set) if self.cancel_event and self.cancel_event.is_set(): self.finish_callback(type="error", error=CancelledError) def stage_done(self, stage): if self.disable or self.parent_monitor and self.parent_monitor.disable: return self.last_report_time = 0.0 self.finish_stage_count += 1 if ( stage.current != stage.total and self.cancel_event is not None and not self.cancel_event.is_set() ): logger.warning( f"Stage {stage.name} completed with {stage.current}/{stage.total} items", ) return if self.progress_change_callback: self.progress_change_callback( type="progress_end", stage=stage.display_name, stage_progress=100.0, stage_current=stage.total, stage_total=stage.total, overall_progress=self.calculate_current_progress(), part_index=self.part_index + 1, total_parts=self.total_parts, ) def calculate_current_progress(self, stage=None): if self.disable or self.parent_monitor and self.parent_monitor.disable: return 100 part_weight = 1 / self.total_parts if self.parent_monitor: part_offset = self.part_index * part_weight else: part_offset = len(self.part_results) * part_weight part_offset *= 100 progress = self._calculate_current_progress(stage) * part_weight + part_offset return progress def _calculate_current_progress(self, stage=None): """Calculate overall progress including part progress""" # Count completed stages completed_stages = sum( 1 for s in self.stage.values() if s.run_time > 0 and s.current == s.total ) # If all stages are complete, return exactly 100 if completed_stages == len(self.stage): return 100 # Calculate progress based on weights progress = sum( s.weight * 100 for s in self.stage.values() if s.run_time > 0 and s.current == s.total ) if stage is not None and 0 < stage.total != stage.current: progress += stage.weight * stage.current * 100 / stage.total # If this is a part monitor (has parent_monitor), return the progress as is if hasattr(self, "parent_monitor") and self.parent_monitor: return progress # Otherwise return the standard progress return progress def stage_update(self, stage, n: int): if self.disable or self.parent_monitor and self.parent_monitor.disable: return report_time_delta = time.time() - self.last_report_time if report_time_delta < self.report_interval and stage.total > 3: return if self.progress_change_callback: if stage.total != 0: stage_progress = stage.current * 100 / stage.total else: stage_progress = 100 self.progress_change_callback( type="progress_update", stage=stage.display_name, stage_progress=stage_progress, stage_current=stage.current, stage_total=stage.total, overall_progress=self.calculate_current_progress(stage), part_index=self.part_index + 1, total_parts=self.total_parts, ) self.last_report_time = time.time() def translate_done(self, translate_result): if self.disable or self.parent_monitor and self.parent_monitor.disable: return if self.finish_callback: self.finish_callback(type="finish", translate_result=translate_result) def translate_error(self, error): if self.disable or self.parent_monitor and self.parent_monitor.disable: return if self.finish_callback: logger.info(f"progress_monitor handle translate_error: {error}") self.finish_callback(type="error", error=error) def raise_if_cancelled(self): if self.cancel_event and self.cancel_event.is_set(): raise asyncio.CancelledError def cancel(self): if self.disable or self.parent_monitor and self.parent_monitor.disable: return if self.cancel_event: logger.info("Translation canceled") self.cancel_event.set() class TranslationStage: def __init__( self, name: str, total: int, pm: ProgressMonitor, weight: float, lock: threading.Lock, ): self.name = name self.display_name = name self.current = 0 self.total = total self.pm = pm self.run_time = 0 self.weight = weight self.lock = lock def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): with self.lock: diff = self.total - self.current if diff > 0: logger.info( f"Stage {self.name} completed with {self.current}/{self.total} items" ) self.pm.stage_update(self, diff) self.current = self.total self.pm.stage_done(self) def advance(self, n: int = 1): with self.lock: self.current += n self.pm.stage_update(self, n) class DummyTranslationStage: def __init__(self, name: str, total: int, pm: ProgressMonitor, weight: float): self.name = name self.display_name = name self.current = 0 self.total = total self.pm = pm def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def advance(self, n: int = 1): pass ================================================ FILE: babeldoc/tools/generate_cmap_metadata.py ================================================ """ This script is used to automatically generate the following file: https://github.com/funstory-ai/BabelDOC-Assets/blob/main/cmap_metadata.json """ import argparse import hashlib import logging from pathlib import Path import orjson from rich.logging import RichHandler logger = logging.getLogger(__name__) def _calc_sha3_256(path: Path) -> str: """Calculate sha3-256 for a given file path.""" hash_ = hashlib.sha3_256() with path.open("rb") as f: # Read the file in chunks to handle large files efficiently while True: chunk = f.read(1024 * 1024) if not chunk: break hash_.update(chunk) return hash_.hexdigest() def main() -> None: logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) parser = argparse.ArgumentParser(description="Generate cmap metadata.") parser.add_argument( "assets_repo_path", type=str, help="Path to the BabelDOC-Assets repository.", ) args = parser.parse_args() repo_path = Path(args.assets_repo_path) assert repo_path.exists(), f"Assets repo path {repo_path} does not exist." assert (repo_path / "README.md").exists(), ( f"Assets repo path {repo_path} does not contain a README.md file." ) assert (repo_path / "cmap").exists(), ( f"Assets repo path {repo_path} does not contain a cmap folder." ) logger.info(f"Getting cmap metadata for {repo_path}") metadatas: dict[str, dict[str, object]] = {} cmap_dir = repo_path / "cmap" for cmap_path in sorted(cmap_dir.glob("**/*.json")): if not cmap_path.is_file(): continue logger.info(f"Getting cmap metadata for {cmap_path}") sha3_256 = _calc_sha3_256(cmap_path) metadata = { "file_name": cmap_path.name, "sha3_256": sha3_256, "size": cmap_path.stat().st_size, } metadatas[cmap_path.name] = metadata metadatas_json = orjson.dumps( metadatas, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ).decode() print(f"CMAP METADATA: {metadatas_json}") with (repo_path / "cmap_metadata.json").open("w") as f: f.write(metadatas_json) if __name__ == "__main__": main() ================================================ FILE: babeldoc/tools/generate_font_metadata.py ================================================ # This script is used to automatically generate the following files: # https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json import argparse import hashlib import io import logging import re from pathlib import Path import babeldoc.format.pdf.high_level import babeldoc.format.pdf.translation_config import orjson import pymupdf from babeldoc.format.pdf.document_il import PdfFont from rich.logging import RichHandler logger = logging.getLogger(__name__) serif_keywords = [ "serif", ] sans_serif_keywords = ["sans", "GoNotoKurrent"] serif_regex = "|".join(serif_keywords) sans_serif_regex = "|".join(sans_serif_keywords) def get_font_metadata(font_path) -> PdfFont: doc = pymupdf.open() page = doc.new_page(width=1000, height=1000) page.insert_font("test_font", font_path) translation_config = babeldoc.format.pdf.translation_config.TranslationConfig( *[None for _ in range(4)], doc_layout_model=1 ) translation_config.progress_monitor = ( babeldoc.format.pdf.high_level.ProgressMonitor( babeldoc.format.pdf.high_level.get_translation_stage(translation_config) ) ) translation_config.font = font_path il_creater = babeldoc.format.pdf.high_level.ILCreater(translation_config) il_creater.mupdf = doc buffer = io.BytesIO() doc.save(buffer) babeldoc.format.pdf.high_level.start_parse_il( buffer, doc_zh=doc, resfont="test_font", il_creater=il_creater, translation_config=translation_config, ) il = il_creater.create_il() il_page = il.page[0] font_metadata = il_page.pdf_font[0] return font_metadata def main(): logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) parser = argparse.ArgumentParser(description="Get font metadata.") parser.add_argument("assets_repo_path", type=str, help="Path to the font file.") args = parser.parse_args() repo_path = Path(args.assets_repo_path) assert repo_path.exists(), f"Assets repo path {repo_path} does not exist." assert (repo_path / "README.md").exists(), ( f"Assets repo path {repo_path} does not contain a README.md file." ) assert (repo_path / "fonts").exists(), ( f"Assets repo path {repo_path} does not contain a fonts folder." ) logger.info(f"Getting font metadata for {repo_path}") metadatas = {} for font_path in list((repo_path / "fonts").glob("**/*.ttf")): logger.info(f"Getting font metadata for {font_path}") with Path(font_path).open("rb") as f: # Read the file in chunks to handle large files efficiently hash_ = hashlib.sha3_256() while True: chunk = f.read(1024 * 1024) if not chunk: break hash_.update(chunk) extracted_metadata = get_font_metadata(font_path) if re.search(serif_regex, extracted_metadata.name, re.IGNORECASE): serif = 1 else: serif = 0 metadata = { "file_name": font_path.name, "font_name": extracted_metadata.name, "encoding_length": extracted_metadata.encoding_length, "bold": extracted_metadata.bold, "italic": extracted_metadata.italic, "monospace": extracted_metadata.monospace, "serif": serif, "ascent": extracted_metadata.ascent, "descent": extracted_metadata.descent, "sha3_256": hash_.hexdigest(), "size": font_path.stat().st_size, } metadatas[font_path.name] = metadata metadatas = orjson.dumps( metadatas, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ).decode() print(f"FONT METADATA: {metadatas}") with (repo_path / "font_metadata.json").open("w") as f: f.write(metadatas) if __name__ == "__main__": main() ================================================ FILE: babeldoc/tools/italic_assistance.py ================================================ import argparse import json import re from pathlib import Path import orjson from babeldoc.const import CACHE_FOLDER from babeldoc.format.pdf.document_il.utils.formular_helper import is_formulas_font from babeldoc.format.pdf.translation_config import TranslationConfig from rich.console import Console from rich.table import Table WORKING_FOLDER = Path(CACHE_FOLDER) / "working" def find_latest_il_json() -> Path | None: """ Find the latest il_translated.json file in ~/.cache/babeldoc/ subdirectories. Returns: Path to the most recently modified il_translated.json file, or None if not found. """ base_dir = Path(WORKING_FOLDER) json_files = list(base_dir.glob("*/il_translated.json")) if not json_files: return None # Sort by modification time (newest first) json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True) return json_files[0] def extract_fonts_from_paragraph( paragraph: dict, page_font_map: dict[str, tuple[str, str]] ) -> set[tuple[str, str]]: """ Extract all font_ids and names used in a paragraph. Args: paragraph: The paragraph dictionary page_font_map: Dictionary mapping font_id to (font_id, name) tuples Returns: Set of (font_id, name) tuples """ fonts = set() # Check if paragraph has a pdfStyle with font_id if ( "pdf_style" in paragraph and paragraph["pdf_style"] and "font_id" in paragraph["pdf_style"] ): font_id = paragraph["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) # Process paragraph compositions if present if "pdf_paragraph_composition" in paragraph: for comp in paragraph["pdf_paragraph_composition"]: # Check different composition types that might contain font information # Direct pdfCharacter in composition if "pdf_character" in comp and comp["pdf_character"]: char = comp["pdf_character"] if "pdf_style" in char and "font_id" in char["pdf_style"]: font_id = char["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) # PdfLine in composition elif "pdf_line" in comp and comp["pdf_line"]: line = comp["pdf_line"] if "pdf_character" in line: for char in line["pdf_character"]: if "pdf_style" in char and "font_id" in char["pdf_style"]: font_id = char["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) # PdfFormula in composition elif "pdf_formula" in comp and comp["pdf_formula"]: formula = comp["pdf_formula"] if "pdf_character" in formula: for char in formula["pdf_character"]: if "pdf_style" in char and "font_id" in char["pdf_style"]: font_id = char["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) # PdfSameStyleCharacters in composition elif ( "pdf_same_style_characters" in comp and comp["pdf_same_style_characters"] ): same_style = comp["pdf_same_style_characters"] if "pdf_style" in same_style and "font_id" in same_style["pdf_style"]: font_id = same_style["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) # PdfSameStyleUnicodeCharacters in composition elif ( "pdf_same_style_unicode_characters" in comp and comp["pdf_same_style_unicode_characters"] ): same_style_unicode = comp["pdf_same_style_unicode_characters"] if ( "pdf_style" in same_style_unicode and same_style_unicode["pdf_style"] is not None and "font_id" in same_style_unicode["pdf_style"] ): font_id = same_style_unicode["pdf_style"]["font_id"] if font_id in page_font_map: fonts.add(page_font_map[font_id]) return fonts def find_fonts_by_debug_id(json_path: Path, debug_id_regex: str) -> dict[str, str]: """ Find all fonts used in paragraphs with matching debug_id. Args: json_path: Path to the il_translated.json file debug_id_regex: Regular expression to match debug_id values Returns: Dictionary mapping font_ids to font names """ # Load and parse JSON with json_path.open("rb") as f: doc_data = orjson.loads(f.read()) # Compile regex pattern (case insensitive) pattern = re.compile(debug_id_regex.strip(" \"'"), re.IGNORECASE) # Set to collect all found font information found_fonts = set() # Process each page for page in doc_data.get("page", []): # Create a mapping of font_id to (font_id, name) tuples for this page page_font_map = {} for font in page.get("pdf_font", []): if "font_id" in font and "name" in font: page_font_map[font["font_id"]] = (font["font_id"], font["name"]) # Check each paragraph for paragraph in page.get("pdf_paragraph", []): # Check if paragraph has debug_id and if it matches the pattern debug_id = paragraph.get("debug_id") if debug_id and pattern.search(debug_id): # Get all fonts used in this paragraph paragraph_fonts = extract_fonts_from_paragraph(paragraph, page_font_map) found_fonts.update(paragraph_fonts) # Convert set of tuples to dictionary return dict(found_fonts) def main(): parser = argparse.ArgumentParser( description="Extract fonts from paragraphs with matching debug_id" ) parser.add_argument( "debug_id_regex", nargs="+", help="Regular expression to match debug_id values" ) parser.add_argument( "--json-path", help="Path to il_translated.json (if not provided, will use the latest file)", ) parser.add_argument( "--working-folder", help="Path to the working folder containing il_translated.json files", ) args = parser.parse_args() if args.working_folder: global WORKING_FOLDER WORKING_FOLDER = Path(args.working_folder) if not WORKING_FOLDER.exists(): print(f"Error: Working folder does not exist: {WORKING_FOLDER}") return 1 # Determine JSON file path json_path = None if args.json_path: json_path = Path(args.json_path) if not json_path.exists(): print(f"Error: File not found: {json_path}") return 1 else: json_path = find_latest_il_json() if not json_path: print("Error: Could not find any il_translated.json file") return 1 print(f"Using JSON file: {json_path}") # Find fonts matching the debug_id pattern fonts = find_fonts_by_debug_id(json_path, "|".join(args.debug_id_regex)) # Output the results if fonts: print( f"Found {len(fonts)} fonts in paragraphs matching debug_id pattern: {args.debug_id_regex}" ) print(json.dumps(fonts, indent=2, ensure_ascii=False)) else: print( f"No fonts found for paragraphs matching debug_id pattern: {args.debug_id_regex}" ) fonts = [] # Read intermediate representation with json_path.open(encoding="utf-8") as f: pdf_data = json.load(f) for page_index, page in enumerate(pdf_data["page"]): for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]): font_debug_id = paragraph_content["debug_id"] if font_debug_id: # Create page font mapping page_font_map = {} for font in page["pdf_font"]: if "font_id" in font and "name" in font: page_font_map[font["font_id"]] = (font["font_id"], font["name"]) # Extract fonts from paragraph name_list = [] paragraph_fonts = extract_fonts_from_paragraph( paragraph_content, page_font_map ) for _font_id, font_name in paragraph_fonts: name_list.append(font_name) font_list = [] for each in fonts: font_list.append(each[1]) for each_name in name_list: if each_name not in font_list: fonts.append( (page_index, each_name, paragraph_index, font_debug_id) ) # Initialize checker translation_config = TranslationConfig( *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1 ) # Create table table = Table(title="Font Recognition Results") table.add_column("Page #", justify="center", style="cyan") table.add_column("Paragraph #", justify="center", style="cyan") table.add_column("DEBUG_ID", justify="center", style="cyan") table.add_column("Font Name", style="magenta") table.add_column("Recognition Result", justify="center") # Output results for each_font in fonts: page_index, font_name, paragraph_index, font_debug_id = each_font if is_formulas_font(font_name, None): table.add_row( str(page_index), str(paragraph_index), str(font_debug_id), font_name, "[bold red]Formula Font[/bold red]", ) else: table.add_row( str(page_index), str(paragraph_index), str(font_debug_id), font_name, "[bold blue]Non-Formula Font[/bold blue]", ) # Print table console = Console() console.print(table) return 0 if __name__ == "__main__": exit(main()) ================================================ FILE: babeldoc/tools/italic_recognize_tool.py ================================================ # Identify non-formula italic fonts that were incorrectly classified as formulas in BableDOC translation results (intermediate) import json import babeldoc.tools.italic_assistance as italic_assistance from babeldoc.format.pdf.document_il.midend.styles_and_formulas import StylesAndFormulas from babeldoc.format.pdf.translation_config import TranslationConfig from rich.console import Console from rich.table import Table console = Console() json_path = italic_assistance.find_latest_il_json() fonts = [] # Read intermediate representation with json_path.open(encoding="utf-8") as f: pdf_data = json.load(f) for page_index, page in enumerate(pdf_data["page"]): for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]): font_debug_id = paragraph_content["debug_id"] if font_debug_id: # Create page font mapping page_font_map = {} for font in page["pdf_font"]: if "font_id" in font and "name" in font: page_font_map[font["font_id"]] = (font["font_id"], font["name"]) # Extract fonts from paragraph name_list = [] paragraph_fonts = italic_assistance.extract_fonts_from_paragraph( paragraph_content, page_font_map ) for _font_id, font_name in paragraph_fonts: name_list.append(font_name) font_list = [] for each in fonts: font_list.append(each[1]) for each_name in name_list: if each_name not in font_list: fonts.append( (page_index, each_name, paragraph_index, font_debug_id) ) # Initialize checker translation_config = TranslationConfig( *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1 ) checker = StylesAndFormulas(translation_config) # Create table table = Table(title="Font Recognition Results") table.add_column("Page #", justify="center", style="cyan") table.add_column("Paragraph #", justify="center", style="cyan") table.add_column("DEBUG_ID", justify="center", style="cyan") table.add_column("Font Name", style="magenta") table.add_column("Recognition Result", justify="center") # Output results for each_font in fonts: page_index, font_name, paragraph_index, font_debug_id = each_font if checker.is_formulas_font(font_name): table.add_row( str(page_index), str(paragraph_index), str(font_debug_id), font_name, "[bold red]Formula Font[/bold red]", ) else: table.add_row( str(page_index), str(paragraph_index), str(font_debug_id), font_name, "[bold blue]Non-Formula Font[/bold blue]", ) # Print table console.print(table) ================================================ FILE: babeldoc/translator/__init__.py ================================================ ================================================ FILE: babeldoc/translator/cache.py ================================================ import json import logging import random import threading from pathlib import Path import peewee from peewee import SQL from peewee import AutoField from peewee import CharField from peewee import Model from peewee import SqliteDatabase from peewee import TextField from peewee import fn # For aggregation functions from babeldoc.const import CACHE_FOLDER logger = logging.getLogger(__name__) # we don't init the database here db = SqliteDatabase(None) # Cleanup configuration CLEAN_PROBABILITY = 0.001 # 0.1% chance to trigger cleanup MAX_CACHE_ROWS = 50_000 # Keep only the latest 50,000 rows # Thread-level mutex to ensure only one cleanup runs at a time within the process _cleanup_lock = threading.Lock() class _TranslationCache(Model): id = AutoField() translate_engine = CharField(max_length=20) translate_engine_params = TextField() original_text = TextField() translation = TextField() class Meta: database = db constraints = [ SQL( """ UNIQUE ( translate_engine, translate_engine_params, original_text ) ON CONFLICT REPLACE """, ), ] class TranslationCache: @staticmethod def _sort_dict_recursively(obj): if isinstance(obj, dict): return { k: TranslationCache._sort_dict_recursively(v) for k in sorted(obj.keys()) for v in [obj[k]] } elif isinstance(obj, list): return [TranslationCache._sort_dict_recursively(item) for item in obj] return obj def __init__(self, translate_engine: str, translate_engine_params: dict = None): self.translate_engine = translate_engine self.replace_params(translate_engine_params) # The program typically starts multi-threaded translation # only after cache parameters are fully configured, # so thread safety doesn't need to be considered here. def replace_params(self, params: dict = None): if params is None: params = {} self.params = params params = self._sort_dict_recursively(params) self.translate_engine_params = json.dumps(params) def update_params(self, params: dict = None): if params is None: params = {} self.params.update(params) self.replace_params(self.params) def add_params(self, k: str, v): self.params[k] = v self.replace_params(self.params) # Since peewee and the underlying sqlite are thread-safe, # get and set operations don't need locks. def get(self, original_text: str) -> str | None: try: result = _TranslationCache.get_or_none( translate_engine=self.translate_engine, translate_engine_params=self.translate_engine_params, original_text=original_text, ) # Trigger cache cleanup with a small probability. if result and random.random() < CLEAN_PROBABILITY: # noqa: S311 self._cleanup() return result.translation if result else None except peewee.OperationalError as e: if "database is locked" in str(e): logger.debug("Cache is locked") return None else: raise def set(self, original_text: str, translation: str): try: _TranslationCache.create( translate_engine=self.translate_engine, translate_engine_params=self.translate_engine_params, original_text=original_text, translation=translation, ) # Trigger cache cleanup with a small probability. if random.random() < CLEAN_PROBABILITY: # noqa: S311 self._cleanup() except peewee.OperationalError as e: if "database is locked" in str(e): logger.debug("Cache is locked") else: raise def _cleanup(self) -> None: """Remove old cache entries, keeping only the latest MAX_CACHE_ROWS records.""" # Quick exit if another thread is already performing cleanup. if not _cleanup_lock.acquire(blocking=False): return try: logger.info("Cleaning up translation cache...") max_id = _TranslationCache.select(fn.MAX(_TranslationCache.id)).scalar() # Nothing to do if table is empty or below threshold if not max_id or max_id <= MAX_CACHE_ROWS: return threshold = max_id - MAX_CACHE_ROWS # Delete rows with id *less than or equal* to threshold so that at most MAX_CACHE_ROWS remain. _TranslationCache.delete().where( _TranslationCache.id <= threshold ).execute() finally: _cleanup_lock.release() def init_db(remove_exists=False): CACHE_FOLDER.mkdir(parents=True, exist_ok=True) # The current version does not support database migration, so add the version number to the file name. cache_db_path = CACHE_FOLDER / "cache.v1.db" logger.info(f"Initializing cache database at {cache_db_path}") if remove_exists and cache_db_path.exists(): cache_db_path.unlink() db.init( cache_db_path, pragmas={ "journal_mode": "wal", "busy_timeout": 1000, }, ) db.create_tables([_TranslationCache], safe=True) def init_test_db(): import tempfile temp_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False) cache_db_path = temp_file.name temp_file.close() test_db = SqliteDatabase( cache_db_path, pragmas={ "journal_mode": "wal", "busy_timeout": 1000, }, ) test_db.bind([_TranslationCache], bind_refs=False, bind_backrefs=False) test_db.connect() test_db.create_tables([_TranslationCache], safe=True) return test_db def clean_test_db(test_db): test_db.drop_tables([_TranslationCache]) test_db.close() db_path = Path(test_db.database) if db_path.exists(): db_path.unlink() wal_path = Path(str(db_path) + "-wal") if wal_path.exists(): wal_path.unlink() shm_path = Path(str(db_path) + "-shm") if shm_path.exists(): shm_path.unlink() init_db() ================================================ FILE: babeldoc/translator/translator.py ================================================ import contextlib import logging import threading import time import unicodedata from abc import ABC from abc import abstractmethod import httpx import openai from tenacity import before_sleep_log from tenacity import retry from tenacity import retry_if_exception_type from tenacity import stop_after_attempt from tenacity import wait_exponential from babeldoc.babeldoc_exception.BabelDOCException import ContentFilterError from babeldoc.translator.cache import TranslationCache from babeldoc.utils.atomic_integer import AtomicInteger logger = logging.getLogger(__name__) def remove_control_characters(s): return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") class RateLimiter: """ A rate limiter using the leaky bucket algorithm to ensure a smooth, constant rate of requests. This implementation is thread-safe and robust against system clock changes. """ def __init__(self, max_qps: int): if max_qps <= 0: raise ValueError("max_qps must be a positive number") self.max_qps = max_qps self.min_interval = 1.0 / max_qps self.lock = threading.Lock() # Use monotonic time to prevent issues with system time changes self.next_request_time = time.monotonic() def wait(self, _rate_limit_params: dict = None): """ Blocks until the next request can be processed, ensuring the rate limit is not exceeded. """ with self.lock: now = time.monotonic() wait_duration = self.next_request_time - now if wait_duration > 0: time.sleep(wait_duration) # Update the next allowed request time. # If the limiter has been idle, the next request should start from 'now'. now = time.monotonic() self.next_request_time = ( max(self.next_request_time, now) + self.min_interval ) def set_max_qps(self, max_qps: int): """ Updates the maximum queries per second. This operation is thread-safe. """ if max_qps <= 0: raise ValueError("max_qps must be a positive number") with self.lock: self.max_qps = max_qps self.min_interval = 1.0 / max_qps _translate_rate_limiter = RateLimiter(5) def set_translate_rate_limiter(max_qps): _translate_rate_limiter.set_max_qps(max_qps) class BaseTranslator(ABC): # Due to cache limitations, name should be within 20 characters. # cache.py: translate_engine = CharField(max_length=20) name = "base" lang_map = {} def __init__(self, lang_in, lang_out, ignore_cache): self.ignore_cache = ignore_cache lang_in = self.lang_map.get(lang_in.lower(), lang_in) lang_out = self.lang_map.get(lang_out.lower(), lang_out) self.lang_in = lang_in self.lang_out = lang_out self.cache = TranslationCache( self.name, { "lang_in": lang_in, "lang_out": lang_out, }, ) self.translate_call_count = 0 self.translate_cache_call_count = 0 def __del__(self): with contextlib.suppress(Exception): logger.info( f"{self.name} translate call count: {self.translate_call_count}" ) logger.info( f"{self.name} translate cache call count: {self.translate_cache_call_count}", ) def add_cache_impact_parameters(self, k: str, v): """ Add parameters that affect the translation quality to distinguish the translation effects under different parameters. :param k: key :param v: value """ self.cache.add_params(k, v) def translate(self, text, ignore_cache=False, rate_limit_params: dict = None): """ Translate the text, and the other part should call this method. :param text: text to translate :return: translated text """ self.translate_call_count += 1 if not (self.ignore_cache or ignore_cache): try: cache = self.cache.get(text) if cache is not None: self.translate_cache_call_count += 1 return cache except Exception as e: logger.debug(f"try get cache failed, ignore it: {e}") _translate_rate_limiter.wait() translation = self.do_translate(text, rate_limit_params) if not (self.ignore_cache or ignore_cache): self.cache.set(text, translation) return translation def llm_translate(self, text, ignore_cache=False, rate_limit_params: dict = None): """ Translate the text, and the other part should call this method. :param text: text to translate :return: translated text """ self.translate_call_count += 1 if not (self.ignore_cache or ignore_cache): try: cache = self.cache.get(text) if cache is not None: self.translate_cache_call_count += 1 return cache except Exception as e: logger.debug(f"try get cache failed, ignore it: {e}") _translate_rate_limiter.wait() translation = self.do_llm_translate(text, rate_limit_params) if not (self.ignore_cache or ignore_cache): try: self.cache.set(text, translation) except Exception as e: logger.debug( f"try set cache failed, ignore it: {e}, text: {text}, translation: {translation}" ) return translation @abstractmethod def do_llm_translate(self, text, rate_limit_params: dict = None): """ Actual translate text, override this method :param text: text to translate :return: translated text """ raise NotImplementedError @abstractmethod def do_translate(self, text, rate_limit_params: dict = None): """ Actual translate text, override this method :param text: text to translate :return: translated text """ logger.critical( f"Do not call BaseTranslator.do_translate. " f"Translator: {self}. " f"Text: {text}. ", ) raise NotImplementedError def __str__(self): return f"{self.name} {self.lang_in} {self.lang_out} {self.model}" def get_rich_text_left_placeholder(self, placeholder_id: int | str): return f"" def get_rich_text_right_placeholder(self, placeholder_id: int | str): return f"" def get_formular_placeholder(self, placeholder_id: int | str): return self.get_rich_text_left_placeholder(placeholder_id) class OpenAITranslator(BaseTranslator): # https://github.com/openai/openai-python name = "openai" def __init__( self, lang_in, lang_out, model, base_url=None, api_key=None, ignore_cache=False, enable_json_mode_if_requested=False, send_dashscope_header=False, send_temperature=True, reasoning=None, ): super().__init__(lang_in, lang_out, ignore_cache) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 self.extra_body = {} # if 'gpt-5' in model and 'gpt-5-chat' not in model: # self.extra_body['reasoning'] = { # "effort": "minimal" # } # self.add_cache_impact_parameters("reasoning-effort", 'minimal') self.reasoning = reasoning self.client = openai.OpenAI( base_url=base_url, api_key=api_key, http_client=httpx.Client( limits=httpx.Limits( max_connections=None, max_keepalive_connections=None ), timeout=60, # Set a reasonable timeout ), ) if send_temperature: self.add_cache_impact_parameters("temperature", self.options["temperature"]) self.model = model self.enable_json_mode_if_requested = enable_json_mode_if_requested self.send_dashscope_header = send_dashscope_header self.send_temperature = send_temperature self.add_cache_impact_parameters("model", self.model) self.add_cache_impact_parameters("prompt", self.prompt("")) if self.reasoning: self.extra_body["reasoning"] = {"effort": self.reasoning} self.add_cache_impact_parameters("reasoning", self.reasoning) if self.enable_json_mode_if_requested: self.add_cache_impact_parameters( "enable_json_mode_if_requested", self.enable_json_mode_if_requested ) self.token_count = AtomicInteger() self.prompt_token_count = AtomicInteger() self.completion_token_count = AtomicInteger() self.cache_hit_prompt_token_count = AtomicInteger() @retry( retry=retry_if_exception_type(openai.RateLimitError), stop=stop_after_attempt(100), wait=wait_exponential(multiplier=1, min=1, max=15), before_sleep=before_sleep_log(logger, logging.WARNING), ) def do_translate(self, text, rate_limit_params: dict = None) -> str: options = {} if self.send_temperature: options.update(self.options) response = self.client.chat.completions.create( model=self.model, **options, messages=self.prompt(text), extra_body=self.extra_body, ) self.update_token_count(response) return response.choices[0].message.content.strip() def prompt(self, text): return [ { "role": "system", "content": "You are a professional,authentic machine translation engine.", }, { "role": "user", "content": f";; Treat next line as plain text input and translate it into {self.lang_out}, output translation ONLY. If translation is unnecessary (e.g. proper nouns, codes, {'{{1}}, etc. '}), return the original text. NO explanations. NO notes. Input:\n\n{text}", }, ] @retry( retry=retry_if_exception_type(openai.RateLimitError), stop=stop_after_attempt(100), wait=wait_exponential(multiplier=1, min=1, max=15), before_sleep=before_sleep_log(logger, logging.WARNING), ) def do_llm_translate(self, text, rate_limit_params: dict = None): if text is None: return None options = {} if self.send_temperature: options.update(self.options) if self.enable_json_mode_if_requested and rate_limit_params.get( "request_json_mode", False ): options["response_format"] = {"type": "json_object"} extra_headers = {} if self.send_dashscope_header: extra_headers["X-DashScope-DataInspection"] = ( '{"input": "disable", "output": "disable"}' ) try: response = self.client.chat.completions.create( model=self.model, **options, max_tokens=2048, messages=[ { "role": "user", "content": text, }, ], extra_headers=extra_headers, extra_body=self.extra_body, ) self.update_token_count(response) return response.choices[0].message.content.strip() except openai.BadRequestError as e: if ( "系统检测到输入或生成内容可能包含不安全或敏感内容,请您避免输入易产生敏感内容的提示语,感谢您的配合。" in e.message ): raise ContentFilterError(e.message) from e else: raise def update_token_count(self, response): try: if response.usage and response.usage.total_tokens: self.token_count.inc(response.usage.total_tokens) if response.usage and response.usage.prompt_tokens: self.prompt_token_count.inc(response.usage.prompt_tokens) if response.usage and response.usage.completion_tokens: self.completion_token_count.inc(response.usage.completion_tokens) # Support both response.usage.prompt_cache_hit_tokens and response.prompt_tokens_details.cached_tokens hit_count = 0 if response.usage and hasattr(response.usage, "prompt_cache_hit_tokens"): hit_count = getattr(response.usage, "prompt_cache_hit_tokens", 0) if hasattr(response, "prompt_tokens_details") and getattr( response.prompt_tokens_details, "cached_tokens", 0 ): hit_count += getattr(response.prompt_tokens_details, "cached_tokens", 0) if hit_count: self.cache_hit_prompt_token_count.inc(hit_count) except Exception as e: logger.exception("Error updating token count") def get_formular_placeholder(self, placeholder_id: int | str): return "{v" + str(placeholder_id) + "}", f"{{\\s*v\\s*{placeholder_id}\\s*}}" return "{{" + str(placeholder_id) + "}}" def get_rich_text_left_placeholder(self, placeholder_id: int | str): return ( f"", r"<\s*\/\s*style\s*>" ================================================ FILE: babeldoc/utils/__init__.py ================================================ ================================================ FILE: babeldoc/utils/atomic_integer.py ================================================ import threading class AtomicInteger: def __init__(self, value=0): self._value = int(value) self._lock = threading.Lock() def inc(self, d=1): with self._lock: self._value += int(d) return self._value def dec(self, d=1): return self.inc(-d) @property def value(self): with self._lock: return self._value @value.setter def value(self, v): with self._lock: self._value = int(v) return self._value ================================================ FILE: babeldoc/utils/memory.py ================================================ import os import sys import time from pathlib import Path try: import psutil except ImportError: psutil = None def _parse_pss_from_smaps_rollup(pid: int) -> int | None: """ Try to read PSS from /proc//smaps_rollup. Returns PSS in bytes, or None if not available/readable. """ try: smaps_rollup_path = Path(f"/proc/{pid}/smaps_rollup") with smaps_rollup_path.open() as f: for line in f: if line.startswith("Pss:"): # Format: "Pss: 1234 kB" parts = line.split() if len(parts) >= 2: pss_kb = int(parts[1]) return pss_kb * 1024 # Convert to bytes return None except (FileNotFoundError, PermissionError, ValueError, OSError): return None def _parse_pss_from_smaps(pid: int) -> int | None: """ Try to read PSS from /proc//smaps and sum all Pss entries. Returns PSS in bytes, or None if not available/readable. """ try: smaps_path = Path(f"/proc/{pid}/smaps") total_pss_kb = 0 with smaps_path.open() as f: for line in f: if line.startswith("Pss:"): # Format: "Pss: 1234 kB" parts = line.split() if len(parts) >= 2: total_pss_kb += int(parts[1]) if total_pss_kb > 0: return total_pss_kb * 1024 # Convert to bytes return None except (FileNotFoundError, PermissionError, ValueError, OSError): return None def _get_pss_linux(pid: int) -> int | None: """ Try to get PSS on Linux. Priority: smaps_rollup -> smaps -> None Returns PSS in bytes, or None if not available. """ # Try smaps_rollup first (lightweight) pss = _parse_pss_from_smaps_rollup(pid) if pss is not None: return pss # Fallback to smaps (heavier) pss = _parse_pss_from_smaps(pid) if pss is not None: return pss return None def _get_rss_psutil(pid: int) -> int | None: """ Get RSS using psutil for a single process. Returns RSS in bytes, or None if psutil unavailable or process not found. """ if psutil is None: return None try: process = psutil.Process(pid) return process.memory_info().rss except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired): return None def _get_single_process_memory( pid: int, prefer_pss: bool = True, use_smaps_rollup_only: bool = False ) -> int | None: """ Get memory usage for a single process (no children). Args: pid: Process ID prefer_pss: If True and on Linux, try PSS first; otherwise use RSS use_smaps_rollup_only: If True, only try smaps_rollup (faster), fallback to RSS if not available Returns: Memory usage in bytes, or None if all methods fail """ if sys.platform == "linux": if prefer_pss: if use_smaps_rollup_only: # Only try smaps_rollup, then fallback to RSS pss = _parse_pss_from_smaps_rollup(pid) if pss is not None: return pss else: # Try full PSS (smaps_rollup -> smaps) pss = _get_pss_linux(pid) if pss is not None: return pss # Fallback to RSS return _get_rss_psutil(pid) def get_memory_usage_bytes( pid: int | None = None, include_children: bool = True, prefer_pss: bool = True, ) -> int: """ Get memory usage of a process (and optionally its children). On Linux with prefer_pss=True: - Tries /proc//smaps_rollup first (lightweight) - Falls back to /proc//smaps if smaps_rollup unavailable (heavier) - Falls back to psutil RSS if smaps unavailable On non-Linux systems or prefer_pss=False: - Uses psutil RSS Args: pid: Process ID to monitor. If None, uses current process. include_children: If True, also includes memory of child processes. prefer_pss: If True on Linux, attempts to use PSS; otherwise uses RSS. Returns: Total memory usage in bytes (guaranteed non-negative). """ if pid is None: pid = os.getpid() total_memory = 0 # Determine if we're using smaps (heavier) vs smaps_rollup (lighter) use_smaps_rollup_only = False if sys.platform == "linux" and prefer_pss: # If we can read smaps_rollup, use rollup-only mode test_rollup = _parse_pss_from_smaps_rollup(pid) use_smaps_rollup_only = test_rollup is not None # Get current process memory memory = _get_single_process_memory( pid, prefer_pss=prefer_pss, use_smaps_rollup_only=use_smaps_rollup_only ) if memory is not None: total_memory += memory # Get children memory if requested if include_children: if psutil is None: # Cannot get children without psutil return total_memory try: parent_process = psutil.Process(pid) children = parent_process.children(recursive=True) except (psutil.NoSuchProcess, psutil.AccessDenied): # Parent process not found or no permission return total_memory for child in children: try: child_pid = child.pid child_memory = _get_single_process_memory( child_pid, prefer_pss=prefer_pss, use_smaps_rollup_only=use_smaps_rollup_only, ) if child_memory is not None: total_memory += child_memory except (psutil.NoSuchProcess, psutil.AccessDenied): # Child process died or no permission; skip it pass return max(0, total_memory) def get_memory_usage_with_throttle( pid: int | None = None, include_children: bool = True, prefer_pss: bool = True, last_pss_check_time: float | None = None, pss_throttle_seconds: float = 2.0, ) -> tuple[int, float | None]: """ Get memory usage with throttling for PSS checks on Linux. When PSS is not available via smaps_rollup and must read smaps (expensive), this throttles checks to at most once per pss_throttle_seconds. Args: pid: Process ID. If None, uses current process. include_children: If True, includes child process memory. prefer_pss: If True on Linux, attempts to use PSS. last_pss_check_time: Timestamp of last PSS check. For throttling logic. pss_throttle_seconds: Minimum interval (seconds) between smaps reads. Returns: Tuple of (memory_bytes, new_check_time). If throttled, returns cached estimate (0) and original check time. """ current_time = time.time() # Check if we should throttle if ( prefer_pss and sys.platform == "linux" and last_pss_check_time is not None and (current_time - last_pss_check_time) < pss_throttle_seconds ): # Throttled: use RSS only as a fast estimate memory = 0 pid_to_check = pid if pid is not None else os.getpid() rss = _get_rss_psutil(pid_to_check) if rss is not None: memory += rss if include_children and psutil is not None: try: parent_process = psutil.Process(pid_to_check) for child in parent_process.children(recursive=True): try: child_rss = _get_rss_psutil(child.pid) if child_rss is not None: memory += child_rss except (psutil.NoSuchProcess, psutil.AccessDenied): pass except (psutil.NoSuchProcess, psutil.AccessDenied): pass return memory, last_pss_check_time # Not throttled: do full check memory = get_memory_usage_bytes( pid=pid, include_children=include_children, prefer_pss=prefer_pss ) return memory, current_time ================================================ FILE: babeldoc/utils/priority_thread_pool_executor.py ================================================ # thanks to: # https://github.com/oleglpts/PriorityThreadPoolExecutor/blob/master/PriorityThreadPoolExecutor/__init__.py # https://github.com/oleglpts/PriorityThreadPoolExecutor/issues/4 import atexit import itertools import logging import queue import random import sys import threading import weakref from concurrent.futures import _base from concurrent.futures.thread import BrokenThreadPool from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import _python_exit from concurrent.futures.thread import _threads_queues from concurrent.futures.thread import _WorkItem from heapq import heappop from heapq import heappush logger = logging.getLogger(__name__) ######################################################################################################################## # Global variables # ######################################################################################################################## NULL_ENTRY = (sys.maxsize, _WorkItem(None, None, (), {})) _shutdown = False ######################################################################################################################## # Before system exit procedure # ######################################################################################################################## def python_exit(): """ Cleanup before system exit """ global _shutdown _shutdown = True items = list(_threads_queues.items()) for _t, q in items: q.put(NULL_ENTRY) for t, _q in items: t.join() # change default cleanup atexit.unregister(_python_exit) atexit.register(python_exit) class PriorityQueue(queue.Queue): """Variant of Queue that retrieves open entries in priority order (lowest first). Entries are typically tuples of the form: (priority number, data). """ REMOVED = "" DEFAULT_PRIORITY = 100 def _init(self, maxsize): self.queue = [] self.entry_finder = {} self.counter = itertools.count() def _qsize(self): return len(self.queue) def _put(self, item): # heappush(self.queue, item) try: if item[1] in self.entry_finder: self.remove(item[1]) count = next(self.counter) entry = [item[0], count, item[1]] self.entry_finder[item[1]] = entry heappush(self.queue, entry) except TypeError: # handle item==None self._put((self.DEFAULT_PRIORITY, None)) def remove(self, task): """ This simply replaces the data with the REMOVED value, which will get cleared out once _get reaches it. """ entry = self.entry_finder.pop(task) entry[-1] = self.REMOVED def _get(self): while self.queue: entry = heappop(self.queue) if entry[2] is not self.REMOVED: del self.entry_finder[entry[2]] return entry return None def _worker(executor_reference, work_queue, initializer, initargs): if initializer is not None: try: initializer(*initargs) except BaseException: _base.LOGGER.critical("Exception in initializer:", exc_info=True) executor = executor_reference() if executor is not None: executor._initializer_failed() return try: while True: work_item = work_queue.get(block=True) try: if work_item[2] is not None: work_item[2].run() # Delete references to object. See issue16284 del work_item # attempt to increment idle count executor = executor_reference() if executor is not None: executor._idle_semaphore.release() del executor continue executor = executor_reference() # Exit if: # - The interpreter is shutting down OR # - The executor that owns the worker has been collected OR # - The executor that owns the worker has been shutdown. if _shutdown or executor is None or executor._shutdown: # Flag the executor as shutting down as early as possible if it # is not gc-ed yet. if executor is not None: executor._shutdown = True # Notice other workers work_queue.put(None) return del executor finally: work_queue.task_done() except BaseException: _base.LOGGER.critical("Exception in worker", exc_info=True) class PriorityThreadPoolExecutor(ThreadPoolExecutor): """ Thread pool executor with priority queue (priorities must be different, lowest first) """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # change work queue type to queue.PriorityQueue self._work_queue: PriorityQueue = PriorityQueue() self._all_future = [] def submit(self, fn, *args, **kwargs): """ Sending the function to the execution queue :param fn: function being executed :type fn: callable :param args: function's positional arguments :param kwargs: function's keywords arguments :return: future instance :rtype: _base.Future Added keyword: - priority (integer later sys.maxsize) """ with self._shutdown_lock: if self._broken: raise BrokenThreadPool(self._broken) if self._shutdown: raise RuntimeError("cannot schedule new futures after shutdown") if _shutdown: raise RuntimeError( "cannot schedule new futures after interpreter shutdown" ) priority = kwargs.get("priority", random.randint(0, sys.maxsize - 1)) # noqa: S311 if "priority" in kwargs: del kwargs["priority"] f = _base.Future() w = _WorkItem(f, fn, args, kwargs) self._work_queue.put((priority, w)) self._adjust_thread_count() self._all_future.append(f) return f def _adjust_thread_count(self): # if idle threads are available, don't spin new threads if self._idle_semaphore.acquire(timeout=0): return # When the executor gets lost, the weakref callback will wake up # the worker threads. def weakref_cb(_, q=self._work_queue): q.put(None) num_threads = len(self._threads) if num_threads < self._max_workers: thread_name = f"{self._thread_name_prefix or self}_{num_threads:d}" t = threading.Thread( name=thread_name, target=_worker, args=( weakref.ref(self, weakref_cb), self._work_queue, self._initializer, self._initargs, ), ) t.start() self._threads.add(t) _threads_queues[t] = self._work_queue def shutdown(self, wait=True, *, cancel_futures=False): logger.debug("Shutting down executor %s", self._thread_name_prefix or self) if wait: logger.debug( "Waiting for all tasks done %s", self._thread_name_prefix or self ) self._work_queue.join() logger.debug("All tasks done %s", self._thread_name_prefix or self) with self._shutdown_lock: self._shutdown = True if cancel_futures: # Drain all work items from the queue, and then cancel their # associated futures. while True: try: work_item = self._work_queue.get_nowait() except queue.Empty: break if work_item is not None: work_item.future.cancel() # Send a wake-up to prevent threads calling # _work_queue.get(block=True) from permanently blocking. self._work_queue.put(None) if wait: logger.debug( "Waiting for all thread done %s", self._thread_name_prefix or self ) for t in self._threads: self._work_queue.put(None) t.join() logger.debug("shutdown finish %s", self._thread_name_prefix or self) def __del__(self): for f in self._all_future: if f.done() and not f.cancelled(): try: f.result() except Exception as e: logger.warning("Exception in future %s: %s", f, e, exc_info=True) ================================================ FILE: docs/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at aw@funstory.ai . All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: docs/CONTRIBUTING.md ================================================ # Contributing to BabelDOC ## How to contribute to BabelDOC ### **About Language** - Issues can be in Chinese or English - PRs are limited to English - All documents are provided in English only ### **Did you find a bug?** - **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/funstory-ai/BabelDOC/issues). Please pay special attention to: 1. Known compatibility issues with pdf2zh - see [#20](https://github.com/funstory-ai/BabelDOC/issues/20) for details 2. Reported edge cases and limitations from downstream applications - see [#23](https://github.com/funstory-ai/BabelDOC/issues/23) for discussion - If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/funstory-ai/BabelDOC/issues/new?template=bug_report.md). Be sure to include a **title and clear description**, as much relevant information as possible. ### **If you wish to request changes or new features** - Suggest your change in the [Issues](https://github.com/funstory-ai/BabelDOC/issues/new?template=feature_request.md) section. ### **If you wish to add more translators** - This project is not intended for direct end-user use, and the supported translators are mainly for debugging purposes. Unless it clearly helps with development and debugging, PRs for directly adding translators will not be accepted. - You can directly use [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate) to get support for more translators. ### **If you want to add new accelerator support for the layout model** - This project only plans to support various accelerators through onnxruntime. Please submit your accelerator support directly to onnxruntime. - Additionally, [translation_config.py](https://github.com/funstory-ai/BabelDOC/blob/9e5be3a05c15ecae98024ba695e4a2db1412c062/babeldoc/translation_config.py#L41) shows that the layout model implementation actually used in this project is passed in from outside. You can implement a layout model class according to the relevant interface, and then pass it through this parameter at runtime. ### **If you wish to contribute to BabelDOC** > [!TIP] > > If you have any questions about the source code or related matters, please contact the maintainer at aw@funstory.ai . > > You can also raise questions in [Issues](https://github.com/funstory-ai/BabelDOC/issues). > > You can contact the maintainers in the pdf2zh discussion group. > > Due to the current high rate of code changes, this project only accepts small PRs. If you would like to suggest a change and you include a patch as a proof-of-concept, that would be great. However, please do not be offended if we rewrite your patch from scratch. > > In addition, we do not accept PRs involving the following changes: > 1. PRs that modify prompts. > 2. Adding GUI or other features directly targeting end users to this project. (Exceptions granted by maintainers in issues are excluded.) > 3. PRs that do not comply with this specification. > 4. Other PRs that maintainers deem inappropriate. > > **This project cannot accept all PRs. We recommend that you discuss with the maintainers via [Issue](https://github.com/funstory-ai/BabelDOC/issues) before submitting a PR.** [//]: # (> We welcome pull requests and will review your contributions.) 1. Fork this repository and clone it locally. 2. Use `doc/deploy.sh` to set up the development environment. 3. Create a new branch and make code changes on that branch. `git checkout -b feature/` 4. Perform development and ensure the code meets the requirements. 5. Commit your changes to your new branch. ``` git add . git commit -m "" ``` 5. Push to your repository: `git push origin feature/`. 6. Create a PR on GitHub and provide a detailed description. 7. Ensure all automated checks pass. #### Basic Requirements ##### Workflow 1. Please create a fork on the main branch and develop on the forked branch. - When submitting a Pull Request (PR), please provide detailed descriptions of the changes. - If the PR fails automated checks (showing checks failed and red cross marks), please review the corresponding details and modify the submission to ensure the new PR passes automated checks. 2. Development and Testing - Use the `uv run BabelDOC` command for development and testing. - When you need print log, please use `log.debug()` to print info. **DO NOT USE `print()`** - Code formatting 3. Dependency Updates - If new dependencies are introduced, please update the dependency list in pyproject.toml accordingly. - It is recommended to use the `uv add` command for adding dependencies. 4. Documentation Updates - If new command-line options are added, please update the command-line options list in README.md accordingly. 5. Commit Messages - Use [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/), for example: feat(translator): add openai. 6. Coding Style - Please ensure submitted code follows basic coding style guidelines. - Use pep8-naming. - Comments should be in English. - Follow these specific Python coding style guidelines: a. Naming Conventions: - Class names should use CapWords (PascalCase): `class TranslatorConfig` - Function and variable names should use snake_case: `def process_text()`, `word_count = 0` - Constants should be UPPER_CASE: `MAX_RETRY_COUNT = 3` - Private attributes should start with underscore: `_internal_state` b. Code Layout: - Use 4 spaces for indentation (no tabs) - Maximum line length is 88 characters (compatible with black formatter) - Add 2 blank lines before top-level classes and functions - Add 1 blank line before class methods - No trailing whitespace c. Imports: - Imports should be on separate lines: `import os\nimport sys` - Imports should be grouped in the following order: 1. Standard library imports 2. Related third party imports 3. Local application/library specific imports - Use absolute imports over relative imports d. String Formatting: - Prefer f-strings for string formatting: `f"Count: {count}"` - Use double quotes for docstrings e. Type Hints: - Use type hints for function arguments and return values - Example: `def translate_text(text: str) -> str:` f. Documentation: - All public functions and classes must have docstrings - Use Google style for docstrings - Example: ```python def function_name(arg1: str, arg2: int) -> bool: """Short description of function. Args: arg1: Description of arg1 arg2: Description of arg2 Returns: Description of return value Raises: ValueError: Description of when this error occurs """ ``` The existing codebase does not comply with the above specifications in some aspects. Contributions for modifications are welcome. #### How to modify the intermediate representation The intermediate representation is described by [il_version_1.rnc](https://github.com/funstory-ai/BabelDOC/blob/main/BabelDOC/format/pdf/document_il/il_version_1.rnc). Corresponding Python data classes are generated using [xsdata](https://xsdata.readthedocs.io/en/latest/). The files `il_version_1.rng`, `il_version_1.xsd`, and `il_version_1.py` are auto-generated and must not be manually modified. ##### Format RNC file ```bash trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.rnc ``` ##### Generate RNG, XSD and Python classes ```bash # Generate RNG from RNC trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.rng # Generate XSD from RNC trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.xsd # Generate Python classes from XSD xsdata generate babeldoc/format/pdf/document_il/il_version_1.xsd --package babeldoc.format.pdf.document_il ``` ##### Profile memory usage ```bash uv run memray run --native --aggregate babeldoc/main.py -c yadt.toml ``` ================================================ FILE: docs/CONTRIBUTOR_REWARD.md ================================================ # BabelDOC/PDFMathTranslate/OneAIFW 贡献者奖励规则 ## 月度活跃贡献者奖励规则 ### 一、资格标准 #### **贡献类型要求** - 需提交 **至少 1 个有效 PR**(Pull Request),或进行 **PR 审核、文档编写** 等贡献。 - 有效贡献定义: - 非简单的文档错别字修复 - 非简单的代码格式化调整(如仅调整缩进、空格等) - 需做出实质性贡献(如功能开发、Bug 修复、性能优化、架构调整、技术文档编写、PR 审核等) - 示例合格贡献:新增功能模块、修复逻辑错误、优化算法效率、编写技术文档等 #### **时间范围** - 每月 1 日至月末最后一天合并的 PR 计入当月统计 ### 二、申请流程 #### **申请条件** - PR 需被成功合并至以下几个仓库: 1. [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 仓库 2. [PDFMathTranslate-next/PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 的主分支。 3. [guaguastandup/zotero-pdf2zh](https://github.com/guaguastandup/zotero-pdf2zh) 的主分支 4. [funstory-ai/aifw](https://github.com/funstory-ai/aifw) 的主分支 - 若目标为 [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 的 PR 未被合并,但被维护者认定为有价值的概念验证,同样符合条件。 - 审核 PR、撰写 wiki 等贡献也必须是以上两个仓库。 - 同一贡献者每月仅可申请一次(无论提交 PR 数量) - 同一贡献者每月最多可以获得 1 个兑换码 - 对于 PR,只有发起者可以申请兑换码 - 仅可使用当月的贡献申请兑换码(特殊情况请联系 aw@funstory.ai 说明) #### **申请方式** - 发送邮件至 **aw@funstory.ai** - 邮件标题格式:`[贡献者会员兑换码申请] GitHub用户名-月份`(例:`[贡献者会员兑换码申请] awwaawwa-2024-07`) - 邮件正文需包含: - GitHub 用户名 - 合并 PR 的完整链接 - 附件要求: - PR 页面完整截图(需包含合并状态、仓库名称及点击头像后弹出来的侧边栏,如下图所示) > [!IMPORTANT] > > 不满足上述格式要求的邮件会被直接忽略! ![附件示例](https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-contributor_reward_example.png) #### **奖励说明** - 奖励内容:[沉浸式翻译(Immersive Translate)](https://immersivetranslate.com/zh-Hans/pricing/)月度会员兑换码 - 兑换码使用:在[沉浸式翻译官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入即可激活 - 会员权益:沉浸式翻译 Pro 会员一个月(详见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明) - 兑换码为专属福利,不可转让 ### 三、审核与发放 #### **审核周期** - 我们会尽力在收到申请邮件后 1 个工作日内完成审核 - 审核时间可能因申请数量、审核复杂度等因素有所延长 - 审核通过后,兑换码将通过邮件方式发送 - 若审核未通过,我们会通过邮件说明原因 #### **兑换码规则** - 使用方式:[官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入兑换码激活 - 权益内容:月度会员(具体权益见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明) - 不可转让 ### 四、注意事项 #### **禁止行为** - 将完整功能拆分为多个无关 PR 提交 - 提交质量不合格或具有潜在危害的代码 - 提供虚假或误导性的申请材料 #### **特别说明** - funstory.ai 保留对贡献价值的评估权、规则的最终解释权等所有必要权利 - 规则如有实质性更新(格式调整等除外),将提前 1 天在 [BabelDOC GitHub PR](https://github.com/funstory-ai/BabelDOC/pulls) 公告 - 过期未使用的兑换码不予补发 - 自 2025 年 2 月 1 日起的贡献可以申请兑换码 - 为了确认您是 Pull Request (PR) 的发起者,防止他人冒领,我们可能会要求您使用发起者账号在 PR 下方留言指定的随机数字。 ## 常见问题解答(FAQ) **Q:如何判断文档翻译贡献是否有效?** A:系统性的人工翻译(如完整章节的翻译并经过人工校对)视为有效贡献。零散段落翻译或仅依赖机器翻译的内容不计入有效贡献。 **Q:兑换码过期了可以补发吗?** A:为确保公平性,过期的兑换码将不予补发,请在有效期内及时使用。 **Q:为什么这个文档是中文的?** A:因为目前应该是中文贡献者多吧,所以就先写中文的。后面再撰写英文版的。 --- **规则公示**:本规则文档存放于 BabelDOC 仓库 [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md),并在 [Contributor Reward - BabelDOC](https://funstory-ai.github.io/BabelDOC/CONTRIBUTOR_REWARD/) 展示。 ================================================ FILE: docs/ImplementationDetails/AsyncTranslate/AsyncTranslate.md ================================================ # Async Translation API > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Overview The `yadt.high_level.async_translate` function provides an asynchronous interface for translating PDF files with real-time progress reporting. This function yields progress events that can be used to update progress bars or other UI elements. ## Usage ```python linenums="1" async def translate_with_progress(): config = TranslationConfig( input_file="example.pdf", translator=your_translator, # ... other configuration options ) try: async for event in async_translate(config): if event["type"] == "progress_update": print(f"Progress: {event['overall_progress']}%") elif event["type"] == "finish": result = event["translate_result"] print(f"Translation completed: {result.original_pdf_path}") elif event["type"] == "error": print(f"Error occurred: {event['error']}") break except asyncio.CancelledError: print("Translation was cancelled") except KeyboardInterrupt: print("Translation was interrupted") ``` ## Event Types The function yields different types of events during the translation process: ### 1. Progress Start Event Emitted when a translation stage begins: ```python { "type": "progress_start", "stage": str, # Name of the current stage "stage_progress": float, # Always 0.0 "stage_current": int, # Current progress count (0) "stage_total": int # Total items to process in this stage } ``` ### 2. Progress Update Event Emitted periodically during translation (controlled by report_interval, default 0.1s): ```python { "type": "progress_update", "stage": str, # Name of the current stage "stage_progress": float, # Progress percentage of current stage (0-100) "stage_current": int, # Current items processed in this stage "stage_total": int, # Total items to process in this stage "overall_progress": float # Overall translation progress (0-100) } ``` ### 3. Progress End Event Emitted when a stage completes: ```python { "type": "progress_end", "stage": str, # Name of the completed stage "stage_progress": float, # Always 100.0 "stage_current": int, # Equal to stage_total "stage_total": int, # Total items processed in this stage "overall_progress": float # Overall translation progress (0-100) } ``` ### 4. Finish Event Emitted when translation completes successfully: ```python { "type": "finish", "translate_result": TranslateResult # Contains paths to translated files and timing info } ``` ### 5. Error Event Emitted if an error occurs during translation: ```python { "type": "error", "error": str # Error message } ``` ## Translation Stages The translation process goes through the following stages in order: 1. ILCreater 2. LayoutParser 3. ParagraphFinder 4. StylesAndFormulas 5. ILTranslator 6. Typesetting 7. FontMapper 8. PDFCreater Each stage will emit its own set of progress events. ## Cancellation The translation process can be cancelled in several ways: 1. By raising a `CancelledError` (e.g., when using `asyncio.Task.cancel()`) 2. Through `KeyboardInterrupt` (e.g., when user presses Ctrl+C) 3. By calling `translation_config.cancel_translation()` method Example of programmatic cancellation: ```python linenums="1" async def translate_with_cancellation(): config = TranslationConfig( input_file="example.pdf", translator=your_translator, # ... other configuration options ) try: # Start translation in another task translation_task = asyncio.create_task(process_translation(config)) # Simulate some condition that requires cancellation await asyncio.sleep(5) config.cancel_translation() # This will trigger cancellation await translation_task # Wait for the task to finish except asyncio.CancelledError: print("Translation was cancelled") async def process_translation(config): async for event in async_translate(config): if event["type"] == "error": if isinstance(event["error"], asyncio.CancelledError): print("Translation was cancelled") break print(f"Error occurred: {event['error']}") break # ... handle other events ... ``` When cancelled: - The function will log the cancellation reason - All resources will be cleaned up properly - Any ongoing translation tasks will be stopped - A final error event with `CancelledError` will be emitted - The function will exit gracefully ## Error Handling Any errors during translation will be: 1. Logged with full traceback (if debug mode is enabled) 2. Reported through an error event 3. Cause the event stream to stop after the error event 4. Clean up resources properly before exiting It's recommended to handle these events appropriately in your application to provide feedback to users. The example in the Usage section shows a basic error handling pattern. ================================================ FILE: docs/ImplementationDetails/ILTranslator/ILTranslator.md ================================================ # Intermediate Layer Translator > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background After formula and style processing, we need to translate the document while preserving all formatting, formulas, and styles. The intermediate layer translator handles this complex task by using placeholders and style preservation techniques. ## Goal 1. Translate text while preserving document structure 2. Maintain formulas and special formatting 3. Handle rich text with different styles 4. Support concurrent translation for better performance ## Specific Implementation The translation process consists of several key steps: ### Step 1: Translation Preparation 1. Process paragraphs: - Skip vertical text - Handle single-component paragraphs directly - Process multi-component paragraphs with placeholders 2. Create placeholders: - Formula placeholders for mathematical expressions - Rich text placeholders for styled text - Ensure placeholder uniqueness within each paragraph ### Step 2: Translation Input Creation 1. Analyze paragraph components: - Regular text components - Formula components - Styled text components 2. Handle special cases: - Skip pure formula paragraphs - Preserve original text when style matches base style - Handle font mapping cases ### Step 3: Translation Execution 1. Concurrent translation: - Use thread pool for parallel processing - Control QPS (Queries Per Second) - Track translation progress 2. Translation tracking: - Record original text - Record translated text - Save tracking information for debugging ### Step 4: Translation Output Processing 1. Parse translated text: - Extract text between placeholders - Restore formulas at placeholder positions - Restore rich text with original styles 2. Create new paragraph components: - Maintain style information - Preserve formula positioning - Handle empty text segments ## Additional Features 1. Style preservation: - Maintains original text styles - Handles font size variations - Preserves formatting attributes 2. Formula handling: - Preserves formula integrity - Maintains formula positioning - Supports complex mathematical expressions 3. Debug support: - Translation tracking - JSON output for debugging - Detailed logging ## Limitations 1. Vertical text is not supported 2. Complex nested styles might not be perfectly preserved 3. Placeholder conflicts could occur in rare cases 4. Translation quality depends on external translation engine ## Configuration Options The translation process can be customized through `TranslationConfig`: 1. `qps`: Maximum queries per second for translation 2. `debug`: Enable/disable debug mode and tracking 3. Translation engine specific settings ================================================ FILE: docs/ImplementationDetails/PDFCreation/PDFCreation.md ================================================ # PDF Creation > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background After translation and typesetting, we need to create the final PDF document that preserves all the formatting, styles, and layout of the original document while containing the translated text. The PDF creation process handles this final step. ## Goal 1. Create a new PDF document with translated content 2. Preserve all original formatting and styles 3. Support both monolingual and dual-language output 4. Maintain font consistency and character encoding 5. Optimize the output file size and performance ## Specific Implementation The PDF creation process consists of several key steps: ### Step 1: Font Management 1. Font initialization: - Add required fonts to the document - Map font identifiers - Handle font encoding lengths 2. Font availability checking: - Check available fonts for each page - Handle XObject font requirements - Manage font resources 3. Font subsetting: - Optimize font usage - Reduce file size - Maintain character support ### Step 2: Content Rendering 1. Character processing: - Handle individual characters - Process character encodings - Manage character positioning 2. Graphics state handling: - Process color spaces - Handle transparency - Manage graphic state instructions 3. XObject management: - Process form XObjects - Handle drawing operations - Maintain XObject hierarchy ### Step 3: Document Assembly 1. Page construction: - Build page content - Process page resources - Handle page boundaries 2. Content stream creation: - Generate drawing operations - Handle text positioning - Manage content streams 3. Resource management: - Handle font resources - Manage XObject resources - Process graphic states ### Step 4: Output Generation 1. Monolingual output: - Create translated-only PDF - Optimize file size - Apply compression 2. Dual-language output: - Combine original and translated pages - Handle page ordering - Maintain document structure 3. File optimization: - Apply garbage collection - Enable compression - Optimize for linear reading ## Additional Features 1. Font handling: - Support for CID fonts - Font subsetting - Font resource management 2. Document optimization: - File size reduction - Performance optimization - Resource cleanup 3. Debug support: - Decompressed output - Debug information - Progress tracking ## Limitations 1. Font support: - Limited to available font formats - Font subsetting restrictions - Character encoding constraints 2. File size: - Dual-language output increases size - Font embedding impact - Resource duplication 3. Performance considerations: - Processing time for large documents - Memory usage during creation - Optimization overhead ## Configuration Options The PDF creation process can be customized through `TranslationConfig`: 1. Output options: - `no_mono`: Disable monolingual output - `no_dual`: Disable dual-language output - Output file naming patterns 2. Optimization settings: - Compression options - Garbage collection - Font subsetting 3. Debug options: - Debug mode - Decompressed output - Progress tracking ================================================ FILE: docs/ImplementationDetails/PDFParsing/PDFParsing.md ================================================ # PDF Parsing and Intermediate Layer Creation > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background The first step in the translation process is to parse the PDF document and create an intermediate layer (IL) representation. This step involves extracting text, styles, formulas, and layout information from the PDF while maintaining their relationships and properties. ## Goal 1. Extract text content while preserving character-level information 2. Maintain font and style information 3. Preserve document structure and layout 4. Handle special elements like XObjects and graphics 5. Create a structured intermediate representation for later processing ## Specific Implementation The parsing process consists of several key components working together: ### Step 1: PDF Interpreter (PDFPageInterpreterEx) 1. Page content processing: - Parse PDF operators and their parameters - Handle graphics state operations - Process text and font operations - Manage XObject rendering 2. Graphics filtering: - Filter non-formula lines - Handle color space operations - Process stroke and fill operations 3. XObject handling: - Process form XObjects - Handle image XObjects - Maintain XObject hierarchy ### Step 2: PDF Converter (PDFConverterEx) 1. Character processing: - Extract character information - Maintain character positions - Preserve style attributes 2. Layout management: - Handle page boundaries - Process figure elements - Manage coordinate systems 3. Font handling: - Map font identifiers - Process font metadata - Handle CID fonts ### Step 3: Intermediate Layer Creator (ILCreater) 1. Document structure creation: - Build page hierarchy - Create character objects - Maintain font registry 2. Resource management: - Process font resources - Handle color spaces - Manage graphic states 3. XObject tracking: - Track XObject hierarchy - Maintain XObject states - Process form content ### Step 4: High-level Coordination 1. Process management: - Initialize resources - Coordinate component interactions - Handle progress tracking 2. Resource initialization: - Set up font management - Initialize graphics resources - Prepare document structure 3. Error handling: - Handle malformed content - Manage resource errors - Provide debug information ## Additional Features 1. Font management: - Support for CID fonts - Font metadata extraction - Font mapping capabilities 2. Graphics state tracking: - Color space management - Line style preservation - Transparency handling 3. Coordinate system handling: - Support for transformations - Boundary box calculations - Position normalization 4. Debug support: - Detailed logging - Intermediate file generation - Progress tracking ## Limitations 1. Complex PDF features: - Limited support for some PDF extensions - Simplified graphics model - Basic transparency support 2. Font handling: - Limited support for some font formats - Simplified font metrics - Basic font feature support 3. Performance considerations: - Memory usage for large documents - Processing time for complex layouts - Resource management overhead ## Configuration Options The parsing process can be customized through `TranslationConfig`: 1. `debug`: Enable/disable debug mode and intermediate file generation 2. Font-related settings: - Font mapping configurations - CID font handling options 3. Layout processing options: - Page selection - Content filtering rules ================================================ FILE: docs/ImplementationDetails/ParagraphFinding/ParagraphFinding.md ================================================ # Paragraph Finding > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background After PDF analysis, we need to identify paragraphs from individual characters. This is a crucial step before translation and typesetting, as it helps maintain the logical structure of the document. ## Goal 1. Group characters into meaningful paragraphs while preserving the document's logical structure 2. Handle special cases like table of contents, short lines, and multi-line paragraphs 3. Maintain layout information for later typesetting ## Specific Implementation The paragraph finding process consists of four main steps: ### Step 1: Create Initial Paragraphs 1. Group characters into lines based on their spatial relationships 2. Create paragraphs based on layout information and XObject IDs 3. Characters that don't belong to text layouts are skipped ### Step 2: Process Paragraph Spacing 1. Remove completely empty lines 2. Handle trailing spaces within lines 3. Update paragraph boundary boxes and metadata ### Step 3: Calculate Line Width Statistics 1. Calculate the median width of all lines 2. This information is used for identifying potential paragraph breaks ### Step 4: Process Independent Paragraphs 1. Analyze paragraphs with multiple lines 2. Split paragraphs in two cases: - When encountering table of contents entries (identified by consecutive dots) - When finding lines significantly shorter than the median width (configurable via `short_line_split_factor`) ## Additional Features 1. Layout-aware processing: - Respects different layout types (plain text, title, figure caption, etc.) - Maintains layout priority order for overlapping regions 2. First line indent detection: - Automatically detects and marks paragraphs with first line indentation 3. Flexible character position detection: - Uses multiple position detection modes (middle, topleft, bottomright) - Special handling for characters with unreliable height information ## Limitations 1. The current implementation assumes left-to-right text direction 2. May not perfectly handle complex layouts with overlapping regions 3. Table of contents detection relies on consecutive dots pattern 4. Short line splitting might occasionally create incorrect paragraph breaks ## Configuration Options The paragraph finding behavior can be customized through `TranslationConfig`: 1. `split_short_lines`: Enable/disable splitting paragraphs at short lines 2. `short_line_split_factor`: Threshold factor for short line detection (relative to median width) ================================================ FILE: docs/ImplementationDetails/README.md ================================================ # Implementation Details > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Core Processing Flow Main processing stages in order of actual execution and corresponding documentation: 1. [PDFParser.md](PDFParsing/PDFParsing.md): **PDF Parsing and Intermediate Layer Creation** 2. [LayoutParser](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/midend/layout_parser.py): **Layout OCR** 3. [ParagraphFinding.md](ParagraphFinding/ParagraphFinding.md): **Paragraph Recognition** 4. [StylesAndFormulas.md](StylesAndFormulas/StylesAndFormulas.md): **Style and Formula Processing** 5. [ILTranslator.md](ILTranslator/ILTranslator.md): **Intermediate Layer Translation** 6. [Typesetting.md](Typesetting/Typesetting.md): **Typesetting Processing** 7. [FontMapper](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/utils/fontmap.py): **Font Mapping** 8. [PDFCreation.md](PDFCreation/PDFCreation.md): **PDF Generation** ## API 1. [Async Translation API](AsyncTranslate/AsyncTranslate.md): **Async Translation API** > [!TIP] > > Click on document links to view detailed implementation principles and configuration options ================================================ FILE: docs/ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md ================================================ # Styles and Formulas Processing > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background After paragraph finding, we need to identify formulas and text styles within each paragraph. This step is crucial for maintaining mathematical expressions and text formatting during translation. ## Goal 1. Identify and preserve mathematical formulas 2. Detect and maintain consistent text styles 3. Handle special cases like subscripts and superscripts 4. Calculate proper offsets for formula positioning ## Specific Implementation The processing consists of several main steps: ### Step 1: Formula Detection 1. Identify formula characters based on: - Formula-specific fonts - Special Unicode characters - Vertical text - Corner marks (subscripts/superscripts) 2. Group consecutive formula characters into formula units ### Step 2: Formula Processing 1. Process comma-containing formulas: - Split complex formulas at commas when appropriate - Preserve brackets and their contents - Convert simple number-only formulas to regular text 2. Merge overlapping formulas: - Handle cases where subscripts/superscripts are detected as separate formulas - Maintain proper character ordering ### Step 3: Style Analysis 1. Calculate base style for each paragraph: - Find common style attributes across all text - Handle font variations - Process graphic states 2. Group characters with identical styles: - Font properties - Size properties - Graphic state properties ### Step 4: Position Calculation 1. Calculate formula offsets: - Compute x-offset relative to surrounding text - Compute y-offset for proper vertical alignment - Handle line spacing variations ## Additional Features 1. Font mapping: - Maps different fonts to standard ones - Special handling for formula fonts 2. Style inheritance: - Maintains style hierarchy - Handles partial style overrides 3. Formula classification: - Distinguishes between translatable and non-translatable formulas - Special handling for numeric formulas with commas ## Limitations 1. Formula detection relies on font and character patterns 2. May not handle all types of mathematical notations 3. Complex subscript/superscript combinations might be misidentified 4. Limited support for vertical formulas ## Configuration Options The formula and style processing can be customized through `TranslationConfig`: 1. `formular_font_pattern`: Regex pattern for identifying formula fonts 2. `formular_char_pattern`: Regex pattern for identifying formula characters ================================================ FILE: docs/ImplementationDetails/Typesetting/Typesetting.md ================================================ # Typography > [!NOTE] > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: > > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) > - Community contribution (PRs welcome!) ## Background After translation, text needs to be typeset before placing into PDF. Translated paragraphs can contain any combination of the following types: 1. PDF formulas 2. Single PDF original character 3. PDF original string with same style 4. Translated Unicode string with same style Let's discuss different cases: For the following 3 types, they can be directly transmitted transparently to new positions: 1. PDF formulas 2. Single PDF original character 3. PDF original string with same style Only "translated Unicode string with same style" needs typesetting operation, as this step loses original layout information. However, since paragraphs may contain other components that need transparent transmission, their positions may also change and need to participate in typesetting. ## Goal Try to fit all components within the original paragraph bounding box. If impossible, try to expand the bounding box in writing direction. ## Specific Implementation First perform reflow judgment to determine if the paragraph needs reflow. If all elements can be transmitted transparently, no reflow is needed. Then, if reflow is needed, execute Algorithm 1: 1. Convert all elements to typesetting unit type, which records length and width information. 2. Start from top-left of original paragraph bounding box, place elements sequentially. 3. If current line cannot fit next element, wrap to next line. 4. Repeat 2-3 until all elements are placed or exceed original bounding box. Algorithm 1 works normally when translated text is shorter than original. When translated text is longer, Algorithm 2 needs to be added: 1. Initialize element scaling factor as 1.0. 2. Initialize line spacing as 1.5. 3. Try typesetting using Algorithm 1. 4. If it cannot fit all elements: - First try to reduce line spacing by 0.1 step until reaching minimum line spacing (1.4) - If still cannot fit: - When scale > 0.6, reduce element scaling by 0.05 - When scale <= 0.6, reduce element scaling by 0.1 - Reset line spacing to 1.5 - When scale becomes less than 0.7, adjust minimum line spacing to 1.1 5. Report error if element scaling is less than 0.1. Algorithm 2 can fit translations of almost all languages in original position. However, for special cases like "图 1" translated to "Figure 1", even with the above algorithms some text may still overflow. So Algorithm 3: 1. Before reducing scale, first try to expand the bounding box in writing direction. 2. Calculate paragraph's right whitespace by: - Using 90% of page crop box width as maximum limit - Checking for overlapping paragraphs on the right - Checking for overlapping figures on the right 3. Expand paragraph bounding box based on available whitespace. 4. If still cannot fit all elements, continue with scale reduction as in Algorithm 2. ## Additional Features 1. Mixed Chinese-English text handling: - Adds 0.5 character width spacing between Chinese and English text transitions - Excludes certain punctuation marks from this spacing rule 2. First line indent: - Adds 2 Chinese characters width indent for the first line when specified 3. Hanging punctuation: - Allows certain punctuation marks to extend beyond the right margin - Helps maintain better visual alignment ## Limitations 1. Currently, we use PDFPlumber for PDF analysis, this is only implemented for paragraphs, only handles left-to-right writing. 2. Cannot handle table of contents alignment by dots. 3. Poor performance, needs optimization. 4. No global page information consideration, inconsistent text sizes. 5. No advanced typography features, poor reading experience. ## Related Resources [UTR #59: East Asian Spacing](https://www.unicode.org/reports/tr59/) specifies which characters need spacing between them. ================================================ FILE: docs/README.md ================================================ YADT Spec === ## YADT Document Intermediate Language [il_version_1.rnc](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/il_version_1.rnc): The definition of the intermediate language used between PDF parsing and rendering stages. For other implementation details, please refer to [Implementation Details](ImplementationDetails/README.md). ================================================ FILE: docs/deploy.sh ================================================ #!/bin/bash set -e command_exists() { command -v "$1" >/dev/null 2>&1 } echo "check uv installed ……" if command_exists uv; then echo "uv installed !" exit 0 fi echo "uv not install, start installing ……" OS=$(uname -s) case "$OS" in Linux) if command_exists curl; then curl -LsSf https://astral.sh/uv/install.sh | sh elif command_exists wget; then wget -qO- https://astral.sh/uv/install.sh | sh else echo "curl or wget not found. uv installed failed." exit 1 fi ;; Darwin) if command_exists brew; then brew install uv else echo "Homebrew not installed, please installed uv munally. " exit 1 fi ;; *) echo "not support OS: $OS" exit 1 ;; esac if command_exists uv; then uv run babeldoc --version pre-commit install else exit 1 fi ================================================ FILE: docs/example/demo_glossary.csv ================================================ source,target,tgt_lng AutoML,自动ML,zh-CN "a,a",a,zh-CN """","""",zh-CN ================================================ FILE: docs/index.md ================================================ {!README.md!} ================================================ FILE: docs/intro-to-pdf-object.md ================================================ An Introduction to PDF Object Definitions in dpml === ## 1. Understanding PDF Structure A PDF file is fundamentally an indexed collection of objects, where each object represents a structured data unit. The file structure consists of four main components: 1. A header 2. Object definitions 3. A cross-reference table 4. A trailer The cross-reference table serves as a lookup directory, mapping each numbered object to its byte offset location within the file. The trailer contains critical metadata, including the location of the root object (document catalog), which serves as the entry point for PDF interpretation. The file concludes with a byte offset pointing to the cross-reference table. Here's an illustrative example of a PDF file structure: ```pdf %PDF-2.0 1 0 obj << /Pages 2 0 R /Type /Catalog >> endobj 2 0 obj << /Count 1 /Kids [ 3 0 R ] /Type /Pages >> endobj 3 0 obj << /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >> /Type /Page >> endobj 4 0 obj << /Length 44 >> stream BT /F1 24 Tf 72 720 Td (Potato) Tj ET endstream endobj 5 0 obj << /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Subtype /Type1 /Type /Font >> endobj xref 0 6 0000000000 65535 f 0000000009 00000 n 0000000062 00000 n 0000000133 00000 n 0000000277 00000 n 0000000372 00000 n trailer << /Root 1 0 R /Size 6 /ID [<42841c13bbf709d79a200fa1691836f8>] >> startxref 478 %%EOF ``` ### PDF File Interpretation When a PDF viewer processes a file, it follows these steps: 1. Starts at the file's end to locate the cross-reference table offset 2. Accesses the cross-reference table to find object locations 3. Reads the trailer dictionary to identify the document catalog 4. Uses the document catalog to access various document components: - Pages - Outlines - Thumbnails - Annotations - Other PDF elements The pages tree root is particularly crucial as it enables navigation to specific pages within the document. ### Example Interpretation Flow Let's trace through our example: 1. The cross-reference table begins at byte offset 478 (indicated after `startxref`) 2. The trailer identifies object 1 as the document catalog (`/Root 1 0 R`) 3. Object 1 is located at byte offset 9 4. The document catalog points to object 2 as the pages tree root 5. Object 2 is found at byte offset 62 6. The pages tree identifies page 3 as the first page 7. Object 3 is positioned at byte offset 133 8. Object 3 defines the page properties and links to object 4 for content 9. Object 4, at byte offset 277, contains the drawing instructions for rendering "Potato" This structure enables efficient random access to any part of the PDF document. ## 2. PDF Objects Earlier, we discussed PDF objects and introduced the concept of dictionaries. At the top level of a PDF file, objects are identified by two numbers followed by the keyword "obj". The first number serves as the object number, while the second—known as the generation number—is typically 0. Everything between these identifiers and the "endobj" keyword constitutes the object's body. The PDF specification provides a mechanism for modifying files by appending object updates and cross-reference table entries. When an object's contents are completely replaced (rather than modified), its generation number can be incremented. This allows object numbers to be reused while preventing old indirect references from resolving to new objects. However, such files are rare in practice, and generation numbers can generally be disregarded. Modern PDF specifications using object streams have even eliminated generation numbers entirely. PDF objects share similarities with data structures found in JSON, YAML, and modern programming languages, though PDF includes some unique object types. Here are the available PDF object types: - String: A text sequence enclosed in parentheses, e.g., (potato). Note that PDF strings typically don't support full Unicode encoding, though there are specific cases where this is possible. (A detailed discussion of character encoding is beyond our current scope.) - Number: Both integers and floating-point numbers (e.g., 12, 3.14159). While the PDF specification distinguishes between integers and real numbers, they're often interchangeable in practice—integers can be used where real numbers are expected, and viewers typically handle real numbers appropriately when integers are required. - Boolean: Simple true/false values - Null: Represented by the keyword "null" - Name: A keyword or dictionary key identifier starting with a forward slash (/), e.g., /Type - Array: An ordered collection of objects enclosed in square brackets, with no separators between items. Arrays support nested structures, including other arrays and dictionaries. Example: `[1 (two) 3.14 false]` - Dictionary: A collection of key-value pairs where keys are Names and values can be any object type. Dictionaries are enclosed in << and >> with no separators between entries. Example: `<< /A 1 /B [2, 3 <> ] >>` - Indirect object reference: A reference to a numbered object in the file, consisting of two numbers (object and generation) followed by 'R', e.g., 1 0 R. While some objects must be direct per the PDF specification, most can be defined at the top level and referenced indirectly. - Stream: A container for binary data, structured as a dictionary (containing at least a /Length key and other format-specific entries) followed by the specified number of bytes between "stream" and "endstream" keywords. 🔍 The stream length can be specified as an indirect object, enabling single-pass PDF generation where the stream length isn't known in advance—a common practice in PDF creation. ## 3. PDF Object Definitions In dpml ### Coordinate system definition The positive x-axis extends horizontally to the right, while the positive y-axis extends vertically upward, following standard mathematical conventions. The unit length along both the x and y axes is defined as 1/72 inch (or 1 point). ## 4. Useful Information - [PDF32000_2008](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf) page 111: Table 51 - Operator Categories ================================================ FILE: docs/requirements.txt ================================================ sphinx>=8.2.0 sphinx-click>=5.1.0 furo>=2024.1.29 myst-parser[linkify,html_meta,html_admonition]>=2.0.0 ================================================ FILE: docs/supported_languages.md ================================================ # Supported Languages For languages in the table below that do not rely on ligature support, BabelDOC provides good support. For languages that partially rely on ligatures, BabelDOC's translation results can generally meet self-reading needs. For languages that completely rely on ligatures (such as some Indian languages), BabelDOC does not currently support them. We are working hard to develop support for ligatures as soon as possible. | Language | Language Code | Ligature Dependency | |:--------------------------------|:--------------|:--------------------| | English | EN | None | | Simplified Chinese | zh-CN | None | | Traditional Chinese - Hong Kong | zh-HK | None | | Traditional Chinese - Taiwan | zh-TW | None | | Japanese | JA | None | | Korean | KO | None | | Polish | PL | Partial | | Russian | RU | None | | Spanish | es | None | | Portuguese | pt | None | | French | fr | Partial | | Malay | ms | None | | Indonesian | id | None | | Turkmen | tk | None | | Filipino (Tagalog) | tl | None | | Vietnamese | vi | None | | Kazakh (Latin) | kk | None | | German | de | None | | Dutch | nl | None | | Irish | ga | None | | Italian | it | None | | Greek | el | None | | Swedish | sv | None | | Danish | da | None | | Norwegian | no | None | | Icelandic | is | None | | Finnish | fi | None | | Ukrainian | uk | None | | Czech | cs | None | | Romanian | ro | None | | Hungarian | hu | None | | Slovak | sk | None | | Croatian | hr | None | | Estonian | et | None | | Latvian | lv | None | | Lithuanian | lt | None | | Belarusian | be | None | | Macedonian | mk | None | | Albanian | sq | None | | Serbian (Cyrillic) | sr | Partial | | Serbian (Latin) | sr | Partial | | Slovenian | sl | None | | Catalan | ca | None | | Bulgarian | bg | None | | Maltese | mt | None | | Swahili | sw | None | | Amharic | am | None | | Oromo | om | None | | Tigrinya | ti | None | | Haitian Creole | ht | None | | Latin | la | None | | Lao | lo | None | | Malayalam | ml | None | | Gujarati | gu | None | | Thai | th | None | | Burmese | my | Partial | | Tamil | ta | None | | Telugu | te | None | | Oriya | or | Partial | | Armenian | hy | None | | Mongolian (Cyrillic) | mn | None | | Georgian | ka | None | | Khmer | km | None | | Bosnian | bs | None | | Luxembourgish | lb | None | | Moldovan | ro | None | | Moldovan (Cyrillic) | ro | None | | Romansh | rm | None | | Turkish | tr | None | | Sinhala | si | None | | Uzbek | uz | None | | Kyrgyz | ky | None | | Tajik | tg | None | | Abkhazian | ab | None | | Afar | aa | None | | Afrikaans | af | None | | Akan | ak | None | | Aragonese | an | None | | Avaric | av | None | | Ewe | ee | None | | Aymara | ay | None | | Ojibwa | oj | None | | Occitan | oc | None | | Oriya | or | None | | Ossetian | os | None | | Pali | pi | None | | Bashkir | ba | None | | Basque | eu | None | | Breton | br | None | | Chamorro | ch | None | | Chechen | ce | None | | Chuvash | cv | None | | Tswana | tn | None | | Ndebele, South | nr | None | | Ndonga | ng | None | | Faroese | fo | None | | Fijian | fj | None | | Frisian, Western | fy | None | | Ganda | lg | None | | Kongo | kg | None | | Kalaallisut | kl | None | | Church Slavic | cu | None | | Guarani | gn | None | | Interlingua | ia | None | | Herero | hz | None | | Kikuyu | ki | None | | Rundi | rn | None | | Kinyarwanda | rw | None | | Kirghiz | ky | None | | Galician | gl | None | | Kanuri | kr | None | | Cornish | kw | None | | Komi | kv | None | | Xhosa | xh | None | | Corsican | co | None | | Cree | cr | None | | Croatian | hr | None | | Quechua | qu | None | | Kurdish (Latin) | ku | None | | Kuanyama | kj | None | | Limburgan | li | None | | Lingala | ln | None | | Manx | gv | None | | Malagasy | mg | None | | Marshallese | mh | None | | Maori | mi | None | | Navajo | nv | None | | Nauru | na | None | | Nyanja | ny | None | | Norwegian Nynorsk | nn | None | | Sardinian | sc | None | | Northern Sami | se | None | | Samoan | sm | None | | Sango | sg | None | | Shona | sn | None | | Esperanto | eo | None | | Scottish Gaelic | gd | None | | Somali | so | None | | Southern Sotho | st | None | | Tagalog | tl | None | | Tatar | tt | None | | Tahitian | ty | None | | Tongan | to | None | | Twi | tw | None | | Walloon | wa | None | | Welsh | cy | None | | Venda | ve | None | | Volapük | vo | None | | Interlingue | ie | None | | Hiri Motu | ho | None | | Igbo | ig | None | | Ido | io | None | | Inuktitut | iu | None | | Inupiaq | ik | None | | Sichuan Yi | ii | None | | Yoruba | yo | None | | Zhuang | za | None | | Tsonga | ts | None | | Zulu | zu | None | | Brazilian Portuguese | pt-BR | None | ================================================ FILE: mkdocs.yml ================================================ # Copyright (c) 2016-2025 Martin Donath # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to # deal in the Software without restriction, including without limitation the # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or # sell copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. # Project information site_name: BabelDOC site_url: https://squidfunk.github.io/mkdocs-material/ site_author: funstory.ai site_description: >- Write your documentation in Markdown and create a professional static site in minutes – searchable, customizable, in 60+ languages, for all devices # Repository repo_name: funstory-ai/BabelDOC repo_url: https://github.com/funstory-ai/BabelDOC edit_uri: edit/main/docs/ # Copyright copyright: Copyright © 2025 funstory.ai # Configuration theme: name: material # custom_dir: material/overrides features: - announce.dismiss - content.action.edit - content.action.view - content.code.annotate - content.code.copy - content.code.select # - content.footnote.tooltips # - content.tabs.link - content.tooltips # - header.autohide # - navigation.expand - navigation.footer - navigation.indexes # - navigation.instant # - navigation.instant.prefetch # - navigation.instant.progress # - navigation.prune - navigation.sections - navigation.tabs # - navigation.tabs.sticky - navigation.top - navigation.tracking - search.highlight - search.share - search.suggest - toc.follow # - toc.integrate palette: - media: "(prefers-color-scheme)" toggle: icon: material/brightness-auto name: Switch to light mode - media: "(prefers-color-scheme: light)" scheme: default primary: white accent: indigo toggle: icon: material/brightness-7 name: Switch to dark mode - media: "(prefers-color-scheme: dark)" scheme: slate primary: black accent: indigo toggle: icon: material/brightness-4 name: Switch to system preference font: text: Roboto code: Roboto Mono # favicon: assets/favicon.png favicon: images/babeldoc-small-logo-with-transparent-background.svg logo: images/babeldoc-small-logo-with-transparent-background.svg # Plugins plugins: - search: separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' - minify: minify_html: true - git-authors - git-revision-date-localized: enable_creation_date: true # Additional configuration extra: status: new: Recently added deprecated: Deprecated social: - icon: fontawesome/brands/github link: https://github.com/funstory-ai/BabelDOC - icon: fontawesome/brands/python link: https://pypi.org/project/BabelDOC/ # Extensions markdown_extensions: - github-callouts - markdown_include.include - pymdownx.highlight: anchor_linenums: true line_spans: __span pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.snippets - pymdownx.superfences - def_list - pymdownx.tasklist: custom_checkbox: true not_in_nav: | /tutorials/**/*.md # Page tree nav: - Home: index.md - Supported Languages: supported_languages.md - API: - Async Translation API: ImplementationDetails/AsyncTranslate/AsyncTranslate.md - Implementation Details: - ImplementationDetails/README.md - PDF Parsing: ImplementationDetails/PDFParsing/PDFParsing.md - Layout Parser(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/midend/layout_parser.py - Paragraph Finding: ImplementationDetails/ParagraphFinding/ParagraphFinding.md - Styles and Formulas: ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md - IL Translator: ImplementationDetails/ILTranslator/ILTranslator.md - Typesetting: ImplementationDetails/Typesetting/Typesetting.md - Font Mapper(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/utils/fontmap.py - PDF Creation: ImplementationDetails/PDFCreation/PDFCreation.md - Intro To PDF Object: intro-to-pdf-object.md - Community: - Code of Conduct: CODE_OF_CONDUCT.md - Contributing: - Contributing: CONTRIBUTING.md - Contributor Reward: CONTRIBUTOR_REWARD.md ================================================ FILE: pyproject.toml ================================================ [project] name = "BabelDOC" version = "0.5.23" description = "Yet Another Document Translator" license = "AGPL-3.0" readme = "README.md" requires-python = ">=3.10,<3.14" authors = [ { name = "awwaawwa", email = "aw@funstory.ai" } ] maintainers = [ { name = "awwaawwa", email = "aw@funstory.ai" } ] classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] keywords = ["PDF"] dependencies = [ "bitstring>=4.3.0", "configargparse>=1.7", "httpx[socks]>=0.27.0", "huggingface-hub>=0.27.0", "numpy>=2.0.2", "onnx>=1.18.0", "onnxruntime>=1.16.1", "openai>=1.59.3", "orjson>=3.10.14", "charset-normalizer >= 2.0.0", "cryptography >= 36.0.0", # "pdfminer-six==20250416", "peewee>=3.17.8", "psutil>=7.0.0", "pymupdf>=1.25.1", "rich>=13.9.4", "toml>=0.10.2", "tqdm>=4.67.1", "xsdata[cli,lxml,soap]>=24.12", "msgpack>=1.1.0", "pydantic>=2.10.6", "tenacity>=9.0.0", "scikit-image>=0.25.2", "freetype-py>=2.5.1", "tiktoken>=0.9.0", "Levenshtein>=0.27.1", "opencv-python-headless>=4.10.0.84", "rapidocr-onnxruntime>=1.4.4", "pyzstd>=0.17.0", "hyperscan>=0.7.13", "rtree>=1.4.0", "chardet>=5.2.0", "scipy>=1.15.3", "uharfbuzz>=0.50.2", "scikit-learn>=1.7.1", ] [project.optional-dependencies] directml = ["onnxruntime-directml>=1.16.1"] cuda = ["onnxruntime-gpu>=1.16.1"] memray = ["memray>=1.17.1"] [project.urls] Homepage = "https://github.com/funstory-ai/BabelDOC" Issues = "https://github.com/funstory-ai/BabelDOC/issues" [project.scripts] babeldoc = "babeldoc.main:cli" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.flake8] ignore = ["E203", "E261", "E501", "W503", "E741", "E501"] max-line-length = 88 [tool.ruff] src = ["babeldoc"] target-version = "py310" show-fixes = true [tool.ruff.format] # Enable reformatting of code snippets in docstrings. docstring-code-format = true [tool.ruff.lint] ignore = [ "E203", # 冒号前的空格 "E261", # 注释前至少两个空格 "E501", # 行太长 "E741", # 变量名歧义 "F841", # 未使用的变量 "C901", # 太复杂的函数 "S101", # use assert "SIM", # flake8-simplify "ARG002", # unused argument "S110", # `try`-`except`-`pass` detected, consider logging the exception "B024", # abstract class without abstract methods "S112", # `try`-`except`-`continue` detected, consider logging the exception "COM812", # missing-trailing-comma ] select = [ "E", # pycodestyle 错误 "F", # Pyflakes "N", # PEP8 命名 "B", # flake8-bugbear "I", # isort "C", # mccabe "UP", # pyupgrade "S", # flake8-bandit "A", # flake8-builtins "COM", # flake8-commas "ARG", # flake8-unused-arguments "PTH", # 使用 pathlib ] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" [tool.ruff.lint.flake8-annotations] suppress-none-returning = true [tool.ruff.lint.isort] force-single-line = true [tool.ruff.lint.pydocstyle] convention = "google" # 设置一些规则的特定配置 [tool.ruff.lint.mccabe] max-complexity = 10 # 函数圈复杂度阈值 [tool.ruff.lint.per-file-ignores] "babeldoc/babeldoc_exception/BabelDOCException.py" = ["N999"] "babeldoc/format/pdf/pdfinterp.py" = ["N"] # 忽略命名规范 "tests/*" = ["S101"] # 在测试文件中允许 assert "**/__init__.py" = ["F401"] # 允许未使用的导入 # 忽略 S311 警告,因为这是有意的 "babeldoc/format/pdf/document_il/midend/paragraph_finder.py" = ["S311"] "docs/*" = ["A001"] "babeldoc/pdfminer/*" =["A","F", "I", "N", "S", "B", "C", "COM", "ARG", "PTH", "UP"] [dependency-groups] dev = [ "bumpver>=2024.1130", "markdown-callouts>=0.4.0", "markdown-include>=0.8.1", "mkdocs-git-authors-plugin>=0.9.2", "mkdocs-git-committers-plugin-2>=2.5.0", "mkdocs-git-revision-date-localized-plugin>=1.3.0", "mkdocs-material[recommended]>=9.6.4", "pre-commit>=4.1.0", "pygments>=2.19.1", "ruff>=0.9.2", "pytest>=8.3.4", "pylance>=0.29.0", "py-spy>=0.4.0", ] [tool.pytest.ini_options] pythonpath = [".", "src"] testpaths = ["tests"] [bumpver] current_version = "0.5.23" version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]" [bumpver.file_patterns] "pyproject.toml" = [ 'current_version = "{version}"', 'version = "{version}"' ] "babeldoc/__init__.py" = [ '__version__ = "{version}"' ] "babeldoc/main.py" = [ '__version__ = "{version}"' ] "babeldoc/const.py" = [ '__version__ = "{version}"' ] [tool.uv.sources] yadt = { path = ".", editable = true } [tool.pyright] pythonVersion = "3.10" # typeCheckingMode = "off" reportGeneralTypeIssues = false reportUnknownVariableType = false reportMissingParameterType = false reportUnknownParameterType = false ================================================ FILE: tests/test_translation_cache_cleanup.py ================================================ from concurrent.futures import ThreadPoolExecutor from babeldoc.translator.cache import TranslationCache from babeldoc.translator.cache import _TranslationCache from babeldoc.translator.cache import clean_test_db from babeldoc.translator.cache import init_test_db def _prepare_records(cache: TranslationCache, num_records: int) -> None: """Insert *num_records* unique records into the cache.""" for i in range(num_records): cache.set(f"text_{i}", f"translation_{i}") def test_cleanup_under_limit(monkeypatch): """When total rows < MAX_CACHE_ROWS, cleanup should do nothing.""" # Create an isolated test database test_db = init_test_db() try: cache = TranslationCache("dummy") # Make cleanup run every time for deterministic behaviour monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0) # Lower the MAX_CACHE_ROWS threshold for quick test execution monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 1000) _prepare_records(cache, 900) cache.set("extra", "extra") # This triggers cleanup assert _TranslationCache.select().count() == 901 finally: clean_test_db(test_db) def test_cleanup_over_limit(monkeypatch): """When rows > MAX_CACHE_ROWS, cleanup should trim to the limit.""" test_db = init_test_db() try: cache = TranslationCache("dummy") monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0) monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 500) total_records = 750 _prepare_records(cache, total_records) cache.set("extra", "extra") assert _TranslationCache.select().count() <= 500 # capped at limit finally: clean_test_db(test_db) def test_cleanup_thread_safety(monkeypatch): """Multiple threads attempting cleanup concurrently should not raise errors.""" test_db = init_test_db() try: cache = TranslationCache("dummy") monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0) monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 500) def task(n): cache.set(f"text_{n}", f"translation_{n}") # Use a pool of threads to stress cleanup with ThreadPoolExecutor(max_workers=10) as executor: executor.map(task, range(600)) # After all threads complete, ensure table size is capped assert _TranslationCache.select().count() <= 500 finally: clean_test_db(test_db)