Repository: py-pdf/pypdf Branch: main Commit: 04b0a38f56ad Files: 207 Total size: 2.4 MB Directory structure: gitextract_mui37wu0/ ├── .git-blame-ignore-revs ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.md │ │ └── feature-request.md │ ├── SECURITY.md │ ├── dependabot.yaml │ ├── scripts/ │ │ ├── check_gh_pages_updates.py │ │ ├── check_pr_title.py │ │ └── check_urls.py │ └── workflows/ │ ├── benchmark.yaml │ ├── create-github-release.yaml │ ├── gh-pages-check.yaml │ ├── github-ci.yaml │ ├── publish-to-pypi.yaml │ ├── release.yaml │ ├── title-check.yaml │ └── urls-check.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── LICENSE ├── Makefile ├── README.md ├── docs/ │ ├── Makefile │ ├── _static/ │ │ └── releasing.drawio │ ├── conf.py │ ├── dev/ │ │ ├── cmaps.md │ │ ├── deprecations.md │ │ ├── documentation.md │ │ ├── intro.md │ │ ├── pdf-format.md │ │ ├── pypdf-parsing.md │ │ ├── pypdf-writing.md │ │ ├── releasing.md │ │ └── testing.md │ ├── index.rst │ ├── make.bat │ ├── meta/ │ │ ├── changelog-v1.md │ │ ├── comparisons.md │ │ ├── faq.md │ │ ├── history.md │ │ ├── migration-1-to-2.md │ │ ├── project-governance.md │ │ ├── scope-of-pypdf.md │ │ └── taking-ownership.md │ ├── modules/ │ │ ├── Destination.rst │ │ ├── DocumentInformation.rst │ │ ├── Field.rst │ │ ├── Fit.rst │ │ ├── PageObject.rst │ │ ├── PageRange.rst │ │ ├── PaperSize.rst │ │ ├── PdfDocCommon.rst │ │ ├── PdfReader.rst │ │ ├── PdfWriter.rst │ │ ├── RectangleObject.rst │ │ ├── Transformation.rst │ │ ├── XmpInformation.rst │ │ ├── annotations.rst │ │ ├── constants.rst │ │ ├── errors.rst │ │ └── generic.rst │ └── user/ │ ├── add-javascript.md │ ├── add-watermark.md │ ├── adding-pdf-annotations.md │ ├── cropping-and-transforming.md │ ├── encryption-decryption.md │ ├── extract-images.md │ ├── extract-text.md │ ├── file-size.md │ ├── forms.md │ ├── handle-attachments.md │ ├── handling-outlines.md │ ├── installation.md │ ├── merging-pdfs.md │ ├── metadata.md │ ├── pdf-version-support.md │ ├── pdfa-compliance.md │ ├── post-processing-in-text-extraction.md │ ├── reading-pdf-annotations.md │ ├── robustness.md │ ├── security.md │ ├── streaming-data.md │ ├── suppress-warnings.md │ └── viewer-preferences.md ├── make_release.py ├── pypdf/ │ ├── __init__.py │ ├── _cmap.py │ ├── _codecs/ │ │ ├── __init__.py │ │ ├── _codecs.py │ │ ├── adobe_glyphs.py │ │ ├── core_font_metrics.py │ │ ├── pdfdoc.py │ │ ├── std.py │ │ ├── symbol.py │ │ └── zapfding.py │ ├── _crypt_providers/ │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── _cryptography.py │ │ ├── _fallback.py │ │ └── _pycryptodome.py │ ├── _doc_common.py │ ├── _encryption.py │ ├── _font.py │ ├── _page.py │ ├── _page_labels.py │ ├── _protocols.py │ ├── _reader.py │ ├── _text_extraction/ │ │ ├── __init__.py │ │ ├── _layout_mode/ │ │ │ ├── __init__.py │ │ │ ├── _fixed_width_page.py │ │ │ ├── _text_state_manager.py │ │ │ └── _text_state_params.py │ │ └── _text_extractor.py │ ├── _utils.py │ ├── _version.py │ ├── _writer.py │ ├── annotations/ │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── _markup_annotations.py │ │ └── _non_markup_annotations.py │ ├── constants.py │ ├── errors.py │ ├── filters.py │ ├── generic/ │ │ ├── __init__.py │ │ ├── _appearance_stream.py │ │ ├── _base.py │ │ ├── _data_structures.py │ │ ├── _files.py │ │ ├── _fit.py │ │ ├── _image_inline.py │ │ ├── _image_xobject.py │ │ ├── _link.py │ │ ├── _outline.py │ │ ├── _rectangle.py │ │ ├── _utils.py │ │ └── _viewerpref.py │ ├── pagerange.py │ ├── papersizes.py │ ├── py.typed │ ├── types.py │ └── xmp.py ├── pyproject.toml ├── requirements/ │ ├── ci-3.11.txt │ ├── ci.in │ ├── ci.txt │ ├── dev.in │ ├── dev.txt │ ├── docs.in │ └── docs.txt ├── resources/ │ ├── 010-pdflatex-forms.txt │ ├── AEO.1172.layout.rot180.txt │ ├── AEO.1172.layout.txt │ ├── Claim Maker Alerts Guide_pg2.layout.txt │ ├── Epic.Page.layout.txt │ ├── afm_to_dataclass.py │ ├── crazyones.txt │ ├── crazyones_layout_vertical_space.txt │ ├── crazyones_layout_vertical_space_font_height_weight.txt │ ├── jpeg.txt │ ├── multicolumn-lorem-ipsum.txt │ └── toy.layout.txt └── tests/ ├── __init__.py ├── bench.py ├── conftest.py ├── example_files.yaml ├── generic/ │ ├── __init__.py │ ├── test_base.py │ ├── test_data_structures.py │ ├── test_files.py │ ├── test_image_inline.py │ ├── test_image_xobject.py │ └── test_link.py ├── scripts/ │ ├── __init__.py │ ├── data/ │ │ └── commits__version_4_0_1.json │ ├── test_example_files.py │ └── test_make_release.py ├── test_annotations.py ├── test_appearance_stream.py ├── test_cmap.py ├── test_codecs.py ├── test_constants.py ├── test_doc_common.py ├── test_encryption.py ├── test_filters.py ├── test_font.py ├── test_forms.py ├── test_generic.py ├── test_images.py ├── test_javascript.py ├── test_merger.py ├── test_page.py ├── test_page_labels.py ├── test_pagerange.py ├── test_papersizes.py ├── test_pdfa.py ├── test_protocols.py ├── test_reader.py ├── test_text_extraction.py ├── test_utils.py ├── test_workflows.py ├── test_writer.py ├── test_xmp.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .git-blame-ignore-revs ================================================ # This file helps us to ignore style / formatting / doc changes # in git blame. That is useful when we're trying to find the root cause of an # error. # Docstring formatting a89ff74d8c0203278a039d9496a3d8df4d134f84 # STY: Apply pre-commit (black, isort) + use snake_case variables (#832) eef03d935dfeacaa75848b39082cf94d833d3174 # STY: Apply black and isort baeb7d23278de0f8d00ca9f2b656bf0674f08937 # STY: Documentation, Variable names (#839) 444fca22836df061d9d23e71ffb7d68edcdfa766 ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.md ================================================ --- name: Report a bug about: Something broke! title: '' labels: Bug assignees: '' --- Replace this: What happened? What were you trying to achieve? ## Environment Which environment were you using when you encountered the problem? ```bash $ python -m platform # TODO: Your output goes here $ python -c "import pypdf;print(pypdf._debug_versions)" # TODO: Your output goes here ``` ## Code + PDF This is a minimal, complete example that shows the issue: ```python # TODO: Your code goes here ``` Share here the PDF file(s) that cause the issue. The smaller they are, the better. Let us know if we may add them to our tests! ## Traceback This is the complete traceback I see: ``` # TODO: Your traceback goes here (if applicable) ``` ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.md ================================================ --- name: Request a Feature about: What do you think is missing in pypdf? title: '' labels: Feature Request assignees: '' --- ## Explanation Explain briefly what you want to achieve. ## Code Example How would your feature be used? (Remove this if it is not applicable.) ```python from pypdf import PdfReader, PdfWriter ... # your new feature in action! ``` ================================================ FILE: .github/SECURITY.md ================================================ # Security Policy ## Supported Versions Security fixes are applied to the latest version. ## Reporting a Vulnerability If you find a potential security issue, please report it using the [private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature of GitHub to automatically inform all relevant team members. Otherwise, please get in touch with stefan6419846 through e-mail (current maintainer, address in GitHub profile). Please have a look at our [corresponding user documentation](https://pypdf.readthedocs.io/en/stable/user/security.html) as well, which includes some information about possibly invalid reports as well. We will try to find a fix in a timely manner and will then issue a security advisory together with the update via GitHub, as well as requesting a CVE ([example](https://github.com/py-pdf/pypdf/security/advisories/GHSA-xcjx-m2pj-8g79)). If you do not get a reaction within 30 days, please open a public issue on GitHub. ================================================ FILE: .github/dependabot.yaml ================================================ # Set update schedule for GitHub Actions version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" commit-message: prefix: "DEV" ================================================ FILE: .github/scripts/check_gh_pages_updates.py ================================================ """Check that all GitHub pages JavaScript dependencies are up-to-date.""" # noqa: INP001 import base64 import hashlib import json import re import sys import urllib.request from pathlib import Path JSDELIVR_RE = re.compile( r"(https://cdn\.jsdelivr\.net/npm/" r"(?P[^@/]+)@(?P[^/]+)" r"/(?P[^\"']+))" ) def fetch_json(url: str) -> dict: """Retrieve JSON data from the given URL.""" with urllib.request.urlopen(url, timeout=15) as resp: # noqa: S310 # Controlled input. return json.load(resp) def fetch_bytes(url: str) -> bytes: """Retrieve bytes data from the given URL.""" with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310 # Controlled input. return resp.read() def get_latest_version(pkg: str) -> str: """Get the latest version for this package.""" data = fetch_json(f"https://registry.npmjs.org/{pkg}") return data["dist-tags"]["latest"] def sri_hash(content: bytes) -> str: """Calculate the SRI hash for the given content.""" digest = hashlib.sha384(content).digest() return "sha384-" + base64.b64encode(digest).decode("ascii") def scan_html(path: Path) -> list[re.Match[str]]: """Scan the given HTML file for external JavaScript includes.""" text = path.read_text(encoding="utf-8", errors="ignore") return list(JSDELIVR_RE.finditer(text)) def main() -> None: """Perform the checks.""" outdated_found = False for html_path in sorted(Path("gh-pages").rglob("*.html"), key=str): matches = scan_html(html_path) if not matches: continue sys.stdout.write(f"\n📄 {html_path} ...\n\n") for m in matches: pkg = m.group("name") current_version = m.group("version") full_url = m.group(1) try: latest_version = get_latest_version(pkg) except Exception as e: sys.stdout.write(f" ⚠️ {pkg}: npm lookup failed ({e})\n") continue if current_version == latest_version: sys.stdout.write(f" ✅ {pkg} {current_version}\n") continue outdated_found = True latest_url = full_url.replace( f"@{current_version}/", f"@{latest_version}/" ) try: latest_bytes = fetch_bytes(latest_url) latest_sri = sri_hash(latest_bytes) except Exception as e: sys.stdout.write(f" ⚠️ {pkg}: failed to fetch latest file ({e})\n") continue sys.stdout.write(f" ❌ {pkg}\n") sys.stdout.write(f" Current: {current_version}\n") sys.stdout.write(f" Latest: {latest_version}\n") sys.stdout.write(f" Latest SRI: {latest_sri}\n") sys.stdout.write("\n") if outdated_found: sys.stdout.write("\n❗ Outdated dependencies detected\n") sys.exit(1) sys.stdout.write("\n🎉 All CDN dependencies are up to date\n") if __name__ == "__main__": main() ================================================ FILE: .github/scripts/check_pr_title.py ================================================ """Check that all PR titles follow the desired scheme.""" # noqa: INP001 import os import sys KNOWN_PREFIXES = ( "SEC: ", "BUG: ", "ENH: ", "DEP: ", "PI: ", "ROB: ", "DOC: ", "TST: ", "DEV: ", "STY: ", "MAINT: ", "REL: ", # For internal use only. ) PR_TITLE = os.getenv("PR_TITLE", "") if not PR_TITLE.startswith(KNOWN_PREFIXES) or not PR_TITLE.split(": ", maxsplit=1)[1]: sys.stderr.write( f"The PR title '{PR_TITLE}' does not follow the projects naming scheme: " "https://pypdf.readthedocs.io/en/latest/dev/intro.html#commit-messages\n", ) sys.stderr.write( "If you do not know which one to choose or if multiple apply, make a best guess. " "Nobody will complain if it does not quite fit :-)\n", ) sys.exit(1) else: sys.stdout.write(f"PR title '{PR_TITLE}' appears to be valid.\n") ================================================ FILE: .github/scripts/check_urls.py ================================================ """Check that all test data URLs are still accessible.""" # noqa: INP001 import ast import sys from collections.abc import Iterator from operator import itemgetter from pathlib import Path from tests import _get_data_from_url, read_yaml_to_list_of_dicts URL_PREFIXES_TO_IGNORE = ( "http://ns.adobe.com/tiff/1.0/", "http://www.example.com", "https://example.com", "https://martin-thoma.com", "https://pypdf.readthedocs.io/", "https://www.example.com", ) PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS = { "https://github.com/user-attachments/files/18381726/tika-957721.pdf", } def get_urls_from_test_files() -> Iterator[str]: """Retrieve all URLs defined the test files.""" tests_directory = Path(__file__).parent.parent.parent / "tests" for test_file in sorted(tests_directory.rglob("test_*.py")): tree = ast.parse(source=test_file.read_text(encoding="utf-8"), filename=str(test_file)) for node in ast.walk(tree): if not isinstance(node, ast.Constant): continue if not isinstance(node.value, str): continue if not node.value.startswith(("http://", "https://")): continue yield node.value def get_urls_from_example_files() -> Iterator[str]: """Retrieve all URLs defined in the `example_files.yaml`.""" pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent.parent.parent / "tests" / "example_files.yaml") yield from map(itemgetter("url"), pdfs) def check_url(url: str) -> bool: """Check if the given URL appears to still be valid.""" if url.startswith(URL_PREFIXES_TO_IGNORE): return True try: data = _get_data_from_url(url) except Exception as exception: sys.stderr.write(f"Error getting data from {url}: {exception}\n") return False if len(data) < 75: sys.stderr.write(f"Not enough data from {url}: {data}\n") return False if ( url.lower().endswith(".pdf") and url not in PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS and not data.startswith(b"%PDF-") ): sys.stderr.write(f"The file at {url} does not look like a PDF: {data[:50]}\n") return False sys.stdout.write(f"URL {url} looks good.\n") return True def main() -> bool: """Check if there are invalid URLs.""" urls: set[str] = set() for url in get_urls_from_test_files(): urls.add(url) for url in get_urls_from_example_files(): urls.add(url) is_valid = True for url in sorted(urls): is_valid &= check_url(url) return not is_valid if __name__ == "__main__": sys.exit(main()) ================================================ FILE: .github/workflows/benchmark.yaml ================================================ name: Benchmarking pypdf on: push: branches: - main permissions: contents: write deployments: write jobs: benchmark: name: "Benchmark ${{ matrix.name }}" runs-on: ubuntu-latest strategy: matrix: python-version: ['3.x'] include: - python-version: '3.x' name: 'CPython' - python-version: 'pypy3.11' name: 'PyPy 3.11' steps: - name: Checkout Code uses: actions/checkout@v6 with: submodules: 'recursive' - name: Setup Python uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install requirements run: | pip install -r requirements/ci-3.11.txt - name: Install pypdf run: | pip install . - name: Run benchmark run: | pytest tests/bench.py --benchmark-json output.json - name: Store benchmark result uses: benchmark-action/github-action-benchmark@v1 with: name: "${{ matrix.name }} Benchmark" tool: 'pytest' output-file-path: output.json # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096 github-token: ${{ secrets.GITHUB_TOKEN }} auto-push: true # Show alert with commit comment on detecting possible performance regression alert-threshold: '200%' comment-on-alert: true fail-on-alert: true ================================================ FILE: .github/workflows/create-github-release.yaml ================================================ name: Create a GitHub release page on: push: tags: - '*.*.*' workflow_dispatch: permissions: contents: write jobs: build_and_publish: name: Create a GitHub release page runs-on: ubuntu-latest steps: - name: Checkout Repository uses: actions/checkout@v6 - name: Prepare variables id: prepare_variables run: | git fetch --tags --force latest_tag=$(git describe --tags --abbrev=0) echo "latest_tag=${latest_tag}" >> "$GITHUB_ENV" echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_ENV" EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64) echo "tag_body<<$EOF" >> "$GITHUB_ENV" git --no-pager tag -l "${latest_tag}" --format='%(contents:body)' >> "$GITHUB_ENV" echo "$EOF" >> "$GITHUB_ENV" - name: Create GitHub Release 🚀 uses: softprops/action-gh-release@v2 with: tag_name: ${{ env.latest_tag }} name: Version ${{ env.latest_tag }}, ${{ env.date }} draft: false prerelease: false body: ${{ env.tag_body }} ================================================ FILE: .github/workflows/gh-pages-check.yaml ================================================ name: 'GitHub Pages Check' on: workflow_dispatch: schedule: - cron: 0 6 * * 1 jobs: url-check: name: GitHub Pages check runs-on: ubuntu-latest steps: - name: Checkout GitHub Pages uses: actions/checkout@v6 with: ref: 'gh-pages' path: 'gh-pages' - name: Checkout main (tools) uses: actions/checkout@v6 with: ref: main path: main - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.x' - name: Check GitHub Pages run: | export PYTHONPATH="$GITHUB_WORKSPACE" python main/.github/scripts/check_gh_pages_updates.py ================================================ FILE: .github/workflows/github-ci.yaml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/tutorials/build-and-test-code/python name: CI on: push: branches: - main paths-ignore: - '**/*.md' - '**/*.rst' pull_request: branches: - main paths-ignore: - '**/*.md' - '**/*.rst' workflow_dispatch: jobs: test_windows: name: pytest on windows runs-on: windows-latest steps: - name: Checkout Code uses: actions/checkout@v6 with: submodules: 'recursive' - name: Cache Downloaded Files id: cache-downloaded-files-windows uses: actions/cache@v5 if: github.ref == 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main-${{ github.run_id }} restore-keys: | cache-downloaded-files-main- cache-downloaded-files enableCrossOsArchive: true - name: Restore Downloaded Files uses: actions/cache/restore@v5 if: github.ref != 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main- restore-keys: | cache-downloaded-files-main- cache-downloaded-files enableCrossOsArchive: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.x' allow-prereleases: true - name: Upgrade pip run: | python -m pip install --upgrade pip - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - name: Install cryptography run: | pip install cryptography - name: Install pypdf run: | pip install . - name: Prepare run: | python -c "from tests import download_test_pdfs; download_test_pdfs()" - name: Test with pytest run: | python -m pytest tests --cov=pypdf --cov-append -n auto -vv -p no:benchmark test_macos: name: pytest on macOS runs-on: macos-latest steps: - name: Checkout Code uses: actions/checkout@v6 with: submodules: 'recursive' - name: Cache Downloaded Files id: cache-downloaded-files-mac uses: actions/cache@v5 if: github.ref == 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main-${{ github.run_id }} restore-keys: | cache-downloaded-files-main- cache-downloaded-files - name: Restore Downloaded Files uses: actions/cache/restore@v5 if: github.ref != 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main- restore-keys: | cache-downloaded-files-main- cache-downloaded-files - name: Setup Python (3.11+) uses: actions/setup-python@v6 with: python-version: '3.x' allow-prereleases: true - name: Upgrade pip run: | python -m pip install --upgrade pip - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - name: Install cryptography run: | pip install cryptography - name: Install OS dependencies run: brew install ghostscript jbig2dec poppler - name: Install pypdf run: | pip install . - name: Prepare run: | python -c "from tests import download_test_pdfs; download_test_pdfs()" - name: Test with pytest run: | python -m pytest tests --cov=pypdf --cov-append -n auto -vv -p no:benchmark tests: name: "pytest on ${{ matrix.python-version }} (crypto-lib: ${{ matrix.use-crypto-lib }})" runs-on: ubuntu-24.04 strategy: matrix: python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14', 'pypy3.11'] use-crypto-lib: ['cryptography'] include: - python-version: '3.9' use-crypto-lib: 'pycryptodome' - python-version: '3.9' use-crypto-lib: 'none' steps: - name: Update APT packages run: sudo apt-get update - name: Install APT dependencies run: sudo apt-get install ghostscript jbig2dec poppler-utils - name: Checkout Code uses: actions/checkout@v6 with: submodules: 'recursive' - name: Cache Downloaded Files id: cache-downloaded-files uses: actions/cache@v5 if: github.ref == 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main-${{ github.run_id }} restore-keys: | cache-downloaded-files-main- cache-downloaded-files - name: Restore Downloaded Files uses: actions/cache/restore@v5 if: github.ref != 'refs/heads/main' with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files-main- restore-keys: | cache-downloaded-files-main- cache-downloaded-files - name: Setup Python uses: actions/setup-python@v6 if: matrix.python-version == '3.9' || matrix.python-version == '3.10' with: python-version: ${{ matrix.python-version }} cache: 'pip' cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) uses: actions/setup-python@v6 if: matrix.python-version != '3.9' && matrix.python-version != '3.10' with: python-version: ${{ matrix.python-version }} allow-prereleases: true cache: 'pip' cache-dependency-path: '**/requirements/ci-3.11.txt' - name: Upgrade pip run: | python -m pip install --upgrade pip - name: Install requirements (Python 3) run: | pip install -r requirements/ci.txt if: matrix.python-version == '3.9' || matrix.python-version == '3.10' - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt if: matrix.python-version != '3.9' && matrix.python-version != '3.10' - name: Remove pycryptodome and cryptography run: | pip uninstall pycryptodome cryptography -y - name: Install cryptography run: | pip install cryptography if: matrix.use-crypto-lib == 'cryptography' - name: Install pycryptodome run: | pip install pycryptodome if: matrix.use-crypto-lib == 'pycryptodome' - name: Install pypdf run: | pip install . - name: Download test files run: | python -c "from tests import download_test_pdfs; download_test_pdfs()" - name: Test with pytest run: | python -m pytest tests --cov=pypdf --cov-append -n auto -vv -p no:benchmark if: ${{ !startsWith(matrix.python-version, 'pypy') }} - name: Test with pytest (PyPy, no coverage) # Coverage on PyPy is skipped because running coverage with PyPy is slow and CPython test already provides # complete coverage data for the same code run: | python -m pytest tests -n auto -vv -p no:benchmark -o faulthandler_timeout=60 --dist=loadfile if: ${{ startsWith(matrix.python-version, 'pypy') }} - name: Rename coverage data file run: mv .coverage ".coverage.$RANDOM" if: ${{ !startsWith(matrix.python-version, 'pypy') }} - name: Upload coverage data uses: actions/upload-artifact@v7 if: ${{ !startsWith(matrix.python-version, 'pypy') }} with: name: coverage-data.${{ matrix.python-version }}-${{ matrix.use-crypto-lib }} path: .coverage.* if-no-files-found: ignore include-hidden-files: true codestyle: name: Check code style issues runs-on: ubuntu-24.04 steps: - name: Checkout Code uses: actions/checkout@v6 with: submodules: 'recursive' - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.x' cache: 'pip' cache-dependency-path: '**/requirements/ci-3.11.txt' - name: Upgrade pip run: | python -m pip install --upgrade pip - name: Install requirements run: | pip install -r requirements/ci-3.11.txt - name: Install pypdf run: | pip install . - name: Test with ruff run: | echo `ruff --version` ruff check . - name: Test with mypy run : | mypy pypdf - name: Install docs requirements run: | pip install -r requirements/docs.txt - name: Test docs build working-directory: ./docs run: | sphinx-build --nitpicky --fail-on-warning --keep-going --show-traceback -d _build/doctrees --builder html . _build/html - name: Test docs examples working-directory: ./docs run: | sphinx-build -d _build/doctrees --builder doctest . _build/doctest - name: Check with pre-commit run: | pip install -r requirements/dev.txt pre-commit run --all-files --show-diff-on-failure package: name: Build & verify package runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.x' - run: python -m pip install flit check-wheel-contents - run: flit build - run: ls -l dist - name: Test CHANGELOG.md present in sdist run: tar -tzf dist/*.tar.gz | grep -q 'CHANGELOG.md' - name: Test of bdist run: check-wheel-contents dist/*.whl - name: Test installing package run: python -m pip install . - name: Test running installed package working-directory: /tmp run: python -c "import pypdf;print(pypdf.__version__)" coverage: name: Combine & check coverage. runs-on: ubuntu-latest needs: tests steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.x' - run: python -m pip install --upgrade coverage[toml] - uses: actions/download-artifact@v8 with: pattern: coverage-data* merge-multiple: true - name: Check Number of Downloaded Files run: | downloaded_files_count=$(find \.coverage* -type f | wc -l) if [ $downloaded_files_count -eq 8 ]; then echo "The expected number of files (8) were downloaded." else echo "ERROR: Expected 8 files, but found $downloaded_files_count files." exit 1 fi - name: Combine coverage & create xml report run: | python -m coverage combine python -m coverage xml - name: Upload Coverage to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml ================================================ FILE: .github/workflows/publish-to-pypi.yaml ================================================ name: Publish Python Package to PyPI on: push: tags: - '*.*.*' workflow_dispatch: jobs: build: name: Build distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install pypa/build run: >- python3 -m pip install build --user - name: Build a binary wheel and a source tarball run: python3 -m build - name: Store the distribution packages uses: actions/upload-artifact@v7 with: name: python-package-distributions path: dist/ publish-to-pypi: name: Publish Python distribution to PyPI needs: - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/pypdf permissions: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - name: Download all the dists uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ - name: Publish distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 ================================================ FILE: .github/workflows/release.yaml ================================================ # This action assumes that there is a REL-commit which already has a # Markdown-formatted git tag. Hence, the CHANGELOG is already adjusted, # and it's decided what should be in the release. # This action only ensures the release is done with the proper contents # and that it's announced with a GitHub release. name: Create git tag # Disable for now and uses dummy `workflow_dispatch` trigger we usually do not use anyway. # To activate this again, we have to fix https://github.com/py-pdf/pypdf/issues/2753 on: workflow_dispatch: # push: # branches: # - main permissions: contents: write env: HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} jobs: build_and_publish: name: Publish a new version runs-on: ubuntu-latest if: "${{ startsWith(github.event.head_commit.message, 'REL: ') }}" steps: - name: Checkout Repository uses: actions/checkout@v6 - name: Extract version from commit message id: extract_version run: | VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Extract tag message from commit message id: extract_message run: | VERSION="${{ steps.extract_version.outputs.version }}" delimiter="$(openssl rand -hex 8)" MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) echo "message<<${delimiter}" >> $GITHUB_OUTPUT echo "$MESSAGE" >> $GITHUB_OUTPUT echo "${delimiter}" >> $GITHUB_OUTPUT - name: Create Git Tag run: | VERSION="${{ steps.extract_version.outputs.version }}" MESSAGE="${{ steps.extract_message.outputs.message }}" git config user.name github-actions git config user.email github-actions@github.com git tag "$VERSION" -m "$MESSAGE" git push origin $VERSION ================================================ FILE: .github/workflows/title-check.yaml ================================================ name: 'PR Title Check' on: pull_request: # check when PR # * is created, # * title is edited, and # * new commits are added (to ensure failing title blocks merging) types: [opened, reopened, edited, synchronize] jobs: title-check: name: Title check runs-on: ubuntu-latest steps: - name: Checkout Code uses: actions/checkout@v6 - name: Check PR title env: PR_TITLE: ${{ github.event.pull_request.title }} run: python .github/scripts/check_pr_title.py ================================================ FILE: .github/workflows/urls-check.yaml ================================================ name: 'URL Check' on: workflow_dispatch: schedule: - cron: 0 6 * * 1 jobs: url-check: name: URL check runs-on: ubuntu-latest steps: - name: Checkout Code uses: actions/checkout@v6 - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install requirements run: pip install pyyaml Pillow - name: Check URLs run: | export PYTHONPATH="$GITHUB_WORKSPACE" python .github/scripts/check_urls.py ================================================ FILE: .gitignore ================================================ *.pyc *.swp .DS_Store .tox build .idea/* *.egg-info/ dist/* __pycache__/ # in-project virtual environments venv/ .venv/ # Code coverage artifacts .coverage* coverage.xml # Editors / IDEs .vscode/ # Docs docs/_build/ .cspell/ # Files generated by some of the scripts dont_commit_*.pdf pypdf-output.pdf annotated-pdf-link.pdf Image9.png pypdf_pdfLocation.txt .python-version tests/pdf_cache/ docs/meta/CHANGELOG.md docs/meta/CONTRIBUTORS.md extracted-images/ RELEASE_COMMIT_MSG.md RELEASE_TAG_MSG.md .envrc ================================================ FILE: .gitmodules ================================================ [submodule "sample-files"] path = sample-files url = https://github.com/py-pdf/sample-files ================================================ FILE: .pre-commit-config.yaml ================================================ # pre-commit run --all-files repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-ast - id: check-case-conflict - id: check-docstring-first - id: check-yaml - id: debug-statements - id: end-of-file-fixer exclude: "resources/.*|docs/make.bat" - id: fix-byte-order-marker - id: trailing-whitespace - id: mixed-line-ending args: ['--fix=lf'] exclude: "docs/make.bat" - id: check-added-large-files args: ['--maxkb=1000'] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.0 hooks: - id: ruff-check args: ['--fix'] - repo: https://github.com/asottile/pyupgrade rev: v3.21.2 hooks: - id: pyupgrade args: [--py39-plus] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.17.1 hooks: - id: mypy files: ^pypdf/.* ================================================ FILE: .readthedocs.yaml ================================================ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 build: os: ubuntu-lts-latest tools: python: "latest" # Build documentation in the "docs/" directory with Sphinx sphinx: configuration: docs/conf.py # If using Sphinx, optionally build your docs in additional formats such as PDF formats: all # Optionally declare the Python requirements required to build your docs python: install: - requirements: requirements/docs.txt - method: pip path: . extra_requirements: - full ================================================ FILE: CHANGELOG.md ================================================ # CHANGELOG ## Version 6.9.1, 2026-03-17 ### Security (SEC) - Improve performance and limit length of array-based content streams (#3686) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.9.0...6.9.1) ## Version 6.9.0, 2026-03-15 ### New Features (ENH) - Expose /Perms verification result on Encryption object (#3672) ### Performance Improvements (PI) - Fix O(n²) performance in NameObject read/write (#3679) - Batch-parse all objects in ObjStm on first access (#3677) ### Bug Fixes (BUG) - Avoid sharing array-based content streams between pages (#3681) - Avoid accessing invalid page when inserting blank page under some conditions (#3529) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.8.0...6.9.0) ## Version 6.8.0, 2026-03-09 ### Security (SEC) - Limit allowed `/Length` value of stream (#3675) ### New Features (ENH) - Add /IRT (in-reply-to) support for markup annotations (#3631) ### Documentation (DOC) - Avoid using `PageObject.replace_contents` on PdfReader (#3669) - Document how to disable jbig2dec calls [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.5...6.8.0) ## Version 6.7.5, 2026-03-02 ### Security (SEC) - Improve the performance of the ASCIIHexDecode filter (#3666) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.4...6.7.5) ## Version 6.7.4, 2026-02-27 ### Security (SEC) - Allow limiting output length for RunLengthDecode filter (#3664) ### Robustness (ROB) - Deal with invalid annotations in extract_links (#3659) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.3...6.7.4) ## Version 6.7.3, 2026-02-24 ### Security (SEC) - Use zlib decompression limit when retrieving XFA data (#3658) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.2...6.7.3) ## Version 6.7.2, 2026-02-22 ### Security (SEC) - Prevent infinite loop from circular xref /Prev references (#3655) ### Bug Fixes (BUG) - Fix wrong LUT size error (#3651) - Fix handling of page boxes defined on `/Pages` (#3650) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.1...6.7.2) ## Version 6.7.1, 2026-02-17 ### Security (SEC) - Detect cyclic references when accessing TreeObject.children (#3645) - Limit size of `/ToUnicode` entries (#3646) - Limit FlateDecode recovery attempts (#3644) ### Bug Fixes (BUG) - Avoid own object replacement logic in `PageObject.replace_contents` (#3638) - Fix UnboundLocalError when update_page_form_field_values with /Sig (#3634) ### Robustness (ROB) - Avoid divison by zero when decoding FlateDecode PNG prediction (#3641) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.7.0...6.7.1) ## Version 6.7.0, 2026-02-08 ### Deprecations (DEP) - Deprecate support for abbreviations in decode_stream_data (#3617) ### New Features (ENH) - Add ability to add font resources for 14 Adobe Core fonts in text widget annotations (#3624) ### Bug Fixes (BUG) - Avoid invalid load for ICCBased FlateDecode images in mode 1 (#3619) ### Robustness (ROB) - Fix AESV2 decryption when /Length missing in encrypt dict (#3629) - Fix merging when annotations point to NullObject (#3613) - Check for `self._info` being None in `compress_identical_objects` (#3612) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.6.2...6.7.0) ## Version 6.6.2, 2026-01-26 ### Security (SEC) - Detect cyclic references when retrieving outlines (#3610) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.6.1...6.6.2) ## Version 6.6.1, 2026-01-25 ### Robustness (ROB) - `/AcroForm` might be NullObject (#3601) - Handle missing font bounding boxes gracefully (#3600) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.6.0...6.6.1) ## Version 6.6.0, 2026-01-09 ### Security (SEC) - Improve handling of partially broken PDF files (#3594) ### Deprecations (DEP) - Block common page content modifications when assigned to reader (#3582) ### New Features (ENH) - Embellishments to generated text appearance streams (#3571) ### Bug Fixes (BUG) - Do not consider multi-byte BOM-like sequences as BOMs (#3589) ### Robustness (ROB) - Avoid empty FlateDecode outputs without warning (#3579) ### Documentation (DOC) - Add outlines documentation and link it in User Guide (#3511) ### Developer Experience (DEV) - Add PyPy 3.11 to test matrix and benchmarks (#3574) ### Maintenance (MAINT) - Fix compatibility with Pillow >= 12.1.0 (#3590) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.5.0...6.6.0) ## Version 6.5.0, 2025-12-21 ### New Features (ENH) - Limit jbig2dec memory usage (#3576) - FontDescriptor: Initiate from embedded font resource (#3551) ### Robustness (ROB) - Allow fallback to PBM files for jbig2dec without PNG support (#3567) - Use warning instead of error for early EOD for RunLengthDecode (#3548) ### Developer Experience (DEV) - Test with macOS as well (#3401) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.4.2...6.5.0) ## Version 6.4.2, 2025-12-14 ### Bug Fixes (BUG) - Fix KeyError when flattening form field without /Font in resources (#3554) ### Robustness (ROB) - Allow deleting non-existent annotations (#3559) ### Documentation (DOC) - Fix level of attachment heading (#3560) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.4.1...6.4.2) ## Version 6.4.1, 2025-12-07 ### Performance Improvements (PI) - Optimize loop for layout mode text extraction (#3543) ### Bug Fixes (BUG) - Do not fail on choice field without /Opt key (#3540) ### Documentation (DOC) - Document possible issues with merge_page and clipping (#3546) - Add some notes about library security (#3545) ### Maintenance (MAINT) - Use CORE_FONT_METRICS for widths where possible (#3526) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.4.0...6.4.1) ## Version 6.4.0, 2025-11-23 ### Security (SEC) - Reduce default limit for LZW decoding ### New Features (ENH) - Parse and format comb fields in text widget annotations (#3519) ### Robustness (ROB) - Silently ignore Adobe Ascii85 whitespace for suffix detection (#3528) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.3.0...6.4.0) ## Version 6.3.0, 2025-11-16 ### New Features (ENH) - Wrap and align text in flattened PDF forms (#3465) ### Bug Fixes (BUG) - Fix missing "PreventGC" when cloning (#3520) - Preserve JPEG image quality by default (#3516) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.2.0...6.3.0) ## Version 6.2.0, 2025-11-09 ### New Features (ENH) - Add 'strict' parameter to PDFWriter (#3503) ### Bug Fixes (BUG) - PdfWriter.append fails when there are articles being None (#3509) ### Documentation (DOC) - Execute docs examples in CI (#3507) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.1.3...6.2.0) ## Version 6.1.3, 2025-10-22 ### Security (SEC) - Allow limiting size of LZWDecode streams (#3502) - Avoid infinite loop when reading broken DCT-based inline images (#3501) ### Bug Fixes (BUG) - PageObject.scale() scales media box incorrectly (#3489) ### Robustness (ROB) - Fail with explicit exception when image mode is an empty array (#3500) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.1.2...6.1.3) ## Version 6.1.2, 2025-10-19 ### Bug Fixes (BUG) - Fix handling of zero-length StreamObject (#3485) ### Robustness (ROB) - Deal with wrong size for incremental PDF files (#3495) - Improve handling for malformed cross-reference tables (#3483) ### Developer Experience (DEV) - Use released Python 3.14 - Use Mapping instead of dict in type hint of update_page_form_field_values (#3490) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.1.1...6.1.2) ## Version 6.1.1, 2025-09-28 ### Bug Fixes (BUG) - Insert new embedded files in a sorted manner (#3477) - Fix name tree handling for embedded files with Kids-based inputs (#3475) - Make embedding files not break PDF/A-3 compliance (#3472) ### Documentation (DOC) - Document AFRelationship handling for PDF/A and provide constants (#3478) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.1.0...6.1.1) ## Version 6.1.0, 2025-09-21 ### New Features (ENH) - Enhance XMP metadata handling with creation and setter methods (#3410) - Add all font metrics for base 14 Type 1 PDF fonts (#3363) - Allow deleting embedded files (#3461) - Add support for Python in FIPS mode for document identifier (#3438) ### Bug Fixes (BUG) - Fix handling of UTF-16 encoded destination titles (#3463) - Guard empty input to prevent IndexError (#3448) ### Developer Experience (DEV) - Fix type hint for XMP metadata setter to add bytes type (#3464) [Full Changelog](https://github.com/py-pdf/pypdf/compare/6.0.0...6.1.0) ## Version 6.0.0, 2025-08-11 ### Security (SEC) - Limit decompressed size for FlateDecode filter (#3430) ### Deprecations (DEP) - Drop Python 3.8 support (#3412) ### New Features (ENH) - Move BlackIs1 functionality to tiff_header (#3421) ### Robustness (ROB) - Skip Go-To actions without a destination (#3420) ### Developer Experience (DEV) - Update code style related libraries (#3414) - Update mypy to 1.17.0 (#3413) - Stop testing on Python 3.8 and start testing on Python 3.14 (#3411) ### Maintenance (MAINT) - Cleanup deprecations (#3424) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.9.0...6.0.0) ## Version 5.9.0, 2025-07-27 ### New Features (ENH) - Automatically preserve links in added pages (#3298) - Allow writing/updating all properties of an embedded file (#3374) ### Bug Fixes (BUG) - Fix XMP handling dropping indirect references (#3392) ### Robustness (ROB) - Deal with DecodeParms being empty list (#3388) ### Documentation (DOC) - Document how to read and modify XMP metadata (#3383) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.8.0...5.9.0) ## Version 5.8.0, 2025-07-13 ### New Features (ENH) - Implement flattening for writer (#3312) ### Bug Fixes (BUG) - Unterminated object when using PdfWriter with incremental=True (#3345) ### Robustness (ROB) - Resolve some image extraction edge cases (#3371) - Ignore faulty trailing newline during RLE decoding (#3355) - Gracefully handle odd-length strings in parse_bfchar (#3348) ### Developer Experience (DEV) - Modernize license specifiers (#3338) ### Maintenance (MAINT) - Reduce max-complexity of tool.ruff.lint.mccabe (#3365) - Refactor text extraction code [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.7.0...5.8.0) ## Version 5.7.0, 2025-06-29 ### Performance Improvements (PI) - Performance optimization for LZW decoding (#3329) ### Robustness (ROB) - Flate decoding for streams with faulty tail bytes (#3332) - dc_creator could be a Bag as well (#3333) - Handle tree being NullObject when retrieving named destinations (#3331) ### Maintenance (MAINT) - Move inline-image mappings to constants (#3328) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.6.1...5.7.0) ## Version 5.6.1, 2025-06-22 ### New Features (ENH) - Add PDF/A XMP metadata support (#3314) ### Robustness (ROB) - Deal with annotations not being lists on merge (#3321) - Handle NullObject for cmap encoding Differences entry (#3317) ### Developer Experience (DEV) - Update ruff to 0.12.0 (#3316) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.6.0...5.6.1) ## Version 5.6.0, 2025-06-01 ### New Features (ENH) - Add basic support for JBIG2 by using jbig2dec (#3163) ### Bug Fixes (BUG) - Fix crashes by removing unnecessary line (#3293) - Add delimiters to NameObject.renumber_table (#3286) ### Robustness (ROB) - Handle DecodeParms being a NullObject (#3285) ### Code Style (STY) - Update to mypy 1.16.0 (#3300) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.5.0...5.6.0) ## Version 5.5.0, 2025-05-11 ### New Features (ENH) - Add support for IndirectObject.__iter__ (#3228) - Allow filtering by font when removing text (#3216) ### Bug Fixes (BUG) - Add missing named destinations being ByteStringObjects (#3282) - Get font information more reliably when removing text (#3252) - T* 2D Translation consistent with PDF 1.7 Spec (#3250) - Add font stack to q/Q operations in layout mode (#3225) - Avoid completely hiding image loading issues like exceeding image size limits (#3221) - Using compress_identical_objects on transformed content duplicates differing content (#3197) - Consider BlackIs1 parameter for CCITTFaxDecode filter (#3196) ### Robustness (ROB) - Deal with insufficient cm matrix during text extraction (#3283) - Allow merging when annotations miss D entry (#3281) - Fix merging documents if there are no Dests (#3280) - Fix crash on malformed action in outline (#3278) - Fix compression issues for removed images which might be None (#3246) - Attempt to deal with non-rectangular FlateDecode streams (#3245) - Handle some None values for broken PDF files (#3230) ### Developer Experience (DEV) - Multiple style improvements - Update ruff to 0.11.0 ### Maintenance (MAINT) - Conform ASCIIHexDecode implementation to specification (#3274) - Modify comments of filters that do not use decode_parms (#3260) ### Code Style (STY) - Simplify warnings & debugging in layout mode text extraction (#3271) - Standardize mypy assert statements (#3276) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.4.0...5.5.0) ## Version 5.4.0, 2025-03-16 ### New Features (ENH) - Add support for `IndirectObject.__contains__` (#3155) ### Bug Fixes (BUG) - Fix detection of inline images followed by names or numbers (#3173) ### Robustness (ROB) - Consider root objects without catalog type as fallback (#3175) - Raise proper error on infinite loop when reading objects (#3169) ### Documentation (DOC) - Mention memory consumption of text extraction (#3168) ### Developer Experience (DEV) - Upgrade to ruff 0.10.0 (#3191) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.3.1...5.4.0) ## Version 5.3.1, 2025-03-02 ### Bug Fixes (BUG) - Use the correct name StandardEncoding for the predefined cmap (#3156) - Handle inline images containing `EI ` sequences (#3152) - Fix check box value which should be name object (#3124) - Fix stream position on inline image fallback extraction (#3120) - Fix object count for incremental writer (#3117) ### Robustness (ROB) - Avoid index errors on empty lines in xref table (#3162) - Improve handling of LZW decoder table overflow (#3159) - Ignore non-numbers for width when building font width map (#3158) - Avoid negative seek values when reading partially broken files (#3157) ### Documentation (DOC) - Fixed PageObject.images example usage for replacing image (#3149) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.3.0...5.3.1) ## Version 5.3.0, 2025-02-09 ### New Features (ENH) - Handle attachments in /Kids and provide object-oriented API (#3108) ### Bug Fixes (BUG) - Handle annotations being None on merging (#3111) ### Robustness (ROB) - Prevent excessive layout mode text output from Type3 fonts (#3082) ### Documentation (DOC) - stefan6419846 becomes BDFL of pypdf (#3078) - Tidy the visitor function description (#3086) ### Developer Experience (DEV) - Remove ignoring multiple Ruff rules - Remove unused mutmut configuration (#3092) ### Testing (TST) - Fix warning assertions to use `pytest.warns()` (#3083) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.2.0...5.3.0) ## Version 5.2.0, 2025-01-26 ### Deprecations (DEP) - Deprecate with replacement CCITParameters (#3019) - Correct deprecation of interiour_color (#2947) ### New Features (ENH) - Support alternative (U)F names for embedded file retrieval (#3072) - Adding support for reading .metadata.keywords (#2939) ### Bug Fixes (BUG) - Handle further Tf operators in text extraction layout mode (#3073) - Ensure `add_metadata` can deal with `_info = None` (#3040) - Handle IndirectObject in CCITTFaxDecode filter (#2965) - Handle chained colorspace for inline images when no filter is set (#3008) - Avoid extracting inline images twice and dropping other operators (#3002) - Fixed reference of value with `str.__new__` in TextStringObject (#2952) - Handle indirect objects in font width calculations (#2967) - Title sometimes is bytes and not str (#2930) - Fix undefined variable for text extraction (regression) (#2934) - Don't close stream passed to PdfWriter.write() (#2909) ### Robustness (ROB) - Handle zero height fonts when extracting text (#3075) - Deal with content streams not containing streams (#3005) - Gracefully handle some text operators when the operands are missing (#3006) - Fall back to non-Adobe Ascii85 format for missing end markers (#3007) - Ignore odd-length strings when processing cmap lines (#3009) - Skip annotation destination being NullObject in PdfWriter (#2964) - Skip destination page being None in PdfWriter (#2963) - Fix infinite loop case when reading null objects within an Array - Fixing infinite loop in ArrayObject read_from_stream (#2928) ### Documentation (DOC) - Add note about default line colors (#3014) ### Developer Experience (DEV) - Remove ignoring Ruff rule PGH004 (#3071) - Tidy ignore array in tool.ruff.lint (#3069) - Move Windows CI to Python 3.13 (#3003) - Move to Ubuntu 22.04 (#3004) ### Maintenance (MAINT) - Fix formatting of warning message and include exception message (#3076) - Narrow return type for `ContentStream.operations` (#2941) ### Testing (TST) - Fix image similarity for upcoming Ubuntu 24.04 (#3039) - Replace broken Apache Tika Corpora urls (#3041) ### Code Style (STY) - Add form feed to WHITESPACES (#3054) - Lots of small internal changes [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.1.0...5.2.0) ## Version 5.1.0, 2024-10-27 ### New Features (ENH) - Add `layout_mode_font_height_weight` argument to `PageObject.extract_text()` (#2920) ### Bug Fixes (BUG) - Fix font specificier for FreeText annotation (#2893) - Line breaks are not generated due to incorrect calculation of text leading (#2890) - Improve handling of spaces in text extraction (#2882) ### Robustness (ROB) - Soft failure for flate encode image mode 1 with wrong LUT size (#2900) ### Documentation (DOC) - Use latest package versions (#2907) - Correct example of reading FileAttachment annotation (#2906) ### Developer Experience (DEV) - Update pinned requirements (#2918) - Make make_release.py compatible with Windows environment (#2894) ### Maintenance (MAINT) - Remove references to outdated Python versions (#2919) - Generalize the method of obtaining space_code (#2891) - Unnecessary character mapping process (#2888) - New LZW decoding implementation (#2887) ### Testing (TST) - Add LzwCodec for encoding (#2883) ### Code Style (STY) - Capitalize error messages (#2903) - Modify error messages in PdfWriter (#2902) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.0.1...5.1.0) ## Version 5.0.1, 2024-09-29 ### New Features (ENH) - Add `full` parameter to PdfWriter constructor (#2865) ### Bug Fixes (BUG) - Update pyproject.toml with minimum Python version of 3.8 (#2859) - Cope with unbalanced delimiters in dictionary object (#2878) - Cope with encoding with too many differences (#2873) - Missing spaces in extract_text() method (#1328) (#2868) - Tolerate truncated files and no warning when jumping startxref (#2855) ### Robustness (ROB) - Repair PDF with invalid Root object (#2880) - Continue parsing dictionary object when error is detected (#2872) - Merge documents with invalid pages in named destinations (#2857) - Tolerate comments in arrays (#2856) ### Developer Experience (DEV) - Use latest Python version for benchmarking (#2879) ### Maintenance (MAINT) - Add tests to source distributions (#2874) - Refactor _update_field_annotation (#2862) [Full Changelog](https://github.com/py-pdf/pypdf/compare/5.0.0...5.0.1) ## Version 5.0.0, 2024-09-15 This version drops support for Python 3.7 (not maintained since July 2023), PdfMerger (use PdfWriter instead) and AnnotationBuilder (use annotations instead). ### Deprecations (DEP) - Remove the deprecated PdfMerger and AnnotationBuilder classes and other deprecations cleanup (#2813) - Drop Python 3.7 support (#2793) ### New Features (ENH) - Add capability to remove /Info from PDF (#2820) - Add incremental capability to PdfWriter (#2811) - Add UniGB-UTF16 encodings (#2819) - Accept utf strings for metadata (#2802) - Report PdfReadError instead of RecursionError (#2800) - Compress PDF files merging identical objects (#2795) ### Bug Fixes (BUG) - Fix sheared image (#2801) ### Robustness (ROB) - Robustify .set_data() (#2821) - Raise PdfReadError when missing /Root in trailer (#2808) - Fix extract_text() issues on damaged PDFs (#2760) - Handle images with empty data when processing an image from bytes (#2786) ### Developer Experience (DEV) - Fix coverage uploads (#2832) - Test against Python 3.13 (#2776) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.3.1...5.0.0) ## Version 4.3.1, 2024-07-21 ### Bug Fixes (BUG) - Cope with Matrix entry in field annotations (#2736) ### Robustness (ROB) - Cope with fields with upside down box/rectangle (#2729) ### Maintenance (MAINT) - Add deprecate_with_replacement to StreamObject.initializeFromD… (#2728) - Deal with cryptography>=43 moving ARC4 (#2765) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.3.0...4.3.1) ## Version 4.3.0, 2024-06-23 ### New Features (ENH) - Accept ETen-B5 and UniCNS-UTF16 encodings (#2721) - Add decode_as_image() to ContentStreams (#2615) - context manager for PdfReader (#2666) - Add capability to set font and size in fields (#2636) - Allow to pass input file without named argument (#2576) ### Bug Fixes (BUG) - Fix deprecation for Ressources when using old constants (#2705) - Fix images issue 4 bits encoding and LUT starting with UTF16_BOM (#2675) - Reading large compressed images takes huge time to process (#2644) - Highlighted Text Cannot Be Printed (#2604) - Fix UnboundLocalError on malformed pdf (#2619) ### Robustness (ROB) - Cope with missing Standard 14 fonts in fields (#2677) - Improve inline image extraction (#2622) - Cope with loops in Fields tree (#2656) - Discard /I in choice fields for compatibility with Acrobat (#2614) - Cope with some issues in pillow (#2595) - Cope with some image extraction issues (#2591) ### Documentation (DOC) - Various improvements on docstrings and examples ### Maintenance (MAINT) - Deprecate interiour_color with replacement interior_color (#2706) - Add deprecate_with_replacement to PdfWriter.find_bookmark (#2674) ### Code Style (STY) - Change Link to be a non-markup annotation (#2714) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.2.0...4.3.0) ## Version 4.2.0, 2024-04-07 ### New Features (ENH) - Allow multiple charsets for NameObject.read_from_stream (#2585) - Add support for /Kids in page labels (#2562) - Allow to update fields on many pages (#2571) - Tolerate PDF with invalid xref pointed objects (#2335) - Add Enforce from PDF2.0 in viewer_preferences (#2511) - Add += and -= operators to ArrayObject (#2510) ### Bug Fixes (BUG) - Fix merge_page sometimes generating unknown operator 'QQ' (#2588) - Fix fields update where annotations are kids of field (#2570) - Process CMYK images without a filter correctly (#2557) - Extract text in layout mode without finding resources (#2555) - Prevent recursive loop in some PDF files (#2505) ### Robustness (ROB) - Tolerate "truncated" xref (#2580) - Replace error by warning for EOD in RunLengthDecode/ASCIIHexDecode (#2334) - Rebuild xref table if one entry is invalid (#2528) - Robustify stream extraction (#2526) ### Documentation (DOC) - Update release process for latest changes (#2564) - Encryption/decryption: Clone document instead of copying all pages (#2546) - Minor improvements (#2542) - Update annotation list (#2534) - Update references and formatting (#2529) - Correct threads reference, plus minor changes (#2521) - Minor readability increases (#2515) - Simplify PaperSize examples (#2504) - Minor improvements (#2501) ### Developer Experience (DEV) - Remove unused dependencies (#2572) - Remove page labels PR link from message (#2561) - Fix changelog generator regarding whitespace and handling of "Other" group (#2492) - Add REL to known PR prefixes (#2554) - Release using the REL commit instead of git tag (#2500) - Unify code between PdfReader and PdfWriter (#2497) - Bump softprops/action-gh-release from 1 to 2 (#2514) ### Maintenance (MAINT) - Ressources → Resources (and internal name childs) (#2550) - Fix typos found by codespell (#2549) - Update Read the Docs configuration (#2538) - Add root_object, _info and _ID to PdfReader (#2495) ### Testing (TST) - Allow loading truncated images if required (#2586) - Fix download issues from #2562 (#2578) - Improve test_get_contents_from_nullobject to show real use-case (#2524) - Add missing test annotations (#2507) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.1.0...4.2.0) ## Version 4.1.0, 2024-03-03 Generating name objects (`NameObject`) without a leading slash is considered deprecated now. Previously, just a plain warning would be logged, leading to possibly invalid PDF files. According to our deprecation policy, this will log a *DeprecationWarning* for now. ### New Features (ENH) - Add get_pages_from_field (#2494) - Add reattach_fields function (#2480) - Automatic access to pointed object for IndirectObject (#2464) ### Bug Fixes (BUG) - Missing error on name without leading / (#2387) - encode_pdfdocencoding() always returns bytes (#2440) - BI in text content identified as image tag (#2459) ### Robustness (ROB) - Missing basefont entry in type 3 font (#2469) ### Documentation (DOC) - Improve lossless compression example (#2488) - Amend robustness documentation (#2479) ### Developer Experience (DEV) - Fix changelog for UTF-8 characters (#2462) ### Maintenance (MAINT) - Add _get_page_number_from_indirect in writer (#2493) - Remove user assignment for feature requests (#2483) - Remove reference to old 2.0.0 branch (#2482) ### Testing (TST) - Fix benchmark failures (#2481) - Broken test due to expired test file URL (#2468) - Resolve file naming conflict in test_iss1767 (#2445) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.0.2...4.1.0) ## Version 4.0.2, 2024-02-18 ### Bug Fixes (BUG) - Use NumberObject for /Border elements of annotations (#2451) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.0.1...4.0.2) ## Version 4.0.1, 2024-01-28 ### Bug Fixes (BUG) - layout mode text extraction ZeroDivisionError (#2417) ### Testing (TST) - Skip tests using fpdf2 if it's not installed (#2419) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.0.0...4.0.1) ## Version 4.0.0, 2024-01-19 ### Deprecations (DEP) - Drop Python 3.6 support (#2369) - Remove deprecated code (#2367) - Remove deprecated XMP properties (#2386) ### New Features (ENH) - Add "layout" mode for text extraction (#2388) - Add Jupyter Notebook integration for PdfReader (#2375) - Improve/rewrite PDF permission retrieval (#2400) ### Bug Fixes (BUG) - PdfWriter.add_uri was setting the wrong type (#2406) - Add support for GBK2K cmaps (#2385) ### Maintenance (MAINT) - Return None instead of -1 when page is not attached (#2376) - Complete FileSpecificationDictionaryEntries constants (#2416) - Replace warning with logging.error (#2377) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.4...4.0.0) ## Version 3.17.4, 2023-12-24 ### Bug Fixes (BUG) - Handle IndirectObject as image filter (#2355) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.3...3.17.4) ## Version 3.17.3, 2023-12-17 ### Robustness (ROB) - Out-of-bounds issue in handle_tj (text extraction) (#2342) ### Developer Experience (DEV) - Make make_release.py easier to configure (#2348) ### Maintenance (MAINT) - Bump actions/download-artifact from 3 to 4 (#2344) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.2...3.17.3) ## Version 3.17.2, 2023-12-10 ### Bug Fixes (BUG) - Cope with deflated images with CMYK Black Only (#2322) - Handle indirect objects as parameters for CCITTFaxDecode (#2307) - check words length in _cmap type1_alternative function (#2310) ### Robustness (ROB) - Relax flate decoding for too many lookup values (#2331) - Let _build_destination skip in case of missing /D key (#2018) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.1...3.17.2) ## Version 3.17.1, 2023-11-14 ### Bug Fixes (BUG) - Mediabox expansion size when applying non-right angle rotation (#2282) ### Robustness (ROB) - MissingWidth is IndirectObject (#2288) - Initialize states array with an empty value (#2280) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.0...3.17.1) ## Version 3.17.0, 2023-10-29 ### Security (SEC) - Infinite recursion when using PdfWriter(clone_from=reader) (#2264) ### New Features (ENH) - Add parameter to select images to be removed (#2214) ### Bug Fixes (BUG) - Correctly handle image mode 1 with FlateDecode (#2249) - Error when filling a value with parentheses #2268 (#2269) - Handle empty root outline (#2239) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.4...3.17.0) ## Version 3.16.4, 2023-10-10 ### Bug Fixes (BUG) - Avoid exceeding recursion depth when retrieving image mode (#2251) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.3...3.16.4) ## Version 3.16.3, 2023-10-08 ### Bug Fixes (BUG) - Invalid cm/tm in visitor functions (#2206) - Encrypt / decrypt Stream object dictionaries (#2228) - Support nested color spaces for the /DeviceN color space (#2241) - Images property fails if NullObject in list (#2215) ### Developer Experience (DEV) - Unify mypy options and warn redundant workarounds (#2223) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.2...3.16.3) ## Version 3.16.2, 2023-09-24 ### Bug Fixes (BUG) - PDF size increases because of too high float writing precision (#2213) - Fix test_watermarking_reportlab_rendering() (#2203) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.1...3.16.2) ## Version 3.16.1, 2023-09-17 ⚠️ The 'rename PdfWriter.create_viewer_preference to PdfWriter.create_viewer_preferences (#2190)' could be a breaking change for you, if you use it. As it was only introduced last week I'm confident enough that nobody will be affected though. Hence only the patch update. ### Bug Fixes (BUG) - Missing new line in extract_text with cm operations (#2142) - _get_fonts not processing properly CIDFonts and annotations (#2194) ### Maintenance (MAINT) - Rename PdfWriter.create_viewer_preference to PdfWriter.create_viewer_preferences (#2190) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.0...3.16.1) ## Version 3.16.0, 2023-09-10 ### Security (SEC) - Infinite recursion caused by IndirectObject clone (#2156) ### New Features (ENH) - Ease access to ViewerPreferences (#2144) ### Bug Fixes (BUG) - Catch the case where w[0] is an IndirectObject instead of an int (#2154) - Cope with indirect objects in filters and remove deprecated code (#2177) - Accept tabs in cmaps (#2174) / cope with extra space (#2151) - Merge pages without resources (#2150) - getcontents() shall return None if contents is NullObject (#2161) - Fix conversion from 1 to LA (#2175) ### Robustness (ROB) - Accept XYZ with no arguments (#2178) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.5...3.16.0) ## Version 3.15.5, 2023-09-03 ### Bug Fixes (BUG) - Cope with missing /I in articles (#2134) - Fix image look-up table in EncodedStreamObject (#2128) - remove_images not operating in sub level forms (#2133) ### Robustness (ROB) - Cope with damaged PDF (#2129) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.4...3.15.5) ## Version 3.15.4, 2023-08-27 ### Performance Improvements (PI) - Making pypdf as fast as pdfrw (#2086) ### Maintenance (MAINT) - Relax typing_extensions version (#2104) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.3...3.15.4) ## Version 3.15.3, 2023-08-26 ### Bug Fixes (BUG) - Check version of crypt provider (#2115) - TypeError: can't concat str to bytes (#2114) - Require flit_core >= 3.9 (#2091) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.2...3.15.3) ## Version 3.15.2, 2023-08-20 ### Security (SEC) - Avoid endless recursion of reading damaged PDF file (#2093) ### Performance Improvements (PI) - Reuse content stream (#2101) ### Maintenance (MAINT) - Make ParseError inherit from PyPdfError (#2097) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.1...3.15.2) ## Version 3.15.1, 2023-08-13 ### Performance Improvements (PI) - optimize _decode_png_prediction (#2068) ### Bug Fixes (BUG) - Fix incorrect tm_matrix in call to visitor_text (#2060) - Writing German characters into form fields (#2047) - Prevent stall when accessing image in corrupted pdf (#2081) - append() fails when articles do not have /T (#2080) ### Robustness (ROB) - Cope with xref not followed by separator (#2083) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.0...3.15.1) ## Version 3.15.0, 2023-08-06 ### New Features (ENH) - Add `level` parameter to compress_content_streams (#2044) - Process /uniHHHH for text_extract (#2043) ### Bug Fixes (BUG) - Fix AnnotationBuilder.link (#2066) - JPX image without ColorSpace (#2062) - Added check for field /Info when cloning reader document (#2055) - Fix indexed/CMYK images (#2039) ### Maintenance (MAINT) - Cryptography as primary dependency (#2053) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.14.0...3.15.0) ## Version 3.14.0, 2023-07-29 ### New Features (ENH) - Accelerate image list keys generation (#2014) - Use `cryptography` for encryption/decryption as a fallback for PyCryptodome (#2000) - Extract LaTeX characters (#2016) - ASCIIHexDecode.decode now returns bytes instead of str (#1994) ### Bug Fixes (BUG) - Add RunLengthDecode filter (#2012) - Process /Separation ColorSpace (#2007) - Handle single element ColorSpace list (#2026) - Process lookup decoded as TextStringObjects (#2008) ### Robustness (ROB) - Cope with garbage collector during cloning (#1841) ### Maintenance (MAINT) - Cleanup of annotations (#1745) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.13.0...3.14.0) ## Version 3.13.0, 2023-07-23 ### New Features (ENH) - Add is_open in outlines in PdfReader and PdfWriter (#1960) ### Bug Fixes (BUG) - Search /DA in hierarchy fields (#2002) - Cope with different ISO date length (#1999) - Decode Black only/CMYK deviceN images (#1984) - Process CMYK in deflate images (#1977) ### Developer Experience (DEV) - Add mypy to pre-commit (#2001) - Release automation (#1991, #1985) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.12.2...3.13.0) ## Version 3.12.2, 2023-07-16 ### Bug Fixes (BUG) - Accept calRGB and calGray color_spaces (#1968) - Process 2bits and 4bits images (#1967) - Check for AcroForm and ensure it is not None (#1965) ### Developer Experience (DEV) - Automate the release process (#1970) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.12.1...3.12.2) ## Version 3.12.1, 2023-07-09 ### Bug Fixes (BUG) - Prevent updating page contents after merging page (stamping/watermarking) (#1952) - % to be hex encoded in names (#1958) - Inverse color in CMYK images (#1947) - Dates conversion not working with Z00\'00\' (#1946) - Support UTF-16-LE Strings (#1884) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.12.0...3.12.1) ## Version 3.12.0, 2023-07-02 ### New Features (ENH) - Add AES support for encrypting PDF files (#1918, #1935, #1936, #1938) - Add page deletion feature to PdfWriter (#1843) ### Bug Fixes (BUG) - PdfReader.get_fields() attempts to delete non-existing index "/Off" (#1933) - Remove unused objects when cloning_from (#1926) - Add the TK.SIZE into the trailer (#1911) - add_named_destination() maintains named destination list sort order (#1930) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.11.1...3.12.0) ## Version 3.11.1, 2023-06-25 ### Bug Fixes (BUG) - Cascaded filters in image objects (#1913) - Append pdf with named destination using numbers for pages (#1858) - Ignore "/B" fields only on pages in PdfWriter.append() (#1875) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.11.0...3.11.1) ## Version 3.11.0, 2023-06-23 ### New Features (ENH) - Add page_number property (#1856) ### Bug Fixes (BUG) - File expansion when updating with Page Contents (#1906) - Missing Alternate in indexed/ICCbased colorspaces (#1896) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.10.0...3.11.0) ## Version 3.10.0, 2023-06-18 ### New Features (ENH) - Extraction of inline images (#1850) - Add capability to replace image (#1849) - Extend images interface by returning an ImageFile(File) class (#1848) - Add set_data to EncodedStreamObject (#1854) ### Bug Fixes (BUG) - Fix RGB FlateEncode Images(PNG) and transparency (#1834) - Generate static appearance for fields (#1864) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.9.1...3.10.0) ## Version 3.9.1, 2023-06-04 ### Deprecations (DEP) - Deprecate PdfMerger (#1866) ### Bug Fixes (BUG) - Ignore UTF-8 decode errors (#1865) ### Robustness (ROB) - Handle missing /Type entry in Page tree (#1859) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.9.0...3.9.1) ## Version 3.9.0, 2023-05-21 ### New Features (ENH) - Simplify metadata input (Document Information Dictionary) (#1851) - Extend cmap compatibility to GBK_EUC_H/V (#1812) ### Bug Fixes (BUG) - Prevent infinite loop when no character follows after a comment (#1828) - get_contents does not return ContentStream (#1847) - Accept XYZ destination with zoom missing (default to zoom=0.0) (#1844) - Cope with 1 Bit images (#1815) ### Robustness (ROB) - Handle missing /Type entry in Page tree (#1845) ### Documentation (DOC) - Expand file size explanations (#1835) - Add comparison with pdfplumber (#1837) - Clarify that PyPDF2 is dead (#1827) - Add Hunter King as Contributor for #1806 ### Maintenance (MAINT) - Refactor internal Encryption class (#1821) - Add R parameter to generate_values (#1820) - Make encryption_key parameter of write_to_stream optional (#1819) - Prepare for adding AES encryption support (#1818) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.9.0) ## Version 3.8.1, 2023-04-23 ### Bug Fixes (BUG) - Convert color space before saving (#1802) ### Documentation (DOC) - PDF/A (#1807) - Use append instead of add_page - Document core mechanics of pypdf (#1783) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.0...3.8.1) ## Version 3.8.0, 2023-04-16 ### New Features (ENH) - Add transform method to Transformation class (#1765) - Cope with UC2 fonts in text_extraction (#1785) ### Robustness (ROB) - Invalid startxref pointing 1 char before (#1784) ### Maintenance (MAINT) - Mark code handling old parameters as deprecated (#1798) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.1...3.8.0) ## Version 3.7.1, 2023-04-09 ### Security (SEC) - Warn about PDF encryption security (#1755) ### Robustness (ROB) - Prevent loop in Cloning (#1770) - Capture UnicodeDecodeError at PdfReader.pdf_header (#1768) ### Documentation (DOC) - Add .readthedocs.yaml and bump docs dependencies using `tox -e deps` (#1750, #1752) ### Developer Experience (DEV) - Make make_changelog.py idempotent ### Maintenance (MAINT) - Move generation of file identifiers to a method (#1760) ### Testing (TST) - Add xmp test (#1775) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.0...3.7.1) ## Version 3.7.0, 2023-03-26 ### Security (SEC) - Use Python's secrets module instead of random module (#1748) ### New Features (ENH) - Add AnnotationBuilder.highlight text markup annotation (#1740) - Add AnnotationBuilder.popup (#1665) - Add AnnotationBuilder.polyline annotation support (#1726) - Add clone_from parameter in PdfWriter constructor (#1703) ### Bug Fixes (BUG) - 'DictionaryObject' object has no attribute 'indirect_reference' (#1729) ### Robustness (ROB) - Handle params NullObject in decode_stream_data (#1738) ### Documentation (DOC) - Project scope (#1743) ### Maintenance (MAINT) - Add AnnotationFlag (#1746) - Add LazyDict.__str__ (#1727) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.6.0...3.7.0) ## Version 3.6.0, 2023-03-18 ### New Features (ENH) - Extend PdfWriter.append() to PageObjects (#1704) - Support qualified names in update_page_form_field_values (#1695) ### Robustness (ROB) - Tolerate streams without length field (#1717) - Accept DictionaryObject in /D of NamedDestination (#1720) - Widths def in cmap calls IndirectObject (#1719) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.2...3.6.0) ## Version 3.5.2, 2023-03-12 ⚠️ We discovered that compress_content_stream has to be applied to a page of the PdfWriter. It may not be applied to a page of the PdfReader! ### Bug Fixes (BUG) - compress_content_stream not readable in Adobe Acrobat (#1698) - Pass logging parameters correctly in set_need_appearances_writer (#1697) - Write /Root/AcroForm in set_need_appearances_writer (#1639) ### Robustness (ROB) - Allow more whitespaces within linearized file (#1701) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.1...3.5.2) ## Version 3.5.1, 2023-03-05 ### Robustness (ROB) - Some attributes not copied in DictionaryObject._clone (#1635) - Allow merging multiple time pages with annots (#1624) ### Testing (TST) - Replace pytest.mark.external by enable_socket (#1657) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.0...3.5.1) ## Version 3.5.0, 2023-02-26 ### New Features (ENH) - Add reader.attachments public interface (#1611, #1661) - Add PdfWriter.remove_objects_from_page(page: PageObject, to_delete: ObjectDeletionFlag) (#1648) - Allow free-text annotation to have transparent border/background (#1664) ### Bug Fixes (BUG) - Allow decryption with empty password for AlgV5 (#1663) - Let PdfWriter.pages return PageObject after calling `clone_document_from_reader()` (#1613) - Invalid font pointed during merge_resources (#1641) ### Robustness (ROB) - Cope with invalid objects in IndirectObject.clone (#1637) - Improve tolerance to invalid Names/Dests (#1658) - Decode encoded values in get_fields (#1636) - Let PdfWriter.merge cope with missing "/Fields" (#1628) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.4.1...3.5.0) ## Version 3.4.1, 2023-02-12 ### Bug Fixes (BUG) - Switch from trimbox to cropbox when merging pages (#1622) - Text extraction not working with one glyph to char sequence (#1620) ### Robustness (ROB) - Fix 2 cases of "object has no attribute \'indirect_reference\'" (#1616) ### Testing (TST) - Add multiple retry on get_url for external PDF downloads (#1626) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.4.0...3.4.1) ## Version 3.4.0, 2023-02-05 NOTICE: pypdf changed the way it represents numbers parsed from PDF files. pypdf<3.4.0 represented numbers as Decimal, pypdf>=3.4.0 represents them as floats. Several other PDF libraries to this, as well as many PDF viewers. We hope to fix issues with too high precision like this and get a speed boost. In case your PDF documents rely on more than 18 decimals of precision you should check if it still works as expected. To clarify: This does not affect the text shown in PDF documents. It affects numbers, e.g. when graphics are drawn on the PDF or very exact positions are used. Typically, 5 decimals should be enough. ### New Features (ENH) - Enable merging forms with overlapping names (#1553) - Add 'over' parameter to merge_transformend_page & co (#1567) ### Bug Fixes (BUG) - Fix getter of the PageObject.rotation property with an indirect object (#1602) - Restore merge_transformed_page & co (#1567) - Replace decimal by float (#1563) ### Robustness (ROB) - PdfWriter.remove_images: /Contents might not be in page_ref (#1598) ### Developer Experience (DEV) - Introduce ruff (#1586, #1609) ### Maintenance (MAINT) - Remove decimal (#1608) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.3.0...3.4.0) ## Version 3.3.0, 2023-01-22 ### New Features (ENH) - Add page label support to PdfWriter (#1558) - Accept inline images with space before EI (#1552) - Add circle annotation support (#1556) - Add polygon annotation support (#1557) - Make merging pages produce a deterministic PDF (#1542, #1543) ### Bug Fixes (BUG) - Fix error in cmap extraction (#1544) - Remove erroneous assertion check (#1564) - Fix dictionary access of optional page label keys (#1562) ### Robustness (ROB) - Set ignore_eof=True for read_until_regex (#1521) ### Documentation (DOC) - Paper size (#1550) ### Developer Experience (DEV) - Fix broken combination of dependencies of docs.txt - Annotate tests appropriately (#1551) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.2.1...3.3.0) ## Version 3.2.1, 2023-01-08 ### Bug Fixes (BUG) - Accept hierarchical fields (#1529) ### Documentation (DOC) - Use google style docstrings (#1534) - Fix linked markdown documents (#1537) ### Developer Experience (DEV) - Update docs config (#1535) ## Version 3.2.0, 2022-12-31 ### Performance Improvement (PI) - Help the specializing adaptive interpreter (#1522) ### New Features (ENH) - Add support for page labels (#1519) ### Bug Fixes (BUG) - upgrade clone_document_root (#1520) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.1.0...3.1.1) ## Version 3.1.0, 2022-12-23 Move PyPDF2 to pypdf (#1513). This now it's all lowercase, no number in the name. For installation and for import. PyPDF2 will no longer receive updates. The community should move back to its roots. If you were still using pyPdf or PyPDF2 < 2.0.0, I recommend reading the migration guide: https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html pypdf==3.1.0 is only different from PyPDF2==3.0.0 in the package name. Replacing "PyPDF2" by "pypdf" should be enough if you migrate from `PyPDF2==3.0.0` to `pypdf==3.1.0`. [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.0.0...3.1.0) ## Version 3.0.0, 2022-12-22 ### BREAKING CHANGES ⚠️ - Deprecate features with PyPDF2==3.0.0 (#1489) - Refactor Fit / Zoom parameters (#1437) ### New Features (ENH) - Add Cloning (#1371) - Allow int for indirect_reference in PdfWriter.get_object (#1490) ### Documentation (DOC) - How to read PDFs from S3 (#1509) - Make MyST parse all links as simple hyperlinks (#1506) - Changed 'latest' for 'stable' generated docs (#1495) - Adjust deprecation procedure (#1487) ### Maintenance (MAINT) - Use typing.IO for file streams (#1498) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.12.1...3.0.0) ## Version 2.12.1, 2022-12-10 ### Documentation (DOC) - Deduplicate extract_text docstring (#1485) - How to cite PyPDF2 (#1476) ### Maintenance (MAINT) Consistency changes: - indirect_ref/ido ➔ indirect_reference, dest➔ page_destination (#1467) - owner_pwd/user_pwd ➔ owner_password/user_password (#1483) - position ➜ page_number in Merger.merge (#1482) - indirect_ref ➜ indirect_reference (#1484) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.12.0...2.12.1) ## Version 2.12.0, 2022-12-10 ### New Features (ENH) - Add support to extract gray scale images (#1460) - Add 'threads' property to PdfWriter (#1458) - Add 'open_destination' property to PdfWriter (#1431) - Make PdfReader.get_object accept integer arguments (#1459) ### Bug Fixes (BUG) - Scale PDF annotations (#1479) ### Robustness (ROB) - Padding issue with AES encryption (#1469) - Accept empty object as null objects (#1477) ### Documentation (DOC) - Add module documentation the PaperSize class (#1447) ### Maintenance (MAINT) - Use 'page_number' instead of 'pagenum' (#1365) - Add List of pages to PageRangeSpec (#1456) ### Testing (TST) - Cleanup temporary files (#1454) - Mark test_tounicode_is_identity as external (#1449) - Use Ubuntu 20.04 for running CI test suite (#1452) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.11.2...2.12.0) ## Version 2.11.2, 2022-11-20 ### New Features (ENH) - Add remove_from_tree (#1432) - Add AnnotationBuilder.rectangle (#1388) ### Bug Fixes (BUG) - JavaScript executed twice (#1439) - ToUnicode stores /Identity-H instead of stream (#1433) - Declare Pillow as optional dependency (#1392) ### Developer Experience (DEV) - Link 'Full Changelog' automatically - Modify read_string_from_stream to a benchmark (#1415) - Improve error reporting of read_object (#1412) - Test Python 3.11 (#1404) - Extend Flake8 ignore list (#1410) - Use correct pytest markers (#1407) - Move project configuration to pyproject.toml (#1382) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.11.1...2.11.2) ## Version 2.11.1, 2022-10-09 ### Bug Fixes (BUG) - td matrix (#1373) - Cope with cmap from #1322 (#1372) ### Robustness (ROB) - Cope with str returned from get_data in cmap (#1380) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.11.0...2.11.1) ## Version 2.11.0, 2022-09-25 ### New Features (ENH) - Addition of optional visitor-functions in extract_text() (#1252) - Add metadata.creation_date and modification_date (#1364) - Add PageObject.images attribute (#1330) ### Bug Fixes (BUG) - Lookup index in _xobj_to_image can be ByteStringObject (#1366) - 'IndexError: index out of range' when using extract_text (#1361) - Errors in transfer_rotation_to_content() (#1356) ### Robustness (ROB) - Ensure update_page_form_field_values does not fail if no fields (#1346) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.9...2.11.0) ## Version 2.10.9, 2022-09-18 ### New Features (ENH) - Add rotation property and transfer_rotate_to_content (#1348) ### Performance Improvements (PI) - Avoid string concatenation with large embedded base64-encoded images (#1350) ### Bug Fixes (BUG) - Format floats using their intrinsic decimal precision (#1267) ### Robustness (ROB) - Fix merge_page for pages without resources (#1349) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.8...2.10.9) ## Version 2.10.8, 2022-09-14 ### New Features (ENH) - Add PageObject.user_unit property (#1336) ### Robustness (ROB) - Improve NameObject reading/writing (#1345) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.7...2.10.8) ## Version 2.10.7, 2022-09-11 ### Bug Fixes (BUG) - Fix Error in transformations (#1341) - Decode #23 in NameObject (#1342) ### Testing (TST) - Use pytest.warns() for warnings, and .raises() for exceptions (#1325) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.6...2.10.7) ## Version 2.10.6, 2022-09-09 ### Robustness (ROB) - Fix infinite loop due to Invalid object (#1331) - Fix image extraction issue with superfluous whitespaces (#1327) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.5...2.10.6) ## Version 2.10.5, 2022-09-04 ### New Features (ENH) - Process XRefStm (#1297) - Auto-detect RTL for text extraction (#1309) ### Bug Fixes (BUG) - Avoid scaling cropbox twice (#1314) ### Robustness (ROB) - Fix offset correction in revised PDF (#1318) - Crop data of /U and /O in encryption dictionary to 48 bytes (#1317) - MultiLine bfrange in cmap (#1299) - Cope with 2 digit codes in bfchar (#1310) - Accept '/annn' charset as ASCII code (#1316) - Log errors during Float / NumberObject initialization (#1315) - Cope with corrupted entries in xref table (#1300) ### Documentation (DOC) - Migration guide (PyPDF2 1.x ➔ 2.x) (#1324) - Creating a coverage report (#1319) - Fix AnnotationBuilder.free_text example (#1311) - Fix usage of page.scale by replacing it with page.scale_by (#1313) ### Maintenance (MAINT) - PdfReaderProtocol (#1303) - Throw PdfReadError if Trailer can't be read (#1298) - Remove catching OverflowException (#1302) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.4...2.10.5) ## Version 2.10.4, 2022-08-28 ### Robustness (ROB) - Fix errors/warnings on no /Resources within extract_text (#1276) - Add required line separators in ContentStream ArrayObjects (#1281) ### Maintenance (MAINT) - Use NameObject idempotency (#1290) ### Testing (TST) - Rectangle deletion (#1289) - Add workflow tests (#1287) - Remove files after tests ran (#1286) ### Packaging (PKG) - Add minimum version for typing_extensions requirement (#1277) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.3...2.10.4) ## Version 2.10.3, 2022-08-21 ### Robustness (ROB) - Decrypt returns empty bytestring (#1258) ### Developer Experience (DEV) - Modify CI to better verify built package contents (#1244) ### Maintenance (MAINT) - Remove 'mine' as PdfMerger always creates the stream (#1261) - Let PdfMerger._create_stream raise NotImplemented (#1251) - password param of _security._alg32(...) is only a string, not bytes (#1259) - Remove unreachable code in read_block_backwards (#1250) and sign function in _extract_text (#1262) ### Testing (TST) - Delete annotations (#1263) - Close PdfMerger in tests (#1260) - PdfReader.xmp_metadata workflow (#1257) - Various PdfWriter (Layout, Bookmark deprecation) (#1249) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.2...2.10.3) ## Version 2.10.2, 2022-08-15 BUG: Add PyPDF2.generic to PyPI distribution ## Version 2.10.1, 2022-08-15 ### Bug Fixes (BUG) - TreeObject.remove_child had a non-PdfObject assignment for Count (#1233, #1234) - Fix stream truncated prematurely (#1223) ### Documentation (DOC) - Fix docstring formatting (#1228) ### Maintenance (MAINT) - Split generic.py (#1229) ### Testing (TST) - Decrypt AlgV4 with owner password (#1239) - AlgV5.generate_values (#1238) - TreeObject.remove_child / empty_tree (#1235, #1236) - create_string_object (#1232) - Free-Text annotations (#1231) - generic._base (#1230) - Strict get fonts (#1226) - Increase PdfReader coverage (#1219, #1225) - Increase PdfWriter coverage (#1237) - 100% coverage for utils.py (#1217) - PdfWriter exception non-binary stream (#1218) - Don't check coverage for deprecated code (#1216) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.10.0...2.10.1) ## Version 2.10.0, 2022-08-07 ### New Features (ENH) - "with" support for PdfMerger and PdfWriter (#1193) - Add AnnotationBuilder.text(...) to build text annotations (#1202) ### Bug Fixes (BUG) - Allow IndirectObjects as stream filters (#1211) ### Documentation (DOC) - Font scrambling - Page vs Content scaling (#1208) - Example for orientation parameter of extract_text (#1206) - Fix AnnotationBuilder parameter formatting (#1204) ### Developer Experience (DEV) - Add flake8-print (#1203) ### Maintenance (MAINT) - Introduce WrongPasswordError / FileNotDecryptedError / EmptyFileError (#1201) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.9.0...2.10.0) ## Version 2.9.0, 2022-07-31 ### New Features (ENH) - Add ability to add hex encoded colors to outline items (#1186) - Add support for pathlib.Path in PdfMerger.merge (#1190) - Add link annotation (#1189) - Add capability to filter text extraction by orientation (#1175) ### Bug Fixes (BUG) - Named Dest in PDF1.1 (#1174) - Incomplete Graphic State save/restore (#1172) ### Documentation (DOC) - Update changelog url in package metadata (#1180) - Mention camelot for table extraction (#1179) - Mention pyHanko for signing PDF documents (#1178) - Weow have CMAP support since a while (#1177) ### Maintenance (MAINT) - Consistent usage of warnings / log messages (#1164) - Consistent terminology for outline items (#1156) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.8.1...2.9.0) ## Version 2.8.1, 2022-07-25 ### Bug Fixes (BUG) - u_hash in AlgV4.compute_key (#1170) ### Robustness (ROB) - Fix loading of file from #134 (#1167) - Cope with empty DecodeParams (#1165) ### Documentation (DOC) - Typo in merger deprecation warning message (#1166) ### Maintenance (MAINT) - Package updates; solve mypy strict remarks (#1163) ### Testing (TST) - Add test from #325 (#1169) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.8.0...2.8.1) ## Version 2.8.0, 2022-07-24 ### New Features (ENH) - Add writer.add_annotation, page.annotations, and generic.AnnotationBuilder (#1120) ### Bug Fixes (BUG) - Set /AS for /Btn form fields in writer (#1161) - Ignore if /Perms verify failed (#1157) ### Robustness (ROB) - Cope with utf16 character for space calculation (#1155) - Cope with null params for FitH / FitV destination (#1152) - Handle outlines without valid destination (#1076) ### Developer Experience (DEV) - Introduce _utils.logger_warning (#1148) ### Maintenance (MAINT) - Break up parse_to_unicode (#1162) - Add diagnostic output to exception in read_from_stream (#1159) - Reduce PdfReader.read complexity (#1151) ### Testing (TST) - Add workflow tests found by arc testing (#1154) - Decrypt file which is not encrypted (#1149) - Test CryptRC4 encryption class; test image extraction filters (#1147) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.7.0...2.8.0) ## Version 2.7.0, 2022-07-21 ### New Features (ENH) - Add `outline_count` property (#1129) ### Bug Fixes (BUG) - Make reader.get_fields also return dropdowns with options (#1114) - Add deprecated EncodedStreamObject functions back until PyPDF2==3.0.0 (#1139) ### Robustness (ROB) - Cope with missing /W entry (#1136) - Cope with invalid parent xref (#1133) ### Documentation (DOC) - Contributors file (#1132) - Fix type in signature of PdfWriter.add_uri (#1131) ### Developer Experience (DEV) - Add .git-blame-ignore-revs (#1141) ### Code Style (STY) - Fixing typos (#1137) - Reuse code via get_outlines_property in tests (#1130) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.6.0...2.7.0) ## Version 2.6.0, 2022-07-17 ### New Features (ENH) - Add color and font_format to PdfReader.outlines[i] (#1104) - Extract Text Enhancement (whitespaces) (#1084) ### Bug Fixes (BUG) - Use `build_destination` for named destination outlines (#1128) - Avoid a crash when a ToUnicode CMap has an empty dstString in beginbfchar (#1118) - Prevent deduplication of PageObject (#1105) - None-check in DictionaryObject.read_from_stream (#1113) - Avoid IndexError in _cmap.parse_to_unicode (#1110) ### Documentation (DOC) - Explanation for git submodule - Watermark and stamp (#1095) ### Maintenance (MAINT) - Text extraction improvements (#1126) - Destination.color returns ArrayObject instead of tuple as fallback (#1119) - Use add_bookmark_destination in add_bookmark (#1100) - Use add_bookmark_destination in add_bookmark_dict (#1099) ### Testing (TST) - Add test for arab text (#1127) - Add xfail for decryption fail (#1125) - Add xfail test for IndexError when extracting text (#1124) - Add MCVE showing outline title issue (#1123) ### Code Style (STY) - Use IntFlag for permissions_flag / update_page_form_field_values (#1094) - Simplify code (#1101) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.5.0...2.6.0) ## Version 2.5.0, 2022-07-10 ### New Features (ENH) - Add support for indexed color spaces / BitsPerComponent for decoding PNGs (#1067) - Add PageObject._get_fonts (#1083) ### Performance Improvements (PI) - Use iterative DFS in PdfWriter._sweep_indirect_references (#1072) ### Bug Fixes (BUG) - Let Page.scale also scale the crop-/trim-/bleed-/artbox (#1066) - Column default for CCITTFaxDecode (#1079) ### Robustness (ROB) - Guard against None-value in _get_outlines (#1060) ### Documentation (DOC) - Stamps and watermarks (#1082) - OCR vs PDF text extraction (#1081) - Python Version support - Formatting of CHANGELOG ### Developer Experience (DEV) - Cache downloaded files (#1070) - Speed-up for CI (#1069) ### Maintenance (MAINT) - Set page.rotate(angle: int) (#1092) - Issue #416 was fixed by #1015 (#1078) ### Testing (TST) - Image extraction (#1080) - Image extraction (#1077) ### Code Style (STY) - Apply black - Typo in Changelog [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.4.2...2.5.0) ## Version 2.4.2, 2022-07-05 ### New Features (ENH) - Add PdfReader.xfa attribute (#1026) ### Bug Fixes (BUG) - Wrong page inserted when PdfMerger.merge is done (#1063) - Resolve IndirectObject when it refers to a free entry (#1054) ### Developer Experience (DEV) - Added {posargs} to tox.ini (#1055) ### Maintenance (MAINT) - Remove PyPDF2._utils.bytes_type (#1053) ### Testing (TST) - Scale page (indirect rect object) (#1057) - Simplify pathlib PdfReader test (#1056) - IndexError of VirtualList (#1052) - Invalid XML in xmp information (#1051) - No pycryptodome (#1050) - Increase test coverage (#1045) ### Code Style (STY) - DOC of compress_content_streams (#1061) - Minimize diff for #879 (#1049) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.4.1...2.4.2) ## Version 2.4.1, 2022-06-30 ### New Features (ENH) - Add writer.pdf_header property (getter and setter) (#1038) ### Performance Improvements (PI) - Remove b_ call in FloatObject.write_to_stream (#1044) - Check duplicate objects in writer._sweep_indirect_references (#207) ### Documentation (DOC) - How to surppress exceptions/warnings/log messages (#1037) - Remove hyphen from lossless (#1041) - Compression of content streams (#1040) - Fix inconsistent variable names in add-watermark.md (#1039) - File size reduction - Add CHANGELOG to the rendered docs (#1023) ### Maintenance (MAINT) - Handle XML error when reading XmpInformation (#1030) - Deduplicate Code / add mutmut config (#1022) ### Code Style (STY) - Use unnecessary one-line function / class attribute (#1043) - Docstring formatting (#1033) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.4.0...2.4.1) ## Version 2.4.0, 2022-06-26 ### New Features (ENH): - Support R6 decrypting (#1015) - Add PdfReader.pdf_header (#1013) ### Performance Improvements (PI): - Remove ord_ calls (#1014) ### Bug Fixes (BUG): - Fix missing page for bookmark (#1016) ### Robustness (ROB): - Deal with invalid Destinations (#1028) ### Documentation (DOC): - get_form_text_fields does not extract dropdown data (#1029) - Adjust PdfWriter.add_uri docstring - Mention crypto extra_requires for installation (#1017) ### Developer Experience (DEV): - Use /n line endings everywhere (#1027) - Adjust string formatting to be able to use mutmut (#1020) - Update Bug report template [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.3.1...2.4.0) ## Version 2.3.1, 2022-06-19 BUG: Forgot to add the internal `_codecs` subpackage. [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.3.0...2.3.1) ## Version 2.3.0, 2022-06-19 The highlight of this release is improved support for file encryption (AES-128 and AES-256, R5 only). See #749 for the amazing work of @exiledkingcc 🎊 Thank you 🤗 ### Deprecations (DEP) - Rename names to be PEP8-compliant (#967) - `PdfWriter.get_page`: the pageNumber parameter is renamed to page_number - `PyPDF2.filters`: * For all classes, a parameter rename: decodeParms ➔ decode_parms * decodeStreamData ➔ decode_stream_data - `PyPDF2.xmp`: * XmpInformation.rdfRoot ➔ XmpInformation.rdf_root * XmpInformation.xmp_createDate ➔ XmpInformation.xmp_create_date * XmpInformation.xmp_creatorTool ➔ XmpInformation.xmp_creator_tool * XmpInformation.xmp_metadataDate ➔ XmpInformation.xmp_metadata_date * XmpInformation.xmp_modifyDate ➔ XmpInformation.xmp_modify_date * XmpInformation.xmpMetadata ➔ XmpInformation.xmp_metadata * XmpInformation.xmpmm_documentId ➔ XmpInformation.xmpmm_document_id * XmpInformation.xmpmm_instanceId ➔ XmpInformation.xmpmm_instance_id - `PyPDF2.generic`: * readHexStringFromStream ➔ read_hex_string_from_stream * initializeFromDictionary ➔ initialize_from_dictionary * createStringObject ➔ create_string_object * TreeObject.hasChildren ➔ TreeObject.has_children * TreeObject.emptyTree ➔ TreeObject.empty_tree ### New Features (ENH) - Add decrypt support for V5 and AES-128, AES-256 (R5 only) (#749) ### Robustness (ROB) - Fix corrupted (wrongly) linear PDF (#1008) ### Maintenance (MAINT) - Move PDF_Samples folder into resources - Fix typos (#1007) ### Testing (TST) - Improve encryption/decryption test (#1009) - Add merger test cases with real PDFs (#1006) - Add mutmut config ### Code Style (STY) - Put pure data mappings in separate files (#1005) - Make encryption module private, apply pre-commit (#1010) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.2.1...2.3.0) ## Version 2.2.1, 2022-06-17 ### Performance Improvements (PI) - Remove b_ calls (#992, #986) - Apply improvements to _utils suggested by perflint (#993) ### Robustness (ROB) - utf-16-be codec can't decode (...) (#995) ### Documentation (DOC) - Remove reference to Scripts (#987) ### Developer Experience (DEV) - Fix type annotations for add_bookmarks (#1000) ### Testing (TST) - Add test for PdfMerger (#1001) - Add tests for XMP information (#996) - reader.get_fields / zlib issue / LZW decode issue (#1004) - reader.get_fields with report generation (#1002) - Improve test coverage by extracting texts (#998) ### Code Style (STY) - Apply fixes suggested by pylint (#999) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.2.0...2.2.1) ## Version 2.2.0, 2022-06-13 The 2.2.0 release improves text extraction again via (#969): * Improvements around /Encoding / /ToUnicode * Extraction of CMaps improved * Fallback for font def missing * Support for /Identity-H and /Identity-V: utf-16-be * Support for /GB-EUC-H / /GB-EUC-V / GBp/c-EUC-H / /GBpc-EUC-V (beta release for evaluation) * Arabic (for evaluation) * Whitespace extraction improvements Those changes should mainly improve the text extraction for non-ASCII alphabets, e.g. Russian / Chinese / Japanese / Korean / Arabic. [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.1.1...2.2.0) ## Version 2.1.1, 2022-06-12 ### New Features (ENH) - Add support for pathlib as input for PdfReader (#979) ### Performance Improvements (PI) - Optimize read_next_end_line (#646) ### Bug Fixes (BUG) - Adobe Acrobat 'Would you like to save this file?' (#970) ### Documentation (DOC) - Notes on annotations (#982) - Who uses PyPDF2 - intendet \xe2\x9e\x94 in robustness page (#958) ### Maintenance (MAINT) - pre-commit / requirements.txt updates (#977) - Mark read_next_end_line as deprecated (#965) - Export `PageObject` in PyPDF2 root (#960) ### Testing (TST) - Add MCVE of issue #416 (#980) - FlateDecode.decode decodeParms (#964) - Xmp module (#962) - utils.paeth_predictor (#959) ### Code Style (STY) - Use more tuples and list/dict comprehensions (#976) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.1.0...2.1.1) ## Version 2.1.0, 2022-06-06 The highlight of the 2.1.0 release is the most massive improvement to the text extraction capabilities of PyPDF2 since 2016 🥳🎊 A very big thank you goes to [pubpub-zz](https://github.com/pubpub-zz) who took a lot of time and knowledge about the PDF format to finally get those improvements into PyPDF2. Thank you 🤗💚 In case the new function causes any issues, you can use `_extract_text_old` for the old functionality. Please also open a bug ticket in that case. There were several people who have attempted to bring similar improvements to PyPDF2. All of those were valuable. The main reason why they didn't get merged is the big amount of open PRs / issues. pubpub-zz was the most comprehensive PR which also incorporated the latest changes of PyPDF2 2.0.0. Thank you to [VictorCarlquist](https://github.com/VictorCarlquist) for #858 and [asabramo](https://github.com/asabramo) for #464 🤗 ### New Features (ENH) - Massive text extraction improvement (#924). Closed many open issues: - Exceptions / missing spaces in extract_text() method (#17) 🕺 - Whitespace issues in extract_text() (#42) 💃 - pypdf2 reads the hifenated words in a new line (#246) - PyPDF2 failing to read unicode character (#37) - Unable to read bullets (#230) - ExtractText yields nothing for apparently good PDF (#168) 🎉 - Encoding issue in extract_text() (#235) - extractText() doesn't work on Chinese PDF (#252) - encoding error (#260) - Trouble with apostophes in names in text "O'Doul" (#384) - extract_text works for some PDF files, but not the others (#437) - Euro sign not being recognized by extractText (#443) - Failed extracting text from French texts (#524) - extract_text doesn't extract ligatures correctly (#598) - reading spanish text - mark convert issue (#635) - Read PDF changed from text to random symbols (#654) - .extractText() reads / as 1. (#789) - Update glyphlist (#947) - inspired by #464 - Allow adding PageRange objects (#948) ### Bug Fixes (BUG) - Delete .python-version file (#944) - Compare StreamObject.decoded_self with None (#931) ### Robustness (ROB) - Fix some conversion errors on non conform PDF (#932) ### Documentation (DOC) - Elaborate on PDF text extraction difficulties (#939) - Add logo (#942) - rotate vs Transformation().rotate (#937) - Example how to use PyPDF2 with AWS S3 (#938) - How to deprecate (#930) - Fix typos on robustness page (#935) - Remove scripts (pdfcat) from docs (#934) ### Developer Experience (DEV) - Ignore .python-version file - Mark deprecated code with no-cover (#943) - Automatically create Github releases from tags (#870) ### Testing (TST) - Text extraction for non-latin alphabets (#954) - Ignore PdfReadWarning in benchmark (#949) - writer.remove_text (#946) - Add test for Tree and _security (#945) ### Code Style (STY) - black, isort, Flake8, splitting buildCharMap (#950) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/2.0.0...2.1.0) ## Version 2.0.0, 2022-06-01 The 2.0.0 release of PyPDF2 includes three core changes: 1. Dropping support for Python 3.5 and older. 2. Introducing type annotations. 3. Interface changes, mostly to have PEP8-compliant names We introduced a [deprecation process](https://github.com/py-pdf/PyPDF2/pull/930) that hopefully helps users to avoid unexpected breaking changes. ### Breaking Changes (DEP) - PyPDF2 2.0 requires Python 3.6+. Python 2.7 and 3.5 support were dropped. - PdfFileReader: The "warndest" parameter was removed - PdfFileReader and PdfFileMerger no longer have the `overwriteWarnings` parameter. The new behavior is `overwriteWarnings=False`. - merger: OutlinesObject was removed without replacement. - merger.py ➔ _merger.py: You must import PdfFileMerger from PyPDF2 directly. - utils: * `ConvertFunctionsToVirtualList` was removed * `formatWarning` was removed * `isInt(obj)`: Use `instance(obj, int)` instead * `u_(s)`: Use `s` directly * `chr_(c)`: Use `chr(c)` instead * `barray(b)`: Use `bytearray(b)` instead * `isBytes(b)`: Use `instance(b, type(bytes()))` instead * `xrange_fn`: Use `range` instead * `string_type`: Use `str` instead * `isString(s)`: Use `instance(s, str)` instead * `_basestring`: Use `str` instead * All Exceptions are now in `PyPDF2.errors`: - PageSizeNotDefinedError - PdfReadError - PdfReadWarning - PyPdfError - `PyPDF2.pdf` (the `pdf` module) no longer exists. The contents were moved with the library. You should most likely import directly from `PyPDF2` instead. The `RectangleObject` is in `PyPDF2.generic`. - The `Resources`, `Scripts`, and `Tests` will no longer be part of the distribution files on PyPI. This should have little to no impact on most people. The `Tests` are renamed to `tests`, the `Resources` are renamed to `resources`. Both are still in the git repository. The `Scripts` are now in [cpdf](https://github.com/py-pdf/cpdf). `Sample_Code` was moved to the `docs`. For a full list of deprecated functions, please see the changelog of version 1.28.0. ### New Features (ENH) - Improve space setting for text extraction (#922) - Allow setting the decryption password in `PdfReader.__init__` (#920) - Add Page.add_transformation (#883) ### Bug Fixes (BUG) - Fix error adding transformation to page without /Contents (#908) ### Robustness (ROB) - Cope with invalid length in streams (#861) ### Documentation (DOC) - Fix style of 1.25 and 1.27 patch notes (#927) - Transformation (#907) ### Developer Experience (DEV) - Create flake8 config file (#916) - Use relative imports (#875) ### Maintenance (MAINT) - Use Python 3.6 language features (#849) - Add wrapper function for PendingDeprecationWarnings (#928) - Use new PEP8 compliant names (#884) - Explicitly represent transformation matrix (#878) - Inline PAGE_RANGE_HELP string (#874) - Remove unnecessary generics imports (#873) - Remove star imports (#865) - merger.py ➔ _merger.py (#864) - Type annotations for all functions/methods (#854) - Add initial type support with mypy (#853) ### Testing (TST) - Regression test for xmp_metadata converter (#923) - Checkout submodule sample-files for benchmark - Add text extracting performance benchmark - Use new PyPDF2 API in benchmark (#902) - Make test suite fail for uncaught warnings (#892) - Remove -OO testrun from CI (#901) - Improve tests for convert_to_int (#899) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.28.4...2.0.0) ## PyPDF2 1.X See [CHANGELOG PyPDF2 1.X](changelog-v1.md) ================================================ FILE: CONTRIBUTING.md ================================================ Please check the [documentation page dedicated to development](https://pypdf.readthedocs.io/en/stable/dev/intro.html). ## Creating issues / tickets Please go here: https://github.com/py-pdf/pypdf/issues Typically you should not send e-mails. E-mails might only reach one person and it could go into spam or that person might be busy. Please create issues on GitHub instead. Please use the templates provided. Keep in mind that although PDF has an official specification, there are tons of variations which might require special handling. Thus, please always provide a reproducing example file for us to work with. Otherwise, we have to guess possible issues, leading to unnecessary overhead - especially since most of the contributions happen during our free time. If you already know a fix, consider opening a pull request after reporting the issue to make life easier for everyone. ## Creating Pull Requests We appreciate if people make PRs, but please be aware that pypdf is used by many people. That means: * We rarely make breaking changes and have a [deprecation process](https://pypdf.readthedocs.io/en/latest/dev/deprecations.html). * New features, especially adding to the public interface, typically need to be discussed first. Before you make bigger changes, open an issue to make the suggestion. Note which interface changes you want to make. ================================================ FILE: CONTRIBUTORS.md ================================================ # Contributors pypdf had a lot of contributors since it started as pyPdf in 2005. We are a free software project without any company affiliation. We cannot pay contributors, but we do value their contributions. A lot of time, effort, and expertise went into this project. With this list, we recognize these awesome people 🤗 The list is definitely not complete. You can find more contributors via the git history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/graphs/contributors). ## Contributors to the pypdf (formerly pyPdf / PyPDF2) project * [abyesilyurt](https://github.com/abyesilyurt) * [ArkieCoder](https://github.com/ArkieCoder) * [Beers, PJ](https://github.com/PJBrs) * [Clauss, Christian](https://github.com/cclauss) * [DL6ER](https://github.com/DL6ER) * [Duy, Phan Thanh](https://github.com/zuypt) * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) * [Gagnon, William G.](https://github.com/williamgagnon) * [Gillard, James](https://github.com/jgillard) * [Górny, Michał](https://github.com/mgorny) * [Grillo, Miguel](https://github.com/Ineffable22) * [Gutteridge, David H.](https://github.com/dhgutteridge) * [Hale, Joseph](https://github.com/thehale) * [harshhes](https://github.com/harshhes) * [Jackowitz, Noah](https://github.com/hackowitz-af) | [LinkedIn](https://www.linkedin.com/in/noah-jackowitz/) * [JianzhengLuo](https://github.com/JianzhengLuo) * [Karvonen, Harry](https://github.com/Hatell/) * [King, Hunter](https://github.com/neversphere) * [Kotler, Mitchell](https://github.com/mitchelljkotler) * [KourFrost](https://github.com/KourFrost) * [Lightup1](https://github.com/Lightup1) * [Majumder, Jonah](https://github.com/jonahmajumder) * [Manini, Lorenzo](https://github.com/lorenzomanini) * [maxbeer99](https://github.com/maxbeer99) * [McNeil, Karen](https://github.com/karenlmcneil): Arabic Language Support * [Mérino, Antoine](https://github.com/Merinorus) * [Murphy, Kevin](https://github.com/kmurphy4) * [nalin-udhaar](https://github.com/nalin-udhaar) * [Noah-Houghton](https://github.com/Noah-Houghton) | [LinkedIn](https://www.linkedin.com/in/noah-h-554992a0/) * [Paramonov, Alexey](https://github.com/alexey-v-paramonov) * [Paternault, Louis](https://framagit.org/spalax) * [Perrensen, Olsen](https://github.com/olsonperrensen) * [pilotandy](https://github.com/pilotandy) * [Pinheiro, Arthur](https://github.com/xilopaint) * [pmiller66](https://github.com/pmiller66) * [Poddar, Arka](https://github.com/postmeback) * [programmarchy](https://github.com/programmarchy) * [pubpub-zz](https://github.com/pubpub-zz): involved in community development * [Ramos, Leodanis Pozo](https://github.com/lpozo) * [RitchieP](https://github.com/RitchieP) | [LinkedIn](https://www.linkedin.com/in/ritchie-p-892b31115/) | [StackOverflow](https://stackoverflow.com/users/13328625/casual-r?tab=profile) * [robbiebusinessacc](https://github.com/robbiebusinessacc) * [Roder, Thomas](https://github.com/MrTomRod) * [Rogmann, Sascha](https://github.com/srogmann) * [Röthenbacher, Thomas](https://github.com/troethe) * [shartzog](https://github.com/shartzog) * [stefan6419846](https://github.com/stefan6419846): Maintainer of pypdf since January 2025 * [sietzeberends](https://github.com/sietzeberends) * [Stober, Marc](https://github.com/marcstober) * [Stüber, Timo](https://github.com/omit66) * [Thoma, Martin](https://github.com/MartinThoma): Maintainer of pypdf from April 2022 to January 2025. I hope to build a great community with many awesome contributors. [LinkedIn](https://www.linkedin.com/in/martin-thoma/) | [StackOverflow](https://stackoverflow.com/users/562769/martin-thoma) | [Blog](https://martin-thoma.com/) * [Thomas, Reuben](https://github.com/rrthomas) * [Tobeabellwether](https://github.com/Tobeabellwether) * [van Alst, Ludo](https://github.com/LudovA) * [WevertonGomes](https://github.com/WevertonGomesCosta) * [Wilson, Huon](https://github.com/huonw) * ztravis ## Adding a new contributor Contributors are: * Anybody who has a commit in `main` - no matter how small or how many. Also if it's via *co-authored-by*. * People who opened helpful issues: 1. Bugs: with complete MCVE 2. Well-described feature requests 3. Potentially some more. The maintainers of pypdf have the last call on that one. * Community work: This is exceptional. If the maintainers of pypdf see people being super helpful in answering issues / discussions or being very active on Stackoverflow, we also consider them being contributors to pypdf. Contributors can add themselves or ask via an GitHub Issue to be added. Please use the following format: ``` * Last name, First name: 140-characters of text; links to LinkedIn / GitHub / other profiles and personal pages are ok ``` OR ``` * GitHub Username: 140-characters of text; links to LinkedIn / GitHub / other profiles and personal pages are ok ``` and add the entry in the alphabetical order. The 140 characters are everything visible after the `Name:`. Please don't use images. ================================================ FILE: LICENSE ================================================ Copyright (c) 2006-2008, Mathieu Fenniak Some contributions copyright (c) 2007, Ashish Kulkarni Some contributions copyright (c) 2014, Steve Witham All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Makefile ================================================ maint: pre-commit autoupdate pip-compile -U requirements/ci.in pip-compile -U requirements/dev.in pip-compile -U requirements/docs.in release: python make_release.py git commit -eF RELEASE_COMMIT_MSG.md clean: python -m pip install pyclean pyclean . rm -rf tests/__pycache__ pypdf/__pycache__ htmlcov docs/_build dist pypdf.egg-info .pytest_cache .mypy_cache .benchmarks test: pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=60 pypdf testtype: pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 --typeguard-packages=pypdf benchmark: pytest tests/bench.py mypy: mypy pypdf --ignore-missing-imports --check-untyped --strict ruff: ruff check pypdf tests make_release.py ================================================ FILE: README.md ================================================ [![PyPI version](https://badge.fury.io/py/pypdf.svg)](https://badge.fury.io/py/pypdf) [![Python Support](https://img.shields.io/pypi/pyversions/pypdf.svg)](https://pypi.org/project/pypdf/) [![](https://img.shields.io/badge/-documentation-green)](https://pypdf.readthedocs.io/en/stable/) [![GitHub last commit](https://img.shields.io/github/last-commit/py-pdf/pypdf)](https://github.com/py-pdf/pypdf) [![codecov](https://codecov.io/gh/py-pdf/pypdf/branch/main/graph/badge.svg?token=id42cGNZ5Z)](https://codecov.io/gh/py-pdf/pypdf) # pypdf pypdf is a free and open-source pure-python PDF library capable of splitting, [merging](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html), [cropping, and transforming](https://pypdf.readthedocs.io/en/stable/user/cropping-and-transforming.html) the pages of PDF files. It can also add custom data, viewing options, and [passwords](https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html) to PDF files. pypdf can [retrieve text](https://pypdf.readthedocs.io/en/stable/user/extract-text.html) and [metadata](https://pypdf.readthedocs.io/en/stable/user/metadata.html) from PDFs as well. See [pdfly](https://github.com/py-pdf/pdfly) for a CLI application that uses pypdf to interact with PDFs. ## Installation Install pypdf using pip: ``` pip install pypdf ``` For using pypdf with AES encryption or decryption, install extra dependencies: ``` pip install pypdf[crypto] ``` > **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to > previous versions. Please refer to [the migration > guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for > more information. ## Usage ```python from pypdf import PdfReader reader = PdfReader("example.pdf") number_of_pages = len(reader.pages) page = reader.pages[0] text = page.extract_text() ``` pypdf can do a lot more, e.g. splitting, merging, reading and creating annotations, decrypting and encrypting. Check out the [documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage examples! For questions and answers, visit [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf) (tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)). ## Contributions Maintaining pypdf is a collaborative effort. You can support the project by writing documentation, helping to narrow down issues, and submitting code. See the [CONTRIBUTING.md](https://github.com/py-pdf/pypdf/blob/main/CONTRIBUTING.md) file for more information. ### Q&A The experience pypdf users have covers the whole range from beginner to expert. You can contribute to the pypdf community by answering questions on [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf), helping in [discussions](https://github.com/py-pdf/pypdf/discussions), and asking users who report issues for [MCVE](https://stackoverflow.com/help/minimal-reproducible-example)'s (Code + example PDF!). ### Issues A good bug ticket includes a MCVE - a minimal complete verifiable example. For pypdf, this means that you must upload a PDF that causes the bug to occur as well as the code you're executing with all of the output. Use `print(pypdf.__version__)` to tell us which version you're using. ### Code All code contributions are welcome, but smaller ones have a better chance to get included in a timely manner. Adding unit tests for new features or test cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine. pypdf includes a test suite which can be executed with `pytest`: ```bash $ pytest ===================== test session starts ===================== platform linux -- Python 3.6.15, pytest-7.0.1, pluggy-1.0.0 rootdir: /home/moose/GitHub/Martin/pypdf plugins: cov-3.0.0 collected 233 items tests/test_basic_features.py .. [ 0%] tests/test_constants.py . [ 1%] tests/test_filters.py .................x..... [ 11%] tests/test_generic.py ................................. [ 25%] ............. [ 30%] tests/test_javascript.py .. [ 31%] tests/test_merger.py . [ 32%] tests/test_page.py ......................... [ 42%] tests/test_pagerange.py ................ [ 49%] tests/test_papersizes.py .................. [ 57%] tests/test_reader.py .................................. [ 72%] ............... [ 78%] tests/test_utils.py .................... [ 87%] tests/test_workflows.py .......... [ 91%] tests/test_writer.py ................. [ 98%] tests/test_xmp.py ... [100%] ========== 232 passed, 1 xfailed, 1 warning in 4.52s ========== ``` ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/_static/releasing.drawio ================================================ ================================================ FILE: docs/conf.py ================================================ """ Configuration file for the Sphinx documentation builder. This file only contains a selection of the most common options. For a full list see the documentation: https://www.sphinx-doc.org/en/master/usage/configuration.html """ # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import datetime import os import shutil import sys from pathlib import Path sys.path.insert(0, os.path.abspath(".")) sys.path.insert(0, os.path.abspath("../")) import pypdf as py_pkg shutil.copyfile("../CHANGELOG.md", "meta/CHANGELOG.md") shutil.copyfile("../CONTRIBUTORS.md", "meta/CONTRIBUTORS.md") # -- Project information ----------------------------------------------------- project = py_pkg.__name__ copyright = f"2006 - {datetime.datetime.now(tz=datetime.timezone.utc).year}, Mathieu Fenniak and pypdf contributors" author = "Mathieu Fenniak" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = py_pkg.__version__ # The full version, including alpha/beta/rc tags. release = py_pkg.__version__ # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. needs_sphinx = "4.0.0" # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.autosummary", "sphinx.ext.coverage", "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinx.ext.napoleon", "sphinx.ext.doctest", # External "myst_parser", ] python_version = ".".join(map(str, sys.version_info[:2])) intersphinx_mapping = { "python": (f"https://docs.python.org/{python_version}", None), "Pillow": ("https://pillow.readthedocs.io/en/latest/", None), } nitpick_ignore_regex = [ # For reasons unclear at this stage, the io module prefixes everything with _io # and this confuses sphinx ( r"py:class", r"(_io.(FileIO|BytesIO|Buffered(Reader|Writer))|pypdf.*PdfDocCommon)", ), ] autodoc_default_options = { "member-order": "bysource", "members": True, "show-inheritance": True, "undoc-members": True, } autodoc_inherit_docstrings = False autodoc_typehints_format = "short" python_use_unqualified_type_names = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # Configure MyST extension. myst_all_links_external = False myst_heading_anchors = 3 # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { "canonical_url": "", "analytics_id": "", "logo_only": True, "prev_next_buttons_location": "bottom", "style_external_links": False, # Toc options "collapse_navigation": True, "sticky_navigation": True, "navigation_depth": 4, "includehidden": True, "titles_only": False, } html_logo = "_static/logo.png" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # -- Options for Napoleon ----------------------------------------------------- napoleon_google_docstring = True napoleon_numpy_docstring = False # Explicitly prefer Google style docstring napoleon_use_param = True # for type hint support napoleon_use_rtype = False # False, so the return type is inline with the description. # -- Options for Doctest ------------------------------------------------------ # Most of doc examples use hardcoded input and output file names. # To execute these examples real files need to be read and written. # # By default, documentation examples run with the working directory set to where # "sphinx-build" command was invoked. To avoid relative paths in docs and to # allow to run "sphinx-build" command from any directory, we modify the current # working directory in each tested file. Tests are executed against our # temporary directory where we have copied all nessesary resources. # # Each doc page that requires file operations must use "testsetup" directive # to call "pypdf_test_setup" function to prepare the test environment for that # page. # # def pypdf_test_setup(group: str, resources: dict[str, str] = {}) -> None # # Args: # group: A unique name for group of tests. Typically we group tests by doc page. # For each doc page we create a test folder under # "_build/doctest/pypdf_test/". This allows to avoid file name conflicts # between different doc pages. # resources: A dictionary of source files to copy into the test folder. # Key is the destination file name (relative to the test folder). # Value is the source file path (relative to the root folder). # # Examples: # ```{testsetup} # pypdf_test_setup("user/add-javascript", { # "example.pdf": "../resources/example.pdf", # }) # ``` pypdf_test_src_root_dir = os.path.abspath(".") pypdf_test_dst_root_dir = os.path.abspath("_build/doctest/pypdf_test") if Path(pypdf_test_dst_root_dir).exists(): shutil.rmtree(pypdf_test_dst_root_dir) Path(pypdf_test_dst_root_dir).mkdir(parents=True) doctest_global_setup = f""" def pypdf_test_global_setup(): import os import shutil from pathlib import Path src_root_dir = {pypdf_test_src_root_dir.__repr__()} dst_root_dir = {pypdf_test_dst_root_dir.__repr__()} global pypdf_test_orig_dir pypdf_test_orig_dir = os.getcwd() os.chdir(dst_root_dir) global pypdf_test_setup def pypdf_test_setup(group: str, resources: dict[str, str] = {{}}) -> None: dst_dir = os.path.join(dst_root_dir, group) Path(dst_dir).mkdir(parents=True) os.chdir(dst_dir) for (dst_path, src_path) in resources.items(): src = os.path.normpath(os.path.join(src_root_dir, src_path)) dst = os.path.join(dst_dir, dst_path) shutil.copyfile(src, dst) pypdf_test_global_setup() """ doctest_global_cleanup = f""" def pypdf_test_global_cleanup(): import os dst_root_dir = {pypdf_test_dst_root_dir.__repr__()} os.chdir(pypdf_test_orig_dir) has_files = False for name in os.listdir(dst_root_dir): file_name = os.path.join(dst_root_dir, name) if os.path.isfile(file_name): if not has_files: print("Docs page was not configured propery for running code examples") print("Please use 'pypdf_test_setup' function in 'testsetup' directive") print("Deleting unexpected file(s) in " + dst_root_dir) has_files = True print(f"- {{name}}") os.remove(file_name) # Avoid side effects on other tests pypdf_test_global_cleanup() """ ================================================ FILE: docs/dev/cmaps.md ================================================ # CMaps Looking at the cmap of "crazyones": ```bash pdftk crazyones.pdf output crazyones-uncomp.pdf uncompress ``` You can see this: ```text begincmap /CMapName /T1Encoding-UTF16 def /CMapType 2 def /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def 1 begincodespacerange <00> endcodespacerange 1 beginbfchar <1B> endbfchar endcmap CMapName currentdict /CMap defineresource pop ``` ## codespacerange A codespacerange maps a complete sequence of bytes to a range of Unicode glyphs. It defines a starting point: ```text 1 beginbfchar <1B> ``` That means that `1B` (Hex for 27) maps to the Unicode character [`FB00`](https://unicode-table.com/en/FB00/) - the ligature ff (two lowercase f's). The two numbers in `begincodespacerange` mean that it starts with an offset of 0 (hence from `1B ➜ FB00`) up to an offset of FF (dec: 255), hence 1B+FF = 282 ➜ [FBFF](https://www.compart.com/de/unicode/U+FBFF). Within the text stream, there is ```text (The)-342(mis\034ts.) ``` `\034 ` is octal for the decimal value 28. ================================================ FILE: docs/dev/deprecations.md ================================================ # The Deprecation Process pypdf strives to be an excellent library for its current users and for new ones. We are careful with introducing potentially breaking changes, but we will do them if they provide value for the community in the long run. We hope and think that deprecations will not happen frequently. If they do, users can rely on the following procedure. ## Semantic Versioning pypdf uses [semantic versioning](https://semver.org/). If you want to avoid breaking changes, please use dependency pinning (also known as version pinning). In Python, this is done by specifying the exact version you want to use in a `requirements.txt` file. A tool that can support you is `pip-compile` from [`pip-tools`](https://pypi.org/project/pip-tools/). If you are using [Poetry](https://pypi.org/project/poetry/) it is done with the `poetry.lock` file. ## How pypdf deprecates features Assume the current version of pypdf is `x.y.z`. After a discussion (e.g., via GitHub issues), we decided to remove a class / function / method. This is how we do it: 1. `x.y.(z+1)`: Add a DeprecationWarning. If there is a replacement, the replacement is also introduced and the warning informs about the change and when it will happen. The docs let users know about the deprecation and when it will happen and the new function. The CHANGELOG informs about it. 2. `(x+1).0.0`: Remove / change the code in the breaking way by replacing DeprecationWarnings by DeprecationErrors. We do this to help people who didn't look at the warnings before. The CHANGELOG informs about it. 3. `(x+2).0.0`: The DeprecationErrors are removed. This means the users have three warnings in the CHANGELOG, a DeprecationWarning until the next major release and a DeprecationError until the major release after that. Please note that adding warnings can be a breaking change for some users; most likely just in the CI. This means it needs to be properly documented. ================================================ FILE: docs/dev/documentation.md ================================================ # Documentation This documentation is build with [Sphinx](https://www.sphinx-doc.org/) and hosted by [Read the Docs](https://about.readthedocs.com/) ## Testing code snippets Almost all python code snippets in documentation tested using Sphinx's extension [sphinx.ext.doctest](https://www.sphinx-doc.org/en/master/usage/extensions/doctest.html). This allows to make sure that we have no typos, missed imports and other problems in: - code snippets marked with `testcode` directive in `*.md` files - code snippets from python's docstrings imported via `autoclass` directive in `*.rst` files CI pipeline is configured run Sphinx's `doctest` build automatically for each PR. It is also possible to run it locally: 1. First you need to install docs requirements ```bash pip install -r requirements/docs.txt ``` 2. Change current directory ```bash cd docs ``` 3. Run `doctest` build. It uses indirectly `sphinx-build` command line tool installed with docs requrements. See [Sphinx's docs](https://www.sphinx-doc.org/en/master/usage/quickstart.html#running-the-build) for details. ```bash make doctest ``` 4. If everything is okay you should see in output `Doctest summary` without failures ## API Reference ### Method / Function Docstrings We use Google-Style Docstrings: ``` def example(param1: int, param2: str) -> bool: """ Example function with PEP 484 type annotations. Args: param1: The first parameter. param2: The second parameter. Returns: The return value. True for success, False otherwise. Raises: AttributeError: The ``Raises`` section is a list of all exceptions that are relevant to the interface. ValueError: If `param2` is equal to `param1`. Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> print([i for i in example_generator(4)]) [0, 1, 2, 3] """ ``` * The order of sections is (1) Args (2) Returns (3) Raises (4) Examples * If there is no return value, remove the 'Returns' block * Properties should not have any sections ## Issues and PRs An issue can be used to discuss what we want to achieve. A PR can be used to discuss how we achieve it. ## Commit Messages We want to have descriptive commits in the `main` branch. For this reason, every pull request (PR) is squashed. That means no matter how many commits a PR has, in the end only one combined commit will be in `main`. The title of the PR will be used as the first line of that combined commit message. The first comment within the commit will be used as the message body. See [developer intro](intro.md#commit-messages) for more details. ================================================ FILE: docs/dev/intro.md ================================================ # Developer Intro pypdf is a library and hence its users are developers. This document is not for the users, but for people who want to work on pypdf itself. ```{note} Our CI (continuous integration) validates that relevant standards are met with your contribution. Especially for regular contributors or larger changes, it is highly recommended that you set up your own development environment to already cover the most important aspects locally. This greatly helps us to reduce the noise compared to when you open an untested PR early and use our CI to do your debugging and improvements from there. The maintainers usually receive a notification on every push to a branch where a corresponding PR is open, possibly hiding important notifications. ``` ## Installing Requirements ``` pip install -r requirements/dev.txt ``` ## Running Tests See [testing pypdf with pytest](testing.md). ## The sample-files git submodule The reason for having the submodule `sample-files` is that we want to keep the size of the pypdf repository small while we also want to have an extensive test suite. Those two goals contradict each other. The `resources` folder should contain a select set of core examples that cover most cases we typically want to test for. The `sample-files` might cover a lot more edge cases, the behavior we get when file sizes get bigger, different PDF producers. To get the sample-files folder, you need to execute: ``` git submodule update --init ``` ## Tools: git and pre-commit Git is a command line application for version control. If you don't know it, you can [play ohmygit](https://ohmygit.org/) to learn it. GitHub is the service where the pypdf project is hosted. While git is free and open source, GitHub is a paid service by Microsoft, but free in a lot of cases. [pre-commit](https://pypi.org/project/pre-commit/) is a command line application that uses git hooks to automatically execute code. This allows you to avoid style issues and other code quality issues. After you entered `pre-commit install` once in your local copy of pypdf, it will automatically be executed when you `git commit`. ## Commit Messages Having a clean commit message helps people to quickly understand what the commit is about, without actually looking at the changes. The first line of the commit message is used to [auto-generate the CHANGELOG](https://github.com/py-pdf/pypdf/blob/main/make_release.py). For this reason, the format should be: ``` PREFIX: DESCRIPTION BODY ``` The `PREFIX` can be: * `SEC`: Security improvements. Typically, an infinite loop that was possible. * `BUG`: A bug was fixed. Likely there are one or multiple issues. Then write in the `BODY`: `Closes #123` where 123 is the issue number on GitHub. It would be absolutely amazing if you could write a regression test in those cases. That is a test that would fail without the fix. A bug is always an issue for pypdf users - test code or CI that was fixed is not considered a bug here. * `ENH`: A new feature! Describe in the body what it can be used for. * `DEP`: Deprecation. Either marking something as "this is going to be removed" or actually removing it. * `PI`: A performance improvement. This could also be a reduction in the file size of PDF files generated by pypdf. * `ROB`: A robustness change. Dealing better with broken PDF files. * `DOC`: A documentation change. * `TST`: Adding or adjusting tests. * `DEV`: Developer experience improvements, e.g., pre-commit or setting up CI. * `MAINT`: Quite a lot of different stuff. Performance improvements are, for sure, the most interesting changes in here. Refactorings as well. * `STY`: A style change. Something that makes pypdf code more consistent. Typically, a small change. It could also be better error messages for end users. The prefix is used to generate the CHANGELOG. Every PR must have exactly one - if you feel like several match, take the top one from this list that matches for your PR. ## Pull Request Size Smaller Pull Requests (PRs) are preferred as it's typically easier to merge them. For example, if you have some typos, a few code-style changes, a new feature, and a bug-fix, that could be three or four PRs. A PR must be complete. That means if you introduce a new feature, it must be finished within the PR and have a test for that feature. ## Benchmarks We need to keep an eye on performance, and thus we have a few benchmarks. See [py-pdf.github.io/pypdf/dev/bench](https://py-pdf.github.io/pypdf/dev/bench/) ================================================ FILE: docs/dev/pdf-format.md ================================================ # The PDF Format It is recommended to look in the PDF specification for details and clarifications. * [PDF Specification Archive](https://pdfa.org/resource/pdf-specification-archive/) * [Portable Document Format Reference Manual, 1993. ISBN 0-201-62628-4](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.0.pdf) * [ISO 32000-1:2008 (PDF 1.7)](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf) * ISO 32000-2:2020 (PDF 2.0) ```{note} We currently generate files with a header for PDF 1.3 by default. At the same time, we strive to support the PDF 1.7 specification. Features specific to PDF 2.0 might be available, but we always ensure that older versions do not break due to the rather limited general PDF 2.0 support in the wild and to not break for old PDF files. For this reason, some historical aspects (like insecure encryption algorithms) are required to be supported, although PDF 2.0 deprecates most of them and allows more secure variants. ``` Below is only intended to give a very rough overview of the format. ## Overall Structure A PDF consists of: 1. Header: Contains the version of the PDF, e.g. `%PDF-1.7` 2. Body: Contains a sequence of indirect objects 3. Cross-reference table (xref): Contains a list of the indirect objects in the body 4. Trailer ## The xref table A cross-reference table (xref) is a table of the indirect objects in the body. It allows quick access to those objects by pointing to their location in the file. It looks like this: ```text xref 42 5 0000001000 65535 f 0000001234 00000 n 0000001987 00000 n 0000011987 00000 n 0000031987 00000 n ``` Let's go through it step-by-step: * `xref` is just a keyword that specifies the start of the xref table. * `42` is the numerical ID of the first object in this xref section; `5` is the number of entries in the xref table. * Now every object has 3 entries `nnnnnnnnnn ggggg n`: a 10-digit byte offset, a 5-digit generation number, and a literal keyword which is either `n` or `f`. * `nnnnnnnnnn` is the byte offset of the object. It tells the reader where the object is in the file. * `ggggg` is the generation number. It tells the reader how old the object is. * `n` means that the object is a normal in-use object, `f` means that the object is a free object. * The first free object always has a generation number of 65535. It forms the head of a linked-list of all free objects. * The generation number of a normal object is always 0. The generation number allows the PDF format to contain multiple versions of the same object. This is a version history mechanism. ## The body The body is a sequence of indirect objects: `counter generation_number << the_object >> endobj` * `counter` (integer) is a unique identifier for the object. * `generation_number` (integer) is the generation number of the object. * `the_object` is the object itself. It can be empty. Starts with `/Keyword` to specify which kind of object it is. * `endobj` marks the end of the object. A concrete example can be found in `test_reader.py::test_get_images_raw`: ```text 1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj 2 0 obj << >> endobj 3 0 obj << >> endobj 4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0] /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R /Resources << /Font << >> >> /Rotate 0 /Type /Page >> endobj 5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj ``` ## The trailer The trailer looks like this: ```text trailer << /Root 5 0 R /Size 6 >> startxref 1234 %%EOF ``` Let's go through it: * `trailer <<` indicates that the *trailer dictionary* starts. It ends with `>>`. * `startxref` is a keyword followed by the byte-location of the `xref` keyword. As the trailer is always at the bottom of the file, this allows readers to quickly find the xref table. * `%%EOF` is the end-of-file marker. The trailer dictionary is a key-value list. The keys are specified in Table 15 of the PDF Reference 1.7, e.g. `/Root` and `/Size` (both are required). * `/Root` (dictionary) contains the document catalog. * The `5` is the object number of the catalog dictionary. * `0` is the generation number of the catalog dictionary. * `R` is the keyword that indicates that the object is a reference to the catalog dictionary. * `/Size` (integer) contains the total number of entries in the files xref table. ## Reading PDF files Most PDF files are compressed. If you want to read them, first uncompress them: ```bash pdftk crazyones.pdf output crazyones-uncomp.pdf uncompress ``` Then rename `crazyones-uncomp.pdf` to `crazyones-uncomp.txt` and open it in your favorite IDE / text editor. ================================================ FILE: docs/dev/pypdf-parsing.md ================================================ # How pypdf parses PDF files pypdf uses {class}`~pypdf.PdfReader` to parse PDF files. The method {py:meth}`PdfReader.read ` shows the basic structure of parsing: 1. **Finding and reading the cross-reference tables / trailer**: The cross-reference table (xref table) is a table of byte offsets that indicate the locations of objects within the file. The trailer provides additional information such as the root object (Catalog) and the Info object containing metadata. 2. **Parsing the objects**: After locating the xref table and the trailer, pypdf proceeds to parse the objects in the PDF. Objects in a PDF can be of various types such as dictionaries, arrays, streams, and simple data types (e.g., integers, strings). pypdf parses these objects and stores them in {py:meth}`PdfReader.resolved_objects `, populated by {py:meth}`cache_indirect_object `. 3. **Decoding content streams**: The content of a PDF is typically stored in content streams, which are sequences of PDF operators and operands. pypdf decodes these content streams by applying filters (e.g., `FlateDecode`, `LZWDecode`) specified in the stream's dictionary. This is only done when the object is requested by {py:meth}`PdfReader.get_object ` which uses the `PdfReader._get_object_from_stream` method. ## References [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf): * 7.5 File Structure * 7.5.4 Cross-Reference Table * 7.8 Content Streams and Resources ================================================ FILE: docs/dev/pypdf-writing.md ================================================ # How pypdf writes PDF files pypdf uses {py:class}`PdfWriter ` to write PDF files. pypdf has {py:class}`PdfObject ` and several subclasses with the {py:meth}`write_to_stream ` method. The {py:meth}`PdfWriter.write ` method uses the `write_to_stream` methods of the referenced objects. The {py:meth}`PdfWriter.write_stream ` method has the following core steps: 1. `_sweep_indirect_references`: This step ensures that any circular references to objects are correctly handled. It adds the object reference numbers of any circularly referenced objects to an external reference map, so that self-page-referencing trees can reference the correct new object location, rather than copying in a new copy of the page object. 2. **Write the File Header and Body** with `_write_pdf_structure`: In this step, the PDF header and objects are written to the output stream. This includes the PDF version (e.g., %PDF-1.7) and the objects that make up the content of the PDF, such as pages, annotations, and form fields. The locations (byte offsets) of these objects are stored for later use in generating the xref table. 3. **Write the Cross-Reference Table** with `_write_xref_table`: Using the stored object locations, this step generates and writes the cross-reference table (xref table) to the output stream. The cross-reference table contains the byte offsets for each object in the PDF file, allowing for quick random access to objects when reading the PDF. 4. **Write the File Trailer** with `_write_trailer`: The trailer is written to the output stream in this step. The trailer contains essential information, such as the number of objects in the PDF, the location of the root object (Catalog), and the Info object containing metadata. The trailer also specifies the location of the xref table. ## How others do it Looking at alternative software designs and implementations can help to improve our choices. ### fpdf2 [fpdf2](https://pypi.org/project/fpdf2/) has a [`PDFObject` class](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py) with a serialize method which roughly maps to `pypdf.PdfObject.write_to_stream`. Some other similarities include: * [fpdf.output.OutputProducer.buffersize](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/output.py#L370-L485) vs. {py:meth}`pypdf.PdfWriter.write_stream ` * [fpdpf.syntax.Name](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L124) vs. {py:class}`pypdf.generic.NameObject ` * [fpdf.syntax.build_obj_dict](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L222) vs. {py:class}`pypdf.generic.DictionaryObject ` * [fpdf.structure_tree.NumberTree](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/structure_tree.py#L17) vs. {py:class}`pypdf.generic.TreeObject ` ### pdfrw [pdfrw](https://pypi.org/project/pdfrw/), in contrast, seems to work more with the standard Python objects (bool, float, string) and not wrap them in custom objects, if possible. It still has: * [PdfArray](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfarray.py#L13) * [PdfDict](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfdict.py#L49) * [PdfName](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfname.py#L65) * [PdfString](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfstring.py#L322) * [PdfIndirect](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfindirect.py#L10) The core classes of pdfrw are [PdfReader](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfreader.py#L26) and [PdfWriter](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfwriter.py#L224) ================================================ FILE: docs/dev/releasing.md ================================================ # Releasing A `pypdf` release contains the following artifacts: * A new [release on PyPI](https://pypi.org/project/pypdf/) * A [release commit](https://github.com/py-pdf/pypdf/commit/91391b18bb8ec9e6e561e2795d988e8634a01a50) * Containing a changelog update * A new [git tag](https://github.com/py-pdf/pypdf/tags) * A [GitHub release](https://github.com/py-pdf/pypdf/releases/tag/3.15.0) ## Who does it? `pypdf` should typically only be released by one of the core maintainers / the core maintainer. At the moment, this usually is stefan6419846. Any owner of the py-pdf organization also has the technical permissions to release. ## How is it done? ### With direct push permissions This is the typical way for the core maintainer/benevolent dictator. The release contains the following steps: 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`. This also prepares the release commit message. 2. Create a release commit: `git commit -eF RELEASE_COMMIT_MSG.md`. 3. Push commit: `git push`. 4. Create the tag: `git tag -s 6.7.1 -eF RELEASE_COMMIT_MSG.md`. 5. Push the tag: `git push origin 6.7.1`. 6. CI now builds a source and a wheels package which it pushes to PyPI. It also creates the corresponding GitHub release. ![](../_static/releasing.drawio.png) ### Using a Pull Request This is the typical way for collaborators which do not have direct push permissions for the `main` branch. The release contains the following steps: 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`. This also prepares the release commit message. 2. Push the changes to a dedicated branch. 3. Open a pull request starting with `REL: `, followed by the new version number. 4. Wait for the approval of another eligible maintainer. 5. Merge the pull request with the name being the PR title and the body being the content of `RELEASE_COMMIT_MSG.md`. 6. Create the tag: `git tag -s 6.7.1 -eF RELEASE_COMMIT_MSG.md`. 7. Push the tag: `git push origin 6.7.1`. 8. CI now builds a source and a wheels package which it pushes to PyPI. It also creates the corresponding GitHub release. ### The Release Tag * Use the release version as the tag name. No need for a leading "v". * Use the changelog entry as the body. ## When are releases done? There is no need to wait for anything. If the CI is green (all tests succeeded), we can release. At the moment, there is no fixed release cycle - except that we usually release on Sunday. ================================================ FILE: docs/dev/testing.md ================================================ # Testing pypdf uses [`pytest`](https://docs.pytest.org/en/7.1.x/) for testing. To run the tests, you need to install the CI (Continuous Integration) requirements by running `pip install -r requirements/ci.txt` or `pip install -r requirements/ci-3.11.txt` if running Python ≥ 3.11. ## Deselecting groups of tests pypdf makes use of the following pytest markers: * `slow`: Tests that require more than 5 seconds. * `samples`: Tests that require [the `sample-files` git submodule](https://github.com/py-pdf/sample-files) to be initialized. As of October 2022, this is about 25 MB. * `enable_socket`: Tests that download PDF documents. They are stored locally and thus only need to be downloaded once. As of October 2022, this is about 200 MB. * To successfully run the tests, please download most of the documents beforehand: `python -c "from tests import download_test_pdfs; download_test_pdfs()"` You can disable them by `pytest -m "not enable_socket"` or `pytest -m "not samples"`. You can even disable all of them: `pytest -m "not enable_socket" -m "not samples" -m "not slow"`. Please note that this reduces test coverage. The CI will always test all files. ## Docstrings in Unit tests The first line of a docstring in a unit test should be written in a way that you could prefix it with "This tests ensures that ...", e.g. * Invalid XML in xmp_metadata is gracefully handled. * The identity is returning its input. * xmp_modify_date is extracted correctly. This way, plugins like [`pytest-testdox`](https://pypi.org/project/pytest-testdox/) can generate really nice output when the tests are running. This looks similar to the output of [mocha.js](https://mochajs.org/). If the test is a regression test, write > This test is a regression test for issue #1234 If the regression test is just one parameter of other tests, then add it as a comment for that parameter. ## Evaluate a PR in-progress version You may want to test a version from a PR which has not been released yet. The easiest way is to use pip and install a version from git: a) Go the PR and identify the repository and branch. Example from below : repository: __pubpub-zz__ / branch: __iss2200__ : ![PR Header example](PR_Header_example.png) b) you can then install the version using pip from git: Example: ``` pip install git+https://github.com/pubpub-zz/pypdf.git@iss2200 ``` ================================================ FILE: docs/index.rst ================================================ .. pypdf documentation main file, created by sphinx-quickstart on Thu Apr 7 20:13:19 2022. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to pypdf ================= pypdf is a `free `_ and open source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. pypdf can retrieve text and metadata from PDFs as well. See `pdfly `_ for a CLI application that uses pypdf to interact with PDFs. You can contribute to `pypdf on GitHub `_. .. toctree:: :caption: User Guide :maxdepth: 1 user/installation user/robustness user/security user/suppress-warnings user/metadata user/extract-text user/post-processing-in-text-extraction user/extract-images user/handle-attachments user/encryption-decryption user/merging-pdfs user/cropping-and-transforming user/reading-pdf-annotations user/adding-pdf-annotations user/add-watermark user/add-javascript user/viewer-preferences user/forms user/handling-outlines user/streaming-data user/file-size user/pdf-version-support user/pdfa-compliance .. toctree:: :caption: API Reference :maxdepth: 1 modules/PdfReader modules/PdfWriter modules/Destination modules/DocumentInformation modules/Field modules/Fit modules/PageObject modules/PageRange modules/PaperSize modules/RectangleObject modules/Transformation modules/XmpInformation modules/annotations modules/constants modules/errors modules/generic modules/PdfDocCommon .. toctree:: :caption: Developer Guide :maxdepth: 1 dev/intro dev/pdf-format dev/pypdf-parsing dev/pypdf-writing dev/cmaps dev/deprecations dev/documentation dev/testing dev/releasing .. toctree:: :caption: About pypdf :maxdepth: 1 meta/CHANGELOG meta/changelog-v1 meta/migration-1-to-2 meta/project-governance meta/taking-ownership meta/history meta/CONTRIBUTORS meta/scope-of-pypdf meta/comparisons meta/faq Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/meta/changelog-v1.md ================================================ # Changelog of PyPDF2 1.X ## Version 1.28.4, 2022-05-29 Bug Fixes (BUG): - XmpInformation._converter_date was unusable (#921) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.28.3...1.28.4) ## Version 1.28.3, 2022-05-28 ### Deprecations (DEP) - PEP8 renaming (#905) ### Bug Fixes (BUG) - XmpInformation missing method _getText (#917) - Fix PendingDeprecationWarning on _merge_page (#904) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.28.2...1.28.3) ## Version 1.28.2, 2022-05-23 ### Bug Fixes (BUG) - PendingDeprecationWarning for getContents (#893) - PendingDeprecationWarning on using PdfMerger (#891) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.28.1...1.28.2) ## Version 1.28.1, 2022-05-22 ### Bug Fixes (BUG) - Incorrectly show deprecation warnings on internal usage (#887) ### Maintenance (MAINT) - Add stacklevel=2 to deprecation warnings (#889) - Remove duplicate warnings imports (#888) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.28.0...1.28.1) ## Version 1.28.0, 2022-05-22 This release adds a lot of deprecation warnings in preparation of the PyPDF2 2.0.0 release. The changes are mostly using snake_case function-, method-, and variable-names as well as using properties instead of getter-methods. Maintenance (MAINT): - Remove IronPython Fallback for zlib (#868) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.12...1.27.13) ### Deprecations (DEP) * Make the `PyPDF2.utils` module private * Rename of core classes: * PdfFileReader ➔ PdfReader * PdfFileWriter ➔ PdfWriter * PdfFileMerger ➔ PdfMerger * Use PEP8 conventions for function names and parameters * If a property and a getter-method are both present, use the property #### Details In many places: - getObject ➔ get_object - writeToStream ➔ write_to_stream - readFromStream ➔ read_from_stream PyPDF2.generic - readObject ➔ read_object - convertToInt ➔ convert_to_int - DocumentInformation.getText ➔ DocumentInformation._get_text : This method should typically not be used; please let me know if you need it. PdfReader class: - `reader.getPage(pageNumber)` ➔ `reader.pages[page_number]` - `reader.getNumPages()` / `reader.numPages` ➔ `len(reader.pages)` - getDocumentInfo ➔ metadata - flattenedPages attribute ➔ flattened_pages - resolvedObjects attribute ➔ resolved_objects - xrefIndex attribute ➔ xref_index - getNamedDestinations / namedDestinations attribute ➔ named_destinations - getPageLayout / pageLayout ➔ page_layout attribute - getPageMode / pageMode ➔ page_mode attribute - getIsEncrypted / isEncrypted ➔ is_encrypted attribute - getOutlines ➔ get_outlines - readObjectHeader ➔ read_object_header - cacheGetIndirectObject ➔ cache_get_indirect_object - cacheIndirectObject ➔ cache_indirect_object - getDestinationPageNumber ➔ get_destination_page_number - readNextEndLine ➔ read_next_end_line - _zeroXref ➔ _zero_xref - _authenticateUserPassword ➔ _authenticate_user_password - _pageId2Num attribute ➔ _page_id2num - _buildDestination ➔ _build_destination - _buildOutline ➔ _build_outline - _getPageNumberByIndirect(indirectRef) ➔ _get_page_number_by_indirect(indirect_ref) - _getObjectFromStream ➔ _get_object_from_stream - _decryptObject ➔ _decrypt_object - _flatten(..., indirectRef) ➔ _flatten(..., indirect_ref) - _buildField ➔ _build_field - _checkKids ➔ _check_kids - _writeField ➔ _write_field - _write_field(..., fieldAttributes) ➔ _write_field(..., field_attributes) - _read_xref_subsections(..., getEntry, ...) ➔ _read_xref_subsections(..., get_entry, ...) PdfWriter class: - `writer.getPage(pageNumber)` ➔ `writer.pages[page_number]` - `writer.getNumPages()` ➔ `len(writer.pages)` - addMetadata ➔ add_metadata - addPage ➔ add_page - addBlankPage ➔ add_blank_page - addAttachment(fname, fdata) ➔ add_attachment(filename, data) - insertPage ➔ insert_page - insertBlankPage ➔ insert_blank_page - appendPagesFromReader ➔ append_pages_from_reader - updatePageFormFieldValues ➔ update_page_form_field_values - cloneReaderDocumentRoot ➔ clone_reader_document_root - cloneDocumentFromReader ➔ clone_document_from_reader - getReference ➔ get_reference - getOutlineRoot ➔ get_outline_root - getNamedDestRoot ➔ get_named_dest_root - addBookmarkDestination ➔ add_bookmark_destination - addBookmarkDict ➔ add_bookmark_dict - addBookmark ➔ add_bookmark - addNamedDestinationObject ➔ add_named_destination_object - addNamedDestination ➔ add_named_destination - removeLinks ➔ remove_links - removeImages(ignoreByteStringObject) ➔ remove_images(ignore_byte_string_object) - removeText(ignoreByteStringObject) ➔ remove_text(ignore_byte_string_object) - addURI ➔ add_uri - addLink ➔ add_link - getPage(pageNumber) ➔ get_page(page_number) - getPageLayout / setPageLayout / pageLayout ➔ page_layout attribute - getPageMode / setPageMode / pageMode ➔ page_mode attribute - _addObject ➔ _add_object - _addPage ➔ _add_page - _sweepIndirectReferences ➔ _sweep_indirect_references PdfMerger class - `__init__` parameter: strict=True ➔ strict=False (the PdfFileMerger still has the old default) - addMetadata ➔ add_metadata - addNamedDestination ➔ add_named_destination - setPageLayout ➔ set_page_layout - setPageMode ➔ set_page_mode Page class: - artBox / bleedBox/ cropBox/ mediaBox / trimBox ➔ artbox / bleedbox/ cropbox/ mediabox / trimbox - getWidth, getHeight ➔ width / height - getLowerLeft_x / getUpperLeft_x ➔ left - getUpperRight_x / getLowerRight_x ➔ right - getLowerLeft_y / getLowerRight_y ➔ bottom - getUpperRight_y / getUpperLeft_y ➔ top - getLowerLeft / setLowerLeft ➔ lower_left property - upperRight ➔ upper_right - mergePage ➔ merge_page - rotateClockwise / rotateCounterClockwise ➔ rotate_clockwise - _mergeResources ➔ _merge_resources - _contentStreamRename ➔ _content_stream_rename - _pushPopGS ➔ _push_pop_gs - _addTransformationMatrix ➔ _add_transformation_matrix - _mergePage ➔ _merge_page XmpInformation class: - getElement(..., aboutUri, ...) ➔ get_element(..., about_uri, ...) - getNodesInNamespace(..., aboutUri, ...) ➔ get_nodes_in_namespace(..., aboutUri, ...) - _getText ➔ _get_text utils.py: - matrixMultiply ➔ matrix_multiply - RC4_encrypt is moved to the security module ## Version 1.27.12, 2022-05-02 ### Bug Fixes (BUG) - _rebuild_xref_table expects trailer to be a dict (#857) ### Documentation (DOC) - Security Policy [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.11...1.27.12) ## Version 1.27.11, 2022-05-02 ### Bug Fixes (BUG) - Incorrectly issued xref warning/exception (#855) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.10...1.27.11) ## Version 1.27.10, 2022-05-01 ### Robustness (ROB) - Handle missing destinations in reader (#840) - warn-only in readStringFromStream (#837) - Fix corruption in startxref or xref table (#788 and #830) ### Documentation (DOC) - Project Governance (#799) - History of PyPDF2 - PDF feature/version support (#816) - More details on text parsing issues (#815) ### Developer Experience (DEV) - Add benchmark command to Makefile - Ignore IronPython parts for code coverage (#826) ### Maintenance (MAINT) - Split pdf module (#836) - Separated CCITTFax param parsing/decoding (#841) - Update requirements files ### Testing (TST) - Use external repository for larger/more PDFs for testing (#820) - Swap incorrect test names (#838) - Add test for PdfFileReader and page properties (#835) - Add tests for PyPDF2.generic (#831) - Add tests for utils, form fields, PageRange (#827) - Add test for ASCII85Decode (#825) - Add test for FlateDecode (#823) - Add test for filters.ASCIIHexDecode (#822) ### Code Style (STY) - Apply pre-commit (black, isort) + use snake_case variables (#832) - Remove debug code (#828) - Documentation, Variable names (#839) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.9...1.27.10) ## Version 1.27.9, 2022-04-24 A change I would like to highlight is the performance improvement for large PDF files (#808) 🎉 ### New Features (ENH) - Add papersizes (#800) - Allow setting permission flags when encrypting (#803) - Allow setting form field flags (#802) ### Bug Fixes (BUG) - TypeError in xmp._converter_date (#813) - Improve spacing for text extraction (#806) - Fix PDFDocEncoding Character Set (#809) ### Robustness (ROB) - Use null ID when encrypted but no ID given (#812) - Handle recursion error (#804) ### Documentation (DOC) - CMaps (#811) - The PDF Format + commit prefixes (#810) - Add compression example (#792) ### Developer Experience (DEV) - Add Benchmark for Performance Testing (#781) ### Maintenance (MAINT) - Validate PDF magic byte in strict mode (#814) - Make PdfFileMerger.addBookmark() behave life PdfFileWriters' (#339) - Quadratic runtime while parsing reduced to linear (#808) ### Testing (TST) - Newlines in text extraction (#807) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.8...1.27.9) ## Version 1.27.8, 2022-04-21 ### Bug Fixes (BUG) - Use 1MB as offset for readNextEndLine (#321) - 'PdfFileWriter' object has no attribute 'stream' (#787) ### Robustness (ROB) - Invalid float object; use 0 as fallback (#782) ### Documentation (DOC) - Robustness (#785) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.7...1.27.8) ## Version 1.27.7, 2022-04-19 ### Bug Fixes (BUG) - Import exceptions from PyPDF2.errors in PyPDF2.utils (#780) ### Code Style (STY) - Naming in 'make_changelog.py' ## Version 1.27.6, 2022-04-18 ### Deprecations (DEP) - Remove support for Python 2.6 and older (#776) ### New Features (ENH) - Extract document permissions (#320) ### Bug Fixes (BUG) - Clip by trimBox when merging pages, which would otherwise be ignored (#240) - Add overwriteWarnings parameter PdfFileMerger (#243) - IndexError for getPage() of decrypted file (#359) - Handle cases where decodeParms is an ArrayObject (#405) - Updated PDF fields don't show up when page is written (#412) - Set Linked Form Value (#414) - Fix zlib -5 error for corrupt files (#603) - Fix reading more than last1K for EOF (#642) - Accidental import ### Robustness (ROB) - Allow extra whitespace before "obj" in readObjectHeader (#567) ### Documentation (DOC) - Link to pdftoc in Sample_Code (#628) - Working with annotations (#764) - Structure history ### Developer Experience (DEV) - Add issue templates (#765) - Add tool to generate changelog ### Maintenance (MAINT) - Use grouped constants instead of string literals (#745) - Add error module (#768) - Use decorators for @staticmethod (#775) - Split long functions (#777) ### Testing (TST) - Run tests in CI once with -OO Flags (#770) - Filling out forms (#771) - Add tests for Writer (#772) - Error cases (#773) - Check Error messages (#769) - Regression test for issue #88 - Regression test for issue #327 ### Code Style (STY) - Make variable naming more consistent in tests [Full changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.5...1.27.6) ## Version 1.27.5, 2022-04-15 ### Security (SEC) - ContentStream_readInlineImage had potential infinite loop (#740) ### Bug fixes (BUG) - Fix merging encrypted files (#757) - CCITTFaxDecode decodeParms can be an ArrayObject (#756) ### Robustness improvements (ROBUST) - title sometimes None (#744) ### Documentation (DOC) - Adjust short description of the package ### Tests and Test setup (TST) - Rewrite JS tests from unittest to pytest (#746) - Increase Test coverage, mainly with filters (#756) - Add test for inline images (#758) ### Developer Experience Improvements (DEV) - Remove unused Travis-CI configuration (#747) - Show code coverage (#754, #755) - Add mutmut (#760) ### Miscellaneous - STY: Closing file handles, explicit exports, ... (#743) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.4...1.27.5) ## Version 1.27.4, 2022-04-12 ### Bug fixes (BUG) - Guard formatting of `__init__.__doc__` string (#738) ### Packaging (PKG) - Add more precise license field to setup (#733) ### Testing (TST) - Add test for issue #297 ### Miscellaneous - DOC: Miscallenious ➔ Miscellaneous (Typo) - TST: Fix CI triggering (master ➔ main) (#739) - STY: Fix various style issues (#742) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.3...1.27.4) ## Version 1.27.3, 2022-04-10 - PKG: Make Tests not a subpackage (#728) - BUG: Fix ASCII85Decode.decode assertion (#729) - BUG: Error in Chinese character encoding (#463) - BUG: Code duplication in Scripts/2-up.py - ROBUST: Guard 'obj.writeToStream' with 'if obj is not None' - ROBUST: Ignore a /Prev entry with value 0 in the trailer - MAINT: Remove Sample_Code (#726) - TST: Close file handle in test_writer (#722) - TST: Fix test_get_images (#730) - DEV: Make tox use pytest and add more Python versions (#721) - DOC: Many (#720, #723-725, #469) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.2...1.27.3) ## Version 1.27.2, 2022-04-09 - Add Scripts (including `pdfcat`), Resources, Tests, and Sample_Code back to PyPDF2. It was removed by accident in 1.27.0, but might get removed with 2.0.0 See [discussions/718](https://github.com/py-pdf/PyPDF2/discussions/718). [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.1...1.27.2) ## Version 1.27.1, 2022-04-08 - Fixed project links on PyPI page after migration from mstamy2 to MartinThoma to the py-pdf organization on GitHub - Documentation is now at [pypdf2.readthedocs.io](https://pypdf2.readthedocs.io/en/latest/) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.27.0...1.27.1) ## Version 1.27.0, 2022-04-07 Features: - Add alpha channel support for png files in Script (#614) ### Bug fixes (BUG) - Fix formatWarning for filename without slash (#612) - Add whitespace between words for extractText() (#569, #334) - "invalid escape sequence" SyntaxError (#522) - Avoid error when printing warning in pythonw (#486) - Stream operations can be List or Dict (#665) ### Documentation (DOC) - Added Scripts/pdf-image-extractor.py - Documentation improvements (#550, #538, #324, #426, #394) ### Tests and Test setup (TST) - Add GitHub Action which automatically runs unit tests via pytest and static code analysis with Flake8 (#660) - Add several unit tests (#661, #663) - Add .coveragerc to create coverage reports ### Developer Experience Improvements (DEV) - Pre commit: Developers can now `pre-commit install` to avoid tiny issues like trailing whitespaces ### Miscellaneous - Add the LICENSE file to the distributed packages (#288) - Use setuptools instead of distutils (#599) - Improvements for the PyPI page (#644) - Python 3 changes (#504, #366) [Full Changelog](https://github.com/py-pdf/PyPDF2/compare/1.26.0...1.27.0) ## Version 1.26.0, 2016-05-18 - NOTE: Active maintenance on PyPDF2 is resuming after a hiatus - Fixed a bug where image resources where incorrectly overwritten when merging pages - Added dictionary for JavaScript actions to the root (louib) - Added unit tests for the JS functionality (louib) - Add more Python 3 compatibility when reading inline images (im2703 and (VyacheslavHashov) - Return NullObject instead of raising error when failing to resolve object (ctate) - Don't output warning for non-zeroed xref table when strict=False (BenRussert) - Remove extraneous zeroes from output formatting (speedplane) - Fix bug where reading an inline image would cut off prematurely in certain cases (speedplane) ## Version 1.25.1, 2015-07-20 - Fix bug when parsing inline images. Occurred when merging certain pages with inline images - Fixed type error when creating outlines by utilizing the isString() test ## Version 1.25, 2015-07-07 BUGFIXES: - Added Python 3 algorithm for ASCII85Decode. Fixes issue when reading reportlab-generated files with Py 3 (jerickbixly) - Recognize more escape sequence which would otherwise throw an exception (manuelzs, robertsoakes) - Fixed overflow error in generic.py. Occurred when reading a too-large int in Python 2 (by Raja Jamwal) - Allow access to files which were encrypted with an empty password. Previously threw a "File has not been decrypted" exception (Elena Williams) - Do not attempt to decode an empty data stream. Previously would cause an error in decode algorithms (vladir) - Fixed some type issues specific to Py 2 or Py 3 - Fix issue when stream data begins with whitespace (soloma83) - Recognize abbreviated filter names (AlmightyOatmeal and Matthew Weiss) - Copy decryption key from PdfFileReader to PdfFileMerger. Allows usage of PdfFileMerger with encrypted files (twolfson) - Fixed bug which occurred when a NameObject is present at end of a file stream. Threw a "Stream has ended unexpectedly" exception (speedplane) FEATURES: - Initial work on a test suite; to be expanded in future. Tests and Resources directory added, README updated (robertsoakes) - Added document cloning methods to PdfFileWriter: appendPagesFromReader, cloneReaderDocumentRoot, and cloneDocumentFromReader. See official documentation (robertsoakes) - Added method for writing to form fields: updatePageFormFieldValues. This will be enhanced in the future. See official documentation (robertsoakes) - New addAttachment method. See documentation. Support for adding and extracting embedded files to be enhanced in the future (moshekaplan) - Added methods to get page number of given PageObject or Destination: getPageNumber and getDestinationPageNumber. See documentation (mozbugbox) OTHER ENHANCEMENTS: - Enhanced type handling (Brent Amrhein) - Enhanced exception handling in NameObject (sbywater) - Enhanced extractText method output (peircej) - Better exception handling - Enhanced regex usage in NameObject class (speedplane) ## Version 1.24, 2014-12-31 - Bugfixes for reading files in Python 3 (by Anthony Tuininga and pqqp) - Appropriate errors are now raised instead of infinite loops (by naure and Cyrus Vafadari) - Bugfix for parsing number tokens with leading spaces (by Maxim Kamenkov) - Don't crash on bad /Outlines reference (by eshellman) - Conform tabs/spaces and blank lines to PEP 8 standards - Utilize the readUntilRegex method when reading Number Objects (by Brendan Jurd) - More bugfixes for Python 3 and clearer exception handling - Fixed encoding issue in merger (with eshellman) - Created separate folder for scripts ## Version 1.23, 2014-08-11 - Documentation now available at pythonhosted.org - Bugfix in pagerange.py for when `__init__.__doc__` has no value (by Vladir Cruz) - Fix typos in OutlinesObject().add() (by shilluc) - Re-added a missing return statement in a utils.py method - Corrected viewing mode names (by Jason Scheirer) - New PdfFileWriter method: addJS() (by vfigueiro) - New bookmark features: color, boldness, italics, and page fit (by Joshua Arnott) - New PdfFileReader method: getFields(). Used to extract field information from PDFs with interactive forms. See documentation for details - Converted README file to markdown format (by Stephen Bussard) - Several improvements to overall performance and efficiency (by mozbugbox) - Fixed a bug where geospatial information was not scaling along with its page - Fixed a type issue and a Python 3 issue in the decryption algorithms (with Francisco Vieira and koba-ninkigumi) - Fixed a bug causing an infinite loop in the ASCII 85 decoding algorithm (by madmaardigan) - Annotations (links, comment windows, etc.) are now preserved when pages are merged together - Used the Destination class in addLink() and addBookmark() so that the page fit option could be properly customized ## Version 1.22, 2014-05-29 - Added .DS_Store to .gitignore (for Mac users) (by Steve Witham) - Removed `__init__()` implementation in NameObject (by Steve Witham) - Fixed bug (inf. loop) when merging pages in Python 3 (by commx) - Corrected error when calculating height in scaleTo() - Removed unnecessary code from DictionaryObject (by Georges Dubus) - Fixed bug where an exception was thrown upon reading a NULL string (by speedplane) - Allow string literals (non-unicode strings in Python 2) to be passed to PdfFileReader - Allow ConvertFunctionsToVirtualList to be indexed with slices and longs (in Python 2) (by Matt Gilson) - Major improvements and bugfixes to addLink() method (see documentation in source code) (by Henry Keiter) - General code clean-up and improvements (with Steve Witham and Henry Keiter) - Fixed bug that caused crash when comments are present at end of dictionary ## Version 1.21, 2014-04-21 - Fix for when /Type isn't present in the Pages dictionary (by Rob1080) - More tolerance for extra whitespace in Indirect Objects - Improved Exception handling - Fixed error in getHeight() method (by Simon Kaempflein) - implement use of utils.string_type to resolve Py2-3 compatibility issues - Prevent exception for multiple definitions in a dictionary (with carlosfunk) (only when strict = False) - Fixed errors when parsing a slice using pdfcat on command line (by Steve Witham) - Tolerance for EOF markers within 1024 bytes of the actual end of the file (with David Wolever) - Added overwriteWarnings parameter to PdfFileReader constructor, if False PyPDF2 will NOT overwrite methods from Python's warnings.py module with a custom implementation. - Fix NumberObject and NameObject constructors for compatibility with PyPy (Rüdiger Jungbeck, Xavier Dupré, shezadkhan137, Steven Witham) - Utilize utils.Str in pdf.py and pagerange.py to resolve type issues (by egbutter) - Improvements in implementing StringIO for Python 2 and BytesIO for Python 3 (by Xavier Dupré) - Added /x00 to Whitespaces, defined utils.WHITESPACES to clarify code (by Maxim Kamenkov) - Bugfix for merging 3 or more resources with the same name (by lucky-user) - Improvements to Xref parsing algorithm (by speedplane) ## Version 1.20, 2014-01-27 - Official Python 3+ support (with contributions from TWAC and cgammans) Support for Python versions 2.6 and 2.7 will be maintained - Command line concatenation (see pdfcat in sample code) (by Steve Witham) - New FAQ; link included in README - Allow more (although unnecessary) escape sequences - Prevent exception when reading a null object in decoding parameters - Corrected error in reading destination types (added a slash since they are name objects) - Corrected TypeError in scaleTo() method - addBookmark() method in PdfFileMerger now returns bookmark (so nested bookmarks can be created) - Additions to Sample Code and Sample PDFs - changes to allow 2up script to work (see sample code) (by Dylan McNamee) - changes to metadata encoding (by Chris Hiestand) - New methods for links: addLink() (by Enrico Lambertini) and removeLinks() - Bugfix to handle nested bookmarks correctly (by Jamie Lentin) - New methods removeImages() and removeText() available for PdfFileWriter (by Tien Haï) - Exception handling for illegal characters in Name Objects ## Version 1.19, 2013-10-08 BUGFIXES: - Removed pop in sweepIndirectReferences to prevent infinite loop (provided by ian-su-sirca) - Fixed bug caused by whitespace when parsing PDFs generated by AutoCad - Fixed a bug caused by reading a 'null' ASCII value in a dictionary object (primarily in PDFs generated by AutoCad). FEATURES: - Added new folders for PyPDF2 sample code and example PDFs; see README for each folder - Added a method for debugging purposes to show current location while parsing - Ability to create custom metadata (by jamma313) - Ability to access and customize document layout and view mode (by Joshua Arnott) OTHER: - Added and corrected some documentation - Added some more warnings and exception messages - Removed old test/debugging code UPCOMING: - More bugfixes (We have received many problematic PDFs via email, we will work with them) - Documentation - It's time for PyPDF2 to get its own documentation since it has grown much since the original pyPdf - A FAQ to answer common questions ## Version 1.18, 2013-08-19 - Fixed a bug where older versions of objects were incorrectly added to the cache, resulting in outdated or missing pages, images, and other objects (from speedplane) - Fixed a bug in parsing the xref table where new xref values were overwritten; also cleaned up code (from speedplane) - New method mergeRotatedAroundPointPage which merges a page while rotating it around a point (from speedplane) - Updated Destination syntax to respect PDF 1.6 specifications (from jamma313) - Prevented infinite loop when a PdfFileReader object was instantiated with an empty file (from Jerome Nexedi) Other Changes: - Downloads now available via PyPI - Installation through pip library is fixed ## Version 1.17, 2013-07-25 - Removed one (from pdf.py) of the two Destination classes. Both classes had the same name, but were slightly different in content, causing some errors. (from Janne Vanhala) - Corrected and Expanded README file to demonstrate PdfFileMerger - Added filter for LZW encoded streams (from Michal Horejsek) - PyPDF2 issue tracker enabled on Github to allow community discussion and collaboration ## Versions -1.16, -2013-06-30 - Note: This ChangeLog has not been kept up-to-date for a while. Hopefully we can keep better track of it from now on. Some of the changes listed here come from previous versions 1.14 and 1.15; they were only vaguely defined. With the new _version.py file we should have more structured and better documented versioning from now on. - Defined `PyPDF2.__version__` - Fixed encrypt() method (from Martijn The) - Improved error handling on PDFs with truncated streams (from cecilkorik) - Python 3 support (from kushal-kumaran) - Fixed example code in README (from Jeremy Bethmont) - Fixed an bug caused by DecimalError Exception (from Adam Morris) - Many other bug fixes and features by: jeansch Anton Vlasenko Joseph Walton Jan Oliver Oelerich Fabian Henze And any others I missed. Thanks for contributing! ## Version 1.13, 2010-12-04 - Fixed a typo in code for reading a "\b" escape character in strings. - Improved `__repr__` in FloatObject. - Fixed a bug in reading octal escape sequences in strings. - Added getWidth and getHeight methods to the RectangleObject class. - Fixed compatibility warnings with Python 2.4 and 2.5. - Added addBlankPage and insertBlankPage methods on PdfFileWriter class. - Fixed a bug with circular references in page's object trees (typically annotations) that prevented correctly writing out a copy of those pages. - New merge page functions allow application of a transformation matrix. - To all patch contributors: I did a poor job of keeping this ChangeLog up-to-date for this release, so I am missing attributions here for any changes you submitted. Sorry! I'll do better in the future. ## Version 1.12, 2008-09-02 - Added support for XMP metadata. - Fix reading files with xref streams with multiple /Index values. - Fix extracting content streams that use graphics operators longer than 2 characters. Affects merging PDF files. ## Version 1.11, 2008-05-09 - Patch from Hartmut Goebel to permit RectangleObjects to accept NumberObject or FloatObject values. - PDF compatibility fixes. - Fix to read object xref stream in correct order. - Fix for comments inside content streams. ## Version 1.10, 2007-10-04 - Text strings from PDF files are returned as Unicode string objects when pyPdf determines that they can be decoded (as UTF-16 strings, or as PDFDocEncoding strings). Unicode objects are also written out when necessary. This means that string objects in pyPdf can be either generic.ByteStringObject instances, or generic.TextStringObject instances. - The extractText method now returns a unicode string object. - All document information properties now return unicode string objects. In the event that a document provides docinfo properties that are not decoded by pyPdf, the raw byte strings can be accessed with an "_raw" property (ie. title_raw rather than title) - generic.DictionaryObject instances have been enhanced to be easier to use. Values coming out of dictionary objects will automatically be de-referenced (.getObject will be called on them), unless accessed by the new "raw_get" method. DictionaryObjects can now only contain PdfObject instances (as keys and values), making it easier to debug where non-PdfObject values (which cannot be written out) are entering dictionaries. - Support for reading named destinations and outlines in PDF files. Original patch by Ashish Kulkarni. - Stream compatibility reading enhancements for malformed PDF files. - Cross reference table reading enhancements for malformed PDF files. - Encryption documentation. - Replace some "assert" statements with error raising. - Minor optimizations to FlateDecode algorithm increase speed when using PNG predictors. ## Version 1.9, 2006-12-15 - Fix several serious bugs introduced in version 1.8, caused by a failure to run through our PDF test suite before releasing that version. - Fix bug in NullObject reading and writing. ## Version 1.8, 2006-12-14 - Add support for decryption with the standard PDF security handler. This allows for decrypting PDF files given the proper user or owner password. - Add support for encryption with the standard PDF security handler. - Add new pythondoc documentation. - Fix bug in ASCII85 decode that occurs when whitespace exists inside the two terminating characters of the stream. ## Version 1.7, 2006-12-10 - Fix a bug when using a single page object in two PdfFileWriter objects. - Adjust PyPDF to be tolerant of whitespace characters that don't belong during a stream object. - Add documentInfo property to PdfFileReader. - Add numPages property to PdfFileReader. - Add pages property to PdfFileReader. - Add extractText function to PdfFileReader. ## Version 1.6, 2006-06-06 - Add basic support for comments in PDF files. This allows us to read some ReportLab PDFs that could not be read before. - Add "auto-repair" for finding xref table at slightly bad locations. - New StreamObject backend, cleaner and more powerful. Allows the use of stream filters more easily, including compressed streams. - Add a graphics state push/pop around page merges. Improves quality of page merges when one page's content stream leaves the graphics in an abnormal state. - Add PageObject.compressContentStreams function, which filters all content streams and compresses them. This will reduce the size of PDF pages, especially after they could have been decompressed in a mergePage operation. - Support inline images in PDF content streams. - Add support for using .NET framework compression when zlib is not available. This does not make pyPdf compatible with IronPython, but it is a first step. - Add support for reading the document information dictionary, and extracting title, author, subject, producer and creator tags. - Add patch to support NullObject and multiple xref streams, from Bradley Lawrence. ## Version 1.5, 2006-01-28 - Fix a bug where merging pages did not work in "no-rename" cases when the second page has an array of content streams. - Remove some debugging output that should not have been present. ## Version 1.4, 2006-01-27 - Add capability to merge pages from multiple PDF files into a single page using the PageObject.mergePage function. See example code (README or web site) for more information. - Add ability to modify a page's MediaBox, CropBox, BleedBox, TrimBox, and ArtBox properties through PageObject. See example code (README or web site) for more information. - Refactor pdf.py into multiple files: generic.py (contains objects like NameObject, DictionaryObject), filters.py (contains filter code), utils.py (various). This does not affect importing PdfFileReader or PdfFileWriter. - Add new decoding functions for standard PDF filters ASCIIHexDecode and ASCII85Decode. - Change url and download_url to refer to new pybrary.net web site. ## Version 1.3, 2006-01-23 - Fix new bug introduced in 1.2 where PDF files with \r line endings did not work properly anymore. A new test suite developed with various PDF files should prevent regression bugs from now on. - Fix a bug where inheriting attributes from page nodes did not work. ## Version 1.2, 2006-01-23 - Improved support for files with CRLF-based line endings, fixing a common reported problem stating "assertion error: assert line == "%%EOF"". - Software author/maintainer is now officially a proud married person, which is sure to result in better software... somehow. ## Version 1.1, 2006-01-18 - Add capability to rotate pages. - Improved PDF reading support to properly manage inherited attributes from /Type=/Pages nodes. This means that page groups that are rotated or have different media boxes or whatever will now work properly. - Added PDF 1.5 support. Namely cross-reference streams and object streams. This release can mangle Adobe's PDFReference16.pdf successfully. ## Version 1.0, 2006-01-17 - First distutils-capable true public release. Supports a wide variety of PDF files that I found sitting around on my system. - Does not support some PDF 1.5 features, such as object streams, cross-reference streams. ================================================ FILE: docs/meta/comparisons.md ================================================ # pypdf vs X pypdf is a [free] and open source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. pypdf can retrieve text and metadata from PDFs as well. ## PyMuPDF and PikePDF [PyMuPDF] is a Python binding to [MuPDF] and [PikePDF] is the Python binding to [QPDF]. While both are excellent libraries for various use-cases, using them is not always possible even when they support the use-case. Both of them are powered by C libraries which make installation harder and might cause security concerns. For MuPDF, you might also need to buy a commercial license. A core feature of pypdf is that it's pure Python. That means there is no C dependency. It has been used for over 10 years and for this reason a lot of support via StackOverflow and examples on the internet. ## pypdf PyPDF2 was merged back into `pypdf`. The development continues at `pypdf`. ## PyPDF3 and PyPDF4 Developing and maintaining open source software is extremely time-intensive and in the case of pypdf not paid at all. Having continuous support is hard. pypdf was initially released in 2012 on PyPI and received releases until 2016. From 2016 to 2022, there was no update - but people were still using it. As pypdf is free software, there were attempts to fork it and continue the development. PyPDF3 was first released in 2018 and still receives updates. PyPDF4 has only one release from 2018. Martin Thoma has worked on bringing the community back to one path of development. He deprecated PyPDF2 in favor of pypdf already, and pypdf has more features and a cleaner interface than PyPDF2 now. See [history of pypdf](history.md). [free]: https://en.wikipedia.org/wiki/Free_software [PyMuPDF]: https://pypi.org/project/PyMuPDF/ [MuPDF]: https://mupdf.com/ [PikePDF]: https://pypi.org/project/pikepdf/ [QPDF]: https://github.com/qpdf/qpdf ## pdfminer.six and pdfplumber [`pdfminer.six`](https://pypi.org/project/pdfminer.six/) is capable of extracting the [font size](https://stackoverflow.com/a/69962459/562769) / font weight (bold-ness). It has no capabilities for writing PDF files. [`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. Please note that the image conversion is done via ImageMagick (see [`pdfplumber`'s documentation](https://github.com/jsvine/pdfplumber#visual-debugging)). The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023. ## pdfrw / pdfrw2 I don't have experience with any of those libraries. Please add a comparison if you know pypdf and [`pdfrw`](https://pypi.org/project/pdfrw/)! Please be aware that there is also [`pdfminer`](https://pypi.org/project/pdfminer/) which is not maintained. Then there is [`pdfrw2`](https://pypi.org/project/pdfrw2/) which doesn't have a large community behind it. ## Document Generation There are (Python) [tools to generate PDF documents](https://github.com/py-pdf/awesome-pdf#generators). pypdf is not one of them. ## CLI applications pypdf is a pure Python PDF library. If you're looking for an application which you can use from the terminal, give [`pdfly`](https://pdfly.readthedocs.io/en/latest/) a shot. ================================================ FILE: docs/meta/faq.md ================================================ # Frequently Asked Questions ## How is pypdf related to PyPDF2? PyPDF2 was a fork from the original pyPdf. After several years, the fork was merged back into `pypdf` (now all lowercase). ## Which Python versions are supported? pypdf 3.0+ supports Python 3.6 and later. PyPDF2 2.0+ supports Python 3.6 and later. PyPDF2 1.27.10 supported Python 2.7 to 3.10. [Matthew]: https://github.com/mstamy2 [source]: https://github.com/py-pdf/PyPDF2/commit/24b270d876518d15773224b5d0d6c2206db29f64#commitcomment-5038317 [this sort of thing]: https://github.com/py-pdf/PyPDF2/issues/24 [GitHub issue]: https://github.com/py-pdf/PyPDF2/issues ## Who uses pypdf? pyPdf is vendored [into](https://github.com/Buyanbat/XacCRM/tree/ee78e8df967182f661b6494a86444501e7d89c8f/report/pyPdf) [several](https://github.com/MyBook/calibre/tree/ca1efe3c21f6553e096dab745b3cdeb36244a5a9/src/pyPdf) [projects](https://github.com/Giacomo-De-Florio-Dev/Make_Your_PDF_Safe/tree/ec439f92243d12d54ae024668792470c6b40ee96/MakeYourPDFsafe_V1.3/PyPDF2). That means the code of pyPdf was copied into that project. Projects that depend on pypdf: * [Camelot](https://github.com/camelot-dev/camelot): A Python library to extract tabular data from PDFs * [edi](https://github.com/OCA/edi): Electronic Data Interchange modules * [amazon-textract-textractor](https://github.com/aws-samples/amazon-textract-textractor/blob/42444b08c672607eadbdcd64f3c5adb2d85383de/helper/setup.py): Analyze documents with Amazon Textract and generate output in multiple formats. * [maigret](https://github.com/soxoj/maigret): Collect a dossier on a person by username from thousands of sites * [deda](https://github.com/dfd-tud/deda): tracking Dots Extraction, Decoding and Anonymisation toolkit * [opencanary](https://github.com/thinkst/opencanary) * Document Conversions * [rst2pdf](https://github.com/rst2pdf/rst2pdf) * [xhtml2pdf](https://github.com/xhtml2pdf/xhtml2pdf) * [doc2text](https://github.com/jlsutherland/doc2text) * [pdfalyzer](https://pypi.org/project/pdfalyzer/): A PDF analysis tool for visualizing the inner tree-like data structure of a PDF in spectacularly large and colorful diagrams as well as scanning the binary streams embedded in the PDF for hidden potentially malicious content. ## How do I cite pypdf? In BibTeX format: ``` @misc{pypdf, title = {The {pypdf} library}, author = {Mathieu Fenniak and Matthew Stamy and pubpub-zz and Martin Thoma and Matthew Peveler and exiledkingcc and {pypdf Contributors}}, year = {2024}, url = {https://pypi.org/project/pypdf/} note = {See https://pypdf.readthedocs.io/en/latest/meta/CONTRIBUTORS.html for all contributors} } ``` ## Which License does pypdf use? `pypdf` uses the [BSD-3-Clause license](https://en.wikipedia.org/wiki/BSD_licenses#3-clause), see the LICENSE file. ================================================ FILE: docs/meta/history.md ================================================ # History of pypdf ## The Origins: pyPdf (2005-2010) In 2005, [Mathieu Fenniak] launched pyPdf "as a PDF toolkit..." focused on - document manipulation: by-page splitting, concatenation, and merging; - document introspection; - page cropping; and - document encryption and decryption. The last release of PyPI was [pyPdf 1.13](https://pypi.org/project/pyPdf/#history) in 2010. ## PyPDF2 is born (2011-2016) At the end of 2011, after consultation with Mathieu and others, Phaseit sponsored PyPDF2 as a fork of pyPdf on GitHub. The initial impetus was to handle a wider range of input PDF instances; Phaseit\'s commercial work often encounters PDF instances \"in the wild\" that it needs to manage (mostly concatenate and paginate), but that deviates so much from PDF standards that pyPdf can't read them. PyPDF2 reads a considerably wider range of real-world PDF instances. Neither pyPdf nor PyPDF2 aims to be universal, that is, to provide all possible PDF-related functionality. Note that the similar-appearing [pyfpdf] of Mariano Reingart is most comparable to [ReportLab], in that both ReportLab and pyfpdf emphasize document generation. Interestingly enough, pyfpdf builds in a basic HTML→PDF converter while PyPDF2 has no knowledge of HTML. So what is PyPDF2 truly about? Think about popular [pdftk] for a moment. PyPDF2 does what pdftk does, and it does so within your current Python process, and it handles a wider range of variant PDF formats \[explain\]. PyPDF2 has its own FAQ to answer other questions that have arisen. The Reddit [/r/python crowd chatted] obliquely and briefly about PyPDF2 in March 2012. The core developer / maintainer was Matthew Stamy. ## PyPDF3 and PyPDF4 (2018-2022) Two approaches were made to get PyPDF2 active again: PyPDF3 and PyPDF4. PyPDF3 had its first release in 2018 and its last one in February 2022. It never got the user base from PyPDF2. PyPDF4 only had one release in 2018. ## PyPDF2: Reborn (2022) Martin Thoma took over maintenance of PyPDF2 in April 2022. It had over 100 open PRs and 321 open issues. [pubpub-zz](https://github.com/pubpub-zz) was extremely active, especially for text extraction. [Matthew Peveler](https://github.com/MasterOdin) helped a lot with reviews and general project decisions. [exiledkingcc](https://github.com/exiledkingcc) added support for modern encryption schemes. ## pypdf: Back to the Roots (2023-2024) In order to simplify things for beginners, PyPDF2 was merged back into pypdf. Now all lowercase, without a number. We hope that the folks who develop PyPDF3 and PyPDF4 also join us. Compared to `PyPDF2 >= 3.0.0`, `pypdf >= 3.1.0` now offers: * AES reading and writing support. Not only with PyCryptoDome, but also with cryptography. * Text extraction improvements, e.g., for math content. [pypdf is now comparable with Tika, pypdfium2, and PyMuPDF](https://github.com/py-pdf/benchmarks) * Annotation support * Performance Improvements and Bugfixes * Page Label support stefan6419846 made his [first PR for pypdf](https://github.com/py-pdf/pypdf/pull/2022) in July 2023 and joined the project. [Mathieu Fenniak]: https://mathieu.fenniak.net/ [pyfpdf]: https://github.com/reingart/pyfpdf [ReportLab]: https://www.reportlab.com/software/opensource/rl-toolkit/ [pdftk]: https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/ [/r/python crowd chatted]: https://www.reddit.com/r/Python/comments/qsvfm/pypdf2_updates_pypdf_pypdf2_is_an_opensource/ ================================================ FILE: docs/meta/migration-1-to-2.md ================================================ # Migration Guide: 1.x to 2.x `PyPDF2<2.0.0` ([docs](https://pypdf2.readthedocs.io/en/1.27.12/meta/history.html)) is very different from `PyPDF2>=2.0.0` ([docs](../meta/history.md)). Luckily, most changes are simple naming adjustments. This guide helps you to make the step from `PyPDF2 1.x` (or even the original PyPdf) to `PyPDF2>=2.0.0`. You can execute your code with the updated version and show deprecation warnings by running `python -W all your_code.py`. ## Imports and Modules * `PyPDF2.utils` no longer exists * `PyPDF2.pdf` no longer exists. You can import from `PyPDF2` directly or from `PyPDF2.generic` ## Naming Adjustments ### Classes The base classes were renamed as they also allow operating with BytesIO streams instead of files. Also, the `strict` parameter changed the default value from `strict=True` to `strict=False`. * `PdfFileReader` ➔ `PdfReader` * `PdfFileWriter` ➔ `PdfWriter` * `PdfFileMerger` ➔ `PdfMerger` PdfFileReader and PdfFileMerger no longer have the `overwriteWarnings` parameter. The new behavior is `overwriteWarnings=False`. ### Function, Method, and Property Names In `PyPDF2.xmp.XmpInformation`: * `rdfRoot` ➔ `rdf_root` * `xmp_createDate` ➔ `xmp_create_date` * `xmp_creatorTool` ➔ `xmp_creator_tool` * `xmp_metadataDate` ➔ `xmp_metadata_date` * `xmp_modifyDate` ➔ `xmp_modify_date` * `xmpMetadata` ➔ `xmp_metadata` * `xmpmm_documentId` ➔ `xmpmm_document_id` * `xmpmm_instanceId` ➔ `xmpmm_instance_id` In `PyPDF2.generic`: * `readObject` ➔ `read_object` * `convertToInt` ➔ `convert_to_int` * `DocumentInformation.getText` ➔ `DocumentInformation._get_text` : This method should typically not be used; please let me know if you need it. * `readHexStringFromStream` ➔ `read_hex_string_from_stream` * `initializeFromDictionary` ➔ `initialize_from_dictionary` * `createStringObject` ➔ `create_string_object` * `TreeObject.hasChildren` ➔ `TreeObject.has_children` * `TreeObject.emptyTree` ➔ `TreeObject.empty_tree` In many places: - `getObject` ➔ `get_object` - `writeToStream` ➔ `write_to_stream` - `readFromStream` ➔ `read_from_stream` PdfReader class: - `reader.getPage(pageNumber)` ➔ `reader.pages[page_number]` - `reader.getNumPages()` / `reader.numPages` ➔ `len(reader.pages)` - `getDocumentInfo` ➔ `metadata` - `flattenedPages` attribute ➔ `flattened_pages` - `resolvedObjects` attribute ➔ `resolved_objects` - `xrefIndex` attribute ➔ `xref_index` - `getNamedDestinations` / `namedDestinations` attribute ➔ `named_destinations` - `getPageLayout` / `pageLayout` ➔ `page_layout` attribute - `getPageMode` / `pageMode` ➔ `page_mode` attribute - `getIsEncrypted` / `isEncrypted` ➔ `is_encrypted` attribute - `getOutlines` ➔ `get_outlines` - `readObjectHeader` ➔ `read_object_header` - `cacheGetIndirectObject` ➔ `cache_get_indirect_object` - `cacheIndirectObject` ➔ `cache_indirect_object` - `getDestinationPageNumber` ➔ `get_destination_page_number` - `readNextEndLine` ➔ `read_next_end_line` - `_zeroXref` ➔ `_zero_xref` - `_authenticateUserPassword` ➔ `_authenticate_user_password` - `_pageId2Num` attribute ➔ `_page_id2num` - `_buildDestination` ➔ `_build_destination` - `_buildOutline` ➔ `_build_outline` - `_getPageNumberByIndirect(indirectRef)` ➔ `_get_page_number_by_indirect(indirect_ref)` - `_getObjectFromStream` ➔ `_get_object_from_stream` - `_decryptObject` ➔ `_decrypt_object` - `_flatten(..., indirectRef)` ➔ `_flatten(..., indirect_ref)` - `_buildField` ➔ `_build_field` - `_checkKids` ➔ `_check_kids` - `_writeField` ➔ `_write_field` - `_write_field(..., fieldAttributes)` ➔ `_write_field(..., field_attributes)` - `_read_xref_subsections(..., getEntry, ...)` ➔ `_read_xref_subsections(..., get_entry, ...)` PdfWriter class: - `writer.getPage(pageNumber)` ➔ `writer.pages[page_number]` - `writer.getNumPages()` ➔ `len(writer.pages)` - `addMetadata` ➔ `add_metadata` - `addPage` ➔ `add_page` - `addBlankPage` ➔ `add_blank_page` - `addAttachment(fname, fdata)` ➔ `add_attachment(filename, data)` - `insertPage` ➔ `insert_page` - `insertBlankPage` ➔ `insert_blank_page` - `appendPagesFromReader` ➔ `append_pages_from_reader` - `updatePageFormFieldValues` ➔ `update_page_form_field_values` - `cloneReaderDocumentRoot` ➔ `clone_reader_document_root` - `cloneDocumentFromReader` ➔ `clone_document_from_reader` - `getReference` ➔ `get_reference` - `getOutlineRoot` ➔ `get_outline_root` - `getNamedDestRoot` ➔ `get_named_dest_root` - `addBookmarkDestination` ➔ `add_bookmark_destination` - `addBookmarkDict` ➔ `add_bookmark_dict` - `addBookmark` ➔ `add_bookmark` - `addNamedDestinationObject` ➔ `add_named_destination_object` - `addNamedDestination` ➔ `add_named_destination` - `removeLinks` ➔ `remove_links` - `removeImages(ignoreByteStringObject)` ➔ `remove_images(ignore_byte_string_object)` - `removeText(ignoreByteStringObject)` ➔ `remove_text(ignore_byte_string_object)` - `addURI` ➔ `add_uri` - `addLink` ➔ `add_link` - `getPage(pageNumber)` ➔ `get_page(page_number)` - `getPageLayout / setPageLayout / pageLayout` ➔ `page_layout attribute` - `getPageMode / setPageMode / pageMode` ➔ `page_mode attribute` - `_addObject` ➔ `_add_object` - `_addPage` ➔ `_add_page` - `_sweepIndirectReferences` ➔ `_sweep_indirect_references` PdfMerger class - `__init__` parameter: `strict=True` ➔ `strict=False` (the `PdfFileMerger` still has the old default) - `addMetadata` ➔ `add_metadata` - `addNamedDestination` ➔ `add_named_destination` - `setPageLayout` ➔ `set_page_layout` - `setPageMode` ➔ `set_page_mode` Page class: - `artBox` / `bleedBox` / `cropBox` / `mediaBox` / `trimBox` ➔ `artbox` / `bleedbox` / `cropbox` / `mediabox` / `trimbox` - `getWidth`, `getHeight ` ➔ `width` / `height` - `getLowerLeft_x` / `getUpperLeft_x` ➔ `left` - `getUpperRight_x` / `getLowerRight_x` ➔ `right` - `getLowerLeft_y` / `getLowerRight_y` ➔ `bottom` - `getUpperRight_y` / `getUpperLeft_y` ➔ `top` - `getLowerLeft` / `setLowerLeft` ➔ `lower_left` property - `upperRight` ➔ `upper_right` - `mergePage` ➔ `merge_page` - `rotateClockwise` / `rotateCounterClockwise` ➔ `rotate_clockwise` - `_mergeResources` ➔ `_merge_resources` - `_contentStreamRename` ➔ `_content_stream_rename` - `_pushPopGS` ➔ `_push_pop_gs` - `_addTransformationMatrix` ➔ `_add_transformation_matrix` - `_mergePage` ➔ `_merge_page` XmpInformation class: - `getElement(..., aboutUri, ...)` ➔ `get_element(..., about_uri, ...)` - `getNodesInNamespace(..., aboutUri, ...)` ➔ `get_nodes_in_namespace(..., aboutUri, ...)` - `_getText` ➔ `_get_text` utils.py: - `matrixMultiply` ➔ `matrix_multiply - `RC4_encrypt` is moved to the security module ### Parameter Names * `PdfWriter.get_page`: `pageNumber` ➔ `page_number` * `PyPDF2.filters` (all classes): `decodeParms` ➔ `decode_parms` * `PyPDF2.filters` (all classes): `decodeStreamData` ➔ `decode_stream_data` * `pagenum` ➔ `page_number` * `PdfMerger.merge`: `position` ➔ `page_number` * `PdfWriter.add_outline_item_destination`: `dest` ➔ `page_destination` * `PdfWriter.add_named_destination_object`: `dest` ➔ `page_destination` * `PdfWriter.encrypt`: `user_pwd` ➔ `user_password` * `PdfWriter.encrypt`: `owner_pwd` ➔ `owner_password` ### Deprecations A few classes / functions were deprecated without replacement: * `PyPDF2.utils.ConvertFunctionsToVirtualList` * `PyPDF2.utils.formatWarning` * `PyPDF2.isInt(obj)`: Use `instance(obj, int)` instead * `PyPDF2.u_(s)`: Use `s` directly * `PyPDF2.chr_(c)`: Use `chr(c)` instead * `PyPDF2.barray(b)`: Use `bytearray(b)` instead * `PyPDF2.isBytes(b)`: Use `instance(b, type(bytes()))` instead * `PyPDF2.xrange_fn`: Use `range` instead * `PyPDF2.string_type`: Use `str` instead * `PyPDF2.isString(s)`: Use `instance(s, str)` instead * `PyPDF2._basestring`: Use `str` instead * `b_(...)` was removed. You should typically be able to use the bytes object directly, otherwise you can [copy this](https://github.com/py-pdf/PyPDF2/pull/986#issuecomment-1230698069) ================================================ FILE: docs/meta/project-governance.md ================================================ # Project Governance This document describes how the pypdf project is managed. It describes the different actors, their roles, and the responsibilities they have. ## Terminology * The **project** is pypdf - a free and open-source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It includes the [code, issues, and discussions on GitHub](https://github.com/py-pdf/pypdf), and [the documentation on ReadTheDocs](https://pypdf.readthedocs.io/en/latest/), [the package on PyPI](https://pypi.org/project/pypdf/), and [the website on GitHub](https://py-pdf.github.io/pypdf/dev/bench/). * A **maintainer** is a person who has technical permissions to change one or more parts of the projects. It is a person driven to keep the project running and improving. * A **contributor** is a person who contributes to the project. That could be through writing code - in the best case through forking and creating a pull request, but that is up to the maintainer. Other contributors describe issues, help to ask questions on existing issues to make them easier to answer, participate in discussions, and help to improve the documentation. Contributors are similar to maintainers, but without technical permissions. * A **user** is a person who imports pypdf into their code. All pypdf users are developers, but not developers who know the internals of pypdf. They only use the public interface of pypdf. They will likely have less knowledge about PDF than contributors. * The **community** is all of that - the users, the contributors, and the maintainers. ## Governance, Leadership, and Steering pypdf forward pypdf is a free and open source project with over 100 contributors and likely (way) more than 1000 users. As pypdf does not have any formal relationship with any company and no funding, all the work done by the community are voluntary contributions. People don't get paid, but choose to spend their free time to create software of which many more are profiting. This has to be honored and respected. Despite such a big community, the project was dormant from 2016 to 2022. There were still questions asked, issues reported, and pull requests created. But the maintainer didn't have the time to move pypdf forward. During that time, nobody else stepped up to become the new maintainer. For this reason, pypdf has the **Benevolent Dictator** governance model. The benevolent dictator is a maintainer with all technical permissions - most importantly the permission to push new pypdf versions on PyPI. Being benevolent, the benevolent dictator listens for decisions to the community and tries their best to make decisions from which the overall community profits - the current one and the potential future one. Being a dictator, the benevolent dictator always has the power and the right to make decisions on their own - also against some members of the community. As pypdf is free software, parts of the community can split off (fork the code) and create a new community. This should limit the harm a bad benevolent dictator can do. ## Project Language The project language is (american) English. All documentation and issues must be written in English to ensure that the community can understand it. We appreciate the fact that large parts of the community don't have English as their native language. We try our best to understand others - [automatic translators](https://translate.google.com/) might help. ## Expectations The community can expect the following: * The **benevolent dictator** tries their best to make decisions from which the overall community profits. The benevolent dictator is aware that his/her decisions can shape the overall community. Once the benevolent dictator notices that she/he doesn't have the time to advance pypdf, he/she looks for a new benevolent dictator. As it is expected that the benevolent dictator will step down at some point of their choice (hopefully before their death), it is NOT a benevolent dictator for life (BDFL). * Every **maintainer** (including the benevolent dictator) is aware of their permissions and the harm they could do. They value security and ensure that the project is not harmed. They give their technical permissions back if they don't need them any longer. Any long-time contributor can become a maintainer. Maintainers can - and should! - step down from their role when they realize that they can no longer commit that time. Their contribution will be honored in the {doc}`history`. * Every **contributor** is aware that the time of maintainers and the benevolent dictator is limited. Short pull requests that briefly describe the solved issue and have a unit test have a higher chance to get merged soon - simply because it's easier for maintainers to see that the contribution will not harm the overall project. Their contributions are documented in the git history and in the public issues. [Let us know](https://github.com/py-pdf/pypdf/discussions/798) if you would appreciate something else! * Every **community member** uses a respectful language. We are all human, we get upset about things we care and other things than what's visible on the internet go on in our live. pypdf does not pay its contributors - keep all of that in mind when you interact with others. We are here because we want to help others. ### Issues and Discussions An issue is any technical description that aims at bringing pypdf forward: * Bugs tickets: Something went wrong because pypdf developers made a mistake. * Feature requests: pypdf does not support all features of the PDF specifications. There are certainly also convenience methods that would help users a lot. * Robustness requests: There are many broken PDFs around. In some cases, we can deal with that. It's kind of a mixture between a bug ticket and a feature request. * Performance tickets: pypdf could be faster - let us know about your specific scenario. Any comment that is in those technical descriptions which is not helping the discussion can be deleted. This is especially true for "me too" comments on bugs or "bump" comments for desired features. People can express this with 👍 / 👎 reactions. [Discussions](https://github.com/py-pdf/pypdf/discussions) are open. No comments will be deleted there - except if they are unrelated spam or only try to insult people (luckily, the community was very respectful so far 🤞) ### Releases The maintainers follow [semantic versioning](https://semver.org/). Most importantly, that means that breaking changes will have a major version bump. Be aware that unintentional breaking changes might still happen. The pypdf maintainers do their best to fix that in a timely manner - please [report such issues](https://github.com/py-pdf/pypdf/issues)! ## People * [stefan6419846](https://github.com/stefan6419846) is the benevolent dictator since January 2025 * [Martin Thoma](https://github.com/MartinThoma) was the benevolent dictator from April 2022 to January 2025. He still has most of the permissions as a fallback. * Maintainers: * Matthew Stamy (mstamy2) was the benevolent dictator for a long time. He still is around on GitHub once in a while and has permissions on PyPI and GitHub. * Matthew Peveler (MasterOdin) is a maintainer on GitHub. ================================================ FILE: docs/meta/scope-of-pypdf.md ================================================ # Scope of pypdf What features should pypdf have and which features will it never have? pypdf aims at simplifying interactions with PDF documents. Core tasks that pypdf can perform are: * Document manipulation: Splitting, merging, cropping, and transforming the pages of PDF files * Data Extraction: Extract text and metadata from PDF documents * Security: Decrypt / encrypt PDF documents Typical indicators that pypdf should do something: * The task needs in-depth knowledge of the PDF format * It currently requires a lot of code or even is impossible to do with pypdf * It's neither mentioned in "belongs in user code" nor in "out of scope" * It already is in the issue list with the [is-feature tag](https://github.com/py-pdf/pypdf/labels/is-feature). The [moonshot extensions](https://github.com/py-pdf/pypdf/discussions/1181) are features we would like to have, but are currently not able to add (PRs are welcome 😉) ## Belongs in user code Here are a few indicators that a feature belongs into users' code (and not into pypdf): 1. The use-case is very specific. Most people will not encounter the same need. 2. It can be done without knowledge of the PDF specification 3. It cannot be done without (non-pdf) domain knowledge. Anything that is specific to your industry. ## Out of scope While this list is infinitely long, there are a few topics that are asked multiple times. Those topics are out of scope for pypdf. They will never be part of pypdf: 1. **Optical Character Recognition (OCR)**: OCR is about extracting text from images. That is very different from the kind of text extraction pypdf is doing. Please note that images can be within PDF documents. In the case of scanned documents, the whole page is an image. Some scanners automatically execute OCR and add a text-layer behind the scanned page. That is something pypdf can use if it's present. As a rule-of-thumb: If you cannot mark/copy the text, it's likely an image. A noteworthy open source OCR project is [tesseract](https://github.com/tesseract-ocr/tesseract). 2. **Format Conversion**: Converting docx / HTML to PDF or PDF to those formats. You might want to have a look at [`pdfkit`](https://pypi.org/project/pdfkit/) and similar projects. Out of scope for the moment, but might be added if there are enough contributors: * **Digital Signature Support** ([reference ticket](https://github.com/py-pdf/pypdf/issues/302)): Cryptography is complicated. It's important to get it right. pypdf currently doesn't have enough active contributors to properly add digital signature support. For the moment, [pyhanko](https://pypi.org/project/pyHanko/) seems to be the best choice. * **PDF Generation from Scratch**: pypdf can manipulate existing PDF documents, add annotations, combine / split / crop / transform. It can add blank pages. But if you want to generate invoices, you might want to have a look at [`reportlab`](https://pypi.org/project/reportlab/) / [`fpdf2`](https://pypi.org/project/fpdf2/) or document conversion tools like [`pdfkit`](https://pypi.org/project/pdfkit/). * **Replacing words within a PDF**: [Extracting text from PDF is hard](../user/extract-text.md#why-text-extraction-is-hard). Replacing text in a reliable way is even harder. For example, one word might be split into multiple tokens. Hence, it's not a simple "search and replace" in some cases. * **(Not) Extracting headers/footers/page numbers**: While you can apply heuristics, there is no way to always make it work. PDF documents simply don't contain the information what a header/footer/page number is. ### Library vs. Application It's also worth pointing out that `pypdf` is designed to be a library. It is not an application. That has several implications: * Execution: pypdf cannot be executed directly, but only be called from within a program written by a pypdf user. In contrast, an application is executed on its own. * Dependencies: pypdf should have a minimal set of dependencies and only restrict them where it is strictly necessary. In contrast, applications should be installed in environments which are isolated from other applications. They can pin their dependencies. If you're looking for a way to interact with PDF files via Shell, you should either write a script using pypdf or use [`pdfly`](https://pypi.org/project/pdfly/). ================================================ FILE: docs/meta/taking-ownership.md ================================================ # Taking Ownership of pypdf pypdf is currently maintained by stefan6419846. We want to avoid that pypdf ever goes unmaintained again. This document serves as a guide to avoid that if I become unavailable, e.g., due to severe health issues. This currently is just an abstract scenario. I'm fine, and I will likely do this for several more years, but I have seen how projects stand still for many years because of the maintainer becoming inactive. ## What belongs to pypdf? The resources needed for maintaining pypdf are: * PyPI: [pypdf](https://pypi.org/project/pypdf/) and [PyPDF2](https://pypi.org/project/PyPDF2/) * GitHub: [pypdf](https://github.com/py-pdf/pypdf) (the repository, not the organization) * ReadTheDocs: [pypdf](https://readthedocs.org/projects/pypdf/) and [PyPDF2](https://readthedocs.org/projects/pypdf2/) ## When may somebody take ownership? **No activity in 180 days**: If I don't answer e-mails (see my GitHub profile) and don't make any commits / merges for half a year, you can consider pypdf "not maintained." ## Who may take ownership? Preferably, one of the owners of the GitHub `py-pdf` organization takes care of that. As of 27th of August 2023, the following people might be candidates: * [Lucas-C](https://github.com/Lucas-C): He maintains fpdf2 and is a py-pdf owner * [pubpub-zz](https://github.com/pubpub-zz): He is one of the most active contributors to pypdf * [Matthew Peveler](https://github.com/MasterOdin): Less active, but he is cautious about breaking changes and an experienced software developer. * [exiledkingcc](https://github.com/exiledkingcc): He has contributed the core changes related to encryption. ## How to take ownership? * PyPI: Follow [PEP 541 – Package Index Name Retention](https://peps.python.org/pep-0541/) * GitHub: Talk with one of the other py-pdf organization owners * ReadTheDocs: Follow the [Abandoned projects policy](https://docs.readthedocs.io/en/latest/abandoned-projects.html) ================================================ FILE: docs/modules/Destination.rst ================================================ The Destination Class --------------------- .. autoclass:: pypdf.generic.Destination :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/DocumentInformation.rst ================================================ The DocumentInformation Class ----------------------------- .. autoclass:: pypdf.DocumentInformation :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/Field.rst ================================================ The Field Class --------------- .. autoclass:: pypdf.generic.Field :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/Fit.rst ================================================ The Fit Class ------------- .. autoclass:: pypdf.generic.Fit :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/PageObject.rst ================================================ The PageObject Class -------------------- .. autoclass:: pypdf._page.PageObject :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._page.VirtualListImages :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._page.ImageFile :members: :inherited-members: File :undoc-members: .. autofunction:: pypdf.mult ================================================ FILE: docs/modules/PageRange.rst ================================================ The PageRange Class ------------------- .. autoclass:: pypdf.PageRange :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/PaperSize.rst ================================================ The PaperSize Class ------------------- .. autoclass:: pypdf.PaperSize :members: :undoc-members: :show-inheritance: Add blank page with PaperSize ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. testsetup :: pypdf_test_setup("modules/PaperSize", { "example.pdf": "../resources/example.pdf", }) .. testcode :: from pypdf import PaperSize, PdfWriter writer = PdfWriter(clone_from="example.pdf") writer.add_blank_page(PaperSize.A8.width, PaperSize.A8.height) writer.write("out-add-page.pdf") Insert blank page with PaperSize ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. testcode :: from pypdf import PaperSize, PdfWriter writer = PdfWriter(clone_from="example.pdf") writer.insert_blank_page(PaperSize.A8.width, PaperSize.A8.height, 1) writer.write("out-insert-page.pdf") ================================================ FILE: docs/modules/PdfDocCommon.rst ================================================ The PdfDocCommon Class ---------------------- **PdfDocCommon** is an abstract class which is inherited by :class:`~pypdf.PdfReader` and :class:`~pypdf.PdfWriter`. Where identified in the API, you can use any of the derived class. .. autoclass:: pypdf._doc_common.PdfDocCommon :members: :inherited-members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/PdfReader.rst ================================================ The PdfReader Class ------------------- .. autoclass:: pypdf.PdfReader :members: :inherited-members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.PasswordType :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/PdfWriter.rst ================================================ The PdfWriter Class ------------------- .. autoclass:: pypdf.PdfWriter :members: :inherited-members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.ObjectDeletionFlag :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/RectangleObject.rst ================================================ The RectangleObject Class ------------------------- .. autoclass:: pypdf.generic.RectangleObject :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/Transformation.rst ================================================ The Transformation Class ------------------------ .. autoclass:: pypdf.Transformation :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/XmpInformation.rst ================================================ The XmpInformation Class ------------------------- .. autoclass:: pypdf.xmp.XmpInformation :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/annotations.rst ================================================ The annotations module ---------------------- .. automodule:: pypdf.annotations :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/constants.rst ================================================ Constants --------- .. autoclass:: pypdf.constants.AnnotationFlag :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.constants.ImageType :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.constants.PageLabelStyle :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.constants.UserAccessPermissions :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf.constants.FieldDictionaryAttributes :members: :undoc-members: :exclude-members: FT, Parent, Kids, T, TU, TM, V, DV, AA, Opt, attributes, attributes_dict :show-inheritance: ================================================ FILE: docs/modules/errors.rst ================================================ Errors ------ .. automodule:: pypdf.errors :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/modules/generic.rst ================================================ Generic PDF objects ------------------- .. automodule:: pypdf.generic :members: :undoc-members: :show-inheritance: :exclude-members: Destination, Field, Fit, RectangleObject .. autoclass:: pypdf._protocols.PdfObjectProtocol :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._protocols.XmpInformationProtocol :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._protocols.PdfCommonDocProtocol :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._protocols.PdfReaderProtocol :members: :undoc-members: :show-inheritance: .. autoclass:: pypdf._protocols.PdfWriterProtocol :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/user/add-javascript.md ================================================ # Adding JavaScript to a PDF PDF readers vary in the extent they support JavaScript, with some not supporting it at all. Adobe has documentation on its support here: [https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html](https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html) ## Launch print window on opening ```{testsetup} pypdf_test_setup("user/add-javascript", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") # Add JavaScript to launch the print window on opening this PDF. writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") writer.write("out-print-window.pdf") ``` ================================================ FILE: docs/user/add-watermark.md ================================================ # Adding a Stamp or Watermark to a PDF Adding stamps or watermarks are two common ways to manipulate PDF files. A stamp is adding something on top of the document, a watermark is in the background of the document. ## Stamp (Overlay) / Watermark (Underlay) The process of stamping and watermarking is the same, you just need to set `over` parameter to `True` for stamping and `False` for watermarking. You can use {func}`~pypdf._page.PageObject.merge_page` if you don't need to transform the stamp: ```{testsetup} pypdf_test_setup("user/add-watermark", { "crazyones.pdf": "../resources/crazyones.pdf", "nup-source.png": "../docs/user/nup-source.png", "jpeg.pdf": "../resources/jpeg.pdf", }) ``` ```{testcode} from pypdf import PdfReader, PdfWriter stamp = PdfReader("jpeg.pdf").pages[0] writer = PdfWriter(clone_from="crazyones.pdf") for page in writer.pages: page.merge_page(stamp, over=False) # here set to False for watermarking writer.write("out-watermark.pdf") ``` Otherwise use {func}`~pypdf._page.PageObject.merge_transformed_page` with {class}`~pypdf.Transformation` if you need to translate, rotate, scale, etc. the stamp before merging it to the content page. ```{testcode} from pathlib import Path from typing import List, Union from pypdf import PdfReader, PdfWriter, Transformation def stamp( content_pdf: Union[Path, str], stamp_pdf: Union[Path, str], pdf_result: Union[Path, str], page_indices: Union[None, List[int]] = None, ): stamp_page = PdfReader(stamp_pdf).pages[0] writer = PdfWriter() # page_indices can be a List(array) of page, tuples are for range definition reader = PdfReader(content_pdf) writer.append(reader, pages=page_indices) for content_page in writer.pages: content_page.merge_transformed_page( stamp_page, Transformation().scale(0.5), ) writer.write(pdf_result) stamp("crazyones.pdf", "jpeg.pdf", "out-scale.pdf") ``` If you are experiencing wrongly rotated watermarks/stamps, try to use {func}`~pypdf._page.PageObject.transfer_rotation_to_content` on the corresponding pages beforehand to fix the page boxes. Example of stamp: ![stamp.png](stamp.png) Example of watermark: ![watermark.png](watermark.png) ## Stamping images directly The above code only works for stamps that are already in PDF format. However, you can easily convert an image to PDF image using [Pillow](https://pypi.org/project/Pillow/). ```{testcode} from io import BytesIO from pathlib import Path from typing import List, Union from PIL import Image from pypdf import PageRange, PdfReader, PdfWriter, Transformation def image_to_pdf(stamp_img: Union[Path, str]) -> PdfReader: img = Image.open(stamp_img) img_as_pdf = BytesIO() img.save(img_as_pdf, "pdf") return PdfReader(img_as_pdf) def stamp_img( content_pdf: Union[Path, str], stamp_img: Union[Path, str], pdf_result: Union[Path, str], page_indices: Union[PageRange, List[int], None] = None, ): # Convert the image to a PDF stamp_pdf = image_to_pdf(stamp_img) # Then use the same stamp code from above stamp_page = stamp_pdf.pages[0] writer = PdfWriter() reader = PdfReader(content_pdf) writer.append(reader, pages=page_indices) for content_page in writer.pages: content_page.merge_transformed_page( stamp_page, Transformation(), ) writer.write(pdf_result) stamp_img("crazyones.pdf", "nup-source.png", "out-image.pdf") ``` ================================================ FILE: docs/user/adding-pdf-annotations.md ================================================ # Adding PDF Annotations ```{note} By default, some annotations might be invisible, for example polylines, as the default color is "transparent". To circumvent this, make sure to add the `/C` entry to the annotation, being an array and each array value being in the range 0.0 to 1.0: * With one element, a grayscale value. * With three elements, a RGB definition. * With four elements, a CMYK definition. ``` ## Attachments ```{testsetup} pypdf_test_setup("user/adding-pdf-annotations", { "crazyones.pdf": "../resources/crazyones.pdf", }) ``` ```{testcode} from pypdf import PdfWriter writer = PdfWriter() writer.add_blank_page(width=200, height=200) data = b"any bytes - typically read from a file" writer.add_attachment("smile.png", data) writer.write("out-attachment.pdf") ``` ## Free Text If you want to add text in a box like this ![](free-text-annotation.png) you can use {class}`~pypdf.annotations.FreeText`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import FreeText # Fill the writer with the pages you want reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Create the annotation and add it annotation = FreeText( text="Hello World\nThis is the second line!", rect=(50, 550, 200, 650), font="Arial", bold=True, italic=True, font_size="20pt", font_color="00ff00", border_color="0000ff", background_color="cdcdcd", ) # Set annotation flags to 4 for printable annotations. # See "AnnotationFlag" for other options, e.g. hidden etc. annotation.flags = 4 writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-free-text.pdf") ``` ## Text A text annotation looks like this: ![](text-annotation.png) ## Line If you want to add a line like this: ![](annotation-line.png) you can use {class}`~pypdf.annotations.Line`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Line reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the line annotation = Line( text="Hello World\nLine2", rect=(50, 550, 200, 650), p1=(50, 550), p2=(200, 650), ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-line.pdf") ``` ## PolyLine If you want to add a line like this: ![](annotation-polyline.png) you can use {class}`~pypdf.annotations.PolyLine`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import PolyLine from pypdf.generic import ArrayObject, FloatObject, NameObject reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the polyline # By default, the line will be transparent. Set an explicit color. annotation = PolyLine( vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], ) annotation[NameObject("/C")] = ArrayObject( [FloatObject(0.9), FloatObject(0.1), FloatObject(0)] ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-polyline.pdf") ``` ## Rectangle If you want to add a rectangle like this: ![](annotation-square.png) you can use {class}`~pypdf.annotations.Rectangle`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Rectangle reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the rectangle annotation = Rectangle( rect=(50, 550, 200, 650), ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-rectangle.pdf") ``` If you want the rectangle to be filled, use the `interiour_color="ff0000"` parameter. This method uses the "square" annotation type of the PDF format. ## Ellipse If you want to add a circle like this: ![](annotation-circle.png) you can use {class}`~pypdf.annotations.Ellipse`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Ellipse reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the rectangle annotation = Ellipse( rect=(50, 550, 200, 650), ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-ellipse.pdf") ``` ## Polygon If you want to add a polygon like this: ![](annotation-polygon.png) you can use {class}`~pypdf.annotations.Polygon`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Polygon reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the line annotation = Polygon( vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-polygon.pdf") ``` ## Popup Manage the Popup windows for markups, looks like this: ![](annotation-popup.png) you can use {py:class}`~pypdf.annotations.Popup`: ```{testcode} from pypdf import PdfWriter from pypdf.annotations import Popup, Text # Arrange writer = PdfWriter() writer.append("crazyones.pdf", [0]) # Act text_annotation = writer.add_annotation( 0, Text( text="Hello World\nThis is the second line!", rect=(50, 550, 200, 650), open=True, ), ) popup_annotation = Popup( rect=(50, 550, 200, 650), open=True, parent=text_annotation, # use the output of add_annotation ) writer.write("out-popup.pdf") ``` You have to use the returned result from add_annotation() as it is the parent annotation with which this popup annotation shall be associated. ## Link If you want to add a link, you can use {class}`~pypdf.annotations.Link`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Link reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the link annotation = Link( rect=(50, 550, 200, 650), url="https://martin-thoma.com/", ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-link.pdf") ``` You can also add internal links: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Link from pypdf.generic import Fit reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Add the link annotation = Link( rect=(50, 550, 200, 650), target_page_index=3, fit=Fit(fit_type="/FitH", fit_args=(123,)), ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-internal-link.pdf") ``` ## Text Markup Annotations Text markup annotations refer to a specific piece of text within the document. These are a bit more complicated as you need to know exactly where the text is, the so-called "Quad points". ### Highlighting If you want to highlight text like this: ![](annotation-highlight.png) you can use {class}`~pypdf.annotations.Highlight`: ```{testcode} from pypdf import PdfReader, PdfWriter from pypdf.annotations import Highlight from pypdf.generic import ArrayObject, FloatObject reader = PdfReader("crazyones.pdf") page = reader.pages[0] writer = PdfWriter() writer.add_page(page) rect = (50, 550, 200, 650) quad_points = [rect[0], rect[1], rect[2], rect[1], rect[0], rect[3], rect[2], rect[3]] # Add the highlight annotation = Highlight( rect=rect, quad_points=ArrayObject([FloatObject(quad_point) for quad_point in quad_points]), ) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk writer.write("out-highlight.pdf") ``` ================================================ FILE: docs/user/cropping-and-transforming.md ================================================ # Cropping and Transforming PDFs ```{note} Just because content is no longer visible, it is not gone. Cropping works by adjusting the viewbox. That means content that was cropped away can still be restored. ``` ```{testsetup} pypdf_test_setup("user/cropping-and-transforming", { "example.pdf": "../resources/example.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf": "../resources/Seige_of_Vicksburg_Sample_OCR.pdf", "labeled-edges-center-image.pdf": "../resources/labeled-edges-center-image.pdf", "side-by-side-subfig.pdf": "../resources/side-by-side-subfig.pdf", "nup-source.pdf": "../resources/box.pdf", "box.pdf": "../resources/box.pdf", }) ``` ```{testcode} from pypdf import PdfReader, PdfWriter reader = PdfReader("Seige_of_Vicksburg_Sample_OCR.pdf") writer = PdfWriter() # Add page 1 from reader to output document, unchanged. writer.add_page(reader.pages[0]) # Add page 2 from reader, but rotated clockwise 90 degrees. writer.add_page(reader.pages[1].rotate(90)) # Add page 3 from reader, but crop it to half size. page3 = writer.add_page(reader.pages[2]) page3.mediabox.upper_right = ( page3.mediabox.right / 2, page3.mediabox.top / 2, ) writer.write("out-all-in-one.pdf") ``` ## Page rotation The most typical rotation is a clockwise rotation of the page by multiples of 90 degrees. That is done when the orientation of the page is wrong. You can do that with the {func}`~pypdf._page.PageObject.rotate` method: ```{testcode} from pypdf import PdfReader, PdfWriter reader = PdfReader("example.pdf") writer = PdfWriter() writer.add_page(reader.pages[0]) writer.pages[0].rotate(90) writer.write("out-page-rotation.pdf") ``` The rotate method is typically preferred over the `page.add_transformation(Transformation().rotate())` method, because `rotate` will ensure that the page is still in the mediabox/cropbox. The transformation object operates on the coordinates of the page contents and does not change the mediabox or cropbox. ## Plain Merge ![](plain-merge.png) is the result of ```{testcode} from pypdf import PdfReader, PdfWriter, Transformation # Get the data reader_base = PdfReader("labeled-edges-center-image.pdf") page_base = reader_base.pages[0] reader = PdfReader("box.pdf") page_box = reader.pages[0] # Write the result back writer = PdfWriter() page = writer.add_page(page_base) page.merge_page(page_box) writer.write("out-plain-merge.pdf") ``` ## Merge with Rotation ![](merge-45-deg-rot.png) ```{testcode} from pypdf import PdfReader, PdfWriter, Transformation # Get the data reader_base = PdfReader("labeled-edges-center-image.pdf") page_base = reader_base.pages[0] reader = PdfReader("box.pdf") page_box = reader.pages[0] # Prepare writer writer = PdfWriter() # Add base page. writer_page = writer.add_page(page_base) # Apply the transformation and merge the pages. transformation = Transformation().rotate(45) writer_page.merge_transformed_page(page_box, transformation) # Write the result back writer.write("out-merge-with-rotation.pdf") ``` If you add the `expand` parameter: ```{testcode} transformation = Transformation().rotate(45) writer_page.merge_transformed_page(page_box, transformation, expand=True) ``` you get: ![](merge-rotate-expand.png) Alternatively, you can move the merged image a bit to the right by using ```{testcode} op = Transformation().rotate(45).translate(tx=50) ``` ![](merge-translated.png) ## Scaling In pypdf, the content and the page can either be scaled together or separately. Content scaling scales the contents on a page, and page scaling scales just the page size (the canvas). Typically, you want to combine both. ![](scaling.png) ### Scaling both the Page and contents together ```{testcode} from pypdf import PdfReader, PdfWriter # Read the input reader = PdfReader("side-by-side-subfig.pdf") page = reader.pages[0] # Add to the writer writer = PdfWriter() writer_page = writer.add_page(page) # Scale writer_page.scale_by(0.5) # Write the result to a file writer.write("out-scale-all.pdf") ``` ### Scaling the content only The content is scaled around the origin of the coordinate system. Typically, that is the lower-left corner. ```{testcode} from pypdf import PdfReader, PdfWriter, Transformation # Read the input reader = PdfReader("side-by-side-subfig.pdf") page = reader.pages[0] # Prepare the writer writer = PdfWriter() writer_page = writer.add_page(page) # Scale op = Transformation().scale(sx=0.7, sy=0.7) writer_page.add_transformation(op) # Write the result to a file writer.write("out-scale-content.pdf") ``` ### Scaling the page only To scale the page by `sx` in the X direction and `sy` in the Y direction: ```{testcode} page.mediabox = page.mediabox.scale(sx=0.7, sy=0.7) ``` If you wish to have more control, you can adjust the various page boxes directly: ```{testcode} from pypdf.generic import RectangleObject mb = page.mediabox page.mediabox = RectangleObject((mb.left, mb.bottom, mb.right, mb.top)) page.cropbox = RectangleObject((mb.left, mb.bottom, mb.right, mb.top)) page.trimbox = RectangleObject((mb.left, mb.bottom, mb.right, mb.top)) page.bleedbox = RectangleObject((mb.left, mb.bottom, mb.right, mb.top)) page.artbox = RectangleObject((mb.left, mb.bottom, mb.right, mb.top)) ``` ### pypdf._page.MERGE_CROP_BOX `pypdf<=3.4.0` used to merge the other page with `trimbox`. `pypdf>3.4.0` changes this behavior to `cropbox`. In case anybody has good reasons to use/expect `trimbox`, you can add the following code to get the old behavior: ```{testcode} import pypdf pypdf._page.MERGE_CROP_BOX = "trimbox" ``` ## Transforming several copies of the same page We have designed the following business card (A8 format) to advertise our new startup. ![](nup-source.png) We would like to copy this card sixteen times on an A4 page, to print it, cut it, and give it to all our friends. Having learned about the {func}`~pypdf._page.PageObject.merge_page` method and the {class}`~pypdf.Transformation` class, we run the following code. Notice that we had to tweak the media box of the source page to extend it, which is already a dirty hack (in this case). ```{testcode} from pypdf import PaperSize, PdfReader, PdfWriter, Transformation # Read source file reader = PdfReader("nup-source.pdf") sourcepage = reader.pages[0] # Create a destination file, and add a blank page to it writer = PdfWriter() destpage = writer.add_blank_page(width=PaperSize.A4.height, height=PaperSize.A4.width) # Copy source page to destination page, several times for x in range(4): for y in range(4): # Translate page transformation = Transformation().translate( x * PaperSize.A8.height, y * PaperSize.A8.width, ) # Merge translated page destpage.merge_transformed_page(sourcepage, transformation) # Write file writer.write("out-nup-dest1.pdf") ``` ![](nup-dest2.png) There is still some work to do, for instance, to insert margins between and around cards, but this is left as an exercise for the reader… ## Possible issues Especially when combining {func}`~pypdf._page.PageObject.merge_page` with transformations, you might end up with a cropped PDF file. In these cases, consider setting `expand=True` to re-calculate the corresponding media box. ================================================ FILE: docs/user/encryption-decryption.md ================================================ # Encryption and Decryption of PDFs PDF encryption makes use of [`RC4`](https://en.wikipedia.org/wiki/RC4) and [`AES`](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) algorithms with different key length. `pypdf` supports all of them until `PDF-2.0`, which is the latest PDF standard. `pypdf` use an extra dependency to do encryption or decryption for `AES` algorithms. We recommend [`pyca/cryptography`](https://cryptography.io/en/latest/). Alternatively, you can use [`pycryptodome`](https://pypi.org/project/pycryptodome/). ```{note} Please see the note in the [installation guide](installation.md) for installing the extra dependencies if interacting with PDFs that use AES. ``` ## Encrypt You can encrypt a PDF by using a password: ```{testsetup} pypdf_test_setup("user/encryption-decryption", { "example.pdf": "../resources/example.pdf", "encrypted-file.pdf": "../resources/encrypted-file.pdf", }) ``` ```{testcode} from pypdf import PdfReader, PdfWriter reader = PdfReader("example.pdf") writer = PdfWriter(clone_from=reader) # Add a password to the new PDF writer.encrypt("my-secret-password", algorithm="AES-256") # Save the new PDF to a file writer.write("out-encrypt.pdf") ``` The algorithm can be one of `RC4-40`, `RC4-128`, `AES-128`, `AES-256-R5`, `AES-256`. We recommend using `AES-256-R5`. ```{warning} pypdf uses `RC4` by default for compatibility if you omit the "algorithm" parameter. Since `RC4` is insecure, you should use `AES` algorithms. ``` ## Decrypt You can decrypt a PDF using the appropriate password: ```{testcode} from pypdf import PdfReader, PdfWriter reader = PdfReader("encrypted-file.pdf") if reader.is_encrypted: reader.decrypt("test") # secret password writer = PdfWriter(clone_from=reader) # Save the new PDF to a file writer.write("out-decrypt.pdf") ``` ================================================ FILE: docs/user/extract-images.md ================================================ # Extract Images ```{note} In order to use the following code you need to install optional dependencies, see [installation guide](installation.md). ``` Every page of a PDF document can contain an arbitrary number of images. The names of the files may not be unique. ```{testsetup} pypdf_test_setup("user/extract-images", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") page = reader.pages[0] for i, image_file_object in enumerate(page.images): file_name = "out-image-" + str(i) + "-" + image_file_object.name image_file_object.image.save(file_name) ``` ## Other images Some other objects can contain images, such as stamp annotations. You can extract the image from the annotation with the following code: ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") im = ( reader.pages[0]["/Annots"][4]["/Parent"] .get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"] .decode_as_image() ) im.save("out-annotation-image.png") ``` ## Error handling Iterating over `page.images` directly will raise an exception on the first issue. If you expect some more or less broken PDF files, but still want to retrieve as many images as possible, consider making this a multistep process: ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for page in reader.pages: for name in page.images.keys(): try: # Try to retrieve actual image. image = page.images[name] except Exception as exception: # Handle exceptions. pass ``` ================================================ FILE: docs/user/extract-text.md ================================================ # Extract Text from a PDF You can extract text from a PDF: ```{testsetup} pypdf_test_setup("user/extract-text", { "test Orient.pdf": "../resources/test Orient.pdf", "GeoBase_NHNC1_Data_Model_UML_EN.pdf": "../resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("test Orient.pdf") page = reader.pages[0] print(page.extract_text()) # extract only text oriented up print(page.extract_text(0)) # extract text oriented up and turned left print(page.extract_text((0, 90))) # extract text in a fixed width format that closely adheres to the rendered # layout in the source pdf print(page.extract_text(extraction_mode="layout")) # extract text preserving horizontal positioning without excess vertical # whitespace (removes blank and "whitespace only" lines) print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)) # adjust horizontal spacing print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)) # exclude (default) or include (as shown below) text rotated w.r.t. the page print(page.extract_text(extraction_mode="layout", layout_mode_strip_rotated=False)) ``` ```{testoutput} :options: +NORMALIZE_WHITESPACE :hide: (T) This is box text at top written down from top (B) This is box text at bottom written up from bottom (L) This is box text on left written vertically to starboard (R) This is box text on righy written vertically to port (T) This is box text at top written down from top (T) This is box text at top written down from top (L) This is box text on left written vertically to starboard (B) This is box text at bottom from bottom upwritten (T) This is box text at top written down from top (B) This is box text at bottom from bottom upwritten (T) This is box text at top written down from top (B) This is box text at bottom from bottom upwritten (T) This is box text at top written down from top (B) This is box text at bottom from bottom upwritten (L) This is box textwritten vertically to starboard on righy on left ) This is box text written vertically to port (R (T) This is box text at top written down from top ``` Refer to {func}`~pypdf._page.PageObject.extract_text` for more details. ```{note} Extracting the text of a page requires parsing its whole content stream. This can require quite a lot of memory - we have seen 10 GB RAM being required for an uncompressed content stream of about 300 MB (which should not occur very often). To limit the size of the content streams to process (and avoid OOM errors in your application), consider checking `len(page.get_contents().get_data())` beforehand. ``` ```{note} If a PDF page appears to contain only an image (e.g., a scanned document), the extracted text may be minimal or visually empty. In such cases, consider using OCR software such as [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) to extract text from images. ``` ## Using a visitor You can use visitor functions to control which part of a page you want to process and extract. The visitor functions you provide will get called for each operator or for each text fragment. The function provided in argument visitor_text of function extract_text has five arguments: * text: the current text (as long as possible, can be up to a full line) * user_matrix: current matrix to move from user coordinate space (also known as CTM) * tm_matrix: current matrix from text coordinate space * font_dictionary: full font dictionary * font_size: the size (in text coordinate space) The matrix stores six parameters. The first four provide the rotation/scaling matrix, and the last two provide the translation (horizontal/vertical). It is recommended to use the user_matrix as it takes into account all transformations. Notes : - As indicated in §8.3.3 of the PDF 1.7 or PDF 2.0 specification, the user matrix applies to text space/image space/form space/pattern space. - If you want to get the full transformation from text to user space, you can use the {func}`~.pypdf.mult` function as follows: `txt2user = mult(tm, cm)`. The font size is the raw text size and affected by the `user_matrix`. The `font_dictionary` may be `None` in case of unknown fonts. If not `None`, it could contain something like the key `"/BaseFont"` with the value `"/Arial,Bold"`. **Caveat**: In complicated documents, the calculated positions may be difficult to determine (if you move from multiple forms to page user space, for example). The function provided in argument visitor_operand_before has four arguments: operator, operand-arguments, current transformation matrix, and text matrix. ### Example 1: Ignore header and footer The following example reads the text of page four of [this PDF document](https://github.com/py-pdf/pypdf/blob/main/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf), but ignores the header (y > 720) and footer (y < 50). In this file we also need to include new line characters (y == 0). ```{testcode} from pypdf import PdfReader reader = PdfReader("GeoBase_NHNC1_Data_Model_UML_EN.pdf") page = reader.pages[3] parts = [] def visitor_body(text, cm, tm, font_dict, font_size): y = tm[5] if 50 < y < 720 or y == 0: parts.append(text) page.extract_text(visitor_text=visitor_body) text_body = "".join(parts) print(text_body) ``` ```{testoutput} :options: +NORMALIZE_WHITESPACE :hide: TABLE OF CONTENTS 1 OVERVIEW ............................................................................................................................................ 6 2 LRS ........................................................................................................................................................ 6 2.1 LRS MODEL ...................................................................................................................................... 7 3 MODEL .................................................................................................................................................. 8 3.1 LRS MODEL ...................................................................................................................................... 9 3.1.1 Logical view ............................................................................................................................... 9 3.1.2 Hydro network.......................................................................................................................... 10 3.1.3 Hydro events............................................................................................................................ 11 3.1.4 Hydrographic ........................................................................................................................... 14 3.1.5 Toponymy (external package) ................................................................................................. 18 3.1.6 Metadata .................................................................................................................................. 19 ``` ### Example 2: Extract rectangles and texts into an SVG file The following example converts page three of [this PDF document](https://github.com/py-pdf/pypdf/blob/main/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf) into an [SVG file](https://en.wikipedia.org/wiki/Scalable_Vector_Graphics). Such an SVG export may help to understand what is going on in a page. % We prefer not to execute doc examples for unmaintained third-party package "svgwrite" ```{testcode} :skipif: True from pypdf import PdfReader import svgwrite reader = PdfReader("GeoBase_NHNC1_Data_Model_UML_EN.pdf") page = reader.pages[2] dwg = svgwrite.Drawing("GeoBase_test.svg", profile="tiny") def visitor_svg_rect(op, args, cm, tm): if op == b"re": (x, y, w, h) = (args[i].as_numeric() for i in range(4)) dwg.add(dwg.rect((x, y), (w, h), stroke="red", fill_opacity=0.05)) def visitor_svg_text(text, cm, tm, font_dict, font_size): (x, y) = (cm[4], cm[5]) dwg.add(dwg.text(text, insert=(x, y), fill="blue")) page.extract_text( visitor_operand_before=visitor_svg_rect, visitor_text=visitor_svg_text ) dwg.save() ``` The SVG generated here is bottom-up because the coordinate systems of PDF and SVG differ. Unfortunately, in complicated PDF documents the coordinates given to the visitor functions may be wrong. ## Why Text Extraction is hard ### Unclear Objective Extracting text from a PDF can be tricky. In several cases, there is no clear answer to what the expected result should look like: 1. **Paragraphs**: Should the text of a paragraph have line breaks at the same places where the original PDF had them or should it rather be one block of text? 2. **Page numbers**: Should they be included in the extract? 3. **Headers and Footers**: Similar to page numbers - should they be extracted? 4. **Outlines**: Should outlines be extracted at all? 5. **Formatting**: If the text is **bold** or *italic*, should it be included in the output? 6. **Tables**: Should the text extraction skip tables? Should it extract just the text? Should the borders be shown in some Markdown-like way or should the structure be present e.g. as an HTML table? How would you deal with merged cells? 7. **Captions**: Should image and table captions be included? 8. **Ligatures**: The Unicode symbol [U+FB00](https://www.compart.com/de/unicode/U+FB00) is a single symbol ff for two lowercase letters 'f'. Should that be parsed as the Unicode symbol 'ff' or as two ASCII symbols 'ff'? 9. **SVG images**: Should the text parts be extracted? 10. **Mathematical Formulas**: Should they be extracted? Formulas have indices and nested fractions. 11. **Whitespace characters**: How many new lines should be extracted for 3 cm of vertical whitespace? How many spaces should be extracted if there is 3 cm of horizontal whitespace? When would you extract tabs and when spaces? 12. **Footnotes**: When the text of multiple pages is extracted, where should footnotes be shown? 13. **Hyperlinks and Metadata**: Should it be extracted at all? Where should it be placed in which format? 14. **Linearization**: Assume you have a floating figure in between a paragraph. Do you first finish the paragraph, or do you put the figure text in between? Then there are issues where most people would agree on the correct output, but the way PDF stores information just makes it hard to achieve that: 1. **Tables**: Typically, tables are just absolutely positioned text. In the worst case, every single letter could be absolutely positioned. That makes it hard to tell where columns / rows are. 2. **Images**: Sometimes PDFs do not contain the text as it is displayed, but instead an image. You notice that when you cannot copy the text. Then there are PDF files that contain an image and a text layer in the background. That typically happens when a document was scanned. Although the scanning software (OCR) is pretty good today, it still fails once in a while. pypdf is no OCR software; it will not be able to detect those failures. pypdf will also never be able to extract text from images. Finally, there are issues that pypdf will deal with. If you find such a text extraction bug, please share the PDF with us so we can work on it! ### Missing Semantic Layer The PDF file format is all about producing the desired visual result for printing. It was not created for parsing the content. PDF files don't contain a semantic layer. Specifically, there is no information what the header, footer, page numbers, tables, and paragraphs are. The visual appearance is there, and people might find heuristics to make educated guesses, but there is no way of being certain. This is a shortcoming of the PDF file format, not of pypdf. It is possible to apply machine learning on PDF documents to make good heuristics, but that will not be part of pypdf. However, pypdf could be used to feed such a machine learning system with the relevant information. ### Whitespaces The PDF format is meant for printing. It is not designed to be read by machines. The text within a PDF document is absolutely positioned, meaning that every single character could be positioned on the page. The text > This is a test document by Ethan Nelson. can be represented as > [(This is a )9(te)-3(st)9( do)-4(cu)13(m)-4(en)12(t )-3(b)3(y)-3( )9(Et)-2(h)3(an)4( Nels)13(o)-5(n)3(.)] TJ Where the numbers are adjustments of vertical space. This representation used within the PDF file makes it very hard to guarantee correct whitespaces. More information: * [issue #1507](https://github.com/py-pdf/pypdf/issues/1507) * [Negative numbers in PDF content stream text object](https://stackoverflow.com/a/28203655/562769) * Mark Stephens: [Understanding PDF text objects](https://blog.idrsolutions.com/understanding-pdf-text-objects/), 2010. ## OCR vs. Text Extraction Optical Character Recognition (OCR) is the process of extracting text from images. Software which does this is called *OCR software*. The [tesseract OCR engine](https://github.com/tesseract-ocr/tesseract) is the most commonly known open source OCR software. pypdf is **not** OCR software. ### Digitally-born vs. Scanned PDF files PDF documents can contain images and text. PDF files don't store text in a semantically meaningful way, but in a way that makes it easy to show the text on screen or print it. For this reason, text extraction from PDFs is hard. If you scan a document, the resulting PDF typically shows the image of the scan. Scanners then also run OCR software and put the recognized text in the background of the image. pypdf can extract this result of the scanners OCR software. However, in such cases, it's recommended to directly use OCR software as errors can accumulate: The OCR software is not perfect in recognizing the text. Then it stores the text in a format that is not meant for text extraction and pypdf might make mistakes parsing that. Hence, I would distinguish three types of PDF documents: * **Digitally born PDF files**: The file was created digitally on the computer. It can contain images, texts, links, outline items (a.k.a., bookmarks), JavaScript, ... If you Zoom in a lot, the text still looks sharp. * **Scanned PDF files**: Any number of pages was scanned. The images were then stored in a PDF file. Hence, the file is just a container for those images. You cannot copy the text, you don't have links, outline items, JavaScript. * **OCRed PDF files**: The scanner ran OCR software and put the recognized text in the background of the image. Hence, you can copy the text, but it still looks like a scan. If you zoom in enough, you can recognize pixels. ### Can we just always use OCR? You might now wonder if it makes sense to just always use OCR software. If the PDF file is digitally-born, you can render it to an image. I would recommend not to do that. Text extraction software like pypdf can use more information from the PDF than just the image. It can know about fonts, encodings, typical character distances and similar topics. That means pypdf has a clear advantage when it comes to characters which are easy to confuse such as `oO0ö`. **pypdf will never confuse characters**. It just reads what is in the file. pypdf also has an edge when it comes to characters which are rare, e.g. 🤰. OCR software will not be able to recognize smileys correctly. ## Attempts to prevent text extraction If people who share PDF documents want to prevent text extraction, they have multiple ways to do so: 1. Store the contents of the PDF as an image 2. [Use a scrambled font](https://stackoverflow.com/a/43466923/562769) However, text extraction cannot be completely prevented if people should still be able to read the document. In the worst case, people can make a screenshot, print it, scan it, and run OCR over it. ================================================ FILE: docs/user/file-size.md ================================================ # Reduce PDF File Size There are multiple ways to reduce the size of a given PDF file. The easiest one is to remove content (e.g., images) or pages. ## Removing duplication Some PDF documents contain the same object multiple times. For example, if an image appears three times in a PDF, it could be embedded three times. Or it can be embedded once and referenced twice. When adding data to a PdfWriter, the data is copied while respecting the original format. For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object. Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed). To reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)` * `remove_identicals` enables/disables compression merging identical objects. * `remove_orphans` enables/disables suppression of unused objects. It is recommended to apply this process just before writing to the file/stream. It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. ## Removing Images ```{testsetup} pypdf_test_setup("user/file-size", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") writer.remove_images() writer.write("out-no-images.pdf") ``` ## Reducing Image Quality If we reduce the quality of the images within the PDF, we can **sometimes** reduce the file size of the PDF overall. That depends on how well the reduced quality image can be compressed. ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") for page in writer.pages: for img in page.images: img.replace(img.image, quality=80) writer.write("out-low-quality.pdf") ``` ## Lossless Compression pypdf supports the FlateDecode filter which uses the zlib/deflate compression method. It is a lossless compression, meaning the resulting PDF looks exactly the same. Deflate compression can be applied to a page via {meth}`page.compress_content_streams `: ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") for page in writer.pages: page.compress_content_streams() # This is CPU intensive! writer.write("out-lossless.pdf") ``` `page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress) and supports the `level` parameter: `level=0` means no compression, `level=9` refers to the highest compression. Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB) with a real PDF. ## Removing Sources When a page is removed from the page list, its content will still be present in the PDF file. This means that the data may still be used elsewhere. Simply removing a page from the page list will reduce the page count but not the file size. To exclude the content completely, the pages should not be added to the PDF using the PdfWriter.append() function. Instead, only the desired pages should be selected for inclusion (note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page deletion feature). There can be issues with poor PDF formatting, such as when all pages are linked to the same resource. In such cases, dropping references to specific pages becomes useless because there is only one source for all pages. Cropping is an ineffective method for reducing the file size because it only adjusts the viewboxes and not the external parts of the source image. Therefore, the content that is no longer visible will still be present in the PDF. ## Going Further The presentation [Putting a Squeeze on Your PDF](https://youtube.com/watch?v=tgOABUhVwFs) has other suggestions. One takeaway is that most of the significant size optimizations usually come from image and font modification. However, font optimization, such as replacing, merging, and subsetting, is not within the functionality of pypdf at the moment. ================================================ FILE: docs/user/forms.md ================================================ # Interactions with PDF Forms ## Reading form fields ```{testsetup} pypdf_test_setup("user/forms", { "form.pdf": "../resources/form.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("form.pdf") fields = reader.get_form_text_fields() fields == {"key": "value", "key2": "value2"} # You can also get all fields: fields = reader.get_fields() ``` ## Filling out forms ```{testcode} from pypdf import PdfReader, PdfWriter reader = PdfReader("form.pdf") writer = PdfWriter() page = reader.pages[0] fields = reader.get_fields() writer.append(reader) writer.update_page_form_field_values( writer.pages[0], {"fieldname": "some filled in text"}, auto_regenerate=False, ) writer.write("out-filled-form.pdf") ``` Generally speaking, you will always want to use `auto_regenerate=False`. The parameter is `True` by default for legacy compatibility, but this flags the PDF processor to recompute the field's rendering, and may trigger a "save changes" dialog for users who open the generated PDF. If you want to flatten your form, that is, keeping all form field contents while removing the form fields themselves, you can set the `flatten` parameter in {func}`~pypdf.PdfWriter.update_page_form_field_values` to `True`. This will convert form field contents to regular PDF content. Afterwards, use {func}`~pypdf.PdfWriter.remove_annotations` with `subtypes="/Widget"` to remove all form fields to get an actual flattened PDF. ## Some notes about form fields and annotations PDF forms have a dual-nature approach to the fields: * Within the root object, an `/AcroForm` structure exists. Inside it, you could find (optional): - some global elements (Fonts, Resources,...) - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content - `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots) * Within the page `/Annots`, you will spot `/Widget` annotations that define the visual rendering. To flesh out this overview: * The core-specific properties of a field are: - `/FT`: The field type (Button, Text, Choice, or Signature). - `/T`: The partial field name. - `/V`: The field’s value, whose format varies depending on the field type. - `/DV`: The default value to which the field reverts when a reset-form action is executed. * To streamline readability, _Field_ Objects and _Widget_ Objects can be fused housing all properties. * Fields can be organized hierarchically, id est one field can be placed under another. In such instances, the `/Parent` will have an IndirectObject providing Bottom-Up links and `/Kids` is an array carrying IndirectObjects for Top-Down navigation; _Widget_ Objects are still required for visual rendering. To call upon them, use the *fully qualified field name* (where all the individual names of the parent objects are separated by `.`) For instance, take two (visual) fields both called _city_, but attached below _sender_ and _receiver_; the corresponding full names will be _sender.city_ and _receiver.city_. * When a field is repeated on multiple pages, the Field Object will have many _Widget_ Objects in `/Kids`. These objects are pure _widgets_, containing no _field_ specific data. * If Fields stores only hidden values, no _Widgets_ are required. In _pypdf_ fields are extracted from the `/Fields` array: ```{testcode} from pypdf import PdfReader reader = PdfReader("form.pdf") fields = reader.get_fields() ``` ```{testcode} from pypdf import PdfReader from pypdf.constants import AnnotationDictionaryAttributes reader = PdfReader("form.pdf") fields = [] for page in reader.pages: for annot in page.annotations: annot = annot.get_object() if annot[AnnotationDictionaryAttributes.Subtype] == "/Widget": fields.append(annot) ``` However, while similar, there are some relevant differences between the two above blocks of code. Most importantly, the first block will return a list of Field objects, whereas the second will return more generic dictionary-like objects. The objects lists will *mostly* reference the same object in the underlying PDF, meaning you'll find that `obj_taken_fom_first_list.indirect_reference == obj_taken_from _second_list.indirect_reference`. Field objects are generally more ergonomic, as the exposed data can be accessed via clearly named properties. However, the more generic dictionary-like objects will contain data that the Field object does not expose, such as the Rect (the widget's position on the page). Therefore, the correct approach depends on your use case. However, it is also important to note that the two lists do not *always* refer to the same underlying PDF object. For example, if the form contains radio buttons, you will find that `reader.get_fields()` will get the parent object (the group of radio buttons) whereas `page.annotations` will return all the child objects (the individual radio buttons). ```{note} Remember that fields are not stored in pages; if you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead. ``` In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix cannot guess intermediate fields and will not report fields using the same _name_. ## Identify pages where fields are used To ease locating page fields you can use `get_pages_showing_field` of PdfReader or PdfWriter. This method accepts a field object, a *PdfObject* that represents a field (as extracted from `_root_object["/AcroForm"]["/Fields"]`). The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g., radio buttons or text displayed on multiple pages). The page numbers can then be retrieved as usual by using `page.page_number`. ================================================ FILE: docs/user/handle-attachments.md ================================================ # Handle Attachments PDF documents can contain attachments, from time to time named embedded file as well. ## Retrieve Attachments Attachments have a name, but it might not be unique. For this reason, the value of `reader.attachments["attachment_name"]` is a list. You can extract all attachments like this: ```{testsetup} pypdf_test_setup("user/handle-attachments", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for name, content_list in reader.attachments.items(): for i, content in enumerate(content_list): with open(f"out-attachment-{i}-{name}", "wb") as fp: fp.write(content) ``` Alternatively, you can retrieve them in an object-oriented fashion if you need further details for these files: ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for attachment in reader.attachment_list: print(attachment.name, attachment.alternative_name, attachment.content) ``` ## Add Attachments To add a new attachment, use the following code: ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") writer.add_attachment(filename="test.txt", data=b"Hello World!") ``` As you can see, the basic attachment properties are its name and content. If you want to modify further properties of it, the returned object provides corresponding setters: ```{testcode} import datetime import hashlib from pypdf import PdfWriter from pypdf.generic import create_string_object, ByteStringObject, NameObject, NumberObject writer = PdfWriter(clone_from="example.pdf") embedded_file = writer.add_attachment(filename="test.txt", data=b"Hello World!") embedded_file.size = NumberObject(len(b"Hello World!")) embedded_file.alternative_name = create_string_object("test1.txt") embedded_file.description = create_string_object("My test file") embedded_file.subtype = NameObject("/text/plain") embedded_file.checksum = ByteStringObject(hashlib.md5(b"Hello World!").digest()) embedded_file.modification_date = datetime.datetime.now(tz=datetime.timezone.utc) # embedded_file.content = "My new content." writer.write("out-add-attachment.pdf") ``` The same functionality is available if you iterate over the attachments of a writer using `writer.attachment_list`. ## Delete Attachments To delete an existing attachment, use the following code: ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") attachment = writer.add_attachment(filename="test.txt", data=b"Hello World!") attachment.delete() assert list(writer.attachment_list) == [] ``` Please note that this will not delete the associated file relationship if it exists. Deleting them as well would require us to know where this has been defined, which requires more complexity. For now, please consider looking for the corresponding definition yourself and delete it from the array. ## PDF/A compliance The following example shows how to add an attachment to a PDF/A-3B compliant document without breaking compliance: ```{testcode} from pypdf import PdfWriter from pypdf.constants import AFRelationship from pypdf.generic import create_string_object, ArrayObject, NameObject writer = PdfWriter(clone_from="example.pdf") attachment = writer.add_attachment(filename="test.txt", data="Hello World!") attachment.subtype = NameObject("/text/plain") attachment.associated_file_relationship = NameObject(AFRelationship.SUPPLEMENT) attachment.alternative_name = create_string_object(attachment.name) if "/AF" in writer.root_object: af = writer.root_object["/AF"].get_object() else: af = ArrayObject() writer.root_object[NameObject("/AF")] = af af.append(attachment.pdf_object.indirect_reference) writer.write("out-a3b.pdf") ``` This example marks a relationship of the attachment to the whole document. Alternatively, it can be added to most of the other PDF objects as well. For details, see the corresponding PDF specification, like section 14.13 of the PDF 2.0 specification. ================================================ FILE: docs/user/handling-outlines.md ================================================ # Handling Outlines PDF outlines - also known as bookmarks - provide a structured navigation panel in PDF readers. `pypdf` allows you to read, create, and modify both simple and deeply nested outlines. ## Writing PDF Outlines To add outlines, use the {meth}`~pypdf.PdfWriter.add_outline_item` method. This method returns a reference to the created outline, which you can use as a parent to create nested (hierarchical) bookmarks. ### Adding a Simple Outline The following example shows how to add a single top-level bookmark. We add an outline item pointing to the first page (index `0`) and save the result. ```{testsetup} pypdf_test_setup("user/handling-outlines", { "crazyones.pdf":"../resources/crazyones.pdf", }) ``` ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="crazyones.pdf") # Add a top-level bookmark writer.add_outline_item( title="Introduction", page_number=0 ) writer.write("simple-example.pdf") ``` ### Adding Nested Outlines You can build hierarchies (like Chapter → Section) by passing the parent outline item to the `parent` parameter of a new item. In the example below, we create a root item "Introduction" and nest two sections under it. ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="crazyones.pdf") # Add parent (Chapter) introduction = writer.add_outline_item( title="Chapter 1", page_number=0 ) # Add children (sections) nested under the introduction writer.add_outline_item( title="Section 1.1", page_number=0, parent=introduction ) writer.add_outline_item( title="Section 1.2", page_number=0, parent=introduction ) writer.write("nested-example.pdf") ``` ### Advanced Styling and View Modes (Fit Options) You can customize the appearance and behavior of bookmarks using optional parameters, such as changing the text color or applying bold and italic styles. For detailed information on all available parameters and their formats, please refer to the {meth}`~pypdf.PdfWriter.add_outline_item` API documentation. The ``fit`` parameter determines how the page is displayed when the user clicks the bookmark. You can use the {class}`~pypdf.generic.Fit` helper to specify modes like {meth}`~pypdf.generic.Fit.fit`, {meth}`~pypdf.generic.Fit.fit_horizontally`, or {meth}`~pypdf.generic.Fit.xyz`. ```{testcode} from pypdf import PdfWriter from pypdf.generic import Fit writer = PdfWriter(clone_from="crazyones.pdf") # Top-level chapter (Points to Page 3, Index 2) chapter2 = writer.add_outline_item( title="Chapter 2", page_number=0, color=(0, 0, 1), bold=True, italic=False, is_open=True, fit=Fit.fit() ) # Section under Chapter 2 (Points to Page 3, Index 2) section2_1 = writer.add_outline_item( title="Section 2.1", page_number=0, parent=chapter2, color=(0, 0.5, 0), bold=False, italic=True, is_open=False, fit=Fit.fit_horizontally(top=800) ) # Section with custom zoom (Points to Page 3, Index 2) section2_2 = writer.add_outline_item( title="Section 2.2", page_number=0, parent=chapter2, color=(1, 0, 0), bold=True, italic=True, is_open=True, fit=Fit.xyz(left=0, top=800, zoom=1.25) ) writer.write("advanced-example.pdf") ``` ```{figure} complete-outlines.png :alt: An annotated screenshot illustrating simple, nested, and advanced PDF bookmarks. An annotated screenshot illustrating simple, nested, and advanced PDF bookmarks in a Table of Contents. ``` ## Reading PDF Outlines `pypdf` represents outlines as a list of {class}`~pypdf.generic.Destination` objects. If an outline has children, they appear as a nested list directly following their parent. To retrieve the page number a bookmark points to, use the {meth}`~pypdf.PdfReader.get_destination_page_number` method, which returns a zero-based page index. ### Reading Simple Outlines To extract only the top-level bookmarks (ignoring nested sections), you can iterate over the {attr}`~pypdf.PdfReader.outline` property. Since nested children appear as lists within the outline structure, you must explicitly check for and skip them (`isinstance(outline, list)`) to avoid errors. The example below reads the file created in the previous section. ```{testcode} from pypdf import PdfReader reader = PdfReader("simple-example.pdf") print("Simple Outline (Top-Level Only):") print("-" * 32) for outline in reader.outline: # Check if the item is a list (which represents nested children) if isinstance(outline, list): continue # Skip the nested parts completely page_number = reader.get_destination_page_number(outline) if page_number is None: print(f"{outline.title} -> No page destination") else: print(f"{outline.title} -> page {page_number + 1}") ``` ```{testoutput} Simple Outline (Top-Level Only): -------------------------------- Introduction -> page 1 ``` ### Reading Nested Outlines When dealing with hierarchical bookmarks, the {attr}`~pypdf.PdfReader.outline` property may contain lists inside lists. You should use a recursive function to traverse the tree. The following example defines a `print_outline` function that handles indentation and nested lists to display the structure of the document we created earlier. ```{testcode} from typing import List, Union from pypdf import PdfReader from pypdf.generic import Destination def print_outline( outlines: List[Union[Destination, List[Destination]]], reader: PdfReader, level: int = 0 ) -> None: """Recursively print all outline items with indentation.""" for item in outlines: if isinstance(item, list): # Recursively handle the nested list of children print_outline(item, reader, level + 1) else: page_number = reader.get_destination_page_number(item) indent = " " * level if page_number is None: print(f"{indent}- {item.title} (No page destination)") else: print(f"{indent}- {item.title} (Page {page_number + 1})") reader = PdfReader("nested-example.pdf") print("Nested Outline Hierarchy:") print("-" * 25) print_outline(reader.outline, reader) ``` ```{testoutput} Nested Outline Hierarchy: ------------------------- - Chapter 1 (Page 1) - Section 1.1 (Page 1) - Section 1.2 (Page 1) ``` ================================================ FILE: docs/user/installation.md ================================================ # Installation There are several ways to install pypdf. The most common option is to use pip. ## pip pypdf requires Python 3.9+ to run. Typically, Python comes with `pip`, a package installer. Using it, you can install pypdf: ```bash pip install pypdf ``` If you are not a superuser (a system administrator / root), you can also just install pypdf for your current user: ```bash pip install --user pypdf ``` ### Optional dependencies pypdf tries to be as self-contained as possible, but for some tasks, the amount of work to properly maintain the code would be too high. This is especially the case for cryptography and image formats. If you simply want to install all optional dependencies, run: ``` pip install pypdf[full] ``` Alternatively, you can install just some: If you plan to use pypdf for encrypting or decrypting PDFs that use AES, you will need to install some extra dependencies. Encryption using RC4 is supported using the regular installation. ``` pip install pypdf[crypto] ``` If you plan to use image extraction, you need Pillow: ``` pip install pypdf[image] ``` For JBIG2 support, you need to install a global OS-level package as well: [`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure depends on our operating system. For Ubuntu, use the following, for example: ``` sudo apt-get install jbig2dec ``` ## Python Version Support Since pypdf 4.0, every release, including point releases, should work with all supported versions of [Python](https://devguide.python.org/versions/). Thus, every point release is designed to work with all existing Python versions, excluding end-of-life versions. Previous versions of pypdf support the following versions of Python: | Python | 3.11 | 3.10 | 3.9 | 3.8 | 3.7 | 3.6 | 2.7 | |------------------------|:----:|:----:|:---:|:---:|:---:|:---:|:---:| | pypdf 3.x | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | PyPDF2 >= 2.0 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | PyPDF2 1.20.0 - 1.28.4 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | PyPDF2 1.15.0 - 1.20.0 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ## Anaconda Anaconda users can [install pypdf via conda-forge](https://anaconda.org/conda-forge/pypdf). ## Development Version In case you want to use the current version under development: ```bash pip install git+https://github.com/py-pdf/pypdf.git ``` ================================================ FILE: docs/user/merging-pdfs.md ================================================ # Merging PDF files ## Basic Example ```{testsetup} pypdf_test_setup("user/merging-pdfs", { "example.pdf": "../resources/example.pdf", "hello-world.pdf": "../resources/hello-world.pdf", "jpeg.pdf": "../resources/jpeg.pdf", "GeoBase_NHNC1_Data_Model_UML_EN.pdf": "../resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf": "../resources/Seige_of_Vicksburg_Sample_OCR.pdf", "two-different-pages.pdf": "../resources/two-different-pages.pdf", }) ``` ```{testcode} from pypdf import PdfWriter merger = PdfWriter() for pdf in ["example.pdf", "hello-world.pdf", "jpeg.pdf"]: merger.append(pdf) merger.write("out-basic.pdf") ``` For more details, see an excellent answer on [StackOverflow](https://stackoverflow.com/questions/3444645/merge-pdf-files) by Paul Rooney. ````{note} Dealing with large PDF files might reach the recursion limit of the current Python interpreter. In these cases, increasing the limit might help: ```{testcode} import sys # Example: Increase the current limit by factor 5. sys.setrecursionlimit(sys.getrecursionlimit() * 5) ``` ```` ## Showing more merging options ```{testcode} from pypdf import PdfWriter merger = PdfWriter() with ( open("Seige_of_Vicksburg_Sample_OCR.pdf", "rb") as input1, open("two-different-pages.pdf", "rb") as input2, open("example.pdf", "rb") as input3 ): # Add the first 3 pages of input1 document to output merger.append(fileobj=input1, pages=(0, 3)) # Insert the first page of input2 into the output beginning after the second page merger.merge(position=2, fileobj=input2, pages=(0, 1)) # Append entire input3 document to the end of the output document merger.append(input3) # Write to an output PDF document merger.write("out-advanced.pdf") ``` ## append `append` has been slightly extended in `PdfWriter`. See {func}`~pypdf.PdfWriter.append` for more details. ### Examples ```{testcode} from pypdf import PdfWriter, PdfReader writer = PdfWriter() source_file_name = "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # Append the first 10 pages from pdf file writer.append(source_file_name, (0, 10)) reader = PdfReader(source_file_name) # Append the first and 10th page from reader and create an outline writer.append(reader, "page 1 and 10", [0, 9]) ``` During merging, the relevant named destination will also be imported. If you want to insert pages in the middle of the destination, use `merge` (which provides an insertion position). You can insert the same page multiple times, if necessary, even using a list-based syntax: ```{testcode} # Insert pages 2 and 3, with page 1 before, between, and after writer.append(reader, [0, 1, 0, 2, 0]) ``` ## add_page / insert_page It is recommended to use `append` or `merge` instead. ## Merging forms When merging forms, some form fields may have the same names, preventing access to some data. A grouping field should be added before adding the source PDF to prevent that. The original fields will be identified by adding the group name. For example, after calling `reader.add_form_topname("form1")`, the field previously named `field1` is now identified as `form1.field1` when calling `reader.get_form_text_fields(True)` or `reader.get_fields()`. After that, you can append the input PDF completely or partially using `writer.append` or `writer.merge`. If you insert a set of pages, only those fields will be listed. ## reset_translation During cloning, if an object has been already cloned, it will not be cloned again, and a pointer to this previously cloned object is returned instead. Because of that, if you add/merge a page that has already been added, the same object will be added the second time. If you modify any of these two pages later, both pages can be modified independently. To reset, call `writer.reset_translation(reader)`. ## Advanced cloning To prevent side effects between pages/objects and all objects linked cloning is done during the merge. This process will be automatically applied if you use `PdfWriter.append/merge/add_page/insert_page`. If you want to clone an object before attaching it "manually", use the `clone` method of any *PdfObject*: ```{testcode} from pypdf.generic import NameObject, NumberObject, StreamObject stream_object = StreamObject() cloned_object = stream_object.clone(writer) ``` If you try to clone an object already belonging to the writer, it will return the same object: ```{testcode} assert cloned_object == stream_object.clone(writer) ``` The same holds true if you try to clone an object twice. It will return the previously cloned object: ```{testcode} assert stream_object.clone(writer) == stream_object.clone(writer) ``` Please note that if you clone an object, you will clone all the objects below as well, including the objects pointed by *IndirectObject*. Due to this, if you clone a page that includes some articles (`"/B"`), not only the first article, but also all the chained articles and the pages where those articles can be read will be copied. This means that you may copy lots of objects which will be saved in the output PDF as well. To prevent this, you can provide the list of fields in the dictionaries to be ignored: ```{testcode} new_page = writer.add_page(reader.pages[0], excluded_keys=["/B"]) ``` ### Merging rotated pages If you are working with rotated pages, you might want to call {func}`~pypdf._page.PageObject.transfer_rotation_to_content` on the page before merging to avoid wrongly rotated results: ```{testcode} background = PdfReader("jpeg.pdf").pages[0] for page in writer.pages: if page.rotation != 0: page.transfer_rotation_to_content() page.merge_page(background, over=False) ``` ================================================ FILE: docs/user/metadata.md ================================================ # Metadata PDF files can have two types of metadata: "Regular" and XMP ones. They can both exist at the same time. ## Reading metadata ```{testsetup} pypdf_test_setup("user/metadata", { "example.pdf": "../resources/example.pdf", "commented-xmp.pdf": "../resources/commented-xmp.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") meta = reader.metadata # All the following could be None! print(meta.title) print(meta.author) print(meta.subject) print(meta.creator) print(meta.producer) print(meta.creation_date) print(meta.modification_date) ``` % Two last rows masked to allow to change example.pdf ```{testoutput} :hide: PDF Example Document None None None Skia/PDF m103 Google Docs Renderer ... ... ``` ## Writing metadata ```{testcode} from datetime import datetime from pypdf import PdfReader, PdfWriter reader = PdfReader("example.pdf") writer = PdfWriter() # Add all pages to the writer for page in reader.pages: writer.add_page(page) # If you want to add the old metadata, include these two lines if reader.metadata is not None: writer.add_metadata(reader.metadata) # Format the current date and time for the metadata utc_time = "-05'00'" # UTC time optional time = datetime.now().strftime(f"D\072%Y%m%d%H%M%S{utc_time}") # Add the new metadata writer.add_metadata( { "/Author": "Martin", "/Producer": "Libre Writer", "/Title": "Title", "/Subject": "Subject", "/Keywords": "Keywords", "/CreationDate": time, "/ModDate": time, "/Creator": "Creator", "/CustomField": "CustomField", } ) # Save the new PDF to a file writer.write("out-meta-create.pdf") ``` ## Updating metadata ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="example.pdf") # Change some values writer.add_metadata( { "/Author": "Martin", "/Producer": "Libre Writer", "/Title": "Title", } ) # Clear all data but keep the entry in PDF writer.metadata = {} # Replace all entries with new set of entries writer.metadata = { "/Author": "Martin", "/Producer": "Libre Writer", } # Save the new PDF to a file writer.write("out-meta-update.pdf") ``` ## Removing metadata entry ```{testcode} from pypdf import PdfWriter writer = PdfWriter("example.pdf") # Remove Metadata (/Info entry) writer.metadata = None # Save the new PDF to a file writer.write("out-meta-remove.pdf") ``` ## Reading XMP metadata ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") meta = reader.xmp_metadata if meta: print(meta.dc_title) print(meta.dc_description) print(meta.xmp_create_date) ``` ```{testoutput} :hide: {'x-default': 'PDF Example Document'} {} 2025-10-30 09:29:55 ``` ## Creating XMP metadata You can create XMP metadata easily using the `XmpInformation.create()` method: ```{testcode} from pypdf import PdfWriter from pypdf.xmp import XmpInformation # Create a new XMP metadata object xmp = XmpInformation.create() # Set metadata fields xmp.dc_title = {"x-default": "My Document Title"} xmp.dc_creator = ["Author One", "Author Two"] xmp.dc_description = {"x-default": "Document description"} xmp.dc_subject = ["keyword1", "keyword2", "keyword3"] xmp.pdf_producer = "pypdf" # Create a writer and add the metadata writer = PdfWriter() writer.add_blank_page(612, 792) # Add a page writer.xmp_metadata = xmp writer.write("out-xmp-create.pdf") ``` ## Setting XMP metadata fields The `XmpInformation` class provides property-based access for all supported metadata fields: ### Dublin Core fields ```{testcode} from datetime import datetime from pypdf.xmp import XmpInformation xmp = XmpInformation.create() # Single value fields xmp.dc_coverage = "Global coverage" xmp.dc_format = "application/pdf" xmp.dc_identifier = "unique-id-123" xmp.dc_source = "Original Source" # Array fields (bags - unordered) xmp.dc_contributor = ["Contributor One", "Contributor Two"] xmp.dc_language = ["en", "fr", "de"] xmp.dc_publisher = ["Publisher One"] xmp.dc_relation = ["Related Doc 1", "Related Doc 2"] xmp.dc_subject = ["keyword1", "keyword2"] xmp.dc_type = ["Document", "Text"] # Sequence fields (ordered arrays) xmp.dc_creator = ["Primary Author", "Secondary Author"] xmp.dc_date = [datetime.now()] # Language alternative fields xmp.dc_title = {"x-default": "Title", "en": "English Title", "fr": "Titre français"} xmp.dc_description = {"x-default": "Description", "en": "English Description"} xmp.dc_rights = {"x-default": "All rights reserved"} ``` ### XMP fields ```{testcode} from datetime import datetime # Date fields accept both datetime objects and strings xmp.xmp_create_date = datetime.now() xmp.xmp_modify_date = datetime.fromisoformat("2023-12-25T10:30:45Z") xmp.xmp_metadata_date = datetime.now() # Text field xmp.xmp_creator_tool = "pypdf" ``` ### PDF fields ```{testcode} xmp.pdf_keywords = "keyword1, keyword2, keyword3" xmp.pdf_pdfversion = "1.4" xmp.pdf_producer = "pypdf" ``` ### XMP Media Management fields ```{testcode} xmp.xmpmm_document_id = "uuid:12345678-1234-1234-1234-123456789abc" xmp.xmpmm_instance_id = "uuid:87654321-4321-4321-4321-cba987654321" ``` ### PDF/A fields ```{testcode} xmp.pdfaid_part = "1" xmp.pdfaid_conformance = "B" ``` ### Clearing metadata fields You can clear any field by assigning `None`: ```{testcode} xmp.dc_title = None xmp.dc_creator = None xmp.pdf_producer = None ``` ### Incrementally updating XMP metadata fields When modifying existing XMP metadata, it is often necessary to add or update individual entries while preserving existing values. The XMP properties return standard Python data structures that can be manipulated directly: ```{testcode} from pypdf.xmp import XmpInformation xmp = XmpInformation.create() # Language alternative fields return dictionaries title = xmp.dc_title or {} title["en"] = "English Title" title["fr"] = "Titre français" xmp.dc_title = title # Bag fields (unordered collections) return lists subjects = xmp.dc_subject or [] subjects.append("new_keyword") xmp.dc_subject = subjects # Sequence fields (ordered collections) return lists creators = xmp.dc_creator or [] creators.append("New Author") xmp.dc_creator = creators ``` This approach provides direct control over the data structures while maintaining the property-based interface. ## Modifying XMP metadata Modifying XMP metadata is a bit more complicated. As an example, we want to add the following PDF/UA identifier section to the XMP metadata: ```xml 1 ``` This could be written like this: ```{testcode} from pypdf import PdfWriter writer = PdfWriter(clone_from="commented-xmp.pdf") metadata = writer.xmp_metadata assert metadata # Ensure that it is not `None`. rdf_root = metadata.rdf_root xmp_meta = rdf_root.parentNode xmp_document = xmp_meta.parentNode # Please note that without a text node, the corresponding elements might # be omitted completely. pdfuaid_description = xmp_document.createElement("rdf:Description") pdfuaid_description.setAttribute("rdf:about", "") pdfuaid_description.setAttribute("xmlns:pdfuaid", "http://www.aiim.org/pdfua/ns/id/") pdfuaid_part = xmp_document.createElement("pdfuaid:part") pdfuaid_part_text = xmp_document.createTextNode("1") pdfuaid_part.appendChild(pdfuaid_part_text) pdfuaid_description.appendChild(pdfuaid_part) rdf_root.appendChild(pdfuaid_description) metadata.stream.set_data(xmp_document.toxml().encode("utf-8")) writer.write("out-xmp-update.pdf") ``` For further details on modifying the structure, please refer to {py:mod}`xml.dom.minidom`. ================================================ FILE: docs/user/pdf-version-support.md ================================================ # PDF Version Support PDF comes in the following versions: * 1993: 1.0 * 1994: 1.1 * 1996: 1.2 * 1999: 1.3 * 2001: 1.4 * 2003: 1.5 * 2004: 1.6 * 2008: 1.7, ISO 32000-1:2008 * 2017: 2.0, ISO 32000-2:2017 The general format didn't change, but new features got added. It can be that pypdf can do the operations you want on PDF 2.0 files without fully supporting all features of PDF 2.0. ## PDF Feature Support by pypdf | Feature | PDF Version | pypdf Support | |--------------------------------|:-----------:|:-------------:| | CMaps | 1.4 | ✅ | | Transparent Graphics | 1.4 | ✅ | | Content Stream Compression | 1.5 | ✅ | | Cross-reference Streams | 1.5 | ✅ | | Object Streams | 1.5 | ✅ | | Optional Content Groups (OCGs) | 1.5 | ❓ | | AES Encryption | 1.6 | ✅ | This table is not complete - if in doubt, consider having a look at the API documentation or inside the issues or try with a corresponding PDF file. In general, we are open to add support for missing features. Please open a new issue if it does not exist yet, and keep in mind that we rely on external contributors to support us with the implementation. One commonly requested feature is proper support reading/handling incremental PDF files, see [issue #3304](https://github.com/py-pdf/pypdf/issues/3304). See [History of PDF](https://en.wikipedia.org/wiki/History_of_PDF) for more features. Some PDF features are not supported by pypdf, but other libraries can be used for them: * [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF ([#302](https://github.com/py-pdf/pypdf/issues/302)) * [camelot-py](https://pypi.org/project/camelot-py/): Table Extraction ([#231](https://github.com/py-pdf/pypdf/issues/231)) ================================================ FILE: docs/user/pdfa-compliance.md ================================================ # PDF/A Compliance PDF/A is a specialized, ISO-standardized version of the Portable Document Format (PDF) specifically designed for the long-term preservation and archiving of electronic documents. It ensures that files remain accessible, readable, and true to their original appearance by embedding all necessary fonts, images, and metadata within the document itself. By adhering to strict guidelines and minimizing dependencies on external resources or proprietary software, PDF/A ensures the consistent and reliable reproduction of content, safeguarding it against future technological changes and obsolescence. ## PDF/A Versions * **PDF/A-1**: Based on PDF 1.4, PDF/A-1 is the first version of the standard and is divided into two levels: PDF/A-1a (Level A, ensuring accessibility) and PDF/A-1b (Level B, ensuring visual preservation). * **Level B** (Basic): Ensures visual preservation and basic requirements for archiving. * **Level A** (Accessible): Everything from level B, but includes additional requirements for accessibility, such as tagging, Unicode character mapping, and logical structure. * **PDF/A-2**: Based on PDF 1.7 (ISO 32000-1), PDF/A-2 adds features and improvements over PDF/A-1, while maintaining compatibility with PDF/A-1b (Level B) documents. * **Level B** (Basic): Like PDF/A-1b, but support for PDF 1.7 features such as transparency layers. * **Level U** (Unicode): Ensures Unicode mapping without the full accessibility requirements of PDF/A-1a (Level A). * **Level A** (Accessible): Similar to PDF/A-1a * **PDF/A-3**: Based on PDF 1.7 (ISO 32000-1), PDF/A-3 is similar to PDF/A-2 but allows the embedding of non-PDF/A files as attachments, enabling the archiving of source or supplementary data alongside the PDF/A document. This is interesting for invoices which can add XML files. * **PDF/A-4**: Based on PDF 2.0 (ISO 32000-2), PDF/A-4 introduces new features and improvements for better archiving and accessibility. The previous levels are replaced by PDF/A-4f (ensuring visual preservation and allowing attachments) and PDF/A-4e (Engineering, allows 3D content). ## PDF/A-1b In contrast to other PDF documents, PDF/A-1b documents must fulfill those requirements: * **MarkInfo Object**: The MarkInfo object is a dictionary object within a PDF/A file that provides information about the logical structure and tagging of the document. The MarkInfo object indicates whether the document is tagged, contains optional content, or has a structure tree that describes the logical arrangement of content such as headings, paragraphs, lists, and tables. By including the MarkInfo object, PDF/A ensures that electronic documents are accessible to users with disabilities, such as those using screen readers or other assistive technologies. * **Embedded fonts**: All fonts used in the document must be embedded to ensure consistent text rendering across different devices and systems. * **Color Spaces**: DeviceRGB is a device-dependent color space that relies on the specific characteristics of the output device, which can lead to inconsistent color rendering across various devices. To achieve accurate and consistent color representation, PDF/A requires the use of device-independent color spaces, such as ICC-based color profiles. * **XMP (Extensible Metadata Platform) metadata**: XMP metadata provides a standardized and extensible way to store essential information about a document and its properties. XMP metadata is an XML-based format embedded directly within a PDF/A file. It contains various types of information, such as document title, author, creation and modification dates, keywords, and copyright information, as well as PDF/A-specific details like conformance level and OutputIntent. ## Validation [VeraPDF](https://docs.verapdf.org/install/) is the go-to PDF/A validator. There are several online validators that allow you to simply upload the document: * [pdfen.com](https://www.pdfen.com/pdf-a-validator) * [avepdf.com](https://avepdf.com/pdfa-validation) : Gives an error report * [pdfa.org](https://pdfa.org/pdfa-online-verification-service/) * [visual-paradigm.com](https://online.visual-paradigm.com/de/online-pdf-editor/pdfa-validator/) - can convert the PDF to a PDF/A * [pdf2go.com](https://www.pdf2go.com/validate-pdfa) * [slub-dresden.de](https://www.slub-dresden.de/veroeffentlichen/dissertationen-habilitationen/elektronische-veroeffentlichung/slub-pdfa-validator) links to relevant parts in the specification. ## pypdf and PDF/A At the moment, pypdf does not make any guarantees regarding PDF/A. [Support is very welcome](https://github.com/py-pdf/pypdf/labels/is-pdf%2Fa-compliance). ================================================ FILE: docs/user/post-processing-in-text-extraction.md ================================================ # Post-Processing of Text Extraction Post-processing can recognizably improve the results of text extraction. It is, however, outside the scope of pypdf itself. Hence, the library will not give any direct support for it. It is a natural language processing (NLP) task. This page lists a few examples of what can be done as well as a community recipe that can be used as a general purpose post-processing step. If you know more about the specific domain of your documents, e.g., the language, it is likely that you can find custom solutions that work better in your context. ## Ligature Replacement ```{testcode} def replace_ligatures(text: str) -> str: ligatures = { "ff": "ff", "fi": "fi", "fl": "fl", "ffi": "ffi", "ffl": "ffl", "ſt": "ft", "st": "st", # "Ꜳ": "AA", # "Æ": "AE", "ꜳ": "aa", } for search, replace in ligatures.items(): text = text.replace(search, replace) return text ``` ## Dehyphenation Hyphens are used to break words up so that the appearance of the page is nicer. ```{testcode} from typing import List def remove_hyphens(text: str) -> str: """ This fails for: * Natural dashes: well-known, self-replication, use-cases, non-semantic, Post-processing, Window-wise, viewpoint-dependent * Trailing math operands: 2 - 4 * Names: Lopez-Ferreras, VGG-19, CIFAR-100 """ lines = [line.rstrip() for line in text.split("\n")] # Find dashes line_numbers = [] for line_no, line in enumerate(lines[:-1]): if line.endswith("-"): line_numbers.append(line_no) # Replace for line_no in line_numbers: lines = dehyphenate(lines, line_no) return "\n".join(lines) def dehyphenate(lines: List[str], line_no: int) -> List[str]: next_line = lines[line_no + 1] word_suffix = next_line.split(" ")[0] lines[line_no] = lines[line_no][:-1] + word_suffix lines[line_no + 1] = lines[line_no + 1][len(word_suffix) :] return lines ``` ## Header/Footer Removal The following header/footer removal has several drawbacks: * False-positives, e.g., for the first page when there is a date like 2024. * False-negatives in many cases: * Dynamic part, e.g., page label is in the header. * Even/odd pages have different headers. * Some pages, e.g., the first one or chapter pages, do not have a header. ```{testcode} def remove_footer(extracted_texts: list[str], page_labels: list[str]): def remove_page_labels(extracted_texts, page_labels): processed = [] for text, label in zip(extracted_texts, page_labels): text_left = text.lstrip() if text_left.startswith(label): text = text_left[len(label) :] text_right = text.rstrip() if text_right.endswith(label): text = text_right[: -len(label)] processed.append(text) return processed extracted_texts = remove_page_labels(extracted_texts, page_labels) return extracted_texts ``` ## Other ideas * Whitespaces in units: Between a number and its unit should be a space. ([source](https://tex.stackexchange.com/questions/20962/should-i-put-a-space-between-a-number-and-its-unit)). That means: 42 ms, 42 GHz, 42 GB. * Percent: English style guides prescribe writing the percent sign following the number without any space between (e.g., 50%). * Whitespaces before dots: Should typically be removed. * Whitespaces after dots: Should typically be added. ================================================ FILE: docs/user/reading-pdf-annotations.md ================================================ # Reading PDF Annotations PDF 2.0 defines the following annotation types: * Text * Link * FreeText * Line * Square * Circle * Polygon * PolyLine * Highlight * Underline * Squiggly * StrikeOut * Caret * Stamp * Ink * Popup * FileAttachment * Sound * Movie * Screen * Widget * PrinterMark * TrapNet * Watermark * 3D * Redact * Projection * RichMedia In general, annotations can be read like this: ```{testsetup} pypdf_test_setup("user/reading-pdf-annotations", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for page in reader.pages: if "/Annots" in page: for annotation in page["/Annots"]: obj = annotation.get_object() print({"subtype": obj["/Subtype"], "location": obj["/Rect"]}) ``` ```{testoutput} :hide: {'subtype': '/Highlight', 'location': [376.771, 406.213, 413.78, 422.506]} {'subtype': '/Popup', 'location': [531.053, 327.965, 715.198, 422.219]} {'subtype': '/FileAttachment', 'location': [245.819, 223.288, 252.819, 240.288]} {'subtype': '/Stamp', 'location': [68.7536, 187.259, 151.442, 254.124]} {'subtype': '/Popup', 'location': [612, 631.925, 816, 745.925]} {'subtype': '/Text', 'location': [176.9, 216.719, 200.9, 240.719]} {'subtype': '/Popup', 'location': [596, 709.445, 780, 801.445]} ``` Examples of reading three of the most common annotations: ## Text ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for page in reader.pages: if "/Annots" in page: for annotation in page["/Annots"]: subtype = annotation.get_object()["/Subtype"] if subtype == "/Text": print(annotation.get_object()["/Contents"]) ``` ```{testoutput} :hide: Text comment ``` ## Highlights ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") for page in reader.pages: if "/Annots" in page: for annotation in page["/Annots"]: subtype = annotation.get_object()["/Subtype"] if subtype == "/Highlight": coords = annotation.get_object()["/QuadPoints"] x1, y1, x2, y2, x3, y3, x4, y4 = coords ``` ## Attachments ```{testcode} from pypdf import PdfReader reader = PdfReader("example.pdf") attachments = {} for page in reader.pages: if "/Annots" in page: for annotation in page["/Annots"]: subtype = annotation.get_object()["/Subtype"] if subtype == "/FileAttachment": fileobj = annotation.get_object()["/FS"] attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data() ``` ================================================ FILE: docs/user/robustness.md ================================================ # Robustness and strict=False PDF is [specified in various versions](https://pdfa.org/resource/pdf-specification-archive/). The specification of PDF 2.0 has 1003 pages. This length makes it hard to get everything right. As a consequence, a lot of PDF files are not strictly following the specification. If a PDF file does not follow the specification, it is not always possible to be certain what the intended effect would be. Think of the following broken Python code as an example: ```{testcode} # Broken function (foo, bar): # Potentially intended: def function(foo, bar): ... # Also possible: function = (foo, bar) ``` ```{testoutput} :hide: Traceback (most recent call last): ... SyntaxError: invalid syntax ``` Writing a parser, you can go two paths: Either you try to be forgiving and try to figure out what the user intended, or you are strict and just tell the user that they should fix their stuff. pypdf gives you the option to be strict or not. pypdf has two core objects: * {class}`~pypdf.PdfReader` * {class}`~pypdf.PdfWriter` PdfReader and PdfWriter both have a `strict` parameter. Choosing `strict=True` means that pypdf will raise an exception if a PDF does not follow the specification. Choosing `strict=False` means that pypdf will try to be forgiving and do something reasonable, but it will log a warning message. It is a best-effort approach. ================================================ FILE: docs/user/security.md ================================================ # Security We strive to provide a library with secure defaults. ## Configuration ### Filters *pypdf* currently employs output size limits for some filters which are known to possibly have large compression ratios. The usual limit is at 75 MB of uncompressed data during decompression. If this is too low for your use case, and you are aware of the possible side effects, you can modify the following constants which define the desired maximal output size in bytes: * `pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH` for the *FlateDecode* filter (zlib compression) * `pypdf.filters.LZW_MAX_OUTPUT_LENGTH` for the *LZWDecode* filter (LZW compression) * `pypdf.filters.RUN_LENGTH_MAX_OUTPUT_LENGTH` for the *RunLengthDecode* filter (run-length compression) For JBIG2 images, there is a similar parameter to limit the memory usage during decoding: `pypdf.filters.JBIG2_MAX_OUTPUT_LENGTH` It defaults to 75 MB as well. For all streams, the maximum allowed value for the `/Length` field is limited to `pypdf.filters.MAX_DECLARED_STREAM_LENGTH`, which defaults to 75 MB as well. For all array-based streams, the maximum allowed output length is limited to `pypdf.filters.MAX_ARRAY_BASED_STREAM_OUTPUT_LENGTH`, which defaults to 75 MB as well. For the *FlateDecode* filter, the number of bytes to attempt recovery with can be set by `pypdf.filters.ZLIB_MAX_RECOVERY_INPUT_LENGTH`. It defaults to 5 MB due to the much more complex recovery approach. For the *JBIG2Decode* filter, calling the external *jbig2dec* tool can be disabled by setting `pypdf.filters.JBIG2DEC_BINARY = None`. ### Reading *pypdf* currently employs the following reading limits on *PdfReader* instances: * `root_object_recovery_limit` limits the number of objects to read before stopping with Root object recovery in non-strict mode. It defaults to 10 000. Setting it to `None` will fully disable this limit. If you want to employ custom limits for the *PdfWriter* as well, the currently preferred way is to initialize it from the reader, id est something like `PdfWriter(clone_from=PdfReader("file.pdf", root_object_recovery_limit=42))`. ## Reporting possible vulnerabilities Please refer to our [security policy](https://github.com/py-pdf/pypdf/security/policy). ## Invalid reports ### Exceptions Most exceptions raised by our code are considered bugs or robustness issues and can be reported publicly. We consider it the task of the library user to catch exceptions which could cause their service to crash, although we try to only raise a known set of exception types. ### Cryptographic functions We receive reports about possibly insecure cryptography from time to time. This includes the following aspects: * Using the ARC4 cipher * Using the AES cipher in ECB mode * Using MD5 for hashing These are requirements of the PDF standard, which we need to achieve the greatest compatibility with. Although some of them might be deprecated in PDF 2.0, the PDF 2.0 adoption rate is very low and legacy documents need to be supported. ### XML parsing We use `xml.minidom` for parsing XMP information. Given recent Python versions built against recent Expat versions, the usual attacks (exponential entity expansion and external entity expansion) should not be possible. We have corresponding tests in place to ensure this for the platforms our tests run against. For some details, see [the official documentation](https://docs.python.org/3/library/xml.html#xml-security) and the [README for defusedxml](https://github.com/tiran/defusedxml/blob/main/README.md#python-xml-libraries). Please note that automated scanners tend to still flag any direct imports of XML modules from the Python standard library as unsafe. There have been discussions about this being outdated already, but they are still being flagged. ================================================ FILE: docs/user/streaming-data.md ================================================ # Streaming Data with pypdf In some cases, you might want to avoid saving things explicitly as a file to disk, e.g. when you want to store the PDF in a database or AWS S3. pypdf supports streaming data to a file-like object: ```{testsetup} pypdf_test_setup("user/streaming-data", { "example.pdf": "../resources/example.pdf", }) ``` ```{testcode} from io import BytesIO from pypdf import PdfReader, PdfWriter # Prepare example with open("example.pdf", "rb") as fh: bytes_stream = BytesIO(fh.read()) # Read from bytes_stream reader = PdfReader(bytes_stream) # Write to bytes_stream writer = PdfWriter() with BytesIO() as bytes_stream: writer.write(bytes_stream) ``` ## Writing a PDF directly to AWS S3 Suppose you want to manipulate a PDF and write it directly to AWS S3 without having to write the document to a file first. We have the original PDF in `raw_bytes_data` as `bytes` and want to set `my-secret-password`: % We prefer not to execute doc examples which require access to cloud providers ```{testcode} :skipif: True from io import BytesIO import boto3 from pypdf import PdfReader, PdfWriter reader = PdfReader(BytesIO(raw_bytes_data)) writer = PdfWriter() # Add all pages to the writer for page in reader.pages: writer.add_page(page) # Add a password to the new PDF writer.encrypt("my-secret-password") # Save the new PDF to a file with BytesIO() as bytes_stream: writer.write(bytes_stream) bytes_stream.seek(0) s3 = boto3.client("s3") s3.write_get_object_response( Body=bytes_stream, RequestRoute=request_route, RequestToken=request_token ) ``` ## Reading PDFs directly from cloud services One option is to first download the file and then pass the local file path to `PdfReader`. Another option is to get a byte stream. For AWS S3 it works like this: % We prefer not to execute doc examples which require access to cloud providers ```{testcode} :skipif: True from io import BytesIO import boto3 from pypdf import PdfReader s3 = boto3.client("s3") obj = s3.get_object(Body=csv_buffer.getvalue(), Bucket="my-bucket", Key="my/doc.pdf") reader = PdfReader(BytesIO(obj["Body"].read())) ``` To use with Google Cloud storage: % We prefer not to execute doc examples which require access to cloud providers ```{testcode} :skipif: True from io import BytesIO from google.cloud import storage # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] must be set storage_client = storage.Client() blob = storage_client.bucket("my-bucket").blob("mydoc.pdf") file_stream = BytesIO() blob.download_to_file(file_stream) reader = PdfReader(file_stream) ``` ================================================ FILE: docs/user/suppress-warnings.md ================================================ # Exceptions, Warnings, and Log messages pypdf makes use of three mechanisms to show if something went wrong: * **Exceptions** are error cases that pypdf users should explicitly handle. In the `strict=True` mode, most log messages with the warning level will become exceptions. This can be useful in applications where you can require a user to fix the broken PDF. * **Warnings** are avoidable issues, such as using deprecated classes / functions / parameters. Another example is missing capabilities of pypdf. In those cases, pypdf users should adjust their code. Warnings are issued by the `warnings` module - those are different from the log-level "warning." * **Log messages** are informative messages that can be used for post-mortem analysis. Most of the time, users can ignore them. They come in different *levels*, such as info / warning / error indicating the severity. Examples are non-standard compliant PDF files which pypdf can deal with or a missing implementation that leads to a part of the text not being extracted. ## Exceptions Exceptions need to be caught if you want to handle them. For example, you could want to read the text from a PDF as a part of a search function. Most PDF files do not follow the specification. In this case, pypdf needs to guess which kinds of mistakes were potentially done when the PDF file was created. See [the robustness page](robustness.md) for the related issues. As a user, you likely do not care about it. If it is readable in any way, you want the text. You might use pdfminer.six as a fallback and do this: % We prefer not to execute doc examples for third-party package "pdfminer.six" used in one code snippet only ```{testcode} :skipif: True from pypdf import PdfReader from pdfminer.high_level import extract_text as fallback_text_extraction text = "" try: reader = PdfReader("example.pdf") for page in reader.pages: text += page.extract_text() except Exception as exc: text = fallback_text_extraction("example.pdf") ``` You could also capture [`pypdf.errors.PyPdfError`](https://github.com/py-pdf/pypdf/blob/main/pypdf/errors.py) if you prefer something more specific. ## Warnings The [`warnings` module](https://docs.python.org/3/library/warnings.html) allows you to ignore warnings: ```{testcode} import warnings warnings.filterwarnings("ignore") ``` In many cases, you actually want to start Python with the `-W` flag so that you see all warnings. This is especially true for Continuous Integration (CI). ## Log messages Log messages can be noisy in some cases. pypdf hopefully has a reasonable level of log messages, but you can reduce which types of messages you want to see: ```{testcode} import logging logger = logging.getLogger("pypdf") logger.setLevel(logging.ERROR) ``` The [`logging` module](https://docs.python.org/3/library/logging.html#logging-levels) defines six log levels: * CRITICAL * ERROR * WARNING * INFO * DEBUG * NOTSET ================================================ FILE: docs/user/viewer-preferences.md ================================================ # Adding Viewer Preferences It is possible to set viewer preferences of a PDF file. §12.2 of the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf). Note that the `/ViewerPreferences` dictionary does not exist by default. If it is not already present, it must be created by calling the {func}`~pypdf.PdfWriter.create_viewer_preferences` method. If viewer preferences exist in a PDF file being read with {class}`~pypdf.PdfReader`, you can access them as properties of {attr}`~pypdf.PdfReader.viewer_preferences`. Otherwise, the {attr}`~pypdf.PdfReader.viewer_preferences` property will be set to `None`. ## Example ```{testsetup} pypdf_test_setup("user/viewer-preferences") ``` ```{testcode} from pypdf import PdfWriter from pypdf.generic import ArrayObject, NumberObject writer = PdfWriter() writer.create_viewer_preferences() # /HideToolbar writer.viewer_preferences.hide_toolbar = True # /HideMenubar writer.viewer_preferences.hide_menubar = True # /HideWindowUI writer.viewer_preferences.hide_windowui = True # /FitWindow writer.viewer_preferences.fit_window = True # /CenterWindow writer.viewer_preferences.center_window = True # /DisplayDocTitle writer.viewer_preferences.display_doctitle = True # /NonFullScreenPageMode writer.viewer_preferences.non_fullscreen_pagemode = "/UseNone" # default writer.viewer_preferences.non_fullscreen_pagemode = "/UseOutlines" writer.viewer_preferences.non_fullscreen_pagemode = "/UseThumbs" writer.viewer_preferences.non_fullscreen_pagemode = "/UseOC" # /Direction writer.viewer_preferences.direction = "/L2R" # default writer.viewer_preferences.direction = "/R2L" # /ViewArea writer.viewer_preferences.view_area = "/CropBox" # /ViewClip writer.viewer_preferences.view_clip = "/CropBox" # /PrintArea writer.viewer_preferences.print_area = "/CropBox" # /PrintClip writer.viewer_preferences.print_clip = "/CropBox" # /PrintScaling writer.viewer_preferences.print_scaling = "/None" writer.viewer_preferences.print_scaling = "/AppDefault" # default according to PDF spec # /Duplex writer.viewer_preferences.duplex = "/Simplex" writer.viewer_preferences.duplex = "/DuplexFlipShortEdge" writer.viewer_preferences.duplex = "/DuplexFlipLongEdge" # /PickTrayByPDFSize writer.viewer_preferences.pick_tray_by_pdfsize = True # /PrintPageRange writer.viewer_preferences.print_pagerange = ArrayObject( [NumberObject("1"), NumberObject("10"), NumberObject("20"), NumberObject("30")] ) # /NumCopies writer.viewer_preferences.num_copies = 2 for i in range(40): writer.add_blank_page(10, 10) writer.write("out.pdf") ``` The names beginning with a slash character are part of the PDF file format. They are included here to ease searching the pypdf documentation for these names from the PDF specification. ================================================ FILE: make_release.py ================================================ """Internal tool to update the CHANGELOG.""" import json import subprocess import urllib.request from dataclasses import dataclass from datetime import datetime, timezone GH_ORG = "py-pdf" GH_PROJECT = "pypdf" VERSION_FILE_PATH = "pypdf/_version.py" CHANGELOG_FILE_PATH = "CHANGELOG.md" @dataclass(frozen=True) class Change: """Capture the data of a git commit.""" commit_hash: str prefix: str message: str author: str author_login: str def main(changelog_path: str) -> None: """ Create a changelog. Args: changelog_path: The location of the CHANGELOG file """ changelog = get_changelog(changelog_path) git_tag = get_most_recent_git_tag() changes, changes_with_author = get_formatted_changes(git_tag) if changes == "": print("No changes") return new_version = version_bump(git_tag) new_version = get_version_interactive(new_version, changes) adjust_version_py(new_version) today = datetime.now(tz=timezone.utc) header = f"## Version {new_version}, {today:%Y-%m-%d}\n" url = f"https://github.com/{GH_ORG}/{GH_PROJECT}/compare/{git_tag}...{new_version}" trailer = f"\n[Full Changelog]({url})\n\n" new_entry = header + changes + trailer print(new_entry) write_commit_msg_file(new_version, changes_with_author + trailer) # write_release_msg_file(new_version, changes_with_author + trailer, today) # Make the script idempotent by checking if the new entry is already in the changelog if new_entry in changelog: print("Changelog is already up-to-date!") return new_changelog = "# CHANGELOG\n\n" + new_entry + strip_header(changelog) write_changelog(new_changelog, changelog_path) print_instructions(new_version) def print_instructions(new_version: str) -> None: """Print release instructions.""" print("=" * 80) print(f"☑ {VERSION_FILE_PATH} was adjusted to '{new_version}'") print(f"☑ {CHANGELOG_FILE_PATH} was adjusted") print() print("Now run:") print(" git commit -eF RELEASE_COMMIT_MSG.md") print(" git push") def adjust_version_py(version: str) -> None: """Adjust the __version__ string.""" with open(VERSION_FILE_PATH, "w") as fp: fp.write(f'__version__ = "{version}"\n') def get_version_interactive(new_version: str, changes: str) -> str: """Get the new __version__ interactively.""" from rich.prompt import Prompt # noqa: PLC0415 print("The changes are:") print(changes) orig = new_version new_version = Prompt.ask("New semantic version", default=orig) while not is_semantic_version(new_version): new_version = Prompt.ask( "That was not a semantic version. Please enter a semantic version", default=orig, ) return new_version def is_semantic_version(version: str) -> bool: """Check if the given version is a semantic version.""" # This doesn't cover the edge-cases like pre-releases if version.count(".") != 2: return False try: return bool([int(part) for part in version.split(".")]) except Exception: return False def write_commit_msg_file(new_version: str, commit_changes: str) -> None: """ Write a file that can be used as a commit message. Like this: git commit -eF RELEASE_COMMIT_MSG.md && git push """ with open("RELEASE_COMMIT_MSG.md", "w") as fp: fp.write(f"REL: {new_version}\n\n") fp.write("## What's new\n") fp.write(commit_changes) def write_release_msg_file( new_version: str, commit_changes: str, today: datetime ) -> None: """ Write a file that can be used as a git tag message. Like this: git tag -eF RELEASE_TAG_MSG.md && git push """ with open("RELEASE_TAG_MSG.md", "w") as fp: fp.write(f"Version {new_version}, {today:%Y-%m-%d}\n\n") fp.write("## What's new\n") fp.write(commit_changes) def strip_header(md: str) -> str: """Remove the 'CHANGELOG' header.""" return md.removeprefix("# CHANGELOG").lstrip() def version_bump(git_tag: str) -> str: """ Increase the patch version of the git tag by one. Args: git_tag: Old version tag Returns: The new version where the patch version is bumped. """ # just assume a patch version change major, minor, patch = git_tag.split(".") return f"{major}.{minor}.{int(patch) + 1}" def get_changelog(changelog_path: str) -> str: """ Read the changelog. Args: changelog_path: Path to the CHANGELOG file Returns: Data of the CHANGELOG """ with open(changelog_path, encoding="utf-8") as fh: return fh.read() def write_changelog(new_changelog: str, changelog_path: str) -> None: """ Write the changelog. Args: new_changelog: Contents of the new CHANGELOG changelog_path: Path where the CHANGELOG file is """ with open(changelog_path, "w", encoding="utf-8") as fh: fh.write(new_changelog) def get_formatted_changes(git_tag: str) -> tuple[str, str]: """ Format the changes done since the last tag. Args: git_tag: the reference tag Returns: Changes done since git_tag """ commits = get_git_commits_since_tag(git_tag) # Group by prefix grouped = {} for commit in commits: if commit.prefix not in grouped: grouped[commit.prefix] = [] grouped[commit.prefix].append( {"msg": commit.message, "author": commit.author_login} ) # Order prefixes order = [ "SEC", "DEP", "ENH", "PI", "BUG", "ROB", "DOC", "DEV", "CI", "MAINT", "TST", "STY", ] abbrev2long = { "SEC": "Security", "DEP": "Deprecations", "ENH": "New Features", "BUG": "Bug Fixes", "ROB": "Robustness", "DOC": "Documentation", "DEV": "Developer Experience", "CI": "Continuous Integration", "MAINT": "Maintenance", "TST": "Testing", "STY": "Code Style", "PI": "Performance Improvements", } # Create output output = "" output_with_user = "" for prefix in order: if prefix not in grouped: continue tmp = f"\n### {abbrev2long[prefix]} ({prefix})\n" # header output += tmp output_with_user += tmp for commit in grouped[prefix]: output += f"- {commit['msg']}\n" output_with_user += f"- {commit['msg']} by @{commit['author']}\n" del grouped[prefix] if grouped: output += "\n### Other\n" output_with_user += "\n### Other\n" for prefix, commits in grouped.items(): for commit in commits: output += f"- {prefix}: {commit['msg']}\n" output_with_user += ( f"- {prefix}: {commit['msg']} by @{commit['author']}\n" ) return output, output_with_user def get_most_recent_git_tag() -> str: """ Get the git tag most recently created. Returns: Most recently created git tag. """ return subprocess.check_output( ["git", "describe", "--tag", "--abbrev=0"], stderr=subprocess.STDOUT, text=True ).strip() def get_author_mapping(line_count: int) -> dict[str, str]: """ Get the authors for each commit. Args: line_count: Number of lines from Git log output. Used for determining how many commits to fetch. Returns: A mapping of long commit hashes to author login handles. """ per_page = min(line_count, 100) page = 1 mapping: dict[str, str] = {} for _ in range(0, line_count, per_page): with urllib.request.urlopen( f"https://api.github.com/repos/{GH_ORG}/{GH_PROJECT}/commits?per_page={per_page}&page={page}" ) as response: commits = json.loads(response.read()) page += 1 for commit in commits: mapping[commit["sha"]] = commit["author"]["login"] return mapping def get_git_commits_since_tag(git_tag: str) -> list[Change]: """ Get all commits since the last tag. Args: git_tag: Reference tag from which the changes to the current commit are fetched. Returns: List of all changes since git_tag. """ commits = ( subprocess.check_output( [ "git", "--no-pager", "log", f"{git_tag}..HEAD", '--pretty=format:"%H:::%s:::%aN"', ], stderr=subprocess.STDOUT, ) .decode("UTF-8") .strip() ) lines = commits.splitlines() authors = get_author_mapping(len(lines)) return [parse_commit_line(line, authors) for line in lines if line != ""] def parse_commit_line(line: str, authors: dict[str, str]) -> Change: """ Parse the first line of a git commit message. Args: line: The first line of a git commit message. Returns: The parsed Change object Raises: ValueError: The commit line is not well-structured """ parts = line.strip().strip('"\\').split(":::") if len(parts) != 3: raise ValueError(f"Invalid commit line: '{line}'") commit_hash, rest, author = parts if ":" in rest: prefix, message = rest.split(": ", 1) else: prefix = "" message = rest # Standardize message = message.strip() commit_hash = commit_hash.strip() author_login = authors[commit_hash] prefix = prefix.strip() if prefix == "DOCS": prefix = "DOC" return Change( commit_hash=commit_hash, prefix=prefix, message=message, author=author, author_login=author_login, ) if __name__ == "__main__": main(CHANGELOG_FILE_PATH) ================================================ FILE: pypdf/__init__.py ================================================ """ pypdf is a free and open-source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. pypdf can retrieve text and metadata from PDFs as well. You can read the full docs at https://pypdf.readthedocs.io/. """ from ._crypt_providers import crypt_provider from ._doc_common import DocumentInformation from ._encryption import PasswordType from ._page import PageObject, Transformation from ._reader import PdfReader from ._text_extraction import mult from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfWriter from .constants import ImageType from .pagerange import PageRange, parse_filename_page_ranges from .papersizes import PaperSize try: import PIL pil_version = PIL.__version__ except ImportError: pil_version = "none" _debug_versions = ( f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}" ) __all__ = [ "DocumentInformation", "ImageType", "ObjectDeletionFlag", "PageObject", "PageRange", "PaperSize", "PasswordType", "PdfReader", "PdfWriter", "Transformation", "__version__", "_debug_versions", "mult", "parse_filename_page_ranges", ] ================================================ FILE: pypdf/_cmap.py ================================================ import binascii from binascii import Error as BinasciiError from binascii import unhexlify from math import ceil from typing import Any, Union, cast from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_error, logger_warning from .errors import LimitReachedError from .generic import ( DecodedStreamObject, DictionaryObject, NullObject, StreamObject, is_null_or_none, ) _predefined_cmap: dict[str, str] = { "/Identity-H": "utf-16-be", "/Identity-V": "utf-16-be", "/GB-EUC-H": "gbk", "/GB-EUC-V": "gbk", "/GBpc-EUC-H": "gb2312", "/GBpc-EUC-V": "gb2312", "/GBK-EUC-H": "gbk", "/GBK-EUC-V": "gbk", "/GBK2K-H": "gb18030", "/GBK2K-V": "gb18030", "/ETen-B5-H": "cp950", "/ETen-B5-V": "cp950", "/ETenms-B5-H": "cp950", "/ETenms-B5-V": "cp950", "/UniCNS-UTF16-H": "utf-16-be", "/UniCNS-UTF16-V": "utf-16-be", "/UniGB-UTF16-H": "gb18030", "/UniGB-UTF16-V": "gb18030", # UCS2 in code } def get_encoding( ft: DictionaryObject ) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: encoding = _parse_encoding(ft) map_dict, int_entry = _parse_to_unicode(ft) # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: # if cmap not empty encoding should be discarded # (here transformed into identity for those characters) # If encoding is a string, it is expected to be an identity translation. if isinstance(encoding, dict): for x in int_entry: if x <= 255: encoding[x] = chr(x) return encoding, map_dict def _parse_encoding( ft: DictionaryObject ) -> Union[str, dict[int, str]]: encoding: Union[str, list[str], dict[int, str]] = [] if "/Encoding" not in ft: if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: encoding = dict( zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) ) else: encoding = "charmap" return encoding enc: Union[str, DictionaryObject, NullObject] = cast( Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() ) if isinstance(enc, str): try: # already done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: encoding = _predefined_cmap[enc] elif "-UCS2-" in enc: encoding = "utf-16-be" else: raise Exception("not found") except Exception: logger_error("Advanced encoding %(encoding)s not implemented yet", source=__name__, encoding=enc) encoding = enc elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: try: encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() except Exception: logger_error( "Advanced encoding %(encoding)s not implemented yet", source=__name__, encoding=encoding ) encoding = charset_encoding["/StandardEncoding"].copy() else: encoding = charset_encoding["/StandardEncoding"].copy() if isinstance(enc, DictionaryObject) and "/Differences" in enc: x: int = 0 o: Union[int, str] for o in cast(DictionaryObject, enc["/Differences"]): if isinstance(o, int): x = o else: # isinstance(o, str): try: if x < len(encoding): encoding[x] = adobe_glyphs[o] # type: ignore except Exception: encoding[x] = o # type: ignore x += 1 if isinstance(encoding, list): encoding = dict(zip(range(256), encoding)) return encoding def _parse_to_unicode( ft: DictionaryObject ) -> tuple[dict[Any, Any], list[int]]: # will store all translation code # and map_dict[-1] we will have the number of bytes to convert map_dict: dict[Any, Any] = {} # will provide the list of cmap keys as int to correct encoding int_entry: list[int] = [] if "/ToUnicode" not in ft: if ft.get("/Subtype", "") == "/Type1": return _type1_alternative(ft, map_dict, int_entry) return {}, [] process_rg: bool = False process_char: bool = False multiline_rg: Union[ None, tuple[int, int] ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file cm = prepare_cm(ft) for line in cm.split(b"\n"): process_rg, process_char, multiline_rg = process_cm_line( line.strip(b" \t"), process_rg, process_char, multiline_rg, map_dict, int_entry, ) return map_dict, int_entry def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() else: # if (tu is None) or cast(str, tu).startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" if isinstance(cm, str): cm = cm.encode() # we need to prepare cm before due to missing return line in pdf printed # to pdf from word cm = ( cm.strip() .replace(b"beginbfchar", b"\nbeginbfchar\n") .replace(b"endbfchar", b"\nendbfchar\n") .replace(b"beginbfrange", b"\nbeginbfrange\n") .replace(b"endbfrange", b"\nendbfrange\n") .replace(b"<<", b"\n{\n") # text between << and >> not used but .replace(b">>", b"\n}\n") # some solution to find it back ) ll = cm.split(b"<") for i in range(len(ll)): j = ll[i].find(b">") if j >= 0: if j == 0: # string is empty: stash a placeholder here (see below) # see https://github.com/py-pdf/pypdf/issues/1111 content = b"." else: content = ll[i][:j].replace(b" ", b"") ll[i] = content + b" " + ll[i][j + 1 :] cm = ( (b" ".join(ll)) .replace(b"[", b" [ ") .replace(b"]", b" ]\n ") .replace(b"\r", b"\n") ) return cm def process_cm_line( line: bytes, process_rg: bool, process_char: bool, multiline_rg: Union[None, tuple[int, int]], map_dict: dict[Any, Any], int_entry: list[int], ) -> tuple[bool, bool, Union[None, tuple[int, int]]]: if line == b"" or line[0] == 37: # 37 = % return process_rg, process_char, multiline_rg line = line.replace(b"\t", b" ") if b"beginbfrange" in line: process_rg = True elif b"endbfrange" in line: process_rg = False elif b"beginbfchar" in line: process_char = True elif b"endbfchar" in line: process_char = False elif process_rg: try: multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) except binascii.Error as error: logger_warning(f"Skipping broken line {line!r}: {error}", __name__) elif process_char: parse_bfchar(line, map_dict, int_entry) return process_rg, process_char, multiline_rg # Usual values should be up to 65_536. MAPPING_DICTIONARY_SIZE_LIMIT = 100_000 def _check_mapping_size(size: int) -> None: if size > MAPPING_DICTIONARY_SIZE_LIMIT: raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.") def parse_bfrange( line: bytes, map_dict: dict[Any, Any], int_entry: list[int], multiline_rg: Union[None, tuple[int, int]], ) -> Union[None, tuple[int, int]]: lst = [x for x in line.split(b" ") if x] closure_found = False entry_count = len(int_entry) _check_mapping_size(entry_count) if multiline_rg is not None: fmt = b"%%0%dX" % (map_dict[-1] * 2) a = multiline_rg[0] # a, b not in the current line b = multiline_rg[1] for sq in lst: if sq == b"]": closure_found = True break entry_count += 1 _check_mapping_size(entry_count) map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: a = int(lst[0], 16) b = int(lst[1], 16) nbi = max(len(lst[0]), len(lst[1])) map_dict[-1] = ceil(nbi / 2) fmt = b"%%0%dX" % (map_dict[-1] * 2) if lst[2] == b"[": for sq in lst[3:]: if sq == b"]": closure_found = True break entry_count += 1 _check_mapping_size(entry_count) map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: # case without list c = int(lst[2], 16) fmt2 = b"%%0%dX" % max(4, len(lst[2])) closure_found = True range_size = max(0, b - a + 1) _check_mapping_size(entry_count + range_size) # This can be checked beforehand. while a <= b: map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 c += 1 return None if closure_found else (a, b) def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: lst = [x for x in line.split(b" ") if x] new_count = len(lst) // 2 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand. map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: map_to = "" # placeholder (see above) means empty string if lst[1] != b".": try: map_to = unhexlify(lst[1]).decode( "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" ) # join is here as some cases where the code was split except BinasciiError as exception: logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) map_dict[ unhexlify(lst[0]).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" ) ] = map_to int_entry.append(int(lst[0], 16)) lst = lst[2:] def _type1_alternative( ft: DictionaryObject, map_dict: dict[Any, Any], int_entry: list[int], ) -> tuple[dict[Any, Any], list[int]]: if "/FontDescriptor" not in ft: return map_dict, int_entry ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") if is_null_or_none(ft_desc): return map_dict, int_entry assert ft_desc is not None, "mypy" txt = ft_desc.get_object().get_data() txt = txt.split(b"eexec\n")[0] # only clear part txt = txt.split(b"/Encoding")[1] # to get the encoding part lines = txt.replace(b"\r", b"\n").split(b"\n") for li in lines: if li.startswith(b"dup"): words = [_w for _w in li.split(b" ") if _w != b""] if len(words) > 3 and words[3] != b"put": continue try: i = int(words[1]) except ValueError: # pragma: no cover continue try: v = adobe_glyphs[words[2].decode()] except KeyError: if words[2].startswith(b"/uni"): try: v = chr(int(words[2][4:], 16)) except ValueError: # pragma: no cover continue else: continue map_dict[chr(i)] = v int_entry.append(i) return map_dict, int_entry ================================================ FILE: pypdf/_codecs/__init__.py ================================================ from .adobe_glyphs import adobe_glyphs from .pdfdoc import _pdfdoc_encoding from .std import _std_encoding from .symbol import _symbol_encoding from .zapfding import _zapfding_encoding def fill_from_encoding(enc: str) -> list[str]: lst: list[str] = [] for x in range(256): try: lst += (bytes((x,)).decode(enc),) except Exception: lst += (chr(x),) return lst def rev_encoding(enc: list[str]) -> dict[str, int]: rev: dict[str, int] = {} for i in range(256): char = enc[i] if char == "\u0000": continue assert char not in rev, f"{char} at {i} already at {rev[char]}" rev[char] = i return rev _win_encoding = fill_from_encoding("cp1252") _mac_encoding = fill_from_encoding("mac_roman") _pdfdoc_encoding_rev: dict[str, int] = rev_encoding(_pdfdoc_encoding) charset_encoding: dict[str, list[str]] = { "/StandardEncoding": _std_encoding, "/WinAnsiEncoding": _win_encoding, "/MacRomanEncoding": _mac_encoding, "/PDFDocEncoding": _pdfdoc_encoding, "/Symbol": _symbol_encoding, "/ZapfDingbats": _zapfding_encoding, } __all__ = [ "_mac_encoding", "_pdfdoc_encoding", "_pdfdoc_encoding_rev", "_std_encoding", "_symbol_encoding", "_win_encoding", "_zapfding_encoding", "adobe_glyphs", "charset_encoding", ] ================================================ FILE: pypdf/_codecs/_codecs.py ================================================ """ This module is for codecs only. While the codec implementation can contain details of the PDF specification, the module should not do any PDF parsing. """ import io from abc import ABC, abstractmethod from pypdf._utils import logger_warning from pypdf.errors import LimitReachedError class Codec(ABC): """Abstract base class for all codecs.""" @abstractmethod def encode(self, data: bytes) -> bytes: """ Encode the input data. Args: data: Data to encode. Returns: Encoded data. """ @abstractmethod def decode(self, data: bytes) -> bytes: """ Decode the input data. Args: data: Data to decode. Returns: Decoded data. """ class LzwCodec(Codec): """Lempel-Ziv-Welch (LZW) adaptive compression codec.""" CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset EOD_MARKER = 257 # End-of-data marker INITIAL_BITS_PER_CODE = 9 # Initial code bit width MAX_BITS_PER_CODE = 12 # Maximum code bit width def __init__(self, max_output_length: int = 75_000_000) -> None: self.max_output_length = max_output_length def _initialize_encoding_table(self) -> None: """Initialize the encoding table and state to initial conditions.""" self.encoding_table: dict[bytes, int] = {bytes([i]): i for i in range(256)} self.next_code = self.EOD_MARKER + 1 self.bits_per_code = self.INITIAL_BITS_PER_CODE self.max_code_value = (1 << self.bits_per_code) - 1 def _increase_next_code(self) -> None: """Update bits_per_code and max_code_value if necessary.""" self.next_code += 1 if ( self.next_code > self.max_code_value and self.bits_per_code < self.MAX_BITS_PER_CODE ): self.bits_per_code += 1 self.max_code_value = (1 << self.bits_per_code) - 1 def encode(self, data: bytes) -> bytes: """ Encode data using the LZW compression algorithm. Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". """ result_codes: list[int] = [] # The encoder shall begin by issuing a clear-table code result_codes.append(self.CLEAR_TABLE_MARKER) self._initialize_encoding_table() current_sequence = b"" for byte in data: next_sequence = current_sequence + bytes([byte]) if next_sequence in self.encoding_table: # Extend current sequence if already in the table current_sequence = next_sequence else: # Output code for the current sequence result_codes.append(self.encoding_table[current_sequence]) # Add the new sequence to the table if there's room if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1: self.encoding_table[next_sequence] = self.next_code self._increase_next_code() else: # If the table is full, emit a clear-table command result_codes.append(self.CLEAR_TABLE_MARKER) self._initialize_encoding_table() # Start new sequence current_sequence = bytes([byte]) # Ensure everything actually is encoded if current_sequence: result_codes.append(self.encoding_table[current_sequence]) result_codes.append(self.EOD_MARKER) return self._pack_codes_into_bytes(result_codes) def _pack_codes_into_bytes(self, codes: list[int]) -> bytes: """ Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width. The bit-width starts at 9 bits and expands as needed. """ self._initialize_encoding_table() buffer = 0 bits_in_buffer = 0 output = bytearray() for code in codes: buffer = (buffer << self.bits_per_code) | code bits_in_buffer += self.bits_per_code # Codes shall be packed into a continuous bit stream, high-order bit # first. This stream shall then be divided into bytes, high-order bit # first. while bits_in_buffer >= 8: bits_in_buffer -= 8 output.append((buffer >> bits_in_buffer) & 0xFF) if code == self.CLEAR_TABLE_MARKER: self._initialize_encoding_table() elif code == self.EOD_MARKER: continue else: self._increase_next_code() # Flush any remaining bits in the buffer if bits_in_buffer > 0: output.append((buffer << (8 - bits_in_buffer)) & 0xFF) return bytes(output) def _initialize_decoding_table(self) -> None: self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1 self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [ b"" ] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1) self._table_index = self.EOD_MARKER + 1 self._bits_to_get = 9 def _next_code_decode(self, data: bytes) -> int: self._next_data: int try: while self._next_bits < self._bits_to_get: self._next_data = (self._next_data << 8) | ( data[self._byte_pointer] ) self._byte_pointer += 1 self._next_bits += 8 code = ( self._next_data >> (self._next_bits - self._bits_to_get) ) & self._and_table[self._bits_to_get - 9] self._next_bits -= self._bits_to_get # Reduce data to get rid of the overhead, # which increases performance on large streams significantly. self._next_data = self._next_data & 0xFFFFF return code except IndexError: return self.EOD_MARKER # The following method has been converted to Python from PDFsharp: # https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs # # Original license: # # ------------------------------------------------------------------------- # Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area), # Germany # # http://docs.pdfsharp.net # # MIT License # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included # in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # -------------------------------------------------------------------------- def decode(self, data: bytes) -> bytes: """ The following code was converted to Python from the following code: https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs """ self._and_table = [511, 1023, 2047, 4095] self._table_index = 0 self._bits_to_get = 9 self._byte_pointer = 0 self._next_data = 0 self._next_bits = 0 output_stream = io.BytesIO() output_length = 0 self._initialize_decoding_table() self._byte_pointer = 0 self._next_data = 0 self._next_bits = 0 old_code = self.CLEAR_TABLE_MARKER while True: code = self._next_code_decode(data) if code == self.EOD_MARKER: break if code == self.CLEAR_TABLE_MARKER: self._initialize_decoding_table() code = self._next_code_decode(data) if code == self.EOD_MARKER: break output_stream.write(decoded := self.decoding_table[code]) old_code = code elif code < self._table_index: decoded = self.decoding_table[code] output_stream.write(decoded) if old_code != self.CLEAR_TABLE_MARKER: self._add_entry_decode(self.decoding_table[old_code], decoded[0]) old_code = code else: # The code is not in the table and not one of the special codes decoded = ( self.decoding_table[old_code] + self.decoding_table[old_code][:1] ) output_stream.write(decoded) self._add_entry_decode(self.decoding_table[old_code], decoded[0]) old_code = code output_length += len(decoded) if output_length > self.max_output_length: raise LimitReachedError( f"Limit reached while decompressing: {output_length} > {self.max_output_length}" ) return output_stream.getvalue() def _add_entry_decode(self, old_string: bytes, new_char: int) -> None: new_string = old_string + bytes([new_char]) if self._table_index > self.max_code_value: logger_warning("Ignoring too large LZW table index.", __name__) return self.decoding_table[self._table_index] = new_string self._table_index += 1 # Update the number of bits to get based on the table index if self._table_index == 511: self._bits_to_get = 10 elif self._table_index == 1023: self._bits_to_get = 11 elif self._table_index == 2047: self._bits_to_get = 12 ================================================ FILE: pypdf/_codecs/adobe_glyphs.py ================================================ # https://raw.githubusercontent.com/adobe-type-tools/agl-aglfn/master/glyphlist.txt # converted manually to python # Extended with data from GlyphNameFormatter: # https://github.com/LettError/glyphNameFormatter # ----------------------------------------------------------- # Copyright 2002-2019 Adobe (http://www.adobe.com/). # # Redistribution and use in source and binary forms, with or # without modification, are permitted provided that the # following conditions are met: # # Redistributions of source code must retain the above # copyright notice, this list of conditions and the following # disclaimer. # # Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials # provided with the distribution. # # Neither the name of Adobe nor the names of its contributors # may be used to endorse or promote products derived from this # software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------- # Name: Adobe Glyph List # Table version: 2.0 # Date: September 20, 2002 # URL: https://github.com/adobe-type-tools/agl-aglfn # # Format: two semicolon-delimited fields: # (1) glyph name--upper/lowercase letters and digits # (2) Unicode scalar value--four uppercase hexadecimal digits # adobe_glyphs = { "/A": "\u0041", "/AA": "\uA732", "/AE": "\u00C6", "/AEacute": "\u01FC", "/AEmacron": "\u01E2", "/AEsmall": "\uF7E6", "/AO": "\uA734", "/AU": "\uA736", "/AV": "\uA738", "/AVhorizontalbar": "\uA73A", "/AY": "\uA73C", "/Aacute": "\u00C1", "/Aacutesmall": "\uF7E1", "/Abreve": "\u0102", "/Abreveacute": "\u1EAE", "/Abrevecyr": "\u04D0", "/Abrevecyrillic": "\u04D0", "/Abrevedotbelow": "\u1EB6", "/Abrevegrave": "\u1EB0", "/Abrevehoi": "\u1EB2", "/Abrevehookabove": "\u1EB2", "/Abrevetilde": "\u1EB4", "/Acaron": "\u01CD", "/Acircle": "\u24B6", "/Acircleblack": "\u1F150", "/Acircumflex": "\u00C2", "/Acircumflexacute": "\u1EA4", "/Acircumflexdotbelow": "\u1EAC", "/Acircumflexgrave": "\u1EA6", "/Acircumflexhoi": "\u1EA8", "/Acircumflexhookabove": "\u1EA8", "/Acircumflexsmall": "\uF7E2", "/Acircumflextilde": "\u1EAA", "/Acute": "\uF6C9", "/Acutesmall": "\uF7B4", "/Acyr": "\u0410", "/Acyrillic": "\u0410", "/Adblgrave": "\u0200", "/Adieresis": "\u00C4", "/Adieresiscyr": "\u04D2", "/Adieresiscyrillic": "\u04D2", "/Adieresismacron": "\u01DE", "/Adieresissmall": "\uF7E4", "/Adot": "\u0226", "/Adotbelow": "\u1EA0", "/Adotmacron": "\u01E0", "/Agrave": "\u00C0", "/Agravedbl": "\u0200", "/Agravesmall": "\uF7E0", "/Ahoi": "\u1EA2", "/Ahookabove": "\u1EA2", "/Aiecyr": "\u04D4", "/Aiecyrillic": "\u04D4", "/Ainvertedbreve": "\u0202", "/Akbar": "\uFDF3", "/Alayhe": "\uFDF7", "/Allah": "\uFDF2", "/Alpha": "\u0391", "/Alphaacute": "\u1FBB", "/Alphaasper": "\u1F09", "/Alphaasperacute": "\u1F0D", "/Alphaasperacuteiotasub": "\u1F8D", "/Alphaaspergrave": "\u1F0B", "/Alphaaspergraveiotasub": "\u1F8B", "/Alphaasperiotasub": "\u1F89", "/Alphaaspertilde": "\u1F0F", "/Alphaaspertildeiotasub": "\u1F8F", "/Alphabreve": "\u1FB8", "/Alphagrave": "\u1FBA", "/Alphaiotasub": "\u1FBC", "/Alphalenis": "\u1F08", "/Alphalenisacute": "\u1F0C", "/Alphalenisacuteiotasub": "\u1F8C", "/Alphalenisgrave": "\u1F0A", "/Alphalenisgraveiotasub": "\u1F8A", "/Alphalenisiotasub": "\u1F88", "/Alphalenistilde": "\u1F0E", "/Alphalenistildeiotasub": "\u1F8E", "/Alphatonos": "\u0386", "/Alphawithmacron": "\u1FB9", "/Amacron": "\u0100", "/Amonospace": "\uFF21", "/Aogonek": "\u0104", "/Aparens": "\u1F110", "/Aring": "\u00C5", "/Aringacute": "\u01FA", "/Aringbelow": "\u1E00", "/Aringsmall": "\uF7E5", "/Asmall": "\uF761", "/Asquare": "\u1F130", "/Asquareblack": "\u1F170", "/Astroke": "\u023A", "/Atilde": "\u00C3", "/Atildesmall": "\uF7E3", "/Aturned": "\u2C6F", "/Ayahend": "\u06DD", "/Aybarmenian": "\u0531", "/B": "\u0042", "/Bcircle": "\u24B7", "/Bcircleblack": "\u1F151", "/Bdot": "\u1E02", "/Bdotaccent": "\u1E02", "/Bdotbelow": "\u1E04", "/Becyr": "\u0411", "/Becyrillic": "\u0411", "/Benarmenian": "\u0532", "/Beta": "\u0392", "/Bflourish": "\uA796", "/Bhook": "\u0181", "/BismillahArRahmanArRaheem": "\uFDFD", "/Blinebelow": "\u1E06", "/Bmonospace": "\uFF22", "/Bparens": "\u1F111", "/Brevesmall": "\uF6F4", "/Bscript": "\u212C", "/Bsmall": "\uF762", "/Bsquare": "\u1F131", "/Bsquareblack": "\u1F171", "/Bstroke": "\u0243", "/Btopbar": "\u0182", "/C": "\u0043", "/CDcircle": "\u1F12D", "/Caarmenian": "\u053E", "/Cacute": "\u0106", "/Caron": "\uF6CA", "/Caronsmall": "\uF6F5", "/Cbar": "\uA792", "/Ccaron": "\u010C", "/Ccedilla": "\u00C7", "/Ccedillaacute": "\u1E08", "/Ccedillasmall": "\uF7E7", "/Ccircle": "\u24B8", "/Ccircleblack": "\u1F152", "/Ccircumflex": "\u0108", "/Cdblstruck": "\u2102", "/Cdot": "\u010A", "/Cdotaccent": "\u010A", "/Cdotreversed": "\uA73E", "/Cedillasmall": "\uF7B8", "/Cfraktur": "\u212D", "/Chaarmenian": "\u0549", "/Cheabkhasiancyrillic": "\u04BC", "/Cheabkhcyr": "\u04BC", "/Cheabkhtailcyr": "\u04BE", "/Checyr": "\u0427", "/Checyrillic": "\u0427", "/Chedescenderabkhasiancyrillic": "\u04BE", "/Chedescendercyrillic": "\u04B6", "/Chedieresiscyr": "\u04F4", "/Chedieresiscyrillic": "\u04F4", "/Cheharmenian": "\u0543", "/Chekhakascyr": "\u04CB", "/Chekhakassiancyrillic": "\u04CB", "/Chetailcyr": "\u04B6", "/Chevertcyr": "\u04B8", "/Cheverticalstrokecyrillic": "\u04B8", "/Chi": "\u03A7", "/Chook": "\u0187", "/Circumflexsmall": "\uF6F6", "/Citaliccircle": "\u1F12B", "/Cmonospace": "\uFF23", "/Coarmenian": "\u0551", "/Con": "\uA76E", "/Cparens": "\u1F112", "/Csmall": "\uF763", "/Csquare": "\u1F132", "/Csquareblack": "\u1F172", "/Cstretched": "\u0297", "/Cstroke": "\u023B", "/Cuatrillo": "\uA72C", "/Cuatrillocomma": "\uA72E", "/D": "\u0044", "/DZ": "\u01F1", "/DZcaron": "\u01C4", "/Daarmenian": "\u0534", "/Dafrican": "\u0189", "/Dcaron": "\u010E", "/Dcedilla": "\u1E10", "/Dchecyr": "\u052C", "/Dcircle": "\u24B9", "/Dcircleblack": "\u1F153", "/Dcircumflexbelow": "\u1E12", "/Dcroat": "\u0110", "/Ddblstruckitalic": "\u2145", "/Ddot": "\u1E0A", "/Ddotaccent": "\u1E0A", "/Ddotbelow": "\u1E0C", "/Decyr": "\u0414", "/Decyrillic": "\u0414", "/Deicoptic": "\u03EE", "/Dekomicyr": "\u0500", "/Delta": "\u2206", "/Deltagreek": "\u0394", "/Dhook": "\u018A", "/Dieresis": "\uF6CB", "/DieresisAcute": "\uF6CC", "/DieresisGrave": "\uF6CD", "/Dieresissmall": "\uF7A8", "/Digamma": "\u03DC", "/Digammagreek": "\u03DC", "/Digammapamphylian": "\u0376", "/Dinsular": "\uA779", "/Djecyr": "\u0402", "/Djecyrillic": "\u0402", "/Djekomicyr": "\u0502", "/Dlinebelow": "\u1E0E", "/Dmonospace": "\uFF24", "/Dotaccentsmall": "\uF6F7", "/Dparens": "\u1F113", "/Dslash": "\u0110", "/Dsmall": "\uF764", "/Dsquare": "\u1F133", "/Dsquareblack": "\u1F173", "/Dtopbar": "\u018B", "/Dz": "\u01F2", "/Dzcaron": "\u01C5", "/Dzeabkhasiancyrillic": "\u04E0", "/Dzeabkhcyr": "\u04E0", "/Dzecyr": "\u0405", "/Dzecyrillic": "\u0405", "/Dzhecyr": "\u040F", "/Dzhecyrillic": "\u040F", "/Dzjekomicyr": "\u0506", "/Dzzhecyr": "\u052A", "/E": "\u0045", "/Eacute": "\u00C9", "/Eacutesmall": "\uF7E9", "/Ebreve": "\u0114", "/Ecaron": "\u011A", "/Ecedilla": "\u0228", "/Ecedillabreve": "\u1E1C", "/Echarmenian": "\u0535", "/Ecircle": "\u24BA", "/Ecircleblack": "\u1F154", "/Ecircumflex": "\u00CA", "/Ecircumflexacute": "\u1EBE", "/Ecircumflexbelow": "\u1E18", "/Ecircumflexdotbelow": "\u1EC6", "/Ecircumflexgrave": "\u1EC0", "/Ecircumflexhoi": "\u1EC2", "/Ecircumflexhookabove": "\u1EC2", "/Ecircumflexsmall": "\uF7EA", "/Ecircumflextilde": "\u1EC4", "/Ecyrillic": "\u0404", "/Edblgrave": "\u0204", "/Edieresis": "\u00CB", "/Edieresissmall": "\uF7EB", "/Edot": "\u0116", "/Edotaccent": "\u0116", "/Edotbelow": "\u1EB8", "/Efcyr": "\u0424", "/Efcyrillic": "\u0424", "/Egrave": "\u00C8", "/Egravedbl": "\u0204", "/Egravesmall": "\uF7E8", "/Egyptain": "\uA724", "/Egyptalef": "\uA722", "/Eharmenian": "\u0537", "/Ehoi": "\u1EBA", "/Ehookabove": "\u1EBA", "/Eightroman": "\u2167", "/Einvertedbreve": "\u0206", "/Eiotifiedcyr": "\u0464", "/Eiotifiedcyrillic": "\u0464", "/Elcyr": "\u041B", "/Elcyrillic": "\u041B", "/Elevenroman": "\u216A", "/Elhookcyr": "\u0512", "/Elmiddlehookcyr": "\u0520", "/Elsharptailcyr": "\u04C5", "/Eltailcyr": "\u052E", "/Emacron": "\u0112", "/Emacronacute": "\u1E16", "/Emacrongrave": "\u1E14", "/Emcyr": "\u041C", "/Emcyrillic": "\u041C", "/Emonospace": "\uFF25", "/Emsharptailcyr": "\u04CD", "/Encyr": "\u041D", "/Encyrillic": "\u041D", "/Endescendercyrillic": "\u04A2", "/Eng": "\u014A", "/Engecyr": "\u04A4", "/Enghecyrillic": "\u04A4", "/Enhookcyr": "\u04C7", "/Enhookcyrillic": "\u04C7", "/Enhookleftcyr": "\u0528", "/Enmiddlehookcyr": "\u0522", "/Ensharptailcyr": "\u04C9", "/Entailcyr": "\u04A2", "/Eogonek": "\u0118", "/Eopen": "\u0190", "/Eparens": "\u1F114", "/Epsilon": "\u0395", "/Epsilonacute": "\u1FC9", "/Epsilonasper": "\u1F19", "/Epsilonasperacute": "\u1F1D", "/Epsilonaspergrave": "\u1F1B", "/Epsilongrave": "\u1FC8", "/Epsilonlenis": "\u1F18", "/Epsilonlenisacute": "\u1F1C", "/Epsilonlenisgrave": "\u1F1A", "/Epsilontonos": "\u0388", "/Ercyr": "\u0420", "/Ercyrillic": "\u0420", "/Ereversed": "\u018E", "/Ereversedcyr": "\u042D", "/Ereversedcyrillic": "\u042D", "/Ereverseddieresiscyr": "\u04EC", "/Ereversedopen": "\uA7AB", "/Ertickcyr": "\u048E", "/Escript": "\u2130", "/Escyr": "\u0421", "/Escyrillic": "\u0421", "/Esdescendercyrillic": "\u04AA", "/Esh": "\u01A9", "/Esmall": "\uF765", "/Esmallturned": "\u2C7B", "/Esquare": "\u1F134", "/Esquareblack": "\u1F174", "/Estailcyr": "\u04AA", "/Estroke": "\u0246", "/Et": "\uA76A", "/Eta": "\u0397", "/Etaacute": "\u1FCB", "/Etaasper": "\u1F29", "/Etaasperacute": "\u1F2D", "/Etaasperacuteiotasub": "\u1F9D", "/Etaaspergrave": "\u1F2B", "/Etaaspergraveiotasub": "\u1F9B", "/Etaasperiotasub": "\u1F99", "/Etaaspertilde": "\u1F2F", "/Etaaspertildeiotasub": "\u1F9F", "/Etagrave": "\u1FCA", "/Etaiotasub": "\u1FCC", "/Etalenis": "\u1F28", "/Etalenisacute": "\u1F2C", "/Etalenisacuteiotasub": "\u1F9C", "/Etalenisgrave": "\u1F2A", "/Etalenisgraveiotasub": "\u1F9A", "/Etalenisiotasub": "\u1F98", "/Etalenistilde": "\u1F2E", "/Etalenistildeiotasub": "\u1F9E", "/Etarmenian": "\u0538", "/Etatonos": "\u0389", "/Eth": "\u00D0", "/Ethsmall": "\uF7F0", "/Etilde": "\u1EBC", "/Etildebelow": "\u1E1A", "/Eukrcyr": "\u0404", "/Euro": "\u20AC", "/Ezh": "\u01B7", "/Ezhcaron": "\u01EE", "/Ezhreversed": "\u01B8", "/F": "\u0046", "/Fcircle": "\u24BB", "/Fcircleblack": "\u1F155", "/Fdot": "\u1E1E", "/Fdotaccent": "\u1E1E", "/Feharmenian": "\u0556", "/Feicoptic": "\u03E4", "/Fhook": "\u0191", "/Finsular": "\uA77B", "/Fitacyr": "\u0472", "/Fitacyrillic": "\u0472", "/Fiveroman": "\u2164", "/Fmonospace": "\uFF26", "/Fourroman": "\u2163", "/Fparens": "\u1F115", "/Fscript": "\u2131", "/Fsmall": "\uF766", "/Fsquare": "\u1F135", "/Fsquareblack": "\u1F175", "/Fstroke": "\uA798", "/Fturned": "\u2132", "/G": "\u0047", "/GBsquare": "\u3387", "/Gacute": "\u01F4", "/Gamma": "\u0393", "/Gammaafrican": "\u0194", "/Gammadblstruck": "\u213E", "/Gangiacoptic": "\u03EA", "/Gbreve": "\u011E", "/Gcaron": "\u01E6", "/Gcedilla": "\u0122", "/Gcircle": "\u24BC", "/Gcircleblack": "\u1F156", "/Gcircumflex": "\u011C", "/Gcommaaccent": "\u0122", "/Gdot": "\u0120", "/Gdotaccent": "\u0120", "/Gecyr": "\u0413", "/Gecyrillic": "\u0413", "/Gehookcyr": "\u0494", "/Gehookstrokecyr": "\u04FA", "/Germandbls": "\u1E9E", "/Gestrokecyr": "\u0492", "/Getailcyr": "\u04F6", "/Geupcyr": "\u0490", "/Ghadarmenian": "\u0542", "/Ghemiddlehookcyrillic": "\u0494", "/Ghestrokecyrillic": "\u0492", "/Gheupturncyrillic": "\u0490", "/Ghook": "\u0193", "/Ghooksmall": "\u029B", "/Gimarmenian": "\u0533", "/Ginsular": "\uA77D", "/Ginsularturned": "\uA77E", "/Gjecyr": "\u0403", "/Gjecyrillic": "\u0403", "/Glottalstop": "\u0241", "/Gmacron": "\u1E20", "/Gmonospace": "\uFF27", "/Gobliquestroke": "\uA7A0", "/Gparens": "\u1F116", "/Grave": "\uF6CE", "/Gravesmall": "\uF760", "/Gsmall": "\uF767", "/Gsmallhook": "\u029B", "/Gsquare": "\u1F136", "/Gsquareblack": "\u1F176", "/Gstroke": "\u01E4", "/Gturnedsans": "\u2141", "/H": "\u0048", "/H18533": "\u25CF", "/H18543": "\u25AA", "/H18551": "\u25AB", "/H22073": "\u25A1", "/HPsquare": "\u33CB", "/HVsquare": "\u1F14A", "/Haabkhasiancyrillic": "\u04A8", "/Haabkhcyr": "\u04A8", "/Hacyr": "\u0425", "/Hadescendercyrillic": "\u04B2", "/Hahookcyr": "\u04FC", "/Hardcyr": "\u042A", "/Hardsigncyrillic": "\u042A", "/Hastrokecyr": "\u04FE", "/Hbar": "\u0126", "/Hbrevebelow": "\u1E2A", "/Hcaron": "\u021E", "/Hcedilla": "\u1E28", "/Hcircle": "\u24BD", "/Hcircleblack": "\u1F157", "/Hcircumflex": "\u0124", "/Hdblstruck": "\u210D", "/Hdescender": "\u2C67", "/Hdieresis": "\u1E26", "/Hdot": "\u1E22", "/Hdotaccent": "\u1E22", "/Hdotbelow": "\u1E24", "/Heng": "\uA726", "/Heta": "\u0370", "/Hfraktur": "\u210C", "/Hgfullwidth": "\u32CC", "/Hhalf": "\u2C75", "/Hhook": "\uA7AA", "/Hmonospace": "\uFF28", "/Hoarmenian": "\u0540", "/HonAA": "\u0611", "/HonRA": "\u0612", "/HonSAW": "\u0610", "/Horicoptic": "\u03E8", "/Hparens": "\u1F117", "/Hscript": "\u210B", "/Hsmall": "\uF768", "/Hsquare": "\u1F137", "/Hsquareblack": "\u1F177", "/Hstrokemod": "\uA7F8", "/Hturned": "\uA78D", "/Hungarumlaut": "\uF6CF", "/Hungarumlautsmall": "\uF6F8", "/Hwair": "\u01F6", "/Hzsquare": "\u3390", "/I": "\u0049", "/IAcyrillic": "\u042F", "/ICsquareblack": "\u1F18B", "/IJ": "\u0132", "/IUcyrillic": "\u042E", "/Iacute": "\u00CD", "/Iacutesmall": "\uF7ED", "/Ibreve": "\u012C", "/Icaron": "\u01CF", "/Icircle": "\u24BE", "/Icircleblack": "\u1F158", "/Icircumflex": "\u00CE", "/Icircumflexsmall": "\uF7EE", "/Icyr": "\u0418", "/Icyrillic": "\u0406", "/Idblgrave": "\u0208", "/Idieresis": "\u00CF", "/Idieresisacute": "\u1E2E", "/Idieresiscyr": "\u04E4", "/Idieresiscyrillic": "\u04E4", "/Idieresissmall": "\uF7EF", "/Idot": "\u0130", "/Idotaccent": "\u0130", "/Idotbelow": "\u1ECA", "/Iebrevecyr": "\u04D6", "/Iebrevecyrillic": "\u04D6", "/Iecyr": "\u0415", "/Iecyrillic": "\u0415", "/Iegravecyr": "\u0400", "/Ifraktur": "\u2111", "/Igrave": "\u00CC", "/Igravecyr": "\u040D", "/Igravedbl": "\u0208", "/Igravesmall": "\uF7EC", "/Ihoi": "\u1EC8", "/Ihookabove": "\u1EC8", "/Iicyrillic": "\u0418", "/Iinvertedbreve": "\u020A", "/Iishortcyrillic": "\u0419", "/Imacron": "\u012A", "/Imacroncyr": "\u04E2", "/Imacroncyrillic": "\u04E2", "/Imonospace": "\uFF29", "/Iniarmenian": "\u053B", "/Iocyr": "\u0401", "/Iocyrillic": "\u0401", "/Iogonek": "\u012E", "/Iota": "\u0399", "/Iotaacute": "\u1FDB", "/Iotaafrican": "\u0196", "/Iotaasper": "\u1F39", "/Iotaasperacute": "\u1F3D", "/Iotaaspergrave": "\u1F3B", "/Iotaaspertilde": "\u1F3F", "/Iotabreve": "\u1FD8", "/Iotadieresis": "\u03AA", "/Iotagrave": "\u1FDA", "/Iotalenis": "\u1F38", "/Iotalenisacute": "\u1F3C", "/Iotalenisgrave": "\u1F3A", "/Iotalenistilde": "\u1F3E", "/Iotatonos": "\u038A", "/Iotawithmacron": "\u1FD9", "/Iparens": "\u1F118", "/Is": "\uA76C", "/Iscript": "\u2110", "/Ishortcyr": "\u0419", "/Ishortsharptailcyr": "\u048A", "/Ismall": "\uF769", "/Isquare": "\u1F138", "/Isquareblack": "\u1F178", "/Istroke": "\u0197", "/Itilde": "\u0128", "/Itildebelow": "\u1E2C", "/Iukrcyr": "\u0406", "/Izhitsacyr": "\u0474", "/Izhitsacyrillic": "\u0474", "/Izhitsadblgravecyrillic": "\u0476", "/Izhitsagravedblcyr": "\u0476", "/J": "\u004A", "/Jaarmenian": "\u0541", "/Jallajalalouhou": "\uFDFB", "/Jcircle": "\u24BF", "/Jcircleblack": "\u1F159", "/Jcircumflex": "\u0134", "/Jcrossed-tail": "\uA7B2", "/Jecyr": "\u0408", "/Jecyrillic": "\u0408", "/Jheharmenian": "\u054B", "/Jmonospace": "\uFF2A", "/Jparens": "\u1F119", "/Jsmall": "\uF76A", "/Jsquare": "\u1F139", "/Jsquareblack": "\u1F179", "/Jstroke": "\u0248", "/K": "\u004B", "/KBsquare": "\u3385", "/KKsquare": "\u33CD", "/KORONIS": "\u1FBD", "/Kaaleutcyr": "\u051E", "/Kabashkcyr": "\u04A0", "/Kabashkircyrillic": "\u04A0", "/Kacute": "\u1E30", "/Kacyr": "\u041A", "/Kacyrillic": "\u041A", "/Kadescendercyrillic": "\u049A", "/Kahookcyr": "\u04C3", "/Kahookcyrillic": "\u04C3", "/Kaisymbol": "\u03CF", "/Kappa": "\u039A", "/Kastrokecyr": "\u049E", "/Kastrokecyrillic": "\u049E", "/Katailcyr": "\u049A", "/Kaverticalstrokecyr": "\u049C", "/Kaverticalstrokecyrillic": "\u049C", "/Kcaron": "\u01E8", "/Kcedilla": "\u0136", "/Kcircle": "\u24C0", "/Kcircleblack": "\u1F15A", "/Kcommaaccent": "\u0136", "/Kdescender": "\u2C69", "/Kdiagonalstroke": "\uA742", "/Kdotbelow": "\u1E32", "/Keharmenian": "\u0554", "/Kenarmenian": "\u053F", "/Khacyrillic": "\u0425", "/Kheicoptic": "\u03E6", "/Khook": "\u0198", "/Kjecyr": "\u040C", "/Kjecyrillic": "\u040C", "/Klinebelow": "\u1E34", "/Kmonospace": "\uFF2B", "/Kobliquestroke": "\uA7A2", "/Koppa": "\u03DE", "/Koppaarchaic": "\u03D8", "/Koppacyr": "\u0480", "/Koppacyrillic": "\u0480", "/Koppagreek": "\u03DE", "/Kparens": "\u1F11A", "/Ksicyr": "\u046E", "/Ksicyrillic": "\u046E", "/Ksmall": "\uF76B", "/Ksquare": "\u1F13A", "/Ksquareblack": "\u1F17A", "/Kstroke": "\uA740", "/Kstrokediagonalstroke": "\uA744", "/Kturned": "\uA7B0", "/L": "\u004C", "/LJ": "\u01C7", "/LL": "\uF6BF", "/LLwelsh": "\u1EFA", "/LTDfullwidth": "\u32CF", "/Lacute": "\u0139", "/Lambda": "\u039B", "/Lbar": "\u023D", "/Lbelt": "\uA7AD", "/Lbroken": "\uA746", "/Lcaron": "\u013D", "/Lcedilla": "\u013B", "/Lcircle": "\u24C1", "/Lcircleblack": "\u1F15B", "/Lcircumflexbelow": "\u1E3C", "/Lcommaaccent": "\u013B", "/Ldblbar": "\u2C60", "/Ldot": "\u013F", "/Ldotaccent": "\u013F", "/Ldotbelow": "\u1E36", "/Ldotbelowmacron": "\u1E38", "/Lhacyr": "\u0514", "/Liwnarmenian": "\u053C", "/Lj": "\u01C8", "/Ljecyr": "\u0409", "/Ljecyrillic": "\u0409", "/Ljekomicyr": "\u0508", "/Llinebelow": "\u1E3A", "/Lmacrondot": "\u1E38", "/Lmiddletilde": "\u2C62", "/Lmonospace": "\uFF2C", "/Lparens": "\u1F11B", "/Lreversedsans": "\u2143", "/Lscript": "\u2112", "/Lslash": "\u0141", "/Lslashsmall": "\uF6F9", "/Lsmall": "\uF76C", "/Lsquare": "\u1F13B", "/Lsquareblack": "\u1F17B", "/Lstroke": "\uA748", "/Lturned": "\uA780", "/Lturnedsans": "\u2142", "/M": "\u004D", "/MBsquare": "\u3386", "/MVsquare": "\u1F14B", "/Macron": "\uF6D0", "/Macronsmall": "\uF7AF", "/Macute": "\u1E3E", "/Mcircle": "\u24C2", "/Mcircleblack": "\u1F15C", "/Mdot": "\u1E40", "/Mdotaccent": "\u1E40", "/Mdotbelow": "\u1E42", "/Menarmenian": "\u0544", "/Mhook": "\u2C6E", "/Mmonospace": "\uFF2D", "/Mohammad": "\uFDF4", "/Mparens": "\u1F11C", "/Mscript": "\u2133", "/Msmall": "\uF76D", "/Msquare": "\u1F13C", "/Msquareblack": "\u1F17C", "/Mturned": "\u019C", "/Mturnedsmall": "\uA7FA", "/Mu": "\u039C", "/N": "\u004E", "/NJ": "\u01CA", "/Nacute": "\u0143", "/Ncaron": "\u0147", "/Ncedilla": "\u0145", "/Ncircle": "\u24C3", "/Ncircleblack": "\u1F15D", "/Ncircumflexbelow": "\u1E4A", "/Ncommaaccent": "\u0145", "/Ndblstruck": "\u2115", "/Ndescender": "\uA790", "/Ndot": "\u1E44", "/Ndotaccent": "\u1E44", "/Ndotbelow": "\u1E46", "/Ngrave": "\u01F8", "/Nhookleft": "\u019D", "/Nineroman": "\u2168", "/Nj": "\u01CB", "/Njecyr": "\u040A", "/Njecyrillic": "\u040A", "/Njekomicyr": "\u050A", "/Nlinebelow": "\u1E48", "/Nlongrightleg": "\u0220", "/Nmonospace": "\uFF2E", "/Nobliquestroke": "\uA7A4", "/Nowarmenian": "\u0546", "/Nparens": "\u1F11D", "/Nsmall": "\uF76E", "/Nsquare": "\u1F13D", "/Nsquareblack": "\u1F17D", "/Ntilde": "\u00D1", "/Ntildesmall": "\uF7F1", "/Nu": "\u039D", "/O": "\u004F", "/OE": "\u0152", "/OEsmall": "\uF6FA", "/OO": "\uA74E", "/Oacute": "\u00D3", "/Oacutesmall": "\uF7F3", "/Obar": "\u019F", "/Obarcyr": "\u04E8", "/Obardieresiscyr": "\u04EA", "/Obarredcyrillic": "\u04E8", "/Obarreddieresiscyrillic": "\u04EA", "/Obreve": "\u014E", "/Ocaron": "\u01D1", "/Ocenteredtilde": "\u019F", "/Ocircle": "\u24C4", "/Ocircleblack": "\u1F15E", "/Ocircumflex": "\u00D4", "/Ocircumflexacute": "\u1ED0", "/Ocircumflexdotbelow": "\u1ED8", "/Ocircumflexgrave": "\u1ED2", "/Ocircumflexhoi": "\u1ED4", "/Ocircumflexhookabove": "\u1ED4", "/Ocircumflexsmall": "\uF7F4", "/Ocircumflextilde": "\u1ED6", "/Ocyr": "\u041E", "/Ocyrillic": "\u041E", "/Odblacute": "\u0150", "/Odblgrave": "\u020C", "/Odieresis": "\u00D6", "/Odieresiscyr": "\u04E6", "/Odieresiscyrillic": "\u04E6", "/Odieresismacron": "\u022A", "/Odieresissmall": "\uF7F6", "/Odot": "\u022E", "/Odotbelow": "\u1ECC", "/Odotmacron": "\u0230", "/Ogoneksmall": "\uF6FB", "/Ograve": "\u00D2", "/Ogravedbl": "\u020C", "/Ogravesmall": "\uF7F2", "/Oharmenian": "\u0555", "/Ohm": "\u2126", "/Ohoi": "\u1ECE", "/Ohookabove": "\u1ECE", "/Ohorn": "\u01A0", "/Ohornacute": "\u1EDA", "/Ohorndotbelow": "\u1EE2", "/Ohorngrave": "\u1EDC", "/Ohornhoi": "\u1EDE", "/Ohornhookabove": "\u1EDE", "/Ohorntilde": "\u1EE0", "/Ohungarumlaut": "\u0150", "/Oi": "\u01A2", "/Oinvertedbreve": "\u020E", "/Oloop": "\uA74C", "/Omacron": "\u014C", "/Omacronacute": "\u1E52", "/Omacrongrave": "\u1E50", "/Omega": "\u2126", "/Omegaacute": "\u1FFB", "/Omegaasper": "\u1F69", "/Omegaasperacute": "\u1F6D", "/Omegaasperacuteiotasub": "\u1FAD", "/Omegaaspergrave": "\u1F6B", "/Omegaaspergraveiotasub": "\u1FAB", "/Omegaasperiotasub": "\u1FA9", "/Omegaaspertilde": "\u1F6F", "/Omegaaspertildeiotasub": "\u1FAF", "/Omegacyr": "\u0460", "/Omegacyrillic": "\u0460", "/Omegagrave": "\u1FFA", "/Omegagreek": "\u03A9", "/Omegaiotasub": "\u1FFC", "/Omegalenis": "\u1F68", "/Omegalenisacute": "\u1F6C", "/Omegalenisacuteiotasub": "\u1FAC", "/Omegalenisgrave": "\u1F6A", "/Omegalenisgraveiotasub": "\u1FAA", "/Omegalenisiotasub": "\u1FA8", "/Omegalenistilde": "\u1F6E", "/Omegalenistildeiotasub": "\u1FAE", "/Omegaroundcyr": "\u047A", "/Omegaroundcyrillic": "\u047A", "/Omegatitlocyr": "\u047C", "/Omegatitlocyrillic": "\u047C", "/Omegatonos": "\u038F", "/Omicron": "\u039F", "/Omicronacute": "\u1FF9", "/Omicronasper": "\u1F49", "/Omicronasperacute": "\u1F4D", "/Omicronaspergrave": "\u1F4B", "/Omicrongrave": "\u1FF8", "/Omicronlenis": "\u1F48", "/Omicronlenisacute": "\u1F4C", "/Omicronlenisgrave": "\u1F4A", "/Omicrontonos": "\u038C", "/Omonospace": "\uFF2F", "/Oneroman": "\u2160", "/Oogonek": "\u01EA", "/Oogonekmacron": "\u01EC", "/Oopen": "\u0186", "/Oparens": "\u1F11E", "/Oslash": "\u00D8", "/Oslashacute": "\u01FE", "/Oslashsmall": "\uF7F8", "/Osmall": "\uF76F", "/Osquare": "\u1F13E", "/Osquareblack": "\u1F17E", "/Ostroke": "\uA74A", "/Ostrokeacute": "\u01FE", "/Otcyr": "\u047E", "/Otcyrillic": "\u047E", "/Otilde": "\u00D5", "/Otildeacute": "\u1E4C", "/Otildedieresis": "\u1E4E", "/Otildemacron": "\u022C", "/Otildesmall": "\uF7F5", "/Ou": "\u0222", "/P": "\u0050", "/PAsquareblack": "\u1F18C", "/PPVsquare": "\u1F14E", "/Pacute": "\u1E54", "/Palochkacyr": "\u04C0", "/Pcircle": "\u24C5", "/Pcircleblack": "\u1F15F", "/Pcrosssquareblack": "\u1F18A", "/Pdblstruck": "\u2119", "/Pdot": "\u1E56", "/Pdotaccent": "\u1E56", "/Pecyr": "\u041F", "/Pecyrillic": "\u041F", "/Peharmenian": "\u054A", "/Pehookcyr": "\u04A6", "/Pemiddlehookcyrillic": "\u04A6", "/Petailcyr": "\u0524", "/Pflourish": "\uA752", "/Phi": "\u03A6", "/Phook": "\u01A4", "/Pi": "\u03A0", "/Pidblstruck": "\u213F", "/Piwrarmenian": "\u0553", "/Pmonospace": "\uFF30", "/Pparens": "\u1F11F", "/Psi": "\u03A8", "/Psicyr": "\u0470", "/Psicyrillic": "\u0470", "/Psmall": "\uF770", "/Psquare": "\u1F13F", "/Psquareblack": "\u1F17F", "/Pstroke": "\u2C63", "/Pstrokedescender": "\uA750", "/Ptail": "\uA754", "/Q": "\u0051", "/Qacyr": "\u051A", "/QalaUsedAsKoranicStopSign": "\uFDF1", "/Qcircle": "\u24C6", "/Qcircleblack": "\u1F160", "/Qdblstruck": "\u211A", "/Qdiagonalstroke": "\uA758", "/Qmonospace": "\uFF31", "/Qparens": "\u1F120", "/Qrotated": "\u213A", "/Qsmall": "\uF771", "/Qsmallhooktail": "\u024A", "/Qsquare": "\u1F140", "/Qsquareblack": "\u1F180", "/Qstrokedescender": "\uA756", "/R": "\u0052", "/Raarmenian": "\u054C", "/Racute": "\u0154", "/Rasoul": "\uFDF6", "/Rcaron": "\u0158", "/Rcedilla": "\u0156", "/Rcircle": "\u24C7", "/Rcircleblack": "\u1F161", "/Rcommaaccent": "\u0156", "/Rdblgrave": "\u0210", "/Rdblstruck": "\u211D", "/Rdot": "\u1E58", "/Rdotaccent": "\u1E58", "/Rdotbelow": "\u1E5A", "/Rdotbelowmacron": "\u1E5C", "/Reharmenian": "\u0550", "/Reverseddottedsigmalunatesymbol": "\u03FF", "/Reversedzecyr": "\u0510", "/Rfraktur": "\u211C", "/Rgravedbl": "\u0210", "/Rhacyr": "\u0516", "/Rho": "\u03A1", "/Rhoasper": "\u1FEC", "/Ringsmall": "\uF6FC", "/Rinsular": "\uA782", "/Rinvertedbreve": "\u0212", "/Rinvertedsmall": "\u0281", "/Ritaliccircle": "\u1F12C", "/Rlinebelow": "\u1E5E", "/Rmacrondot": "\u1E5C", "/Rmonospace": "\uFF32", "/Robliquestroke": "\uA7A6", "/Rparens": "\u1F121", "/Rrotunda": "\uA75A", "/Rscript": "\u211B", "/Rsmall": "\uF772", "/Rsmallinverted": "\u0281", "/Rsmallinvertedsuperior": "\u02B6", "/Rsquare": "\u1F141", "/Rsquareblack": "\u1F181", "/Rstroke": "\u024C", "/Rsupinvertedmod": "\u02B6", "/Rtail": "\u2C64", "/RubElHizbstart": "\u06DE", "/Rumrotunda": "\uA75C", "/Rumsmall": "\uA776", "/S": "\u0053", "/SAsquareblack": "\u1F18D", "/SDsquare": "\u1F14C", "/SF010000": "\u250C", "/SF020000": "\u2514", "/SF030000": "\u2510", "/SF040000": "\u2518", "/SF050000": "\u253C", "/SF060000": "\u252C", "/SF070000": "\u2534", "/SF080000": "\u251C", "/SF090000": "\u2524", "/SF100000": "\u2500", "/SF110000": "\u2502", "/SF190000": "\u2561", "/SF200000": "\u2562", "/SF210000": "\u2556", "/SF220000": "\u2555", "/SF230000": "\u2563", "/SF240000": "\u2551", "/SF250000": "\u2557", "/SF260000": "\u255D", "/SF270000": "\u255C", "/SF280000": "\u255B", "/SF360000": "\u255E", "/SF370000": "\u255F", "/SF380000": "\u255A", "/SF390000": "\u2554", "/SF400000": "\u2569", "/SF410000": "\u2566", "/SF420000": "\u2560", "/SF430000": "\u2550", "/SF440000": "\u256C", "/SF450000": "\u2567", "/SF460000": "\u2568", "/SF470000": "\u2564", "/SF480000": "\u2565", "/SF490000": "\u2559", "/SF500000": "\u2558", "/SF510000": "\u2552", "/SF520000": "\u2553", "/SF530000": "\u256B", "/SF540000": "\u256A", "/SSsquare": "\u1F14D", "/Sacute": "\u015A", "/Sacutedotaccent": "\u1E64", "/Safha": "\u0603", "/Sajdah": "\u06E9", "/Salam": "\uFDF5", "/Salla": "\uFDF9", "/SallaUsedAsKoranicStopSign": "\uFDF0", "/SallallahouAlayheWasallam": "\uFDFA", "/Saltillo": "\uA78B", "/Sampi": "\u03E0", "/Sampiarchaic": "\u0372", "/Sampigreek": "\u03E0", "/San": "\u03FA", "/Sanah": "\u0601", "/Scaron": "\u0160", "/Scarondot": "\u1E66", "/Scarondotaccent": "\u1E66", "/Scaronsmall": "\uF6FD", "/Scedilla": "\u015E", "/Schwa": "\u018F", "/Schwacyr": "\u04D8", "/Schwacyrillic": "\u04D8", "/Schwadieresiscyr": "\u04DA", "/Schwadieresiscyrillic": "\u04DA", "/Scircle": "\u24C8", "/Scircleblack": "\u1F162", "/Scircumflex": "\u015C", "/Scommaaccent": "\u0218", "/Scriptg": "\uA7AC", "/Sdot": "\u1E60", "/Sdotaccent": "\u1E60", "/Sdotbelow": "\u1E62", "/Sdotbelowdotabove": "\u1E68", "/Sdotbelowdotaccent": "\u1E68", "/Seharmenian": "\u054D", "/Semisoftcyr": "\u048C", "/Sevenroman": "\u2166", "/Shaarmenian": "\u0547", "/Shacyr": "\u0428", "/Shacyrillic": "\u0428", "/Shchacyr": "\u0429", "/Shchacyrillic": "\u0429", "/Sheicoptic": "\u03E2", "/SheneGerishin:hb": "\u059E", "/Shhacyr": "\u04BA", "/Shhacyrillic": "\u04BA", "/Shhatailcyr": "\u0526", "/Shimacoptic": "\u03EC", "/Sho": "\u03F7", "/Sigma": "\u03A3", "/Sigmalunatesymbol": "\u03F9", "/Sigmalunatesymboldotted": "\u03FE", "/Sigmareversedlunatesymbol": "\u03FD", "/Sinsular": "\uA784", "/Sixroman": "\u2165", "/Sjekomicyr": "\u050C", "/Smonospace": "\uFF33", "/Sobliquestroke": "\uA7A8", "/Softcyr": "\u042C", "/Softsigncyrillic": "\u042C", "/Sparens": "\u1F122", "/Sshell": "\u1F12A", "/Ssmall": "\uF773", "/Ssquare": "\u1F142", "/Ssquareblack": "\u1F182", "/Sswashtail": "\u2C7E", "/Stigma": "\u03DA", "/Stigmagreek": "\u03DA", "/T": "\u0054", "/Tau": "\u03A4", "/Tbar": "\u0166", "/Tcaron": "\u0164", "/Tcedilla": "\u0162", "/Tcircle": "\u24C9", "/Tcircleblack": "\u1F163", "/Tcircumflexbelow": "\u1E70", "/Tcommaaccent": "\u0162", "/Tdot": "\u1E6A", "/Tdotaccent": "\u1E6A", "/Tdotbelow": "\u1E6C", "/Tecyr": "\u0422", "/Tecyrillic": "\u0422", "/Tedescendercyrillic": "\u04AC", "/Tenroman": "\u2169", "/Tetailcyr": "\u04AC", "/Tetsecyr": "\u04B4", "/Tetsecyrillic": "\u04B4", "/Theta": "\u0398", "/Thetasymbol": "\u03F4", "/Thook": "\u01AC", "/Thorn": "\u00DE", "/Thornsmall": "\uF7FE", "/Thornstroke": "\uA764", "/Thornstrokedescender": "\uA766", "/Threeroman": "\u2162", "/Tildesmall": "\uF6FE", "/Tinsular": "\uA786", "/Tiwnarmenian": "\u054F", "/Tjekomicyr": "\u050E", "/Tlinebelow": "\u1E6E", "/Tmonospace": "\uFF34", "/Toarmenian": "\u0539", "/Tonefive": "\u01BC", "/Tonesix": "\u0184", "/Tonetwo": "\u01A7", "/Tparens": "\u1F123", "/Tresillo": "\uA72A", "/Tretroflexhook": "\u01AE", "/Tsecyr": "\u0426", "/Tsecyrillic": "\u0426", "/Tshecyr": "\u040B", "/Tshecyrillic": "\u040B", "/Tsmall": "\uF774", "/Tsquare": "\u1F143", "/Tsquareblack": "\u1F183", "/Tturned": "\uA7B1", "/Twelveroman": "\u216B", "/Twithdiagonalstroke": "\u023E", "/Tworoman": "\u2161", "/Tz": "\uA728", "/U": "\u0055", "/Uacute": "\u00DA", "/Uacutedblcyr": "\u04F2", "/Uacutesmall": "\uF7FA", "/Ubar": "\u0244", "/Ubreve": "\u016C", "/Ucaron": "\u01D3", "/Ucircle": "\u24CA", "/Ucircleblack": "\u1F164", "/Ucircumflex": "\u00DB", "/Ucircumflexbelow": "\u1E76", "/Ucircumflexsmall": "\uF7FB", "/Ucyr": "\u0423", "/Ucyrillic": "\u0423", "/Udblacute": "\u0170", "/Udblgrave": "\u0214", "/Udieresis": "\u00DC", "/Udieresisacute": "\u01D7", "/Udieresisbelow": "\u1E72", "/Udieresiscaron": "\u01D9", "/Udieresiscyr": "\u04F0", "/Udieresiscyrillic": "\u04F0", "/Udieresisgrave": "\u01DB", "/Udieresismacron": "\u01D5", "/Udieresissmall": "\uF7FC", "/Udotbelow": "\u1EE4", "/Ugrave": "\u00D9", "/Ugravedbl": "\u0214", "/Ugravesmall": "\uF7F9", "/Uhoi": "\u1EE6", "/Uhookabove": "\u1EE6", "/Uhorn": "\u01AF", "/Uhornacute": "\u1EE8", "/Uhorndotbelow": "\u1EF0", "/Uhorngrave": "\u1EEA", "/Uhornhoi": "\u1EEC", "/Uhornhookabove": "\u1EEC", "/Uhorntilde": "\u1EEE", "/Uhungarumlaut": "\u0170", "/Uhungarumlautcyrillic": "\u04F2", "/Uinvertedbreve": "\u0216", "/Ukcyr": "\u0478", "/Ukcyrillic": "\u0478", "/Umacron": "\u016A", "/Umacroncyr": "\u04EE", "/Umacroncyrillic": "\u04EE", "/Umacrondieresis": "\u1E7A", "/Umonospace": "\uFF35", "/Uogonek": "\u0172", "/Uparens": "\u1F124", "/Upsilon": "\u03A5", "/Upsilon1": "\u03D2", "/Upsilonacute": "\u1FEB", "/Upsilonacutehooksymbol": "\u03D3", "/Upsilonacutehooksymbolgreek": "\u03D3", "/Upsilonadieresishooksymbol": "\u03D4", "/Upsilonafrican": "\u01B1", "/Upsilonasper": "\u1F59", "/Upsilonasperacute": "\u1F5D", "/Upsilonaspergrave": "\u1F5B", "/Upsilonaspertilde": "\u1F5F", "/Upsilonbreve": "\u1FE8", "/Upsilondieresis": "\u03AB", "/Upsilondieresishooksymbolgreek": "\u03D4", "/Upsilongrave": "\u1FEA", "/Upsilonhooksymbol": "\u03D2", "/Upsilontonos": "\u038E", "/Upsilonwithmacron": "\u1FE9", "/Uring": "\u016E", "/Ushortcyr": "\u040E", "/Ushortcyrillic": "\u040E", "/Usmall": "\uF775", "/Usquare": "\u1F144", "/Usquareblack": "\u1F184", "/Ustraightcyr": "\u04AE", "/Ustraightcyrillic": "\u04AE", "/Ustraightstrokecyr": "\u04B0", "/Ustraightstrokecyrillic": "\u04B0", "/Utilde": "\u0168", "/Utildeacute": "\u1E78", "/Utildebelow": "\u1E74", "/V": "\u0056", "/Vcircle": "\u24CB", "/Vcircleblack": "\u1F165", "/Vdiagonalstroke": "\uA75E", "/Vdotbelow": "\u1E7E", "/Vecyr": "\u0412", "/Vecyrillic": "\u0412", "/Vend": "\uA768", "/Vewarmenian": "\u054E", "/Vhook": "\u01B2", "/Visigothicz": "\uA762", "/Vmod": "\u2C7D", "/Vmonospace": "\uFF36", "/Voarmenian": "\u0548", "/Volapukae": "\uA79A", "/Volapukoe": "\uA79C", "/Volapukue": "\uA79E", "/Vparens": "\u1F125", "/Vsmall": "\uF776", "/Vsquare": "\u1F145", "/Vsquareblack": "\u1F185", "/Vtilde": "\u1E7C", "/Vturned": "\u0245", "/Vwelsh": "\u1EFC", "/Vy": "\uA760", "/W": "\u0057", "/WZcircle": "\u1F12E", "/Wacute": "\u1E82", "/Wasallam": "\uFDF8", "/Wcircle": "\u24CC", "/Wcircleblack": "\u1F166", "/Wcircumflex": "\u0174", "/Wdieresis": "\u1E84", "/Wdot": "\u1E86", "/Wdotaccent": "\u1E86", "/Wdotbelow": "\u1E88", "/Wecyr": "\u051C", "/Wgrave": "\u1E80", "/Whook": "\u2C72", "/Wmonospace": "\uFF37", "/Wparens": "\u1F126", "/Wsmall": "\uF777", "/Wsquare": "\u1F146", "/Wsquareblack": "\u1F186", "/Wynn": "\u01F7", "/X": "\u0058", "/Xatailcyr": "\u04B2", "/Xcircle": "\u24CD", "/Xcircleblack": "\u1F167", "/Xdieresis": "\u1E8C", "/Xdot": "\u1E8A", "/Xdotaccent": "\u1E8A", "/Xeharmenian": "\u053D", "/Xi": "\u039E", "/Xmonospace": "\uFF38", "/Xparens": "\u1F127", "/Xsmall": "\uF778", "/Xsquare": "\u1F147", "/Xsquareblack": "\u1F187", "/Y": "\u0059", "/Yacute": "\u00DD", "/Yacutesmall": "\uF7FD", "/Yacyr": "\u042F", "/Yaecyr": "\u0518", "/Yatcyr": "\u0462", "/Yatcyrillic": "\u0462", "/Ycircle": "\u24CE", "/Ycircleblack": "\u1F168", "/Ycircumflex": "\u0176", "/Ydieresis": "\u0178", "/Ydieresissmall": "\uF7FF", "/Ydot": "\u1E8E", "/Ydotaccent": "\u1E8E", "/Ydotbelow": "\u1EF4", "/Yericyrillic": "\u042B", "/Yerudieresiscyrillic": "\u04F8", "/Ygrave": "\u1EF2", "/Yhoi": "\u1EF6", "/Yhook": "\u01B3", "/Yhookabove": "\u1EF6", "/Yiarmenian": "\u0545", "/Yicyrillic": "\u0407", "/Yiwnarmenian": "\u0552", "/Ylongcyr": "\u042B", "/Ylongdieresiscyr": "\u04F8", "/Yloop": "\u1EFE", "/Ymacron": "\u0232", "/Ymonospace": "\uFF39", "/Yogh": "\u021C", "/Yot": "\u037F", "/Yparens": "\u1F128", "/Ysmall": "\uF779", "/Ysquare": "\u1F148", "/Ysquareblack": "\u1F188", "/Ystroke": "\u024E", "/Ytilde": "\u1EF8", "/Yturnedsans": "\u2144", "/Yucyr": "\u042E", "/Yukrcyr": "\u0407", "/Yusbigcyr": "\u046A", "/Yusbigcyrillic": "\u046A", "/Yusbigiotifiedcyr": "\u046C", "/Yusbigiotifiedcyrillic": "\u046C", "/Yuslittlecyr": "\u0466", "/Yuslittlecyrillic": "\u0466", "/Yuslittleiotifiedcyr": "\u0468", "/Yuslittleiotifiedcyrillic": "\u0468", "/Z": "\u005A", "/Zaarmenian": "\u0536", "/Zacute": "\u0179", "/Zcaron": "\u017D", "/Zcaronsmall": "\uF6FF", "/Zcircle": "\u24CF", "/Zcircleblack": "\u1F169", "/Zcircumflex": "\u1E90", "/Zdblstruck": "\u2124", "/Zdescender": "\u2C6B", "/Zdot": "\u017B", "/Zdotaccent": "\u017B", "/Zdotbelow": "\u1E92", "/Zecyr": "\u0417", "/Zecyrillic": "\u0417", "/Zedescendercyrillic": "\u0498", "/Zedieresiscyr": "\u04DE", "/Zedieresiscyrillic": "\u04DE", "/Zeta": "\u0396", "/Zetailcyr": "\u0498", "/Zfraktur": "\u2128", "/Zhearmenian": "\u053A", "/Zhebrevecyr": "\u04C1", "/Zhebrevecyrillic": "\u04C1", "/Zhecyr": "\u0416", "/Zhecyrillic": "\u0416", "/Zhedescendercyrillic": "\u0496", "/Zhedieresiscyr": "\u04DC", "/Zhedieresiscyrillic": "\u04DC", "/Zhetailcyr": "\u0496", "/Zhook": "\u0224", "/Zjekomicyr": "\u0504", "/Zlinebelow": "\u1E94", "/Zmonospace": "\uFF3A", "/Zparens": "\u1F129", "/Zsmall": "\uF77A", "/Zsquare": "\u1F149", "/Zsquareblack": "\u1F189", "/Zstroke": "\u01B5", "/Zswashtail": "\u2C7F", "/a": "\u0061", "/a.inferior": "\u2090", "/aHonRAA": "\u0613", "/aa": "\uA733", "/aabengali": "\u0986", "/aacute": "\u00E1", "/aadeva": "\u0906", "/aagujarati": "\u0A86", "/aagurmukhi": "\u0A06", "/aamatragurmukhi": "\u0A3E", "/aarusquare": "\u3303", "/aavowelsignbengali": "\u09BE", "/aavowelsigndeva": "\u093E", "/aavowelsigngujarati": "\u0ABE", "/abbreviationmarkarmenian": "\u055F", "/abbreviationsigndeva": "\u0970", "/abengali": "\u0985", "/abopomofo": "\u311A", "/abreve": "\u0103", "/abreveacute": "\u1EAF", "/abrevecyr": "\u04D1", "/abrevecyrillic": "\u04D1", "/abrevedotbelow": "\u1EB7", "/abrevegrave": "\u1EB1", "/abrevehoi": "\u1EB3", "/abrevehookabove": "\u1EB3", "/abrevetilde": "\u1EB5", "/absquareblack": "\u1F18E", "/acaron": "\u01CE", "/accountof": "\u2100", "/accurrent": "\u23E6", "/acircle": "\u24D0", "/acirclekatakana": "\u32D0", "/acircumflex": "\u00E2", "/acircumflexacute": "\u1EA5", "/acircumflexdotbelow": "\u1EAD", "/acircumflexgrave": "\u1EA7", "/acircumflexhoi": "\u1EA9", "/acircumflexhookabove": "\u1EA9", "/acircumflextilde": "\u1EAB", "/activatearabicformshaping": "\u206D", "/activatesymmetricswapping": "\u206B", "/acute": "\u00B4", "/acutebelowcmb": "\u0317", "/acutecmb": "\u0301", "/acutecomb": "\u0301", "/acutedblmiddlemod": "\u02F6", "/acutedeva": "\u0954", "/acutelowmod": "\u02CF", "/acutemod": "\u02CA", "/acutetonecmb": "\u0341", "/acyr": "\u0430", "/acyrillic": "\u0430", "/adblgrave": "\u0201", "/addakgurmukhi": "\u0A71", "/addressedsubject": "\u2101", "/adegadegpada": "\uA9CB", "/adegpada": "\uA9CA", "/adeva": "\u0905", "/adieresis": "\u00E4", "/adieresiscyr": "\u04D3", "/adieresiscyrillic": "\u04D3", "/adieresismacron": "\u01DF", "/adishakti": "\u262C", "/admissionTickets": "\u1F39F", "/adot": "\u0227", "/adotbelow": "\u1EA1", "/adotmacron": "\u01E1", "/ae": "\u00E6", "/aeacute": "\u01FD", "/aekorean": "\u3150", "/aemacron": "\u01E3", "/aerialTramway": "\u1F6A1", "/afghani": "\u060B", "/afii00208": "\u2015", "/afii08941": "\u20A4", "/afii10017": "\u0410", "/afii10018": "\u0411", "/afii10019": "\u0412", "/afii10020": "\u0413", "/afii10021": "\u0414", "/afii10022": "\u0415", "/afii10023": "\u0401", "/afii10024": "\u0416", "/afii10025": "\u0417", "/afii10026": "\u0418", "/afii10027": "\u0419", "/afii10028": "\u041A", "/afii10029": "\u041B", "/afii10030": "\u041C", "/afii10031": "\u041D", "/afii10032": "\u041E", "/afii10033": "\u041F", "/afii10034": "\u0420", "/afii10035": "\u0421", "/afii10036": "\u0422", "/afii10037": "\u0423", "/afii10038": "\u0424", "/afii10039": "\u0425", "/afii10040": "\u0426", "/afii10041": "\u0427", "/afii10042": "\u0428", "/afii10043": "\u0429", "/afii10044": "\u042A", "/afii10045": "\u042B", "/afii10046": "\u042C", "/afii10047": "\u042D", "/afii10048": "\u042E", "/afii10049": "\u042F", "/afii10050": "\u0490", "/afii10051": "\u0402", "/afii10052": "\u0403", "/afii10053": "\u0404", "/afii10054": "\u0405", "/afii10055": "\u0406", "/afii10056": "\u0407", "/afii10057": "\u0408", "/afii10058": "\u0409", "/afii10059": "\u040A", "/afii10060": "\u040B", "/afii10061": "\u040C", "/afii10062": "\u040E", "/afii10063": "\uF6C4", "/afii10064": "\uF6C5", "/afii10065": "\u0430", "/afii10066": "\u0431", "/afii10067": "\u0432", "/afii10068": "\u0433", "/afii10069": "\u0434", "/afii10070": "\u0435", "/afii10071": "\u0451", "/afii10072": "\u0436", "/afii10073": "\u0437", "/afii10074": "\u0438", "/afii10075": "\u0439", "/afii10076": "\u043A", "/afii10077": "\u043B", "/afii10078": "\u043C", "/afii10079": "\u043D", "/afii10080": "\u043E", "/afii10081": "\u043F", "/afii10082": "\u0440", "/afii10083": "\u0441", "/afii10084": "\u0442", "/afii10085": "\u0443", "/afii10086": "\u0444", "/afii10087": "\u0445", "/afii10088": "\u0446", "/afii10089": "\u0447", "/afii10090": "\u0448", "/afii10091": "\u0449", "/afii10092": "\u044A", "/afii10093": "\u044B", "/afii10094": "\u044C", "/afii10095": "\u044D", "/afii10096": "\u044E", "/afii10097": "\u044F", "/afii10098": "\u0491", "/afii10099": "\u0452", "/afii10100": "\u0453", "/afii10101": "\u0454", "/afii10102": "\u0455", "/afii10103": "\u0456", "/afii10104": "\u0457", "/afii10105": "\u0458", "/afii10106": "\u0459", "/afii10107": "\u045A", "/afii10108": "\u045B", "/afii10109": "\u045C", "/afii10110": "\u045E", "/afii10145": "\u040F", "/afii10146": "\u0462", "/afii10147": "\u0472", "/afii10148": "\u0474", "/afii10192": "\uF6C6", "/afii10193": "\u045F", "/afii10194": "\u0463", "/afii10195": "\u0473", "/afii10196": "\u0475", "/afii10831": "\uF6C7", "/afii10832": "\uF6C8", "/afii10846": "\u04D9", "/afii299": "\u200E", "/afii300": "\u200F", "/afii301": "\u200D", "/afii57381": "\u066A", "/afii57388": "\u060C", "/afii57392": "\u0660", "/afii57393": "\u0661", "/afii57394": "\u0662", "/afii57395": "\u0663", "/afii57396": "\u0664", "/afii57397": "\u0665", "/afii57398": "\u0666", "/afii57399": "\u0667", "/afii57400": "\u0668", "/afii57401": "\u0669", "/afii57403": "\u061B", "/afii57407": "\u061F", "/afii57409": "\u0621", "/afii57410": "\u0622", "/afii57411": "\u0623", "/afii57412": "\u0624", "/afii57413": "\u0625", "/afii57414": "\u0626", "/afii57415": "\u0627", "/afii57416": "\u0628", "/afii57417": "\u0629", "/afii57418": "\u062A", "/afii57419": "\u062B", "/afii57420": "\u062C", "/afii57421": "\u062D", "/afii57422": "\u062E", "/afii57423": "\u062F", "/afii57424": "\u0630", "/afii57425": "\u0631", "/afii57426": "\u0632", "/afii57427": "\u0633", "/afii57428": "\u0634", "/afii57429": "\u0635", "/afii57430": "\u0636", "/afii57431": "\u0637", "/afii57432": "\u0638", "/afii57433": "\u0639", "/afii57434": "\u063A", "/afii57440": "\u0640", "/afii57441": "\u0641", "/afii57442": "\u0642", "/afii57443": "\u0643", "/afii57444": "\u0644", "/afii57445": "\u0645", "/afii57446": "\u0646", "/afii57448": "\u0648", "/afii57449": "\u0649", "/afii57450": "\u064A", "/afii57451": "\u064B", "/afii57452": "\u064C", "/afii57453": "\u064D", "/afii57454": "\u064E", "/afii57455": "\u064F", "/afii57456": "\u0650", "/afii57457": "\u0651", "/afii57458": "\u0652", "/afii57470": "\u0647", "/afii57505": "\u06A4", "/afii57506": "\u067E", "/afii57507": "\u0686", "/afii57508": "\u0698", "/afii57509": "\u06AF", "/afii57511": "\u0679", "/afii57512": "\u0688", "/afii57513": "\u0691", "/afii57514": "\u06BA", "/afii57519": "\u06D2", "/afii57534": "\u06D5", "/afii57636": "\u20AA", "/afii57645": "\u05BE", "/afii57658": "\u05C3", "/afii57664": "\u05D0", "/afii57665": "\u05D1", "/afii57666": "\u05D2", "/afii57667": "\u05D3", "/afii57668": "\u05D4", "/afii57669": "\u05D5", "/afii57670": "\u05D6", "/afii57671": "\u05D7", "/afii57672": "\u05D8", "/afii57673": "\u05D9", "/afii57674": "\u05DA", "/afii57675": "\u05DB", "/afii57676": "\u05DC", "/afii57677": "\u05DD", "/afii57678": "\u05DE", "/afii57679": "\u05DF", "/afii57680": "\u05E0", "/afii57681": "\u05E1", "/afii57682": "\u05E2", "/afii57683": "\u05E3", "/afii57684": "\u05E4", "/afii57685": "\u05E5", "/afii57686": "\u05E6", "/afii57687": "\u05E7", "/afii57688": "\u05E8", "/afii57689": "\u05E9", "/afii57690": "\u05EA", "/afii57694": "\uFB2A", "/afii57695": "\uFB2B", "/afii57700": "\uFB4B", "/afii57705": "\uFB1F", "/afii57716": "\u05F0", "/afii57717": "\u05F1", "/afii57718": "\u05F2", "/afii57723": "\uFB35", "/afii57793": "\u05B4", "/afii57794": "\u05B5", "/afii57795": "\u05B6", "/afii57796": "\u05BB", "/afii57797": "\u05B8", "/afii57798": "\u05B7", "/afii57799": "\u05B0", "/afii57800": "\u05B2", "/afii57801": "\u05B1", "/afii57802": "\u05B3", "/afii57803": "\u05C2", "/afii57804": "\u05C1", "/afii57806": "\u05B9", "/afii57807": "\u05BC", "/afii57839": "\u05BD", "/afii57841": "\u05BF", "/afii57842": "\u05C0", "/afii57929": "\u02BC", "/afii61248": "\u2105", "/afii61289": "\u2113", "/afii61352": "\u2116", "/afii61573": "\u202C", "/afii61574": "\u202D", "/afii61575": "\u202E", "/afii61664": "\u200C", "/afii63167": "\u066D", "/afii64937": "\u02BD", "/agrave": "\u00E0", "/agravedbl": "\u0201", "/agujarati": "\u0A85", "/agurmukhi": "\u0A05", "/ahiragana": "\u3042", "/ahoi": "\u1EA3", "/ahookabove": "\u1EA3", "/aibengali": "\u0990", "/aibopomofo": "\u311E", "/aideva": "\u0910", "/aiecyr": "\u04D5", "/aiecyrillic": "\u04D5", "/aigujarati": "\u0A90", "/aigurmukhi": "\u0A10", "/aimatragurmukhi": "\u0A48", "/ain.fina": "\uFECA", "/ain.init": "\uFECB", "/ain.init_alefmaksura.fina": "\uFCF7", "/ain.init_jeem.fina": "\uFC29", "/ain.init_jeem.medi": "\uFCBA", "/ain.init_jeem.medi_meem.medi": "\uFDC4", "/ain.init_meem.fina": "\uFC2A", "/ain.init_meem.medi": "\uFCBB", "/ain.init_meem.medi_meem.medi": "\uFD77", "/ain.init_yeh.fina": "\uFCF8", "/ain.isol": "\uFEC9", "/ain.medi": "\uFECC", "/ain.medi_alefmaksura.fina": "\uFD13", "/ain.medi_jeem.medi_meem.fina": "\uFD75", "/ain.medi_meem.medi_alefmaksura.fina": "\uFD78", "/ain.medi_meem.medi_meem.fina": "\uFD76", "/ain.medi_meem.medi_yeh.fina": "\uFDB6", "/ain.medi_yeh.fina": "\uFD14", "/ainThreeDotsDownAbove": "\u075E", "/ainTwoDotsAbove": "\u075D", "/ainTwoDotsVerticallyAbove": "\u075F", "/ainarabic": "\u0639", "/ainfinalarabic": "\uFECA", "/aininitialarabic": "\uFECB", "/ainmedialarabic": "\uFECC", "/ainthreedotsabove": "\u06A0", "/ainvertedbreve": "\u0203", "/airplaneArriving": "\u1F6EC", "/airplaneDeparture": "\u1F6EB", "/aivowelsignbengali": "\u09C8", "/aivowelsigndeva": "\u0948", "/aivowelsigngujarati": "\u0AC8", "/akatakana": "\u30A2", "/akatakanahalfwidth": "\uFF71", "/akorean": "\u314F", "/aktieselskab": "\u214D", "/alarmclock": "\u23F0", "/alef": "\u05D0", "/alef.fina": "\uFE8E", "/alef.init_fathatan.fina": "\uFD3D", "/alef.isol": "\uFE8D", "/alef.medi_fathatan.fina": "\uFD3C", "/alef:hb": "\u05D0", "/alefDigitThreeAbove": "\u0774", "/alefDigitTwoAbove": "\u0773", "/alefLamYehabove": "\u0616", "/alefabove": "\u0670", "/alefarabic": "\u0627", "/alefdageshhebrew": "\uFB30", "/aleffinalarabic": "\uFE8E", "/alefhamza": "\u0623", "/alefhamza.fina": "\uFE84", "/alefhamza.isol": "\uFE83", "/alefhamzaabovearabic": "\u0623", "/alefhamzaabovefinalarabic": "\uFE84", "/alefhamzabelow": "\u0625", "/alefhamzabelow.fina": "\uFE88", "/alefhamzabelow.isol": "\uFE87", "/alefhamzabelowarabic": "\u0625", "/alefhamzabelowfinalarabic": "\uFE88", "/alefhebrew": "\u05D0", "/alefhighhamza": "\u0675", "/aleflamedhebrew": "\uFB4F", "/alefmadda": "\u0622", "/alefmadda.fina": "\uFE82", "/alefmadda.isol": "\uFE81", "/alefmaddaabovearabic": "\u0622", "/alefmaddaabovefinalarabic": "\uFE82", "/alefmaksura": "\u0649", "/alefmaksura.fina": "\uFEF0", "/alefmaksura.init_superscriptalef.fina": "\uFC5D", "/alefmaksura.isol": "\uFEEF", "/alefmaksura.medi_superscriptalef.fina": "\uFC90", "/alefmaksuraarabic": "\u0649", "/alefmaksurafinalarabic": "\uFEF0", "/alefmaksurainitialarabic": "\uFEF3", "/alefmaksuramedialarabic": "\uFEF4", "/alefpatahhebrew": "\uFB2E", "/alefqamatshebrew": "\uFB2F", "/alefwasla": "\u0671", "/alefwasla.fina": "\uFB51", "/alefwasla.isol": "\uFB50", "/alefwavyhamza": "\u0672", "/alefwavyhamzabelow": "\u0673", "/alefwide:hb": "\uFB21", "/alefwithmapiq:hb": "\uFB30", "/alefwithpatah:hb": "\uFB2E", "/alefwithqamats:hb": "\uFB2F", "/alembic": "\u2697", "/aleph": "\u2135", "/alienMonster": "\u1F47E", "/allaroundprofile": "\u232E", "/allequal": "\u224C", "/allianceideographiccircled": "\u32AF", "/allianceideographicparen": "\u323F", "/almostequalorequal": "\u224A", "/alpha": "\u03B1", "/alphaacute": "\u1F71", "/alphaacuteiotasub": "\u1FB4", "/alphaasper": "\u1F01", "/alphaasperacute": "\u1F05", "/alphaasperacuteiotasub": "\u1F85", "/alphaaspergrave": "\u1F03", "/alphaaspergraveiotasub": "\u1F83", "/alphaasperiotasub": "\u1F81", "/alphaaspertilde": "\u1F07", "/alphaaspertildeiotasub": "\u1F87", "/alphabreve": "\u1FB0", "/alphafunc": "\u237A", "/alphagrave": "\u1F70", "/alphagraveiotasub": "\u1FB2", "/alphaiotasub": "\u1FB3", "/alphalenis": "\u1F00", "/alphalenisacute": "\u1F04", "/alphalenisacuteiotasub": "\u1F84", "/alphalenisgrave": "\u1F02", "/alphalenisgraveiotasub": "\u1F82", "/alphalenisiotasub": "\u1F80", "/alphalenistilde": "\u1F06", "/alphalenistildeiotasub": "\u1F86", "/alphatilde": "\u1FB6", "/alphatildeiotasub": "\u1FB7", "/alphatonos": "\u03AC", "/alphaturned": "\u0252", "/alphaunderlinefunc": "\u2376", "/alphawithmacron": "\u1FB1", "/alternateonewayleftwaytraffic": "\u26D5", "/alternative": "\u2387", "/amacron": "\u0101", "/ambulance": "\u1F691", "/americanFootball": "\u1F3C8", "/amfullwidth": "\u33C2", "/amonospace": "\uFF41", "/amountofcheck": "\u2447", "/ampersand": "\u0026", "/ampersandSindhi": "\u06FD", "/ampersandmonospace": "\uFF06", "/ampersandsmall": "\uF726", "/ampersandturned": "\u214B", "/amphora": "\u1F3FA", "/amsquare": "\u33C2", "/anbopomofo": "\u3122", "/anchor": "\u2693", "/ancoradown": "\u2E14", "/ancoraup": "\u2E15", "/andappada": "\uA9C3", "/angbopomofo": "\u3124", "/anger": "\u1F4A2", "/angkhankhuthai": "\u0E5A", "/angle": "\u2220", "/anglearcright": "\u22BE", "/anglebracketleft": "\u3008", "/anglebracketleftvertical": "\uFE3F", "/anglebracketright": "\u3009", "/anglebracketrightvertical": "\uFE40", "/angledottedright": "\u2E16", "/angleleft": "\u2329", "/anglemarkerdottedsubstitutionright": "\u2E01", "/anglemarkersubstitutionright": "\u2E00", "/angleright": "\u232A", "/anglezigzagarrowdownright": "\u237C", "/angryFace": "\u1F620", "/angstrom": "\u212B", "/anguishedFace": "\u1F627", "/ankh": "\u2625", "/anoteleia": "\u0387", "/anpeasquare": "\u3302", "/ant": "\u1F41C", "/antennaBars": "\u1F4F6", "/anticlockwiseDownwardsAndUpwardsOpenCircleArrows": "\u1F504", "/anudattadeva": "\u0952", "/anusvarabengali": "\u0982", "/anusvaradeva": "\u0902", "/anusvaragujarati": "\u0A82", "/ao": "\uA735", "/aogonek": "\u0105", "/aovermfullwidth": "\u33DF", "/apaatosquare": "\u3300", "/aparen": "\u249C", "/aparenthesized": "\u249C", "/apostrophearmenian": "\u055A", "/apostrophedblmod": "\u02EE", "/apostrophemod": "\u02BC", "/apple": "\uF8FF", "/approaches": "\u2250", "/approacheslimit": "\u2250", "/approxequal": "\u2248", "/approxequalorimage": "\u2252", "/approximatelybutnotactuallyequal": "\u2246", "/approximatelyequal": "\u2245", "/approximatelyequalorimage": "\u2252", "/apriltelegraph": "\u32C3", "/aquarius": "\u2652", "/ar:ae": "\u06D5", "/ar:ain": "\u0639", "/ar:alef": "\u0627", "/ar:comma": "\u060C", "/ar:cuberoot": "\u0606", "/ar:decimalseparator": "\u066B", "/ar:e": "\u06D0", "/ar:eight": "\u0668", "/ar:feh": "\u0641", "/ar:five": "\u0665", "/ar:four": "\u0664", "/ar:fourthroot": "\u0607", "/ar:kaf": "\u0643", "/ar:ng": "\u06AD", "/ar:nine": "\u0669", "/ar:numbersign": "\u0600", "/ar:oe": "\u06C6", "/ar:one": "\u0661", "/ar:peh": "\u067E", "/ar:percent": "\u066A", "/ar:perthousand": "\u060A", "/ar:question": "\u061F", "/ar:reh": "\u0631", "/ar:semicolon": "\u061B", "/ar:seven": "\u0667", "/ar:shadda": "\u0651", "/ar:six": "\u0666", "/ar:sukun": "\u0652", "/ar:three": "\u0663", "/ar:two": "\u0662", "/ar:u": "\u06C7", "/ar:ve": "\u06CB", "/ar:yu": "\u06C8", "/ar:zero": "\u0660", "/araeaekorean": "\u318E", "/araeakorean": "\u318D", "/arc": "\u2312", "/archaicmepigraphic": "\uA7FF", "/aries": "\u2648", "/arighthalfring": "\u1E9A", "/aring": "\u00E5", "/aringacute": "\u01FB", "/aringbelow": "\u1E01", "/armn:Ayb": "\u0531", "/armn:Ben": "\u0532", "/armn:Ca": "\u053E", "/armn:Cha": "\u0549", "/armn:Cheh": "\u0543", "/armn:Co": "\u0551", "/armn:DRAMSIGN": "\u058F", "/armn:Da": "\u0534", "/armn:Ech": "\u0535", "/armn:Eh": "\u0537", "/armn:Et": "\u0538", "/armn:Feh": "\u0556", "/armn:Ghad": "\u0542", "/armn:Gim": "\u0533", "/armn:Ho": "\u0540", "/armn:Ini": "\u053B", "/armn:Ja": "\u0541", "/armn:Jheh": "\u054B", "/armn:Keh": "\u0554", "/armn:Ken": "\u053F", "/armn:Liwn": "\u053C", "/armn:Men": "\u0544", "/armn:Now": "\u0546", "/armn:Oh": "\u0555", "/armn:Peh": "\u054A", "/armn:Piwr": "\u0553", "/armn:Ra": "\u054C", "/armn:Reh": "\u0550", "/armn:Seh": "\u054D", "/armn:Sha": "\u0547", "/armn:Tiwn": "\u054F", "/armn:To": "\u0539", "/armn:Vew": "\u054E", "/armn:Vo": "\u0548", "/armn:Xeh": "\u053D", "/armn:Yi": "\u0545", "/armn:Yiwn": "\u0552", "/armn:Za": "\u0536", "/armn:Zhe": "\u053A", "/armn:abbreviationmark": "\u055F", "/armn:apostrophe": "\u055A", "/armn:ayb": "\u0561", "/armn:ben": "\u0562", "/armn:ca": "\u056E", "/armn:cha": "\u0579", "/armn:cheh": "\u0573", "/armn:co": "\u0581", "/armn:comma": "\u055D", "/armn:da": "\u0564", "/armn:ech": "\u0565", "/armn:ech_yiwn": "\u0587", "/armn:eh": "\u0567", "/armn:emphasismark": "\u055B", "/armn:et": "\u0568", "/armn:exclam": "\u055C", "/armn:feh": "\u0586", "/armn:ghad": "\u0572", "/armn:gim": "\u0563", "/armn:ho": "\u0570", "/armn:hyphen": "\u058A", "/armn:ini": "\u056B", "/armn:ja": "\u0571", "/armn:jheh": "\u057B", "/armn:keh": "\u0584", "/armn:ken": "\u056F", "/armn:leftfacingeternitysign": "\u058E", "/armn:liwn": "\u056C", "/armn:men": "\u0574", "/armn:men_ech": "\uFB14", "/armn:men_ini": "\uFB15", "/armn:men_now": "\uFB13", "/armn:men_xeh": "\uFB17", "/armn:now": "\u0576", "/armn:oh": "\u0585", "/armn:peh": "\u057A", "/armn:period": "\u0589", "/armn:piwr": "\u0583", "/armn:question": "\u055E", "/armn:ra": "\u057C", "/armn:reh": "\u0580", "/armn:rightfacingeternitysign": "\u058D", "/armn:ringhalfleft": "\u0559", "/armn:seh": "\u057D", "/armn:sha": "\u0577", "/armn:tiwn": "\u057F", "/armn:to": "\u0569", "/armn:vew": "\u057E", "/armn:vew_now": "\uFB16", "/armn:vo": "\u0578", "/armn:xeh": "\u056D", "/armn:yi": "\u0575", "/armn:yiwn": "\u0582", "/armn:za": "\u0566", "/armn:zhe": "\u056A", "/arrowNE": "\u2197", "/arrowNW": "\u2196", "/arrowSE": "\u2198", "/arrowSW": "\u2199", "/arrowanticlockwiseopencircle": "\u21BA", "/arrowanticlockwisesemicircle": "\u21B6", "/arrowboth": "\u2194", "/arrowclockwiseopencircle": "\u21BB", "/arrowclockwisesemicircle": "\u21B7", "/arrowdashdown": "\u21E3", "/arrowdashleft": "\u21E0", "/arrowdashright": "\u21E2", "/arrowdashup": "\u21E1", "/arrowdblboth": "\u21D4", "/arrowdbldown": "\u21D3", "/arrowdblleft": "\u21D0", "/arrowdblright": "\u21D2", "/arrowdblup": "\u21D1", "/arrowdown": "\u2193", "/arrowdowndashed": "\u21E3", "/arrowdownfrombar": "\u21A7", "/arrowdownleft": "\u2199", "/arrowdownright": "\u2198", "/arrowdowntwoheaded": "\u21A1", "/arrowdownwhite": "\u21E9", "/arrowdownzigzag": "\u21AF", "/arrowheaddown": "\u2304", "/arrowheaddownlowmod": "\u02EF", "/arrowheaddownmod": "\u02C5", "/arrowheadleftlowmod": "\u02F1", "/arrowheadleftmod": "\u02C2", "/arrowheadrightlowmod": "\u02F2", "/arrowheadrightmod": "\u02C3", "/arrowheadtwobarsuphorizontal": "\u2324", "/arrowheadup": "\u2303", "/arrowheaduplowmod": "\u02F0", "/arrowheadupmod": "\u02C4", "/arrowhorizex": "\uF8E7", "/arrowleft": "\u2190", "/arrowleftdashed": "\u21E0", "/arrowleftdbl": "\u21D0", "/arrowleftdblstroke": "\u21CD", "/arrowleftdowncorner": "\u21B5", "/arrowleftdowntip": "\u21B2", "/arrowleftfrombar": "\u21A4", "/arrowlefthook": "\u21A9", "/arrowleftloop": "\u21AB", "/arrowleftlowmod": "\u02FF", "/arrowleftoverright": "\u21C6", "/arrowleftoverrighttobar": "\u21B9", "/arrowleftright": "\u2194", "/arrowleftrightstroke": "\u21AE", "/arrowleftrightwave": "\u21AD", "/arrowleftsquiggle": "\u21DC", "/arrowleftstroke": "\u219A", "/arrowlefttail": "\u21A2", "/arrowlefttobar": "\u21E4", "/arrowlefttwoheaded": "\u219E", "/arrowleftuptip": "\u21B0", "/arrowleftwave": "\u219C", "/arrowleftwhite": "\u21E6", "/arrowlongNWtobar": "\u21B8", "/arrowright": "\u2192", "/arrowrightdashed": "\u21E2", "/arrowrightdblstroke": "\u21CF", "/arrowrightdowncorner": "\u21B4", "/arrowrightdowntip": "\u21B3", "/arrowrightfrombar": "\u21A6", "/arrowrightheavy": "\u279E", "/arrowrighthook": "\u21AA", "/arrowrightloop": "\u21AC", "/arrowrightoverleft": "\u21C4", "/arrowrightsmallcircle": "\u21F4", "/arrowrightsquiggle": "\u21DD", "/arrowrightstroke": "\u219B", "/arrowrighttail": "\u21A3", "/arrowrighttobar": "\u21E5", "/arrowrighttwoheaded": "\u21A0", "/arrowrightwave": "\u219D", "/arrowrightwhite": "\u21E8", "/arrowspaireddown": "\u21CA", "/arrowspairedleft": "\u21C7", "/arrowspairedright": "\u21C9", "/arrowspairedup": "\u21C8", "/arrowtableft": "\u21E4", "/arrowtabright": "\u21E5", "/arrowup": "\u2191", "/arrowupdashed": "\u21E1", "/arrowupdn": "\u2195", "/arrowupdnbse": "\u21A8", "/arrowupdown": "\u2195", "/arrowupdownbase": "\u21A8", "/arrowupdownwithbase": "\u21A8", "/arrowupfrombar": "\u21A5", "/arrowupleft": "\u2196", "/arrowupleftofdown": "\u21C5", "/arrowupright": "\u2197", "/arrowuprighttip": "\u21B1", "/arrowuptwoheaded": "\u219F", "/arrowupwhite": "\u21E7", "/arrowvertex": "\uF8E6", "/articulatedLorry": "\u1F69B", "/artistPalette": "\u1F3A8", "/aruhuasquare": "\u3301", "/asciicircum": "\u005E", "/asciicircummonospace": "\uFF3E", "/asciitilde": "\u007E", "/asciitildemonospace": "\uFF5E", "/ascript": "\u0251", "/ascriptturned": "\u0252", "/asmallhiragana": "\u3041", "/asmallkatakana": "\u30A1", "/asmallkatakanahalfwidth": "\uFF67", "/asper": "\u1FFE", "/asperacute": "\u1FDE", "/aspergrave": "\u1FDD", "/aspertilde": "\u1FDF", "/assertion": "\u22A6", "/asterisk": "\u002A", "/asteriskaltonearabic": "\u066D", "/asteriskarabic": "\u066D", "/asteriskmath": "\u2217", "/asteriskmonospace": "\uFF0A", "/asterisksmall": "\uFE61", "/asterism": "\u2042", "/astonishedFace": "\u1F632", "/astroke": "\u2C65", "/astronomicaluranus": "\u26E2", "/asuperior": "\uF6E9", "/asympticallyequal": "\u2243", "/asymptoticallyequal": "\u2243", "/at": "\u0040", "/athleticShoe": "\u1F45F", "/atilde": "\u00E3", "/atmonospace": "\uFF20", "/atnachHafukh:hb": "\u05A2", "/atom": "\u269B", "/atsmall": "\uFE6B", "/attentionideographiccircled": "\u329F", "/aturned": "\u0250", "/au": "\uA737", "/aubengali": "\u0994", "/aubergine": "\u1F346", "/aubopomofo": "\u3120", "/audeva": "\u0914", "/aufullwidth": "\u3373", "/augujarati": "\u0A94", "/augurmukhi": "\u0A14", "/augusttelegraph": "\u32C7", "/aulengthmarkbengali": "\u09D7", "/aumatragurmukhi": "\u0A4C", "/austral": "\u20B3", "/automatedTellerMachine": "\u1F3E7", "/automobile": "\u1F697", "/auvowelsignbengali": "\u09CC", "/auvowelsigndeva": "\u094C", "/auvowelsigngujarati": "\u0ACC", "/av": "\uA739", "/avagrahadeva": "\u093D", "/avhorizontalbar": "\uA73B", "/ay": "\uA73D", "/aybarmenian": "\u0561", "/ayin": "\u05E2", "/ayin:hb": "\u05E2", "/ayinalt:hb": "\uFB20", "/ayinaltonehebrew": "\uFB20", "/ayinhebrew": "\u05E2", "/azla:hb": "\u059C", "/b": "\u0062", "/baarerusquare": "\u332D", "/babengali": "\u09AC", "/babyAngel": "\u1F47C", "/babyBottle": "\u1F37C", "/babyChick": "\u1F424", "/backLeftwardsArrowAbove": "\u1F519", "/backOfEnvelope": "\u1F582", "/backslash": "\u005C", "/backslashbarfunc": "\u2340", "/backslashdbl": "\u244A", "/backslashmonospace": "\uFF3C", "/bactrianCamel": "\u1F42B", "/badeva": "\u092C", "/badmintonRacquetAndShuttlecock": "\u1F3F8", "/bagdelimitersshapeleft": "\u27C5", "/bagdelimitersshaperight": "\u27C6", "/baggageClaim": "\u1F6C4", "/bagujarati": "\u0AAC", "/bagurmukhi": "\u0A2C", "/bahiragana": "\u3070", "/bahtthai": "\u0E3F", "/bakatakana": "\u30D0", "/balloon": "\u1F388", "/ballotBoldScriptX": "\u1F5F6", "/ballotBoxBallot": "\u1F5F3", "/ballotBoxBoldCheck": "\u1F5F9", "/ballotBoxBoldScriptX": "\u1F5F7", "/ballotBoxScriptX": "\u1F5F5", "/ballotScriptX": "\u1F5F4", "/bamurda": "\uA9A8", "/banana": "\u1F34C", "/bank": "\u1F3E6", "/banknoteDollarSign": "\u1F4B5", "/banknoteEuroSign": "\u1F4B6", "/banknotePoundSign": "\u1F4B7", "/banknoteYenSign": "\u1F4B4", "/bar": "\u007C", "/barChart": "\u1F4CA", "/barberPole": "\u1F488", "/barfullwidth": "\u3374", "/barmonospace": "\uFF5C", "/barquillverticalleft": "\u2E20", "/barquillverticalright": "\u2E21", "/baseball": "\u26BE", "/basketballAndHoop": "\u1F3C0", "/bath": "\u1F6C0", "/bathtub": "\u1F6C1", "/battery": "\u1F50B", "/bbopomofo": "\u3105", "/bcircle": "\u24D1", "/bdot": "\u1E03", "/bdotaccent": "\u1E03", "/bdotbelow": "\u1E05", "/beachUmbrella": "\u1F3D6", "/beamedAscendingMusicalNotes": "\u1F39C", "/beamedDescendingMusicalNotes": "\u1F39D", "/beamedeighthnotes": "\u266B", "/beamedsixteenthnotes": "\u266C", "/beamfunc": "\u2336", "/bearFace": "\u1F43B", "/beatingHeart": "\u1F493", "/because": "\u2235", "/becyr": "\u0431", "/becyrillic": "\u0431", "/bed": "\u1F6CF", "/beeh": "\u067B", "/beeh.fina": "\uFB53", "/beeh.init": "\uFB54", "/beeh.isol": "\uFB52", "/beeh.medi": "\uFB55", "/beerMug": "\u1F37A", "/beetasquare": "\u333C", "/beh": "\u0628", "/beh.fina": "\uFE90", "/beh.init": "\uFE91", "/beh.init_alefmaksura.fina": "\uFC09", "/beh.init_hah.fina": "\uFC06", "/beh.init_hah.medi": "\uFC9D", "/beh.init_heh.medi": "\uFCA0", "/beh.init_jeem.fina": "\uFC05", "/beh.init_jeem.medi": "\uFC9C", "/beh.init_khah.fina": "\uFC07", "/beh.init_khah.medi": "\uFC9E", "/beh.init_meem.fina": "\uFC08", "/beh.init_meem.medi": "\uFC9F", "/beh.init_yeh.fina": "\uFC0A", "/beh.isol": "\uFE8F", "/beh.medi": "\uFE92", "/beh.medi_alefmaksura.fina": "\uFC6E", "/beh.medi_hah.medi_yeh.fina": "\uFDC2", "/beh.medi_heh.medi": "\uFCE2", "/beh.medi_khah.medi_yeh.fina": "\uFD9E", "/beh.medi_meem.fina": "\uFC6C", "/beh.medi_meem.medi": "\uFCE1", "/beh.medi_noon.fina": "\uFC6D", "/beh.medi_reh.fina": "\uFC6A", "/beh.medi_yeh.fina": "\uFC6F", "/beh.medi_zain.fina": "\uFC6B", "/behDotBelowThreeDotsAbove": "\u0751", "/behInvertedSmallVBelow": "\u0755", "/behSmallV": "\u0756", "/behThreeDotsHorizontallyBelow": "\u0750", "/behThreeDotsUpBelow": "\u0752", "/behThreeDotsUpBelowTwoDotsAbove": "\u0753", "/behTwoDotsBelowDotAbove": "\u0754", "/beharabic": "\u0628", "/beheh": "\u0680", "/beheh.fina": "\uFB5B", "/beheh.init": "\uFB5C", "/beheh.isol": "\uFB5A", "/beheh.medi": "\uFB5D", "/behfinalarabic": "\uFE90", "/behinitialarabic": "\uFE91", "/behiragana": "\u3079", "/behmedialarabic": "\uFE92", "/behmeeminitialarabic": "\uFC9F", "/behmeemisolatedarabic": "\uFC08", "/behnoonfinalarabic": "\uFC6D", "/bekatakana": "\u30D9", "/bellCancellationStroke": "\u1F515", "/bellhopBell": "\u1F6CE", "/beltbuckle": "\u2444", "/benarmenian": "\u0562", "/beng:a": "\u0985", "/beng:aa": "\u0986", "/beng:aasign": "\u09BE", "/beng:abbreviationsign": "\u09FD", "/beng:ai": "\u0990", "/beng:aisign": "\u09C8", "/beng:anji": "\u0980", "/beng:anusvara": "\u0982", "/beng:au": "\u0994", "/beng:aulengthmark": "\u09D7", "/beng:ausign": "\u09CC", "/beng:avagraha": "\u09BD", "/beng:ba": "\u09AC", "/beng:bha": "\u09AD", "/beng:ca": "\u099A", "/beng:candrabindu": "\u0981", "/beng:cha": "\u099B", "/beng:currencyoneless": "\u09F8", "/beng:da": "\u09A6", "/beng:dda": "\u09A1", "/beng:ddha": "\u09A2", "/beng:dha": "\u09A7", "/beng:e": "\u098F", "/beng:eight": "\u09EE", "/beng:esign": "\u09C7", "/beng:five": "\u09EB", "/beng:four": "\u09EA", "/beng:fourcurrencynumerator": "\u09F7", "/beng:ga": "\u0997", "/beng:gandamark": "\u09FB", "/beng:gha": "\u0998", "/beng:ha": "\u09B9", "/beng:i": "\u0987", "/beng:ii": "\u0988", "/beng:iisign": "\u09C0", "/beng:isign": "\u09BF", "/beng:isshar": "\u09FA", "/beng:ja": "\u099C", "/beng:jha": "\u099D", "/beng:ka": "\u0995", "/beng:kha": "\u0996", "/beng:khandata": "\u09CE", "/beng:la": "\u09B2", "/beng:llvocal": "\u09E1", "/beng:llvocalsign": "\u09E3", "/beng:lvocal": "\u098C", "/beng:lvocalsign": "\u09E2", "/beng:ma": "\u09AE", "/beng:na": "\u09A8", "/beng:nga": "\u0999", "/beng:nine": "\u09EF", "/beng:nna": "\u09A3", "/beng:nukta": "\u09BC", "/beng:nya": "\u099E", "/beng:o": "\u0993", "/beng:one": "\u09E7", "/beng:onecurrencynumerator": "\u09F4", "/beng:osign": "\u09CB", "/beng:pa": "\u09AA", "/beng:pha": "\u09AB", "/beng:ra": "\u09B0", "/beng:ralowdiagonal": "\u09F1", "/beng:ramiddiagonal": "\u09F0", "/beng:rha": "\u09DD", "/beng:rra": "\u09DC", "/beng:rrvocal": "\u09E0", "/beng:rrvocalsign": "\u09C4", "/beng:rupee": "\u09F3", "/beng:rupeemark": "\u09F2", "/beng:rvocal": "\u098B", "/beng:rvocalsign": "\u09C3", "/beng:sa": "\u09B8", "/beng:seven": "\u09ED", "/beng:sha": "\u09B6", "/beng:six": "\u09EC", "/beng:sixteencurrencydenominator": "\u09F9", "/beng:ssa": "\u09B7", "/beng:ta": "\u09A4", "/beng:tha": "\u09A5", "/beng:three": "\u09E9", "/beng:threecurrencynumerator": "\u09F6", "/beng:tta": "\u099F", "/beng:ttha": "\u09A0", "/beng:two": "\u09E8", "/beng:twocurrencynumerator": "\u09F5", "/beng:u": "\u0989", "/beng:usign": "\u09C1", "/beng:uu": "\u098A", "/beng:uusign": "\u09C2", "/beng:vedicanusvara": "\u09FC", "/beng:virama": "\u09CD", "/beng:visarga": "\u0983", "/beng:ya": "\u09AF", "/beng:yya": "\u09DF", "/beng:zero": "\u09E6", "/bentoBox": "\u1F371", "/benzenering": "\u232C", "/benzeneringcircle": "\u23E3", "/bet": "\u05D1", "/bet:hb": "\u05D1", "/beta": "\u03B2", "/betasymbol": "\u03D0", "/betasymbolgreek": "\u03D0", "/betdagesh": "\uFB31", "/betdageshhebrew": "\uFB31", "/bethebrew": "\u05D1", "/betrafehebrew": "\uFB4C", "/between": "\u226C", "/betwithdagesh:hb": "\uFB31", "/betwithrafe:hb": "\uFB4C", "/bflourish": "\uA797", "/bhabengali": "\u09AD", "/bhadeva": "\u092D", "/bhagujarati": "\u0AAD", "/bhagurmukhi": "\u0A2D", "/bhook": "\u0253", "/bicycle": "\u1F6B2", "/bicyclist": "\u1F6B4", "/bihiragana": "\u3073", "/bikatakana": "\u30D3", "/bikini": "\u1F459", "/bilabialclick": "\u0298", "/billiards": "\u1F3B1", "/bindigurmukhi": "\u0A02", "/biohazard": "\u2623", "/bird": "\u1F426", "/birthdayCake": "\u1F382", "/birusquare": "\u3331", "/bishopblack": "\u265D", "/bishopwhite": "\u2657", "/bitcoin": "\u20BF", "/blackDownPointingBackhandIndex": "\u1F5A3", "/blackDroplet": "\u1F322", "/blackFolder": "\u1F5BF", "/blackHardShellFloppyDisk": "\u1F5AA", "/blackHeart": "\u1F5A4", "/blackLeftPointingBackhandIndex": "\u1F59C", "/blackPennant": "\u1F3F2", "/blackPushpin": "\u1F588", "/blackRightPointingBackhandIndex": "\u1F59D", "/blackRosette": "\u1F3F6", "/blackSkullAndCrossbones": "\u1F571", "/blackSquareButton": "\u1F532", "/blackTouchtoneTelephone": "\u1F57F", "/blackUpPointingBackhandIndex": "\u1F5A2", "/blackcircle": "\u25CF", "/blackcircleforrecord": "\u23FA", "/blackdiamond": "\u25C6", "/blackdownpointingtriangle": "\u25BC", "/blackforstopsquare": "\u23F9", "/blackleftpointingpointer": "\u25C4", "/blackleftpointingtriangle": "\u25C0", "/blacklenticularbracketleft": "\u3010", "/blacklenticularbracketleftvertical": "\uFE3B", "/blacklenticularbracketright": "\u3011", "/blacklenticularbracketrightvertical": "\uFE3C", "/blacklowerlefttriangle": "\u25E3", "/blacklowerrighttriangle": "\u25E2", "/blackmediumpointingtriangledown": "\u23F7", "/blackmediumpointingtriangleleft": "\u23F4", "/blackmediumpointingtriangleright": "\u23F5", "/blackmediumpointingtriangleup": "\u23F6", "/blackpointingdoubletrianglebarverticalleft": "\u23EE", "/blackpointingdoubletrianglebarverticalright": "\u23ED", "/blackpointingdoubletriangledown": "\u23EC", "/blackpointingdoubletriangleleft": "\u23EA", "/blackpointingdoubletriangleright": "\u23E9", "/blackpointingdoubletriangleup": "\u23EB", "/blackpointingtriangledoublebarverticalright": "\u23EF", "/blackrectangle": "\u25AC", "/blackrightpointingpointer": "\u25BA", "/blackrightpointingtriangle": "\u25B6", "/blacksmallsquare": "\u25AA", "/blacksmilingface": "\u263B", "/blacksquare": "\u25A0", "/blackstar": "\u2605", "/blackupperlefttriangle": "\u25E4", "/blackupperrighttriangle": "\u25E5", "/blackuppointingsmalltriangle": "\u25B4", "/blackuppointingtriangle": "\u25B2", "/blackwardsbulletleft": "\u204C", "/blackwardsbulletright": "\u204D", "/blank": "\u2423", "/blinebelow": "\u1E07", "/block": "\u2588", "/blossom": "\u1F33C", "/blowfish": "\u1F421", "/blueBook": "\u1F4D8", "/blueHeart": "\u1F499", "/bmonospace": "\uFF42", "/boar": "\u1F417", "/board": "\u2328", "/bobaimaithai": "\u0E1A", "/bohiragana": "\u307C", "/bokatakana": "\u30DC", "/bomb": "\u1F4A3", "/book": "\u1F56E", "/bookmark": "\u1F516", "/bookmarkTabs": "\u1F4D1", "/books": "\u1F4DA", "/bopo:a": "\u311A", "/bopo:ai": "\u311E", "/bopo:an": "\u3122", "/bopo:ang": "\u3124", "/bopo:au": "\u3120", "/bopo:b": "\u3105", "/bopo:c": "\u3118", "/bopo:ch": "\u3114", "/bopo:d": "\u3109", "/bopo:e": "\u311C", "/bopo:eh": "\u311D", "/bopo:ei": "\u311F", "/bopo:en": "\u3123", "/bopo:eng": "\u3125", "/bopo:er": "\u3126", "/bopo:f": "\u3108", "/bopo:g": "\u310D", "/bopo:gn": "\u312C", "/bopo:h": "\u310F", "/bopo:i": "\u3127", "/bopo:ih": "\u312D", "/bopo:iu": "\u3129", "/bopo:j": "\u3110", "/bopo:k": "\u310E", "/bopo:l": "\u310C", "/bopo:m": "\u3107", "/bopo:n": "\u310B", "/bopo:ng": "\u312B", "/bopo:o": "\u311B", "/bopo:ou": "\u3121", "/bopo:owithdotabove": "\u312E", "/bopo:p": "\u3106", "/bopo:q": "\u3111", "/bopo:r": "\u3116", "/bopo:s": "\u3119", "/bopo:sh": "\u3115", "/bopo:t": "\u310A", "/bopo:u": "\u3128", "/bopo:v": "\u312A", "/bopo:x": "\u3112", "/bopo:z": "\u3117", "/bopo:zh": "\u3113", "/borutosquare": "\u333E", "/bottlePoppingCork": "\u1F37E", "/bouquet": "\u1F490", "/bouquetOfFlowers": "\u1F395", "/bowAndArrow": "\u1F3F9", "/bowlOfHygieia": "\u1F54F", "/bowling": "\u1F3B3", "/boxlineverticalleft": "\u23B8", "/boxlineverticalright": "\u23B9", "/boy": "\u1F466", "/boys": "\u1F6C9", "/bparen": "\u249D", "/bparenthesized": "\u249D", "/bqfullwidth": "\u33C3", "/bqsquare": "\u33C3", "/braceex": "\uF8F4", "/braceleft": "\u007B", "/braceleftbt": "\uF8F3", "/braceleftmid": "\uF8F2", "/braceleftmonospace": "\uFF5B", "/braceleftsmall": "\uFE5B", "/bracelefttp": "\uF8F1", "/braceleftvertical": "\uFE37", "/braceright": "\u007D", "/bracerightbt": "\uF8FE", "/bracerightmid": "\uF8FD", "/bracerightmonospace": "\uFF5D", "/bracerightsmall": "\uFE5C", "/bracerighttp": "\uF8FC", "/bracerightvertical": "\uFE38", "/bracketangledblleft": "\u27EA", "/bracketangledblright": "\u27EB", "/bracketangleleft": "\u27E8", "/bracketangleright": "\u27E9", "/bracketbottomcurly": "\u23DF", "/bracketbottomsquare": "\u23B5", "/bracketcornerupleftsquare": "\u23A1", "/bracketcorneruprightsquare": "\u23A4", "/bracketdottedsubstitutionleft": "\u2E04", "/bracketdottedsubstitutionright": "\u2E05", "/bracketextensioncurly": "\u23AA", "/bracketextensionleftsquare": "\u23A2", "/bracketextensionrightsquare": "\u23A5", "/brackethalfbottomleft": "\u2E24", "/brackethalfbottomright": "\u2E25", "/brackethalftopleft": "\u2E22", "/brackethalftopright": "\u2E23", "/brackethookupleftcurly": "\u23A7", "/brackethookuprightcurly": "\u23AB", "/bracketleft": "\u005B", "/bracketleftbt": "\uF8F0", "/bracketleftex": "\uF8EF", "/bracketleftmonospace": "\uFF3B", "/bracketleftsquarequill": "\u2045", "/bracketlefttp": "\uF8EE", "/bracketlowercornerleftsquare": "\u23A3", "/bracketlowercornerrightsquare": "\u23A6", "/bracketlowerhookleftcurly": "\u23A9", "/bracketlowerhookrightcurly": "\u23AD", "/bracketmiddlepieceleftcurly": "\u23A8", "/bracketmiddlepiecerightcurly": "\u23AC", "/bracketoverbrackettopbottomsquare": "\u23B6", "/bracketparaphraselowleft": "\u2E1C", "/bracketparaphraselowright": "\u2E1D", "/bracketraisedleft": "\u2E0C", "/bracketraisedright": "\u2E0D", "/bracketright": "\u005D", "/bracketrightbt": "\uF8FB", "/bracketrightex": "\uF8FA", "/bracketrightmonospace": "\uFF3D", "/bracketrightsquarequill": "\u2046", "/bracketrighttp": "\uF8F9", "/bracketsectionupleftlowerrightcurly": "\u23B0", "/bracketsectionuprightlowerleftcurly": "\u23B1", "/bracketshellbottom": "\u23E1", "/bracketshelltop": "\u23E0", "/bracketshellwhiteleft": "\u27EC", "/bracketshellwhiteright": "\u27ED", "/bracketsubstitutionleft": "\u2E02", "/bracketsubstitutionright": "\u2E03", "/brackettopcurly": "\u23DE", "/brackettopsquare": "\u23B4", "/brackettranspositionleft": "\u2E09", "/brackettranspositionright": "\u2E0A", "/bracketwhitesquareleft": "\u27E6", "/bracketwhitesquareright": "\u27E7", "/branchbankidentification": "\u2446", "/bread": "\u1F35E", "/breve": "\u02D8", "/brevebelowcmb": "\u032E", "/brevecmb": "\u0306", "/breveinvertedbelowcmb": "\u032F", "/breveinvertedcmb": "\u0311", "/breveinverteddoublecmb": "\u0361", "/brevemetrical": "\u23D1", "/brideVeil": "\u1F470", "/bridgeAtNight": "\u1F309", "/bridgebelowcmb": "\u032A", "/bridgeinvertedbelowcmb": "\u033A", "/briefcase": "\u1F4BC", "/brll:blank": "\u2800", "/brokenHeart": "\u1F494", "/brokenbar": "\u00A6", "/brokencirclenorthwestarrow": "\u238B", "/bstroke": "\u0180", "/bsuperior": "\uF6EA", "/btopbar": "\u0183", "/bug": "\u1F41B", "/buhiragana": "\u3076", "/buildingConstruction": "\u1F3D7", "/bukatakana": "\u30D6", "/bullet": "\u2022", "/bulletinverse": "\u25D8", "/bulletoperator": "\u2219", "/bullhorn": "\u1F56B", "/bullhornSoundWaves": "\u1F56C", "/bullseye": "\u25CE", "/burrito": "\u1F32F", "/bus": "\u1F68C", "/busStop": "\u1F68F", "/bussyerusquare": "\u3334", "/bustInSilhouette": "\u1F464", "/bustsInSilhouette": "\u1F465", "/c": "\u0063", "/caarmenian": "\u056E", "/cabengali": "\u099A", "/cactus": "\u1F335", "/cacute": "\u0107", "/cadauna": "\u2106", "/cadeva": "\u091A", "/caduceus": "\u2624", "/cagujarati": "\u0A9A", "/cagurmukhi": "\u0A1A", "/cakraconsonant": "\uA9BF", "/calendar": "\u1F4C5", "/calfullwidth": "\u3388", "/callideographicparen": "\u323A", "/calsquare": "\u3388", "/camera": "\u1F4F7", "/cameraFlash": "\u1F4F8", "/camping": "\u1F3D5", "/camurda": "\uA996", "/cancellationX": "\u1F5D9", "/cancer": "\u264B", "/candle": "\u1F56F", "/candrabindubengali": "\u0981", "/candrabinducmb": "\u0310", "/candrabindudeva": "\u0901", "/candrabindugujarati": "\u0A81", "/candy": "\u1F36C", "/canoe": "\u1F6F6", "/capitulum": "\u2E3F", "/capricorn": "\u2651", "/capslock": "\u21EA", "/cardFileBox": "\u1F5C3", "/cardIndex": "\u1F4C7", "/cardIndexDividers": "\u1F5C2", "/careof": "\u2105", "/caret": "\u2038", "/caretinsertionpoint": "\u2041", "/carettildedownfunc": "\u2371", "/carettildeupfunc": "\u2372", "/caron": "\u02C7", "/caronbelowcmb": "\u032C", "/caroncmb": "\u030C", "/carouselHorse": "\u1F3A0", "/carpStreamer": "\u1F38F", "/carriagereturn": "\u21B5", "/carsliding": "\u26D0", "/castle": "\u26EB", "/cat": "\u1F408", "/catFace": "\u1F431", "/catFaceWithTearsOfJoy": "\u1F639", "/catFaceWithWrySmile": "\u1F63C", "/caution": "\u2621", "/cbar": "\uA793", "/cbopomofo": "\u3118", "/ccaron": "\u010D", "/ccedilla": "\u00E7", "/ccedillaacute": "\u1E09", "/ccfullwidth": "\u33C4", "/ccircle": "\u24D2", "/ccircumflex": "\u0109", "/ccurl": "\u0255", "/cdfullwidth": "\u33C5", "/cdot": "\u010B", "/cdotaccent": "\u010B", "/cdotreversed": "\uA73F", "/cdsquare": "\u33C5", "/cecak": "\uA981", "/cecaktelu": "\uA9B3", "/cedi": "\u20B5", "/cedilla": "\u00B8", "/cedillacmb": "\u0327", "/ceilingleft": "\u2308", "/ceilingright": "\u2309", "/celticCross": "\u1F548", "/cent": "\u00A2", "/centigrade": "\u2103", "/centinferior": "\uF6DF", "/centmonospace": "\uFFE0", "/centoldstyle": "\uF7A2", "/centreddotwhitediamond": "\u27D0", "/centreideographiccircled": "\u32A5", "/centreline": "\u2104", "/centrelineverticalsquarewhite": "\u2385", "/centsuperior": "\uF6E0", "/ceres": "\u26B3", "/chaarmenian": "\u0579", "/chabengali": "\u099B", "/chadeva": "\u091B", "/chagujarati": "\u0A9B", "/chagurmukhi": "\u0A1B", "/chains": "\u26D3", "/chair": "\u2441", "/chamkocircle": "\u327C", "/charactertie": "\u2040", "/chartDownwardsTrend": "\u1F4C9", "/chartUpwardsTrend": "\u1F4C8", "/chartUpwardsTrendAndYenSign": "\u1F4B9", "/chbopomofo": "\u3114", "/cheabkhasiancyrillic": "\u04BD", "/cheabkhcyr": "\u04BD", "/cheabkhtailcyr": "\u04BF", "/checkbox": "\u2610", "/checkboxchecked": "\u2611", "/checkboxx": "\u2612", "/checkmark": "\u2713", "/checyr": "\u0447", "/checyrillic": "\u0447", "/chedescenderabkhasiancyrillic": "\u04BF", "/chedescendercyrillic": "\u04B7", "/chedieresiscyr": "\u04F5", "/chedieresiscyrillic": "\u04F5", "/cheeringMegaphone": "\u1F4E3", "/cheharmenian": "\u0573", "/chekhakascyr": "\u04CC", "/chekhakassiancyrillic": "\u04CC", "/chequeredFlag": "\u1F3C1", "/cherries": "\u1F352", "/cherryBlossom": "\u1F338", "/chestnut": "\u1F330", "/chetailcyr": "\u04B7", "/chevertcyr": "\u04B9", "/cheverticalstrokecyrillic": "\u04B9", "/chi": "\u03C7", "/chicken": "\u1F414", "/chieuchacirclekorean": "\u3277", "/chieuchaparenkorean": "\u3217", "/chieuchcirclekorean": "\u3269", "/chieuchkorean": "\u314A", "/chieuchparenkorean": "\u3209", "/childrenCrossing": "\u1F6B8", "/chipmunk": "\u1F43F", "/chirho": "\u2627", "/chiron": "\u26B7", "/chochangthai": "\u0E0A", "/chochanthai": "\u0E08", "/chochingthai": "\u0E09", "/chochoethai": "\u0E0C", "/chocolateBar": "\u1F36B", "/chook": "\u0188", "/christmasTree": "\u1F384", "/church": "\u26EA", "/cieucacirclekorean": "\u3276", "/cieucaparenkorean": "\u3216", "/cieuccirclekorean": "\u3268", "/cieuckorean": "\u3148", "/cieucparenkorean": "\u3208", "/cieucuparenkorean": "\u321C", "/cinema": "\u1F3A6", "/circle": "\u25CB", "/circleallbutupperquadrantleftblack": "\u25D5", "/circlebackslashfunc": "\u2349", "/circleblack": "\u25CF", "/circledCrossPommee": "\u1F540", "/circledInformationSource": "\u1F6C8", "/circledasteriskoperator": "\u229B", "/circledbarnotchhorizontal": "\u2389", "/circledcrossinglanes": "\u26D2", "/circleddash": "\u229D", "/circleddivisionslash": "\u2298", "/circleddotoperator": "\u2299", "/circledequals": "\u229C", "/circlediaeresisfunc": "\u2365", "/circledminus": "\u2296", "/circledot": "\u2299", "/circledotrightwhite": "\u2686", "/circledotted": "\u25CC", "/circledringoperator": "\u229A", "/circledtriangledown": "\u238A", "/circlehalfleftblack": "\u25D0", "/circlehalfrightblack": "\u25D1", "/circleinversewhite": "\u25D9", "/circlejotfunc": "\u233E", "/circlelowerhalfblack": "\u25D2", "/circlelowerquadrantleftwhite": "\u25F5", "/circlelowerquadrantrightwhite": "\u25F6", "/circlemultiply": "\u2297", "/circleot": "\u2299", "/circleplus": "\u2295", "/circlepostalmark": "\u3036", "/circlestarfunc": "\u235F", "/circlestilefunc": "\u233D", "/circlestroketwodotsaboveheavy": "\u26E3", "/circletwodotsblackwhite": "\u2689", "/circletwodotswhite": "\u2687", "/circleunderlinefunc": "\u235C", "/circleupperhalfblack": "\u25D3", "/circleupperquadrantleftwhite": "\u25F4", "/circleupperquadrantrightblack": "\u25D4", "/circleupperquadrantrightwhite": "\u25F7", "/circleverticalfill": "\u25CD", "/circlewhite": "\u25CB", "/circlewhitedotrightblack": "\u2688", "/circlewithlefthalfblack": "\u25D0", "/circlewithrighthalfblack": "\u25D1", "/circumflex": "\u02C6", "/circumflexbelowcmb": "\u032D", "/circumflexcmb": "\u0302", "/circumflexlow": "\uA788", "/circusTent": "\u1F3AA", "/cityscape": "\u1F3D9", "/cityscapeAtDusk": "\u1F306", "/cjk:ideographiccomma": "\u3001", "/cjk:tortoiseshellbracketleft": "\u3014", "/cjk:tortoiseshellbracketright": "\u3015", "/clamshellMobilePhone": "\u1F581", "/clapperBoard": "\u1F3AC", "/clappingHandsSign": "\u1F44F", "/classicalBuilding": "\u1F3DB", "/clear": "\u2327", "/clearscreen": "\u239A", "/clickalveolar": "\u01C2", "/clickbilabial": "\u0298", "/clickdental": "\u01C0", "/clicklateral": "\u01C1", "/clickretroflex": "\u01C3", "/clinkingBeerMugs": "\u1F37B", "/clipboard": "\u1F4CB", "/clockFaceEight-thirty": "\u1F563", "/clockFaceEightOclock": "\u1F557", "/clockFaceEleven-thirty": "\u1F566", "/clockFaceElevenOclock": "\u1F55A", "/clockFaceFive-thirty": "\u1F560", "/clockFaceFiveOclock": "\u1F554", "/clockFaceFour-thirty": "\u1F55F", "/clockFaceFourOclock": "\u1F553", "/clockFaceNine-thirty": "\u1F564", "/clockFaceNineOclock": "\u1F558", "/clockFaceOne-thirty": "\u1F55C", "/clockFaceOneOclock": "\u1F550", "/clockFaceSeven-thirty": "\u1F562", "/clockFaceSevenOclock": "\u1F556", "/clockFaceSix-thirty": "\u1F561", "/clockFaceSixOclock": "\u1F555", "/clockFaceTen-thirty": "\u1F565", "/clockFaceTenOclock": "\u1F559", "/clockFaceThree-thirty": "\u1F55E", "/clockFaceThreeOclock": "\u1F552", "/clockFaceTwelve-thirty": "\u1F567", "/clockFaceTwelveOclock": "\u1F55B", "/clockFaceTwo-thirty": "\u1F55D", "/clockFaceTwoOclock": "\u1F551", "/clockwiseDownwardsAndUpwardsOpenCircleArrows": "\u1F503", "/clockwiseRightAndLeftSemicircleArrows": "\u1F5D8", "/clockwiseRightwardsAndLeftwardsOpenCircleArrows": "\u1F501", "/clockwiseRightwardsAndLeftwardsOpenCircleArrowsCircledOneOverlay": "\u1F502", "/closedBook": "\u1F4D5", "/closedLockKey": "\u1F510", "/closedMailboxLoweredFlag": "\u1F4EA", "/closedMailboxRaisedFlag": "\u1F4EB", "/closedUmbrella": "\u1F302", "/closedentryleft": "\u26DC", "/closeup": "\u2050", "/cloud": "\u2601", "/cloudLightning": "\u1F329", "/cloudRain": "\u1F327", "/cloudSnow": "\u1F328", "/cloudTornado": "\u1F32A", "/clsquare": "\u1F191", "/club": "\u2663", "/clubblack": "\u2663", "/clubsuitblack": "\u2663", "/clubsuitwhite": "\u2667", "/clubwhite": "\u2667", "/cm2fullwidth": "\u33A0", "/cm3fullwidth": "\u33A4", "/cmb:a": "\u0363", "/cmb:aaboveflat": "\u1DD3", "/cmb:aboveogonek": "\u1DCE", "/cmb:acute": "\u0301", "/cmb:acutebelow": "\u0317", "/cmb:acutegraveacute": "\u1DC9", "/cmb:acutemacron": "\u1DC7", "/cmb:acutetone": "\u0341", "/cmb:adieresis": "\u1DF2", "/cmb:ae": "\u1DD4", "/cmb:almostequalabove": "\u034C", "/cmb:almostequaltobelow": "\u1DFD", "/cmb:alpha": "\u1DE7", "/cmb:ao": "\u1DD5", "/cmb:arrowheadleftbelow": "\u0354", "/cmb:arrowheadrightabove": "\u0350", "/cmb:arrowheadrightarrowheadupbelow": "\u0356", "/cmb:arrowheadrightbelow": "\u0355", "/cmb:arrowleftrightbelow": "\u034D", "/cmb:arrowrightdoublebelow": "\u0362", "/cmb:arrowupbelow": "\u034E", "/cmb:asteriskbelow": "\u0359", "/cmb:av": "\u1DD6", "/cmb:b": "\u1DE8", "/cmb:belowbreve": "\u032E", "/cmb:beta": "\u1DE9", "/cmb:breve": "\u0306", "/cmb:brevemacron": "\u1DCB", "/cmb:bridgeabove": "\u0346", "/cmb:bridgebelow": "\u032A", "/cmb:c": "\u0368", "/cmb:candrabindu": "\u0310", "/cmb:caron": "\u030C", "/cmb:caronbelow": "\u032C", "/cmb:ccedilla": "\u1DD7", "/cmb:cedilla": "\u0327", "/cmb:circumflex": "\u0302", "/cmb:circumflexbelow": "\u032D", "/cmb:commaaccentbelow": "\u0326", "/cmb:commaturnedabove": "\u0312", "/cmb:d": "\u0369", "/cmb:dblarchinvertedbelow": "\u032B", "/cmb:dbloverline": "\u033F", "/cmb:dblverticallineabove": "\u030E", "/cmb:dblverticallinebelow": "\u0348", "/cmb:deletionmark": "\u1DFB", "/cmb:dialytikatonos": "\u0344", "/cmb:dieresis": "\u0308", "/cmb:dieresisbelow": "\u0324", "/cmb:dotaboveleft": "\u1DF8", "/cmb:dotaccent": "\u0307", "/cmb:dotbelowcomb": "\u0323", "/cmb:dotrightabove": "\u0358", "/cmb:dottedacute": "\u1DC1", "/cmb:dottedgrave": "\u1DC0", "/cmb:doubleabovecircumflex": "\u1DCD", "/cmb:doublebelowbreve": "\u035C", "/cmb:doublebreve": "\u035D", "/cmb:doubleinvertedbelowbreve": "\u1DFC", "/cmb:doubleringbelow": "\u035A", "/cmb:downtackbelow": "\u031E", "/cmb:e": "\u0364", "/cmb:equalbelow": "\u0347", "/cmb:esh": "\u1DEF", "/cmb:eth": "\u1DD9", "/cmb:f": "\u1DEB", "/cmb:fermata": "\u0352", "/cmb:g": "\u1DDA", "/cmb:graphemejoiner": "\u034F", "/cmb:grave": "\u0300", "/cmb:graveacutegrave": "\u1DC8", "/cmb:gravebelow": "\u0316", "/cmb:gravedouble": "\u030F", "/cmb:gravemacron": "\u1DC5", "/cmb:gravetone": "\u0340", "/cmb:gsmall": "\u1DDB", "/cmb:h": "\u036A", "/cmb:halfleftringabove": "\u0351", "/cmb:halfleftringbelow": "\u031C", "/cmb:halfrightringabove": "\u0357", "/cmb:halfrightringbelow": "\u0339", "/cmb:homotheticabove": "\u034B", "/cmb:hookabove": "\u0309", "/cmb:horn": "\u031B", "/cmb:hungarumlaut": "\u030B", "/cmb:i": "\u0365", "/cmb:insulard": "\u1DD8", "/cmb:invertedbelowbreve": "\u032F", "/cmb:invertedbreve": "\u0311", "/cmb:invertedbridgebelow": "\u033A", "/cmb:inverteddoublebreve": "\u0361", "/cmb:iotasub": "\u0345", "/cmb:isbelow": "\u1DD0", "/cmb:k": "\u1DDC", "/cmb:kavykaaboveleft": "\u1DF7", "/cmb:kavykaaboveright": "\u1DF6", "/cmb:koronis": "\u0343", "/cmb:l": "\u1DDD", "/cmb:leftangleabove": "\u031A", "/cmb:leftanglebelow": "\u0349", "/cmb:leftarrowheadabove": "\u1DFE", "/cmb:lefttackbelow": "\u0318", "/cmb:lineverticalabove": "\u030D", "/cmb:lineverticalbelow": "\u0329", "/cmb:longs": "\u1DE5", "/cmb:lowline": "\u0332", "/cmb:lowlinedouble": "\u0333", "/cmb:lsmall": "\u1DDE", "/cmb:lwithdoublemiddletilde": "\u1DEC", "/cmb:m": "\u036B", "/cmb:macron": "\u0304", "/cmb:macronacute": "\u1DC4", "/cmb:macronbelow": "\u0331", "/cmb:macronbreve": "\u1DCC", "/cmb:macrondouble": "\u035E", "/cmb:macrondoublebelow": "\u035F", "/cmb:macrongrave": "\u1DC6", "/cmb:minusbelow": "\u0320", "/cmb:msmall": "\u1DDF", "/cmb:n": "\u1DE0", "/cmb:nottildeabove": "\u034A", "/cmb:nsmall": "\u1DE1", "/cmb:o": "\u0366", "/cmb:odieresis": "\u1DF3", "/cmb:ogonek": "\u0328", "/cmb:overlaystrokelong": "\u0336", "/cmb:overlaystrokeshort": "\u0335", "/cmb:overline": "\u0305", "/cmb:owithlightcentralizationstroke": "\u1DED", "/cmb:p": "\u1DEE", "/cmb:palatalizedhookbelow": "\u0321", "/cmb:perispomeni": "\u0342", "/cmb:plusbelow": "\u031F", "/cmb:r": "\u036C", "/cmb:rbelow": "\u1DCA", "/cmb:retroflexhookbelow": "\u0322", "/cmb:reversedcommaabove": "\u0314", "/cmb:rightarrowheadanddownarrowheadbelow": "\u1DFF", "/cmb:righttackbelow": "\u0319", "/cmb:ringabove": "\u030A", "/cmb:ringbelow": "\u0325", "/cmb:rrotunda": "\u1DE3", "/cmb:rsmall": "\u1DE2", "/cmb:s": "\u1DE4", "/cmb:schwa": "\u1DEA", "/cmb:seagullbelow": "\u033C", "/cmb:snakebelow": "\u1DC2", "/cmb:soliduslongoverlay": "\u0338", "/cmb:solidusshortoverlay": "\u0337", "/cmb:squarebelow": "\u033B", "/cmb:suspensionmark": "\u1DC3", "/cmb:t": "\u036D", "/cmb:tilde": "\u0303", "/cmb:tildebelow": "\u0330", "/cmb:tildedouble": "\u0360", "/cmb:tildeoverlay": "\u0334", "/cmb:tildevertical": "\u033E", "/cmb:turnedabove": "\u0313", "/cmb:turnedcommaabove": "\u0315", "/cmb:u": "\u0367", "/cmb:udieresis": "\u1DF4", "/cmb:uptackabove": "\u1DF5", "/cmb:uptackbelow": "\u031D", "/cmb:urabove": "\u1DD1", "/cmb:usabove": "\u1DD2", "/cmb:uwithlightcentralizationstroke": "\u1DF0", "/cmb:v": "\u036E", "/cmb:w": "\u1DF1", "/cmb:wideinvertedbridgebelow": "\u1DF9", "/cmb:x": "\u036F", "/cmb:xabove": "\u033D", "/cmb:xbelow": "\u0353", "/cmb:z": "\u1DE6", "/cmb:zigzagabove": "\u035B", "/cmb:zigzagbelow": "\u1DCF", "/cmcubedsquare": "\u33A4", "/cmfullwidth": "\u339D", "/cmonospace": "\uFF43", "/cmsquaredsquare": "\u33A0", "/cntr:acknowledge": "\u2406", "/cntr:backspace": "\u2408", "/cntr:bell": "\u2407", "/cntr:blank": "\u2422", "/cntr:cancel": "\u2418", "/cntr:carriagereturn": "\u240D", "/cntr:datalinkescape": "\u2410", "/cntr:delete": "\u2421", "/cntr:deleteformtwo": "\u2425", "/cntr:devicecontrolfour": "\u2414", "/cntr:devicecontrolone": "\u2411", "/cntr:devicecontrolthree": "\u2413", "/cntr:devicecontroltwo": "\u2412", "/cntr:endofmedium": "\u2419", "/cntr:endoftext": "\u2403", "/cntr:endoftransmission": "\u2404", "/cntr:endoftransmissionblock": "\u2417", "/cntr:enquiry": "\u2405", "/cntr:escape": "\u241B", "/cntr:fileseparator": "\u241C", "/cntr:formfeed": "\u240C", "/cntr:groupseparator": "\u241D", "/cntr:horizontaltab": "\u2409", "/cntr:linefeed": "\u240A", "/cntr:negativeacknowledge": "\u2415", "/cntr:newline": "\u2424", "/cntr:null": "\u2400", "/cntr:openbox": "\u2423", "/cntr:recordseparator": "\u241E", "/cntr:shiftin": "\u240F", "/cntr:shiftout": "\u240E", "/cntr:space": "\u2420", "/cntr:startofheading": "\u2401", "/cntr:startoftext": "\u2402", "/cntr:substitute": "\u241A", "/cntr:substituteformtwo": "\u2426", "/cntr:synchronousidle": "\u2416", "/cntr:unitseparator": "\u241F", "/cntr:verticaltab": "\u240B", "/coarmenian": "\u0581", "/cocktailGlass": "\u1F378", "/coffin": "\u26B0", "/cofullwidth": "\u33C7", "/collision": "\u1F4A5", "/colon": "\u003A", "/colonequals": "\u2254", "/colonmod": "\uA789", "/colonmonetary": "\u20A1", "/colonmonospace": "\uFF1A", "/colonraisedmod": "\u02F8", "/colonsign": "\u20A1", "/colonsmall": "\uFE55", "/colontriangularhalfmod": "\u02D1", "/colontriangularmod": "\u02D0", "/comet": "\u2604", "/comma": "\u002C", "/commaabovecmb": "\u0313", "/commaaboverightcmb": "\u0315", "/commaaccent": "\uF6C3", "/commaarabic": "\u060C", "/commaarmenian": "\u055D", "/commabarfunc": "\u236A", "/commainferior": "\uF6E1", "/commamonospace": "\uFF0C", "/commaraised": "\u2E34", "/commareversed": "\u2E41", "/commareversedabovecmb": "\u0314", "/commareversedmod": "\u02BD", "/commasmall": "\uFE50", "/commasuperior": "\uF6E2", "/commaturnedabovecmb": "\u0312", "/commaturnedmod": "\u02BB", "/commercialat": "\uFE6B", "/commercialminussign": "\u2052", "/compass": "\u263C", "/complement": "\u2201", "/composition": "\u2384", "/compression": "\u1F5DC", "/con": "\uA76F", "/confettiBall": "\u1F38A", "/confoundedFace": "\u1F616", "/confusedFace": "\u1F615", "/congratulationideographiccircled": "\u3297", "/congratulationideographicparen": "\u3237", "/congruent": "\u2245", "/conicaltaper": "\u2332", "/conjunction": "\u260C", "/consquareupblack": "\u26FE", "/constructionSign": "\u1F6A7", "/constructionWorker": "\u1F477", "/containsasmembersmall": "\u220D", "/containsasnormalsubgroorequalup": "\u22B5", "/containsasnormalsubgroup": "\u22B3", "/containslonghorizontalstroke": "\u22FA", "/containsoverbar": "\u22FD", "/containsoverbarsmall": "\u22FE", "/containssmallverticalbarhorizontalstroke": "\u22FC", "/containsverticalbarhorizontalstroke": "\u22FB", "/continuousunderline": "\u2381", "/contourintegral": "\u222E", "/control": "\u2303", "/controlACK": "\u0006", "/controlBEL": "\u0007", "/controlBS": "\u0008", "/controlCAN": "\u0018", "/controlCR": "\u000D", "/controlDC1": "\u0011", "/controlDC2": "\u0012", "/controlDC3": "\u0013", "/controlDC4": "\u0014", "/controlDEL": "\u007F", "/controlDLE": "\u0010", "/controlEM": "\u0019", "/controlENQ": "\u0005", "/controlEOT": "\u0004", "/controlESC": "\u001B", "/controlETB": "\u0017", "/controlETX": "\u0003", "/controlFF": "\u000C", "/controlFS": "\u001C", "/controlGS": "\u001D", "/controlHT": "\u0009", "/controlKnobs": "\u1F39B", "/controlLF": "\u000A", "/controlNAK": "\u0015", "/controlRS": "\u001E", "/controlSI": "\u000F", "/controlSO": "\u000E", "/controlSOT": "\u0002", "/controlSTX": "\u0001", "/controlSUB": "\u001A", "/controlSYN": "\u0016", "/controlUS": "\u001F", "/controlVT": "\u000B", "/convavediamondwhite": "\u27E1", "/convenienceStore": "\u1F3EA", "/cookedRice": "\u1F35A", "/cookie": "\u1F36A", "/cooking": "\u1F373", "/coolsquare": "\u1F192", "/coproductarray": "\u2210", "/copyideographiccircled": "\u32A2", "/copyright": "\u00A9", "/copyrightsans": "\uF8E9", "/copyrightserif": "\uF6D9", "/cornerbottomleft": "\u231E", "/cornerbottomright": "\u231F", "/cornerbracketleft": "\u300C", "/cornerbracketlefthalfwidth": "\uFF62", "/cornerbracketleftvertical": "\uFE41", "/cornerbracketright": "\u300D", "/cornerbracketrighthalfwidth": "\uFF63", "/cornerbracketrightvertical": "\uFE42", "/cornerdotupleft": "\u27D4", "/cornertopleft": "\u231C", "/cornertopright": "\u231D", "/coroniseditorial": "\u2E0E", "/corporationsquare": "\u337F", "/correctideographiccircled": "\u32A3", "/corresponds": "\u2258", "/cosquare": "\u33C7", "/couchAndLamp": "\u1F6CB", "/counterbore": "\u2334", "/countersink": "\u2335", "/coupleHeart": "\u1F491", "/coverkgfullwidth": "\u33C6", "/coverkgsquare": "\u33C6", "/cow": "\u1F404", "/cowFace": "\u1F42E", "/cpalatalhook": "\uA794", "/cparen": "\u249E", "/cparenthesized": "\u249E", "/creditCard": "\u1F4B3", "/crescentMoon": "\u1F319", "/creversed": "\u2184", "/cricketBatAndBall": "\u1F3CF", "/crocodile": "\u1F40A", "/cropbottomleft": "\u230D", "/cropbottomright": "\u230C", "/croptopleft": "\u230F", "/croptopright": "\u230E", "/crossPommee": "\u1F542", "/crossPommeeHalf-circleBelow": "\u1F541", "/crossedFlags": "\u1F38C", "/crossedswords": "\u2694", "/crossinglanes": "\u26CC", "/crossmod": "\u02DF", "/crossofjerusalem": "\u2629", "/crossoflorraine": "\u2628", "/crossonshieldblack": "\u26E8", "/crown": "\u1F451", "/crrn:rupee": "\u20A8", "/cruzeiro": "\u20A2", "/cryingCatFace": "\u1F63F", "/cryingFace": "\u1F622", "/crystalBall": "\u1F52E", "/cstretched": "\u0297", "/cstroke": "\u023C", "/cuatrillo": "\uA72D", "/cuatrillocomma": "\uA72F", "/curlyand": "\u22CF", "/curlylogicaland": "\u22CF", "/curlylogicalor": "\u22CE", "/curlyor": "\u22CE", "/currency": "\u00A4", "/currencyExchange": "\u1F4B1", "/curryAndRice": "\u1F35B", "/custard": "\u1F36E", "/customeraccountnumber": "\u2449", "/customs": "\u1F6C3", "/cyclone": "\u1F300", "/cylindricity": "\u232D", "/cyrBreve": "\uF6D1", "/cyrFlex": "\uF6D2", "/cyrbreve": "\uF6D4", "/cyrflex": "\uF6D5", "/d": "\u0064", "/daarmenian": "\u0564", "/daasusquare": "\u3324", "/dabengali": "\u09A6", "/dad": "\u0636", "/dad.fina": "\uFEBE", "/dad.init": "\uFEBF", "/dad.init_alefmaksura.fina": "\uFD07", "/dad.init_hah.fina": "\uFC23", "/dad.init_hah.medi": "\uFCB5", "/dad.init_jeem.fina": "\uFC22", "/dad.init_jeem.medi": "\uFCB4", "/dad.init_khah.fina": "\uFC24", "/dad.init_khah.medi": "\uFCB6", "/dad.init_khah.medi_meem.medi": "\uFD70", "/dad.init_meem.fina": "\uFC25", "/dad.init_meem.medi": "\uFCB7", "/dad.init_reh.fina": "\uFD10", "/dad.init_yeh.fina": "\uFD08", "/dad.isol": "\uFEBD", "/dad.medi": "\uFEC0", "/dad.medi_alefmaksura.fina": "\uFD23", "/dad.medi_hah.medi_alefmaksura.fina": "\uFD6E", "/dad.medi_hah.medi_yeh.fina": "\uFDAB", "/dad.medi_khah.medi_meem.fina": "\uFD6F", "/dad.medi_reh.fina": "\uFD2C", "/dad.medi_yeh.fina": "\uFD24", "/dadarabic": "\u0636", "/daddotbelow": "\u06FB", "/dadeva": "\u0926", "/dadfinalarabic": "\uFEBE", "/dadinitialarabic": "\uFEBF", "/dadmedialarabic": "\uFEC0", "/dafullwidth": "\u3372", "/dagesh": "\u05BC", "/dagesh:hb": "\u05BC", "/dageshhebrew": "\u05BC", "/dagger": "\u2020", "/daggerKnife": "\u1F5E1", "/daggerdbl": "\u2021", "/daggerwithguardleft": "\u2E36", "/daggerwithguardright": "\u2E37", "/dagujarati": "\u0AA6", "/dagurmukhi": "\u0A26", "/dahal": "\u068C", "/dahal.fina": "\uFB85", "/dahal.isol": "\uFB84", "/dahiragana": "\u3060", "/dakatakana": "\u30C0", "/dal": "\u062F", "/dal.fina": "\uFEAA", "/dal.isol": "\uFEA9", "/dalInvertedSmallVBelow": "\u075A", "/dalTwoDotsVerticallyBelowSmallTah": "\u0759", "/dalarabic": "\u062F", "/daldotbelow": "\u068A", "/daldotbelowtahsmall": "\u068B", "/daldownthreedotsabove": "\u068F", "/dalet": "\u05D3", "/dalet:hb": "\u05D3", "/daletdagesh": "\uFB33", "/daletdageshhebrew": "\uFB33", "/dalethatafpatah": "\u05D3", "/dalethatafpatahhebrew": "\u05D3", "/dalethatafsegol": "\u05D3", "/dalethatafsegolhebrew": "\u05D3", "/dalethebrew": "\u05D3", "/dalethiriq": "\u05D3", "/dalethiriqhebrew": "\u05D3", "/daletholam": "\u05D3", "/daletholamhebrew": "\u05D3", "/daletpatah": "\u05D3", "/daletpatahhebrew": "\u05D3", "/daletqamats": "\u05D3", "/daletqamatshebrew": "\u05D3", "/daletqubuts": "\u05D3", "/daletqubutshebrew": "\u05D3", "/daletsegol": "\u05D3", "/daletsegolhebrew": "\u05D3", "/daletsheva": "\u05D3", "/daletshevahebrew": "\u05D3", "/dalettsere": "\u05D3", "/dalettserehebrew": "\u05D3", "/daletwide:hb": "\uFB22", "/daletwithdagesh:hb": "\uFB33", "/dalfinalarabic": "\uFEAA", "/dalfourdotsabove": "\u0690", "/dalinvertedV": "\u06EE", "/dalring": "\u0689", "/damahaprana": "\uA9A3", "/damma": "\u064F", "/dammaIsol": "\uFE78", "/dammaMedi": "\uFE79", "/dammaarabic": "\u064F", "/dammalowarabic": "\u064F", "/dammareversed": "\u065D", "/dammasmall": "\u0619", "/dammatan": "\u064C", "/dammatanIsol": "\uFE72", "/dammatanaltonearabic": "\u064C", "/dammatanarabic": "\u064C", "/dancer": "\u1F483", "/danda": "\u0964", "/dango": "\u1F361", "/darga:hb": "\u05A7", "/dargahebrew": "\u05A7", "/dargalefthebrew": "\u05A7", "/darkShade": "\u2593", "/darkSunglasses": "\u1F576", "/dashwithupturnleft": "\u2E43", "/dasiacmbcyr": "\u0485", "/dasiapneumatacyrilliccmb": "\u0485", "/dateseparator": "\u060D", "/dayeighteentelegraph": "\u33F1", "/dayeighttelegraph": "\u33E7", "/dayeleventelegraph": "\u33EA", "/dayfifteentelegraph": "\u33EE", "/dayfivetelegraph": "\u33E4", "/dayfourteentelegraph": "\u33ED", "/dayfourtelegraph": "\u33E3", "/daynineteentelegraph": "\u33F2", "/dayninetelegraph": "\u33E8", "/dayonetelegraph": "\u33E0", "/dayseventeentelegraph": "\u33F0", "/dayseventelegraph": "\u33E6", "/daysixteentelegraph": "\u33EF", "/daysixtelegraph": "\u33E5", "/daytentelegraph": "\u33E9", "/daythirteentelegraph": "\u33EC", "/daythirtyonetelegraph": "\u33FE", "/daythirtytelegraph": "\u33FD", "/daythreetelegraph": "\u33E2", "/daytwelvetelegraph": "\u33EB", "/daytwentyeighttelegraph": "\u33FB", "/daytwentyfivetelegraph": "\u33F8", "/daytwentyfourtelegraph": "\u33F7", "/daytwentyninetelegraph": "\u33FC", "/daytwentyonetelegraph": "\u33F4", "/daytwentyseventelegraph": "\u33FA", "/daytwentysixtelegraph": "\u33F9", "/daytwentytelegraph": "\u33F3", "/daytwentythreetelegraph": "\u33F6", "/daytwentytwotelegraph": "\u33F5", "/daytwotelegraph": "\u33E1", "/dbdigraph": "\u0238", "/dbfullwidth": "\u33C8", "/dblGrave": "\uF6D3", "/dblanglebracketleft": "\u300A", "/dblanglebracketleftvertical": "\uFE3D", "/dblanglebracketright": "\u300B", "/dblanglebracketrightvertical": "\uFE3E", "/dblarchinvertedbelowcmb": "\u032B", "/dblarrowNE": "\u21D7", "/dblarrowNW": "\u21D6", "/dblarrowSE": "\u21D8", "/dblarrowSW": "\u21D9", "/dblarrowdown": "\u21D3", "/dblarrowleft": "\u21D4", "/dblarrowleftright": "\u21D4", "/dblarrowleftrightstroke": "\u21CE", "/dblarrowleftstroke": "\u21CD", "/dblarrowright": "\u21D2", "/dblarrowrightstroke": "\u21CF", "/dblarrowup": "\u21D1", "/dblarrowupdown": "\u21D5", "/dbldanda": "\u0965", "/dbldnhorz": "\u2566", "/dbldnleft": "\u2557", "/dbldnright": "\u2554", "/dblgrave": "\uF6D6", "/dblgravecmb": "\u030F", "/dblhorz": "\u2550", "/dblintegral": "\u222C", "/dbllowline": "\u2017", "/dbllowlinecmb": "\u0333", "/dbloverlinecmb": "\u033F", "/dblprimemod": "\u02BA", "/dblstrokearrowdown": "\u21DF", "/dblstrokearrowup": "\u21DE", "/dbluphorz": "\u2569", "/dblupleft": "\u255D", "/dblupright": "\u255A", "/dblvert": "\u2551", "/dblverthorz": "\u256C", "/dblverticalbar": "\u2016", "/dblverticallineabovecmb": "\u030E", "/dblvertleft": "\u2563", "/dblvertright": "\u2560", "/dbopomofo": "\u3109", "/dbsquare": "\u33C8", "/dcaron": "\u010F", "/dcedilla": "\u1E11", "/dchecyr": "\u052D", "/dcircle": "\u24D3", "/dcircumflexbelow": "\u1E13", "/dcroat": "\u0111", "/dcurl": "\u0221", "/ddabengali": "\u09A1", "/ddadeva": "\u0921", "/ddagujarati": "\u0AA1", "/ddagurmukhi": "\u0A21", "/ddahal": "\u068D", "/ddahal.fina": "\uFB83", "/ddahal.isol": "\uFB82", "/ddal": "\u0688", "/ddal.fina": "\uFB89", "/ddal.isol": "\uFB88", "/ddalarabic": "\u0688", "/ddalfinalarabic": "\uFB89", "/ddamahaprana": "\uA99E", "/ddblstruckitalic": "\u2146", "/dddhadeva": "\u095C", "/ddhabengali": "\u09A2", "/ddhadeva": "\u0922", "/ddhagujarati": "\u0AA2", "/ddhagurmukhi": "\u0A22", "/ddot": "\u1E0B", "/ddotaccent": "\u1E0B", "/ddotbelow": "\u1E0D", "/decembertelegraph": "\u32CB", "/deciduousTree": "\u1F333", "/decimalexponent": "\u23E8", "/decimalseparatorarabic": "\u066B", "/decimalseparatorpersian": "\u066B", "/decreaseFontSize": "\u1F5DB", "/decyr": "\u0434", "/decyrillic": "\u0434", "/degree": "\u00B0", "/degreecelsius": "\u2103", "/degreefahrenheit": "\u2109", "/dehi:hb": "\u05AD", "/dehihebrew": "\u05AD", "/dehiragana": "\u3067", "/deicoptic": "\u03EF", "/dekatakana": "\u30C7", "/dekomicyr": "\u0501", "/deldiaeresisfunc": "\u2362", "/deleteleft": "\u232B", "/deleteright": "\u2326", "/deliveryTruck": "\u1F69A", "/delstilefunc": "\u2352", "/delta": "\u03B4", "/deltaequal": "\u225C", "/deltastilefunc": "\u234B", "/deltaturned": "\u018D", "/deltaunderlinefunc": "\u2359", "/deltildefunc": "\u236B", "/denominatorminusonenumeratorbengali": "\u09F8", "/dentistrybottomverticalleft": "\u23CC", "/dentistrybottomverticalright": "\u23BF", "/dentistrycircledownhorizontal": "\u23C1", "/dentistrycircleuphorizontal": "\u23C2", "/dentistrycirclevertical": "\u23C0", "/dentistrydownhorizontal": "\u23C9", "/dentistrytopverticalleft": "\u23CB", "/dentistrytopverticalright": "\u23BE", "/dentistrytriangledownhorizontal": "\u23C4", "/dentistrytriangleuphorizontal": "\u23C5", "/dentistrytrianglevertical": "\u23C3", "/dentistryuphorizontal": "\u23CA", "/dentistrywavedownhorizontal": "\u23C7", "/dentistrywaveuphorizontal": "\u23C8", "/dentistrywavevertical": "\u23C6", "/departmentStore": "\u1F3EC", "/derelictHouseBuilding": "\u1F3DA", "/desert": "\u1F3DC", "/desertIsland": "\u1F3DD", "/desisquare": "\u3325", "/desktopComputer": "\u1F5A5", "/desktopWindow": "\u1F5D4", "/deva:a": "\u0905", "/deva:aa": "\u0906", "/deva:aasign": "\u093E", "/deva:abbreviation": "\u0970", "/deva:acandra": "\u0972", "/deva:acute": "\u0954", "/deva:ai": "\u0910", "/deva:aisign": "\u0948", "/deva:anudatta": "\u0952", "/deva:anusvara": "\u0902", "/deva:ashort": "\u0904", "/deva:au": "\u0914", "/deva:ausign": "\u094C", "/deva:avagraha": "\u093D", "/deva:aw": "\u0975", "/deva:awsign": "\u094F", "/deva:ba": "\u092C", "/deva:bba": "\u097F", "/deva:bha": "\u092D", "/deva:ca": "\u091A", "/deva:candrabindu": "\u0901", "/deva:candrabinduinverted": "\u0900", "/deva:cha": "\u091B", "/deva:da": "\u0926", "/deva:danda": "\u0964", "/deva:dbldanda": "\u0965", "/deva:dda": "\u0921", "/deva:ddda": "\u097E", "/deva:dddha": "\u095C", "/deva:ddha": "\u0922", "/deva:dha": "\u0927", "/deva:dothigh": "\u0971", "/deva:e": "\u090F", "/deva:ecandra": "\u090D", "/deva:eight": "\u096E", "/deva:eshort": "\u090E", "/deva:esign": "\u0947", "/deva:esigncandra": "\u0945", "/deva:esignprishthamatra": "\u094E", "/deva:esignshort": "\u0946", "/deva:fa": "\u095E", "/deva:five": "\u096B", "/deva:four": "\u096A", "/deva:ga": "\u0917", "/deva:gga": "\u097B", "/deva:gha": "\u0918", "/deva:ghha": "\u095A", "/deva:glottalstop": "\u097D", "/deva:grave": "\u0953", "/deva:ha": "\u0939", "/deva:i": "\u0907", "/deva:ii": "\u0908", "/deva:iisign": "\u0940", "/deva:isign": "\u093F", "/deva:ja": "\u091C", "/deva:jha": "\u091D", "/deva:jja": "\u097C", "/deva:ka": "\u0915", "/deva:kha": "\u0916", "/deva:khha": "\u0959", "/deva:la": "\u0932", "/deva:lla": "\u0933", "/deva:llla": "\u0934", "/deva:llvocal": "\u0961", "/deva:llvocalsign": "\u0963", "/deva:lvocal": "\u090C", "/deva:lvocalsign": "\u0962", "/deva:ma": "\u092E", "/deva:marwaridda": "\u0978", "/deva:na": "\u0928", "/deva:nga": "\u0919", "/deva:nine": "\u096F", "/deva:nna": "\u0923", "/deva:nnna": "\u0929", "/deva:nukta": "\u093C", "/deva:nya": "\u091E", "/deva:o": "\u0913", "/deva:ocandra": "\u0911", "/deva:oe": "\u0973", "/deva:oesign": "\u093A", "/deva:om": "\u0950", "/deva:one": "\u0967", "/deva:ooe": "\u0974", "/deva:ooesign": "\u093B", "/deva:oshort": "\u0912", "/deva:osign": "\u094B", "/deva:osigncandra": "\u0949", "/deva:osignshort": "\u094A", "/deva:pa": "\u092A", "/deva:pha": "\u092B", "/deva:qa": "\u0958", "/deva:ra": "\u0930", "/deva:rha": "\u095D", "/deva:rra": "\u0931", "/deva:rrvocal": "\u0960", "/deva:rrvocalsign": "\u0944", "/deva:rvocal": "\u090B", "/deva:rvocalsign": "\u0943", "/deva:sa": "\u0938", "/deva:seven": "\u096D", "/deva:sha": "\u0936", "/deva:signelongcandra": "\u0955", "/deva:six": "\u096C", "/deva:ssa": "\u0937", "/deva:ta": "\u0924", "/deva:tha": "\u0925", "/deva:three": "\u0969", "/deva:tta": "\u091F", "/deva:ttha": "\u0920", "/deva:two": "\u0968", "/deva:u": "\u0909", "/deva:udatta": "\u0951", "/deva:ue": "\u0976", "/deva:uesign": "\u0956", "/deva:usign": "\u0941", "/deva:uu": "\u090A", "/deva:uue": "\u0977", "/deva:uuesign": "\u0957", "/deva:uusign": "\u0942", "/deva:va": "\u0935", "/deva:virama": "\u094D", "/deva:visarga": "\u0903", "/deva:ya": "\u092F", "/deva:yaheavy": "\u097A", "/deva:yya": "\u095F", "/deva:za": "\u095B", "/deva:zero": "\u0966", "/deva:zha": "\u0979", "/dezh": "\u02A4", "/dfemaledbl": "\u26A2", "/dhabengali": "\u09A7", "/dhadeva": "\u0927", "/dhagujarati": "\u0AA7", "/dhagurmukhi": "\u0A27", "/dhook": "\u0257", "/diaeresisgreaterfunc": "\u2369", "/dialytikatonos": "\u0385", "/dialytikatonoscmb": "\u0344", "/diametersign": "\u2300", "/diamond": "\u2666", "/diamondShapeADotInside": "\u1F4A0", "/diamondinsquarewhite": "\u26CB", "/diamondoperator": "\u22C4", "/diamondsuitwhite": "\u2662", "/diamondunderlinefunc": "\u235A", "/diamondwhitewithdiamondsmallblack": "\u25C8", "/diefive": "\u2684", "/diefour": "\u2683", "/dieone": "\u2680", "/dieresis": "\u00A8", "/dieresisacute": "\uF6D7", "/dieresisbelowcmb": "\u0324", "/dieresiscmb": "\u0308", "/dieresisgrave": "\uF6D8", "/dieresistilde": "\u1FC1", "/dieresistonos": "\u0385", "/dieselLocomotive": "\u1F6F2", "/diesix": "\u2685", "/diethree": "\u2682", "/dietwo": "\u2681", "/differencebetween": "\u224F", "/digamma": "\u03DD", "/digammapamphylian": "\u0377", "/digramgreateryang": "\u268C", "/digramgreateryin": "\u268F", "/digramlesseryang": "\u268E", "/digramlesseryin": "\u268D", "/dihiragana": "\u3062", "/dikatakana": "\u30C2", "/dimensionorigin": "\u2331", "/dingbatSAns-serifzerocircle": "\u1F10B", "/dingbatSAns-serifzerocircleblack": "\u1F10C", "/dinsular": "\uA77A", "/directHit": "\u1F3AF", "/directcurrentformtwo": "\u2393", "/dirgamurevowel": "\uA9BB", "/disabledcar": "\u26CD", "/disappointedButRelievedFace": "\u1F625", "/disappointedFace": "\u1F61E", "/discontinuousunderline": "\u2382", "/dittomark": "\u3003", "/divide": "\u00F7", "/divides": "\u2223", "/divisionslash": "\u2215", "/divisiontimes": "\u22C7", "/divorce": "\u26AE", "/dizzy": "\u1F4AB", "/dizzyFace": "\u1F635", "/djecyr": "\u0452", "/djecyrillic": "\u0452", "/djekomicyr": "\u0503", "/dkshade": "\u2593", "/dlfullwidth": "\u3397", "/dlinebelow": "\u1E0F", "/dlogicalorsquare": "\u27CF", "/dlogicalsquare": "\u27CE", "/dlsquare": "\u3397", "/dm2fullwidth": "\u3378", "/dm3fullwidth": "\u3379", "/dmacron": "\u0111", "/dmaledbl": "\u26A3", "/dmfullwidth": "\u3377", "/dmonospace": "\uFF44", "/dnblock": "\u2584", "/dndblhorzsng": "\u2565", "/dndblleftsng": "\u2556", "/dndblrightsng": "\u2553", "/dngb:airplane": "\u2708", "/dngb:arrowfeatheredblackNE": "\u27B6", "/dngb:arrowfeatheredblackSE": "\u27B4", "/dngb:arrowfeatheredblackheavyNE": "\u27B9", "/dngb:arrowfeatheredblackheavySE": "\u27B7", "/dngb:arrowheadrightblack": "\u27A4", "/dngb:arrowheadrightthreeDbottomlight": "\u27A3", "/dngb:arrowheadrightthreeDtoplight": "\u27A2", "/dngb:arrowheavyNE": "\u279A", "/dngb:arrowheavySE": "\u2798", "/dngb:arrowrightbacktiltedshadowedwhite": "\u27AB", "/dngb:arrowrightblack": "\u27A1", "/dngb:arrowrightcircledwhiteheavy": "\u27B2", "/dngb:arrowrightcurvedownblackheavy": "\u27A5", "/dngb:arrowrightcurveupblackheavy": "\u27A6", "/dngb:arrowrightfeatheredblack": "\u27B5", "/dngb:arrowrightfeatheredblackheavy": "\u27B8", "/dngb:arrowrightfeatheredwhite": "\u27B3", "/dngb:arrowrightfronttiltedshadowedwhite": "\u27AC", "/dngb:arrowrightheavy": "\u2799", "/dngb:arrowrightleftshadedwhite": "\u27AA", "/dngb:arrowrightoutlinedopen": "\u27BE", "/dngb:arrowrightpointed": "\u279B", "/dngb:arrowrightpointedblackheavy": "\u27A8", "/dngb:arrowrightrightshadedwhite": "\u27A9", "/dngb:arrowrightroundheavy": "\u279C", "/dngb:arrowrightsquatblack": "\u27A7", "/dngb:arrowrighttriangle": "\u279D", "/dngb:arrowrighttriangledashed": "\u279F", "/dngb:arrowrighttriangledashedheavy": "\u27A0", "/dngb:arrowrighttriangleheavy": "\u279E", "/dngb:arrowrightwedge": "\u27BC", "/dngb:arrowrightwedgeheavy": "\u27BD", "/dngb:arrowrightwideheavy": "\u2794", "/dngb:arrowshadowrightlowerwhiteheavy": "\u27AD", "/dngb:arrowshadowrightnotchedlowerwhite": "\u27AF", "/dngb:arrowshadowrightnotchedupperwhite": "\u27B1", "/dngb:arrowshadowrightupperwhiteheavy": "\u27AE", "/dngb:arrowteardropright": "\u27BA", "/dngb:arrowteardroprightheavy": "\u27BB", "/dngb:asteriskballoon": "\u2749", "/dngb:asteriskballoonfour": "\u2723", "/dngb:asteriskballoonheavyfour": "\u2724", "/dngb:asteriskcentreopen": "\u2732", "/dngb:asteriskclubfour": "\u2725", "/dngb:asteriskheavy": "\u2731", "/dngb:asteriskpointedsixteen": "\u273A", "/dngb:asteriskteardrop": "\u273B", "/dngb:asteriskteardropcentreopen": "\u273C", "/dngb:asteriskteardropfour": "\u2722", "/dngb:asteriskteardropheavy": "\u273D", "/dngb:asteriskteardroppinwheelheavy": "\u2743", "/dngb:asteriskteardroppropellereight": "\u274A", "/dngb:asteriskteardroppropellerheavyeight": "\u274B", "/dngb:ballotx": "\u2717", "/dngb:ballotxheavy": "\u2718", "/dngb:bracketleftpointedangleheavyornament": "\u2770", "/dngb:bracketleftpointedanglemediumornament": "\u276C", "/dngb:bracketrightpointedangleheavyornament": "\u2771", "/dngb:bracketrightpointedanglemediumornament": "\u276D", "/dngb:bracketshellleftlightornament": "\u2772", "/dngb:bracketshellrightlightornament": "\u2773", "/dngb:check": "\u2713", "/dngb:checkheavy": "\u2714", "/dngb:checkwhiteheavy": "\u2705", "/dngb:chevronsnowflakeheavy": "\u2746", "/dngb:circleshadowedwhite": "\u274D", "/dngb:commaheavydoubleornament": "\u275E", "/dngb:commaheavydoubleturnedornament": "\u275D", "/dngb:commaheavyornament": "\u275C", "/dngb:commaheavyturnedornament": "\u275B", "/dngb:compasstarpointedblackeight": "\u2737", "/dngb:compasstarpointedblackheavyeight": "\u2738", "/dngb:cross": "\u274C", "/dngb:crosscentreopen": "\u271B", "/dngb:crosscentreopenheavy": "\u271C", "/dngb:curlybracketleftmediumornament": "\u2774", "/dngb:curlybracketrightmediumornament": "\u2775", "/dngb:curlyloop": "\u27B0", "/dngb:curlyloopdouble": "\u27BF", "/dngb:curvedstemparagraphsignornament": "\u2761", "/dngb:diamondminusxblackwhite": "\u2756", "/dngb:divisionsignheavy": "\u2797", "/dngb:eightnegativecircled": "\u277D", "/dngb:eightsanscircled": "\u2787", "/dngb:eightsansnegativecircled": "\u2791", "/dngb:envelope": "\u2709", "/dngb:exclamationheavy": "\u2757", "/dngb:exclamationheavyornament": "\u2762", "/dngb:exclamationwhiteornament": "\u2755", "/dngb:fivenegativecircled": "\u277A", "/dngb:fivesanscircled": "\u2784", "/dngb:fivesansnegativecircled": "\u278E", "/dngb:floralheart": "\u2766", "/dngb:floralheartbulletrotated": "\u2767", "/dngb:floretteblack": "\u273F", "/dngb:floretteoutlinedpetalledblackeight": "\u2741", "/dngb:florettepetalledblackwhitesix": "\u273E", "/dngb:florettewhite": "\u2740", "/dngb:fournegativecircled": "\u2779", "/dngb:foursanscircled": "\u2783", "/dngb:foursansnegativecircled": "\u278D", "/dngb:greekcrossheavy": "\u271A", "/dngb:greekcrossoutlined": "\u2719", "/dngb:heartblackheavy": "\u2764", "/dngb:heartbulletrotatedblackheavy": "\u2765", "/dngb:heartexclamationheavyornament": "\u2763", "/dngb:hvictory": "\u270C", "/dngb:hwriting": "\u270D", "/dngb:latincross": "\u271D", "/dngb:latincrossoutlined": "\u271F", "/dngb:latincrossshadowedwhite": "\u271E", "/dngb:lowcommaheavydoubleornament": "\u2760", "/dngb:lowcommaheavyornament": "\u275F", "/dngb:maltesecross": "\u2720", "/dngb:minussignheavy": "\u2796", "/dngb:multiplicationx": "\u2715", "/dngb:multiplicationxheavy": "\u2716", "/dngb:nibblack": "\u2712", "/dngb:nibwhite": "\u2711", "/dngb:ninenegativecircled": "\u277E", "/dngb:ninesanscircled": "\u2788", "/dngb:ninesansnegativecircled": "\u2792", "/dngb:onenegativecircled": "\u2776", "/dngb:onesanscircled": "\u2780", "/dngb:onesansnegativecircled": "\u278A", "/dngb:parenthesisleftflattenedmediumornament": "\u276A", "/dngb:parenthesisleftmediumornament": "\u2768", "/dngb:parenthesisrightflattenedmediumornament": "\u276B", "/dngb:parenthesisrightmediumornament": "\u2769", "/dngb:pencil": "\u270F", "/dngb:pencillowerright": "\u270E", "/dngb:pencilupperright": "\u2710", "/dngb:plussignheavy": "\u2795", "/dngb:questionblackornament": "\u2753", "/dngb:questionwhiteornament": "\u2754", "/dngb:quotationleftpointedangleheavyornament": "\u276E", "/dngb:quotationrightpointedangleheavyornament": "\u276F", "/dngb:raisedfist": "\u270A", "/dngb:raisedh": "\u270B", "/dngb:safetyscissorsblack": "\u2700", "/dngb:scissorsblack": "\u2702", "/dngb:scissorslowerblade": "\u2703", "/dngb:scissorsupperblade": "\u2701", "/dngb:scissorswhite": "\u2704", "/dngb:sevennegativecircled": "\u277C", "/dngb:sevensanscircled": "\u2786", "/dngb:sevensansnegativecircled": "\u2790", "/dngb:sixnegativecircled": "\u277B", "/dngb:sixsanscircled": "\u2785", "/dngb:sixsansnegativecircled": "\u278F", "/dngb:snowflake": "\u2744", "/dngb:snowflaketight": "\u2745", "/dngb:sparkle": "\u2747", "/dngb:sparkleheavy": "\u2748", "/dngb:sparkles": "\u2728", "/dngb:spokedasteriskeight": "\u2733", "/dngb:squaredcrossnegative": "\u274E", "/dngb:squarelowerrightshadowedwhite": "\u2751", "/dngb:squareshadowlowerrightwhite": "\u274F", "/dngb:squareshadowupperrightwhite": "\u2750", "/dngb:squareupperrightshadowedwhite": "\u2752", "/dngb:starcentreblackwhite": "\u272C", "/dngb:starcentreopenblack": "\u272B", "/dngb:starcentreopenpointedcircledeight": "\u2742", "/dngb:starcircledwhite": "\u272A", "/dngb:starofdavid": "\u2721", "/dngb:staroutlinedblack": "\u272D", "/dngb:staroutlinedblackheavy": "\u272E", "/dngb:staroutlinedstresswhite": "\u2729", "/dngb:starpinwheel": "\u272F", "/dngb:starpointedblackeight": "\u2734", "/dngb:starpointedblackfour": "\u2726", "/dngb:starpointedblacksix": "\u2736", "/dngb:starpointedblacktwelve": "\u2739", "/dngb:starpointedpinwheeleight": "\u2735", "/dngb:starpointedwhitefour": "\u2727", "/dngb:starshadowedwhite": "\u2730", "/dngb:tapedrive": "\u2707", "/dngb:telephonelocationsign": "\u2706", "/dngb:tennegativecircled": "\u277F", "/dngb:tensanscircled": "\u2789", "/dngb:tensansnegativecircled": "\u2793", "/dngb:threenegativecircled": "\u2778", "/dngb:threesanscircled": "\u2782", "/dngb:threesansnegativecircled": "\u278C", "/dngb:twonegativecircled": "\u2777", "/dngb:twosanscircled": "\u2781", "/dngb:twosansnegativecircled": "\u278B", "/dngb:verticalbarheavy": "\u275A", "/dngb:verticalbarlight": "\u2758", "/dngb:verticalbarmedium": "\u2759", "/dnheavyhorzlight": "\u2530", "/dnheavyleftlight": "\u2512", "/dnheavyleftuplight": "\u2527", "/dnheavyrightlight": "\u250E", "/dnheavyrightuplight": "\u251F", "/dnheavyuphorzlight": "\u2541", "/dnlighthorzheavy": "\u252F", "/dnlightleftheavy": "\u2511", "/dnlightleftupheavy": "\u2529", "/dnlightrightheavy": "\u250D", "/dnlightrightupheavy": "\u2521", "/dnlightuphorzheavy": "\u2547", "/dnsnghorzdbl": "\u2564", "/dnsngleftdbl": "\u2555", "/dnsngrightdbl": "\u2552", "/doNotLitter": "\u1F6AF", "/dochadathai": "\u0E0E", "/document": "\u1F5CE", "/documentPicture": "\u1F5BB", "/documentText": "\u1F5B9", "/documentTextAndPicture": "\u1F5BA", "/dodekthai": "\u0E14", "/doesnotcontainasnormalsubgroorequalup": "\u22ED", "/doesnotcontainasnormalsubgroup": "\u22EB", "/doesnotdivide": "\u2224", "/doesnotforce": "\u22AE", "/doesnotprecede": "\u2280", "/doesnotprecedeorequal": "\u22E0", "/doesnotprove": "\u22AC", "/doesnotsucceed": "\u2281", "/doesnotsucceedorequal": "\u22E1", "/dog": "\u1F415", "/dogFace": "\u1F436", "/dohiragana": "\u3069", "/dokatakana": "\u30C9", "/dollar": "\u0024", "/dollarinferior": "\uF6E3", "/dollarmonospace": "\uFF04", "/dollaroldstyle": "\uF724", "/dollarsmall": "\uFE69", "/dollarsuperior": "\uF6E4", "/dolphin": "\u1F42C", "/dominohorizontal_00_00": "\u1F031", "/dominohorizontal_00_01": "\u1F032", "/dominohorizontal_00_02": "\u1F033", "/dominohorizontal_00_03": "\u1F034", "/dominohorizontal_00_04": "\u1F035", "/dominohorizontal_00_05": "\u1F036", "/dominohorizontal_00_06": "\u1F037", "/dominohorizontal_01_00": "\u1F038", "/dominohorizontal_01_01": "\u1F039", "/dominohorizontal_01_02": "\u1F03A", "/dominohorizontal_01_03": "\u1F03B", "/dominohorizontal_01_04": "\u1F03C", "/dominohorizontal_01_05": "\u1F03D", "/dominohorizontal_01_06": "\u1F03E", "/dominohorizontal_02_00": "\u1F03F", "/dominohorizontal_02_01": "\u1F040", "/dominohorizontal_02_02": "\u1F041", "/dominohorizontal_02_03": "\u1F042", "/dominohorizontal_02_04": "\u1F043", "/dominohorizontal_02_05": "\u1F044", "/dominohorizontal_02_06": "\u1F045", "/dominohorizontal_03_00": "\u1F046", "/dominohorizontal_03_01": "\u1F047", "/dominohorizontal_03_02": "\u1F048", "/dominohorizontal_03_03": "\u1F049", "/dominohorizontal_03_04": "\u1F04A", "/dominohorizontal_03_05": "\u1F04B", "/dominohorizontal_03_06": "\u1F04C", "/dominohorizontal_04_00": "\u1F04D", "/dominohorizontal_04_01": "\u1F04E", "/dominohorizontal_04_02": "\u1F04F", "/dominohorizontal_04_03": "\u1F050", "/dominohorizontal_04_04": "\u1F051", "/dominohorizontal_04_05": "\u1F052", "/dominohorizontal_04_06": "\u1F053", "/dominohorizontal_05_00": "\u1F054", "/dominohorizontal_05_01": "\u1F055", "/dominohorizontal_05_02": "\u1F056", "/dominohorizontal_05_03": "\u1F057", "/dominohorizontal_05_04": "\u1F058", "/dominohorizontal_05_05": "\u1F059", "/dominohorizontal_05_06": "\u1F05A", "/dominohorizontal_06_00": "\u1F05B", "/dominohorizontal_06_01": "\u1F05C", "/dominohorizontal_06_02": "\u1F05D", "/dominohorizontal_06_03": "\u1F05E", "/dominohorizontal_06_04": "\u1F05F", "/dominohorizontal_06_05": "\u1F060", "/dominohorizontal_06_06": "\u1F061", "/dominohorizontalback": "\u1F030", "/dominovertical_00_00": "\u1F063", "/dominovertical_00_01": "\u1F064", "/dominovertical_00_02": "\u1F065", "/dominovertical_00_03": "\u1F066", "/dominovertical_00_04": "\u1F067", "/dominovertical_00_05": "\u1F068", "/dominovertical_00_06": "\u1F069", "/dominovertical_01_00": "\u1F06A", "/dominovertical_01_01": "\u1F06B", "/dominovertical_01_02": "\u1F06C", "/dominovertical_01_03": "\u1F06D", "/dominovertical_01_04": "\u1F06E", "/dominovertical_01_05": "\u1F06F", "/dominovertical_01_06": "\u1F070", "/dominovertical_02_00": "\u1F071", "/dominovertical_02_01": "\u1F072", "/dominovertical_02_02": "\u1F073", "/dominovertical_02_03": "\u1F074", "/dominovertical_02_04": "\u1F075", "/dominovertical_02_05": "\u1F076", "/dominovertical_02_06": "\u1F077", "/dominovertical_03_00": "\u1F078", "/dominovertical_03_01": "\u1F079", "/dominovertical_03_02": "\u1F07A", "/dominovertical_03_03": "\u1F07B", "/dominovertical_03_04": "\u1F07C", "/dominovertical_03_05": "\u1F07D", "/dominovertical_03_06": "\u1F07E", "/dominovertical_04_00": "\u1F07F", "/dominovertical_04_01": "\u1F080", "/dominovertical_04_02": "\u1F081", "/dominovertical_04_03": "\u1F082", "/dominovertical_04_04": "\u1F083", "/dominovertical_04_05": "\u1F084", "/dominovertical_04_06": "\u1F085", "/dominovertical_05_00": "\u1F086", "/dominovertical_05_01": "\u1F087", "/dominovertical_05_02": "\u1F088", "/dominovertical_05_03": "\u1F089", "/dominovertical_05_04": "\u1F08A", "/dominovertical_05_05": "\u1F08B", "/dominovertical_05_06": "\u1F08C", "/dominovertical_06_00": "\u1F08D", "/dominovertical_06_01": "\u1F08E", "/dominovertical_06_02": "\u1F08F", "/dominovertical_06_03": "\u1F090", "/dominovertical_06_04": "\u1F091", "/dominovertical_06_05": "\u1F092", "/dominovertical_06_06": "\u1F093", "/dominoverticalback": "\u1F062", "/dong": "\u20AB", "/door": "\u1F6AA", "/dorusquare": "\u3326", "/dot": "\u27D1", "/dotaccent": "\u02D9", "/dotaccentcmb": "\u0307", "/dotbelowcmb": "\u0323", "/dotbelowcomb": "\u0323", "/dotkatakana": "\u30FB", "/dotlessbeh": "\u066E", "/dotlessfeh": "\u06A1", "/dotlessi": "\u0131", "/dotlessj": "\uF6BE", "/dotlessjstroke": "\u025F", "/dotlessjstrokehook": "\u0284", "/dotlesskhahabove": "\u06E1", "/dotlessqaf": "\u066F", "/dotlower:hb": "\u05C5", "/dotmath": "\u22C5", "/dotminus": "\u2238", "/dotplus": "\u2214", "/dotraised": "\u2E33", "/dots1": "\u2801", "/dots12": "\u2803", "/dots123": "\u2807", "/dots1234": "\u280F", "/dots12345": "\u281F", "/dots123456": "\u283F", "/dots1234567": "\u287F", "/dots12345678": "\u28FF", "/dots1234568": "\u28BF", "/dots123457": "\u285F", "/dots1234578": "\u28DF", "/dots123458": "\u289F", "/dots12346": "\u282F", "/dots123467": "\u286F", "/dots1234678": "\u28EF", "/dots123468": "\u28AF", "/dots12347": "\u284F", "/dots123478": "\u28CF", "/dots12348": "\u288F", "/dots1235": "\u2817", "/dots12356": "\u2837", "/dots123567": "\u2877", "/dots1235678": "\u28F7", "/dots123568": "\u28B7", "/dots12357": "\u2857", "/dots123578": "\u28D7", "/dots12358": "\u2897", "/dots1236": "\u2827", "/dots12367": "\u2867", "/dots123678": "\u28E7", "/dots12368": "\u28A7", "/dots1237": "\u2847", "/dots12378": "\u28C7", "/dots1238": "\u2887", "/dots124": "\u280B", "/dots1245": "\u281B", "/dots12456": "\u283B", "/dots124567": "\u287B", "/dots1245678": "\u28FB", "/dots124568": "\u28BB", "/dots12457": "\u285B", "/dots124578": "\u28DB", "/dots12458": "\u289B", "/dots1246": "\u282B", "/dots12467": "\u286B", "/dots124678": "\u28EB", "/dots12468": "\u28AB", "/dots1247": "\u284B", "/dots12478": "\u28CB", "/dots1248": "\u288B", "/dots125": "\u2813", "/dots1256": "\u2833", "/dots12567": "\u2873", "/dots125678": "\u28F3", "/dots12568": "\u28B3", "/dots1257": "\u2853", "/dots12578": "\u28D3", "/dots1258": "\u2893", "/dots126": "\u2823", "/dots1267": "\u2863", "/dots12678": "\u28E3", "/dots1268": "\u28A3", "/dots127": "\u2843", "/dots1278": "\u28C3", "/dots128": "\u2883", "/dots13": "\u2805", "/dots134": "\u280D", "/dots1345": "\u281D", "/dots13456": "\u283D", "/dots134567": "\u287D", "/dots1345678": "\u28FD", "/dots134568": "\u28BD", "/dots13457": "\u285D", "/dots134578": "\u28DD", "/dots13458": "\u289D", "/dots1346": "\u282D", "/dots13467": "\u286D", "/dots134678": "\u28ED", "/dots13468": "\u28AD", "/dots1347": "\u284D", "/dots13478": "\u28CD", "/dots1348": "\u288D", "/dots135": "\u2815", "/dots1356": "\u2835", "/dots13567": "\u2875", "/dots135678": "\u28F5", "/dots13568": "\u28B5", "/dots1357": "\u2855", "/dots13578": "\u28D5", "/dots1358": "\u2895", "/dots136": "\u2825", "/dots1367": "\u2865", "/dots13678": "\u28E5", "/dots1368": "\u28A5", "/dots137": "\u2845", "/dots1378": "\u28C5", "/dots138": "\u2885", "/dots14": "\u2809", "/dots145": "\u2819", "/dots1456": "\u2839", "/dots14567": "\u2879", "/dots145678": "\u28F9", "/dots14568": "\u28B9", "/dots1457": "\u2859", "/dots14578": "\u28D9", "/dots1458": "\u2899", "/dots146": "\u2829", "/dots1467": "\u2869", "/dots14678": "\u28E9", "/dots1468": "\u28A9", "/dots147": "\u2849", "/dots1478": "\u28C9", "/dots148": "\u2889", "/dots15": "\u2811", "/dots156": "\u2831", "/dots1567": "\u2871", "/dots15678": "\u28F1", "/dots1568": "\u28B1", "/dots157": "\u2851", "/dots1578": "\u28D1", "/dots158": "\u2891", "/dots16": "\u2821", "/dots167": "\u2861", "/dots1678": "\u28E1", "/dots168": "\u28A1", "/dots17": "\u2841", "/dots178": "\u28C1", "/dots18": "\u2881", "/dots2": "\u2802", "/dots23": "\u2806", "/dots234": "\u280E", "/dots2345": "\u281E", "/dots23456": "\u283E", "/dots234567": "\u287E", "/dots2345678": "\u28FE", "/dots234568": "\u28BE", "/dots23457": "\u285E", "/dots234578": "\u28DE", "/dots23458": "\u289E", "/dots2346": "\u282E", "/dots23467": "\u286E", "/dots234678": "\u28EE", "/dots23468": "\u28AE", "/dots2347": "\u284E", "/dots23478": "\u28CE", "/dots2348": "\u288E", "/dots235": "\u2816", "/dots2356": "\u2836", "/dots23567": "\u2876", "/dots235678": "\u28F6", "/dots23568": "\u28B6", "/dots2357": "\u2856", "/dots23578": "\u28D6", "/dots2358": "\u2896", "/dots236": "\u2826", "/dots2367": "\u2866", "/dots23678": "\u28E6", "/dots2368": "\u28A6", "/dots237": "\u2846", "/dots2378": "\u28C6", "/dots238": "\u2886", "/dots24": "\u280A", "/dots245": "\u281A", "/dots2456": "\u283A", "/dots24567": "\u287A", "/dots245678": "\u28FA", "/dots24568": "\u28BA", "/dots2457": "\u285A", "/dots24578": "\u28DA", "/dots2458": "\u289A", "/dots246": "\u282A", "/dots2467": "\u286A", "/dots24678": "\u28EA", "/dots2468": "\u28AA", "/dots247": "\u284A", "/dots2478": "\u28CA", "/dots248": "\u288A", "/dots25": "\u2812", "/dots256": "\u2832", "/dots2567": "\u2872", "/dots25678": "\u28F2", "/dots2568": "\u28B2", "/dots257": "\u2852", "/dots2578": "\u28D2", "/dots258": "\u2892", "/dots26": "\u2822", "/dots267": "\u2862", "/dots2678": "\u28E2", "/dots268": "\u28A2", "/dots27": "\u2842", "/dots278": "\u28C2", "/dots28": "\u2882", "/dots3": "\u2804", "/dots34": "\u280C", "/dots345": "\u281C", "/dots3456": "\u283C", "/dots34567": "\u287C", "/dots345678": "\u28FC", "/dots34568": "\u28BC", "/dots3457": "\u285C", "/dots34578": "\u28DC", "/dots3458": "\u289C", "/dots346": "\u282C", "/dots3467": "\u286C", "/dots34678": "\u28EC", "/dots3468": "\u28AC", "/dots347": "\u284C", "/dots3478": "\u28CC", "/dots348": "\u288C", "/dots35": "\u2814", "/dots356": "\u2834", "/dots3567": "\u2874", "/dots35678": "\u28F4", "/dots3568": "\u28B4", "/dots357": "\u2854", "/dots3578": "\u28D4", "/dots358": "\u2894", "/dots36": "\u2824", "/dots367": "\u2864", "/dots3678": "\u28E4", "/dots368": "\u28A4", "/dots37": "\u2844", "/dots378": "\u28C4", "/dots38": "\u2884", "/dots4": "\u2808", "/dots45": "\u2818", "/dots456": "\u2838", "/dots4567": "\u2878", "/dots45678": "\u28F8", "/dots4568": "\u28B8", "/dots457": "\u2858", "/dots4578": "\u28D8", "/dots458": "\u2898", "/dots46": "\u2828", "/dots467": "\u2868", "/dots4678": "\u28E8", "/dots468": "\u28A8", "/dots47": "\u2848", "/dots478": "\u28C8", "/dots48": "\u2888", "/dots5": "\u2810", "/dots56": "\u2830", "/dots567": "\u2870", "/dots5678": "\u28F0", "/dots568": "\u28B0", "/dots57": "\u2850", "/dots578": "\u28D0", "/dots58": "\u2890", "/dots6": "\u2820", "/dots67": "\u2860", "/dots678": "\u28E0", "/dots68": "\u28A0", "/dots7": "\u2840", "/dots78": "\u28C0", "/dots8": "\u2880", "/dotsquarefour": "\u2E2C", "/dottedcircle": "\u25CC", "/dottedcross": "\u205C", "/dotupper:hb": "\u05C4", "/doublebarvertical": "\u23F8", "/doubleyodpatah": "\uFB1F", "/doubleyodpatahhebrew": "\uFB1F", "/doughnut": "\u1F369", "/doveOfPeace": "\u1F54A", "/downtackbelowcmb": "\u031E", "/downtackmod": "\u02D5", "/downwarrowleftofuparrow": "\u21F5", "/dparen": "\u249F", "/dparenthesized": "\u249F", "/drachma": "\u20AF", "/dragon": "\u1F409", "/dragonFace": "\u1F432", "/draughtskingblack": "\u26C3", "/draughtskingwhite": "\u26C1", "/draughtsmanblack": "\u26C2", "/draughtsmanwhite": "\u26C0", "/dress": "\u1F457", "/driveslow": "\u26DA", "/dromedaryCamel": "\u1F42A", "/droplet": "\u1F4A7", "/dsquare": "\u1F1A5", "/dsuperior": "\uF6EB", "/dtail": "\u0256", "/dtopbar": "\u018C", "/duhiragana": "\u3065", "/dukatakana": "\u30C5", "/dul": "\u068E", "/dul.fina": "\uFB87", "/dul.isol": "\uFB86", "/dum": "\uA771", "/dvd": "\u1F4C0", "/dyeh": "\u0684", "/dyeh.fina": "\uFB73", "/dyeh.init": "\uFB74", "/dyeh.isol": "\uFB72", "/dyeh.medi": "\uFB75", "/dz": "\u01F3", "/dzaltone": "\u02A3", "/dzcaron": "\u01C6", "/dzcurl": "\u02A5", "/dzeabkhasiancyrillic": "\u04E1", "/dzeabkhcyr": "\u04E1", "/dzecyr": "\u0455", "/dzecyrillic": "\u0455", "/dzed": "\u02A3", "/dzedcurl": "\u02A5", "/dzhecyr": "\u045F", "/dzhecyrillic": "\u045F", "/dzjekomicyr": "\u0507", "/dzzhecyr": "\u052B", "/e": "\u0065", "/e-mail": "\u1F4E7", "/e.fina": "\uFBE5", "/e.inferior": "\u2091", "/e.init": "\uFBE6", "/e.isol": "\uFBE4", "/e.medi": "\uFBE7", "/eVfullwidth": "\u32CE", "/eacute": "\u00E9", "/earOfMaize": "\u1F33D", "/earOfRice": "\u1F33E", "/earth": "\u2641", "/earthGlobeAmericas": "\u1F30E", "/earthGlobeAsiaAustralia": "\u1F30F", "/earthGlobeEuropeAfrica": "\u1F30D", "/earthground": "\u23DA", "/earthideographiccircled": "\u328F", "/earthideographicparen": "\u322F", "/eastsyriaccross": "\u2671", "/ebengali": "\u098F", "/ebopomofo": "\u311C", "/ebreve": "\u0115", "/ecandradeva": "\u090D", "/ecandragujarati": "\u0A8D", "/ecandravowelsigndeva": "\u0945", "/ecandravowelsigngujarati": "\u0AC5", "/ecaron": "\u011B", "/ecedilla": "\u0229", "/ecedillabreve": "\u1E1D", "/echarmenian": "\u0565", "/echyiwnarmenian": "\u0587", "/ecircle": "\u24D4", "/ecirclekatakana": "\u32D3", "/ecircumflex": "\u00EA", "/ecircumflexacute": "\u1EBF", "/ecircumflexbelow": "\u1E19", "/ecircumflexdotbelow": "\u1EC7", "/ecircumflexgrave": "\u1EC1", "/ecircumflexhoi": "\u1EC3", "/ecircumflexhookabove": "\u1EC3", "/ecircumflextilde": "\u1EC5", "/ecyrillic": "\u0454", "/edblgrave": "\u0205", "/edblstruckitalic": "\u2147", "/edeva": "\u090F", "/edieresis": "\u00EB", "/edot": "\u0117", "/edotaccent": "\u0117", "/edotbelow": "\u1EB9", "/eegurmukhi": "\u0A0F", "/eekaasquare": "\u3308", "/eematragurmukhi": "\u0A47", "/efcyr": "\u0444", "/efcyrillic": "\u0444", "/egrave": "\u00E8", "/egravedbl": "\u0205", "/egujarati": "\u0A8F", "/egyptain": "\uA725", "/egyptalef": "\uA723", "/eharmenian": "\u0567", "/ehbopomofo": "\u311D", "/ehiragana": "\u3048", "/ehoi": "\u1EBB", "/ehookabove": "\u1EBB", "/eibopomofo": "\u311F", "/eight": "\u0038", "/eight.inferior": "\u2088", "/eight.roman": "\u2167", "/eight.romansmall": "\u2177", "/eight.superior": "\u2078", "/eightarabic": "\u0668", "/eightbengali": "\u09EE", "/eightcircle": "\u2467", "/eightcircledbl": "\u24FC", "/eightcircleinversesansserif": "\u2791", "/eightcomma": "\u1F109", "/eightdeva": "\u096E", "/eighteencircle": "\u2471", "/eighteencircleblack": "\u24F2", "/eighteenparen": "\u2485", "/eighteenparenthesized": "\u2485", "/eighteenperiod": "\u2499", "/eightfar": "\u06F8", "/eightgujarati": "\u0AEE", "/eightgurmukhi": "\u0A6E", "/eighthackarabic": "\u0668", "/eighthangzhou": "\u3028", "/eighthnote": "\u266A", "/eighthnotebeamed": "\u266B", "/eightideographiccircled": "\u3287", "/eightideographicparen": "\u3227", "/eightinferior": "\u2088", "/eightksquare": "\u1F19F", "/eightmonospace": "\uFF18", "/eightoldstyle": "\uF738", "/eightparen": "\u247B", "/eightparenthesized": "\u247B", "/eightperiod": "\u248F", "/eightpersian": "\u06F8", "/eightroman": "\u2177", "/eightsuperior": "\u2078", "/eightthai": "\u0E58", "/eightycirclesquare": "\u324F", "/einvertedbreve": "\u0207", "/eiotifiedcyr": "\u0465", "/eiotifiedcyrillic": "\u0465", "/eject": "\u23CF", "/ekatakana": "\u30A8", "/ekatakanahalfwidth": "\uFF74", "/ekonkargurmukhi": "\u0A74", "/ekorean": "\u3154", "/elcyr": "\u043B", "/elcyrillic": "\u043B", "/electricLightBulb": "\u1F4A1", "/electricPlug": "\u1F50C", "/electricTorch": "\u1F526", "/electricalintersection": "\u23E7", "/electricarrow": "\u2301", "/element": "\u2208", "/elementdotabove": "\u22F5", "/elementlonghorizontalstroke": "\u22F2", "/elementopeningup": "\u27D2", "/elementoverbar": "\u22F6", "/elementoverbarsmall": "\u22F7", "/elementsmall": "\u220A", "/elementsmallverticalbarhorizontalstroke": "\u22F4", "/elementtwoshorizontalstroke": "\u22F9", "/elementunderbar": "\u22F8", "/elementverticalbarhorizontalstroke": "\u22F3", "/elephant": "\u1F418", "/eleven.roman": "\u216A", "/eleven.romansmall": "\u217A", "/elevencircle": "\u246A", "/elevencircleblack": "\u24EB", "/elevenparen": "\u247E", "/elevenparenthesized": "\u247E", "/elevenperiod": "\u2492", "/elevenroman": "\u217A", "/elhookcyr": "\u0513", "/ellipsis": "\u2026", "/ellipsisdiagonaldownright": "\u22F1", "/ellipsisdiagonalupright": "\u22F0", "/ellipsismidhorizontal": "\u22EF", "/ellipsisvertical": "\u22EE", "/elmiddlehookcyr": "\u0521", "/elsharptailcyr": "\u04C6", "/eltailcyr": "\u052F", "/emacron": "\u0113", "/emacronacute": "\u1E17", "/emacrongrave": "\u1E15", "/emcyr": "\u043C", "/emcyrillic": "\u043C", "/emdash": "\u2014", "/emdashdbl": "\u2E3A", "/emdashtpl": "\u2E3B", "/emdashvertical": "\uFE31", "/emojiModifierFitzpatrickType-1-2": "\u1F3FB", "/emojiModifierFitzpatrickType-3": "\u1F3FC", "/emojiModifierFitzpatrickType-4": "\u1F3FD", "/emojiModifierFitzpatrickType-5": "\u1F3FE", "/emojiModifierFitzpatrickType-6": "\u1F3FF", "/emonospace": "\uFF45", "/emphasis": "\u2383", "/emphasismarkarmenian": "\u055B", "/emptyDocument": "\u1F5CB", "/emptyNote": "\u1F5C5", "/emptyNotePad": "\u1F5C7", "/emptyNotePage": "\u1F5C6", "/emptyPage": "\u1F5CC", "/emptyPages": "\u1F5CD", "/emptyset": "\u2205", "/emquad": "\u2001", "/emsharptailcyr": "\u04CE", "/emspace": "\u2003", "/enbopomofo": "\u3123", "/encyr": "\u043D", "/encyrillic": "\u043D", "/endLeftwardsArrowAbove": "\u1F51A", "/endash": "\u2013", "/endashvertical": "\uFE32", "/endescendercyrillic": "\u04A3", "/endpro": "\u220E", "/eng": "\u014B", "/engbopomofo": "\u3125", "/engecyr": "\u04A5", "/enghecyrillic": "\u04A5", "/enhookcyr": "\u04C8", "/enhookcyrillic": "\u04C8", "/enhookleftcyr": "\u0529", "/enmiddlehookcyr": "\u0523", "/enotch": "\u2C78", "/enquad": "\u2000", "/ensharptailcyr": "\u04CA", "/enspace": "\u2002", "/entailcyr": "\u04A3", "/enter": "\u2386", "/enterpriseideographiccircled": "\u32AD", "/enterpriseideographicparen": "\u323D", "/envelopeDownwardsArrowAbove": "\u1F4E9", "/envelopeLightning": "\u1F584", "/eogonek": "\u0119", "/eokorean": "\u3153", "/eopen": "\u025B", "/eopenclosed": "\u029A", "/eopenreversed": "\u025C", "/eopenreversedclosed": "\u025E", "/eopenreversedhook": "\u025D", "/eparen": "\u24A0", "/eparenthesized": "\u24A0", "/epsilon": "\u03B5", "/epsilonacute": "\u1F73", "/epsilonasper": "\u1F11", "/epsilonasperacute": "\u1F15", "/epsilonaspergrave": "\u1F13", "/epsilongrave": "\u1F72", "/epsilonlenis": "\u1F10", "/epsilonlenisacute": "\u1F14", "/epsilonlenisgrave": "\u1F12", "/epsilonlunatesymbol": "\u03F5", "/epsilonreversedlunatesymbol": "\u03F6", "/epsilontonos": "\u03AD", "/epsilonunderlinefunc": "\u2377", "/equal": "\u003D", "/equal.inferior": "\u208C", "/equal.superior": "\u207C", "/equalandparallel": "\u22D5", "/equalbydefinition": "\u225D", "/equalmonospace": "\uFF1D", "/equalorgreater": "\u22DD", "/equalorless": "\u22DC", "/equalorprecedes": "\u22DE", "/equalorsucceeds": "\u22DF", "/equalscolon": "\u2255", "/equalsmall": "\uFE66", "/equalsuperior": "\u207C", "/equiangular": "\u225A", "/equivalence": "\u2261", "/equivalent": "\u224D", "/eranameheiseisquare": "\u337B", "/eranamemeizisquare": "\u337E", "/eranamesyouwasquare": "\u337C", "/eranametaisyousquare": "\u337D", "/eraseleft": "\u232B", "/eraseright": "\u2326", "/erbopomofo": "\u3126", "/ercyr": "\u0440", "/ercyrillic": "\u0440", "/ereversed": "\u0258", "/ereversedcyr": "\u044D", "/ereversedcyrillic": "\u044D", "/ereverseddieresiscyr": "\u04ED", "/ergfullwidth": "\u32CD", "/ertickcyr": "\u048F", "/escript": "\u212F", "/escyr": "\u0441", "/escyrillic": "\u0441", "/esdescendercyrillic": "\u04AB", "/esh": "\u0283", "/eshcurl": "\u0286", "/eshortdeva": "\u090E", "/eshortvowelsigndeva": "\u0946", "/eshreversedloop": "\u01AA", "/eshsquatreversed": "\u0285", "/esmallhiragana": "\u3047", "/esmallkatakana": "\u30A7", "/esmallkatakanahalfwidth": "\uFF6A", "/estailcyr": "\u04AB", "/estimated": "\u212E", "/estimates": "\u2259", "/estroke": "\u0247", "/esukuudosquare": "\u3307", "/esuperior": "\uF6EC", "/et": "\uA76B", "/eta": "\u03B7", "/etaacute": "\u1F75", "/etaacuteiotasub": "\u1FC4", "/etaasper": "\u1F21", "/etaasperacute": "\u1F25", "/etaasperacuteiotasub": "\u1F95", "/etaaspergrave": "\u1F23", "/etaaspergraveiotasub": "\u1F93", "/etaasperiotasub": "\u1F91", "/etaaspertilde": "\u1F27", "/etaaspertildeiotasub": "\u1F97", "/etagrave": "\u1F74", "/etagraveiotasub": "\u1FC2", "/etaiotasub": "\u1FC3", "/etalenis": "\u1F20", "/etalenisacute": "\u1F24", "/etalenisacuteiotasub": "\u1F94", "/etalenisgrave": "\u1F22", "/etalenisgraveiotasub": "\u1F92", "/etalenisiotasub": "\u1F90", "/etalenistilde": "\u1F26", "/etalenistildeiotasub": "\u1F96", "/etarmenian": "\u0568", "/etatilde": "\u1FC6", "/etatildeiotasub": "\u1FC7", "/etatonos": "\u03AE", "/eth": "\u00F0", "/ethi:aaglottal": "\u12A3", "/ethi:aglottal": "\u12A0", "/ethi:ba": "\u1260", "/ethi:baa": "\u1263", "/ethi:be": "\u1265", "/ethi:bee": "\u1264", "/ethi:bi": "\u1262", "/ethi:bo": "\u1266", "/ethi:bu": "\u1261", "/ethi:bwa": "\u1267", "/ethi:ca": "\u1278", "/ethi:caa": "\u127B", "/ethi:ce": "\u127D", "/ethi:cee": "\u127C", "/ethi:cha": "\u1328", "/ethi:chaa": "\u132B", "/ethi:che": "\u132D", "/ethi:chee": "\u132C", "/ethi:chi": "\u132A", "/ethi:cho": "\u132E", "/ethi:chu": "\u1329", "/ethi:chwa": "\u132F", "/ethi:ci": "\u127A", "/ethi:co": "\u127E", "/ethi:colon": "\u1365", "/ethi:comma": "\u1363", "/ethi:cu": "\u1279", "/ethi:cwa": "\u127F", "/ethi:da": "\u12F0", "/ethi:daa": "\u12F3", "/ethi:dda": "\u12F8", "/ethi:ddaa": "\u12FB", "/ethi:dde": "\u12FD", "/ethi:ddee": "\u12FC", "/ethi:ddi": "\u12FA", "/ethi:ddo": "\u12FE", "/ethi:ddu": "\u12F9", "/ethi:ddwa": "\u12FF", "/ethi:de": "\u12F5", "/ethi:dee": "\u12F4", "/ethi:di": "\u12F2", "/ethi:do": "\u12F6", "/ethi:du": "\u12F1", "/ethi:dwa": "\u12F7", "/ethi:eeglottal": "\u12A4", "/ethi:eglottal": "\u12A5", "/ethi:eight": "\u1370", "/ethi:eighty": "\u1379", "/ethi:fa": "\u1348", "/ethi:faa": "\u134B", "/ethi:fe": "\u134D", "/ethi:fee": "\u134C", "/ethi:fi": "\u134A", "/ethi:fifty": "\u1376", "/ethi:five": "\u136D", "/ethi:fo": "\u134E", "/ethi:forty": "\u1375", "/ethi:four": "\u136C", "/ethi:fu": "\u1349", "/ethi:fullstop": "\u1362", "/ethi:fwa": "\u134F", "/ethi:fya": "\u135A", "/ethi:ga": "\u1308", "/ethi:gaa": "\u130B", "/ethi:ge": "\u130D", "/ethi:gee": "\u130C", "/ethi:geminationandvowellengthmarkcmb": "\u135D", "/ethi:geminationmarkcmb": "\u135F", "/ethi:gga": "\u1318", "/ethi:ggaa": "\u131B", "/ethi:gge": "\u131D", "/ethi:ggee": "\u131C", "/ethi:ggi": "\u131A", "/ethi:ggo": "\u131E", "/ethi:ggu": "\u1319", "/ethi:ggwaa": "\u131F", "/ethi:gi": "\u130A", "/ethi:go": "\u130E", "/ethi:goa": "\u130F", "/ethi:gu": "\u1309", "/ethi:gwa": "\u1310", "/ethi:gwaa": "\u1313", "/ethi:gwe": "\u1315", "/ethi:gwee": "\u1314", "/ethi:gwi": "\u1312", "/ethi:ha": "\u1200", "/ethi:haa": "\u1203", "/ethi:he": "\u1205", "/ethi:hee": "\u1204", "/ethi:hha": "\u1210", "/ethi:hhaa": "\u1213", "/ethi:hhe": "\u1215", "/ethi:hhee": "\u1214", "/ethi:hhi": "\u1212", "/ethi:hho": "\u1216", "/ethi:hhu": "\u1211", "/ethi:hhwa": "\u1217", "/ethi:hi": "\u1202", "/ethi:ho": "\u1206", "/ethi:hoa": "\u1207", "/ethi:hu": "\u1201", "/ethi:hundred": "\u137B", "/ethi:iglottal": "\u12A2", "/ethi:ja": "\u1300", "/ethi:jaa": "\u1303", "/ethi:je": "\u1305", "/ethi:jee": "\u1304", "/ethi:ji": "\u1302", "/ethi:jo": "\u1306", "/ethi:ju": "\u1301", "/ethi:jwa": "\u1307", "/ethi:ka": "\u12A8", "/ethi:kaa": "\u12AB", "/ethi:ke": "\u12AD", "/ethi:kee": "\u12AC", "/ethi:ki": "\u12AA", "/ethi:ko": "\u12AE", "/ethi:koa": "\u12AF", "/ethi:ku": "\u12A9", "/ethi:kwa": "\u12B0", "/ethi:kwaa": "\u12B3", "/ethi:kwe": "\u12B5", "/ethi:kwee": "\u12B4", "/ethi:kwi": "\u12B2", "/ethi:kxa": "\u12B8", "/ethi:kxaa": "\u12BB", "/ethi:kxe": "\u12BD", "/ethi:kxee": "\u12BC", "/ethi:kxi": "\u12BA", "/ethi:kxo": "\u12BE", "/ethi:kxu": "\u12B9", "/ethi:kxwa": "\u12C0", "/ethi:kxwaa": "\u12C3", "/ethi:kxwe": "\u12C5", "/ethi:kxwee": "\u12C4", "/ethi:kxwi": "\u12C2", "/ethi:la": "\u1208", "/ethi:laa": "\u120B", "/ethi:le": "\u120D", "/ethi:lee": "\u120C", "/ethi:li": "\u120A", "/ethi:lo": "\u120E", "/ethi:lu": "\u1209", "/ethi:lwa": "\u120F", "/ethi:ma": "\u1218", "/ethi:maa": "\u121B", "/ethi:me": "\u121D", "/ethi:mee": "\u121C", "/ethi:mi": "\u121A", "/ethi:mo": "\u121E", "/ethi:mu": "\u1219", "/ethi:mwa": "\u121F", "/ethi:mya": "\u1359", "/ethi:na": "\u1290", "/ethi:naa": "\u1293", "/ethi:ne": "\u1295", "/ethi:nee": "\u1294", "/ethi:ni": "\u1292", "/ethi:nine": "\u1371", "/ethi:ninety": "\u137A", "/ethi:no": "\u1296", "/ethi:nu": "\u1291", "/ethi:nwa": "\u1297", "/ethi:nya": "\u1298", "/ethi:nyaa": "\u129B", "/ethi:nye": "\u129D", "/ethi:nyee": "\u129C", "/ethi:nyi": "\u129A", "/ethi:nyo": "\u129E", "/ethi:nyu": "\u1299", "/ethi:nywa": "\u129F", "/ethi:oglottal": "\u12A6", "/ethi:one": "\u1369", "/ethi:pa": "\u1350", "/ethi:paa": "\u1353", "/ethi:paragraphseparator": "\u1368", "/ethi:pe": "\u1355", "/ethi:pee": "\u1354", "/ethi:pha": "\u1330", "/ethi:phaa": "\u1333", "/ethi:pharyngeala": "\u12D0", "/ethi:pharyngealaa": "\u12D3", "/ethi:pharyngeale": "\u12D5", "/ethi:pharyngealee": "\u12D4", "/ethi:pharyngeali": "\u12D2", "/ethi:pharyngealo": "\u12D6", "/ethi:pharyngealu": "\u12D1", "/ethi:phe": "\u1335", "/ethi:phee": "\u1334", "/ethi:phi": "\u1332", "/ethi:pho": "\u1336", "/ethi:phu": "\u1331", "/ethi:phwa": "\u1337", "/ethi:pi": "\u1352", "/ethi:po": "\u1356", "/ethi:prefacecolon": "\u1366", "/ethi:pu": "\u1351", "/ethi:pwa": "\u1357", "/ethi:qa": "\u1240", "/ethi:qaa": "\u1243", "/ethi:qe": "\u1245", "/ethi:qee": "\u1244", "/ethi:qha": "\u1250", "/ethi:qhaa": "\u1253", "/ethi:qhe": "\u1255", "/ethi:qhee": "\u1254", "/ethi:qhi": "\u1252", "/ethi:qho": "\u1256", "/ethi:qhu": "\u1251", "/ethi:qhwa": "\u1258", "/ethi:qhwaa": "\u125B", "/ethi:qhwe": "\u125D", "/ethi:qhwee": "\u125C", "/ethi:qhwi": "\u125A", "/ethi:qi": "\u1242", "/ethi:qo": "\u1246", "/ethi:qoa": "\u1247", "/ethi:qu": "\u1241", "/ethi:questionmark": "\u1367", "/ethi:qwa": "\u1248", "/ethi:qwaa": "\u124B", "/ethi:qwe": "\u124D", "/ethi:qwee": "\u124C", "/ethi:qwi": "\u124A", "/ethi:ra": "\u1228", "/ethi:raa": "\u122B", "/ethi:re": "\u122D", "/ethi:ree": "\u122C", "/ethi:ri": "\u122A", "/ethi:ro": "\u122E", "/ethi:ru": "\u1229", "/ethi:rwa": "\u122F", "/ethi:rya": "\u1358", "/ethi:sa": "\u1230", "/ethi:saa": "\u1233", "/ethi:se": "\u1235", "/ethi:sectionmark": "\u1360", "/ethi:see": "\u1234", "/ethi:semicolon": "\u1364", "/ethi:seven": "\u136F", "/ethi:seventy": "\u1378", "/ethi:sha": "\u1238", "/ethi:shaa": "\u123B", "/ethi:she": "\u123D", "/ethi:shee": "\u123C", "/ethi:shi": "\u123A", "/ethi:sho": "\u123E", "/ethi:shu": "\u1239", "/ethi:shwa": "\u123F", "/ethi:si": "\u1232", "/ethi:six": "\u136E", "/ethi:sixty": "\u1377", "/ethi:so": "\u1236", "/ethi:su": "\u1231", "/ethi:swa": "\u1237", "/ethi:sza": "\u1220", "/ethi:szaa": "\u1223", "/ethi:sze": "\u1225", "/ethi:szee": "\u1224", "/ethi:szi": "\u1222", "/ethi:szo": "\u1226", "/ethi:szu": "\u1221", "/ethi:szwa": "\u1227", "/ethi:ta": "\u1270", "/ethi:taa": "\u1273", "/ethi:te": "\u1275", "/ethi:tee": "\u1274", "/ethi:ten": "\u1372", "/ethi:tenthousand": "\u137C", "/ethi:tha": "\u1320", "/ethi:thaa": "\u1323", "/ethi:the": "\u1325", "/ethi:thee": "\u1324", "/ethi:thi": "\u1322", "/ethi:thirty": "\u1374", "/ethi:tho": "\u1326", "/ethi:three": "\u136B", "/ethi:thu": "\u1321", "/ethi:thwa": "\u1327", "/ethi:ti": "\u1272", "/ethi:to": "\u1276", "/ethi:tsa": "\u1338", "/ethi:tsaa": "\u133B", "/ethi:tse": "\u133D", "/ethi:tsee": "\u133C", "/ethi:tsi": "\u133A", "/ethi:tso": "\u133E", "/ethi:tsu": "\u1339", "/ethi:tswa": "\u133F", "/ethi:tu": "\u1271", "/ethi:twa": "\u1277", "/ethi:twenty": "\u1373", "/ethi:two": "\u136A", "/ethi:tza": "\u1340", "/ethi:tzaa": "\u1343", "/ethi:tze": "\u1345", "/ethi:tzee": "\u1344", "/ethi:tzi": "\u1342", "/ethi:tzo": "\u1346", "/ethi:tzoa": "\u1347", "/ethi:tzu": "\u1341", "/ethi:uglottal": "\u12A1", "/ethi:va": "\u1268", "/ethi:vaa": "\u126B", "/ethi:ve": "\u126D", "/ethi:vee": "\u126C", "/ethi:vi": "\u126A", "/ethi:vo": "\u126E", "/ethi:vowellengthmarkcmb": "\u135E", "/ethi:vu": "\u1269", "/ethi:vwa": "\u126F", "/ethi:wa": "\u12C8", "/ethi:waa": "\u12CB", "/ethi:waglottal": "\u12A7", "/ethi:we": "\u12CD", "/ethi:wee": "\u12CC", "/ethi:wi": "\u12CA", "/ethi:wo": "\u12CE", "/ethi:woa": "\u12CF", "/ethi:wordspace": "\u1361", "/ethi:wu": "\u12C9", "/ethi:xa": "\u1280", "/ethi:xaa": "\u1283", "/ethi:xe": "\u1285", "/ethi:xee": "\u1284", "/ethi:xi": "\u1282", "/ethi:xo": "\u1286", "/ethi:xoa": "\u1287", "/ethi:xu": "\u1281", "/ethi:xwa": "\u1288", "/ethi:xwaa": "\u128B", "/ethi:xwe": "\u128D", "/ethi:xwee": "\u128C", "/ethi:xwi": "\u128A", "/ethi:ya": "\u12E8", "/ethi:yaa": "\u12EB", "/ethi:ye": "\u12ED", "/ethi:yee": "\u12EC", "/ethi:yi": "\u12EA", "/ethi:yo": "\u12EE", "/ethi:yoa": "\u12EF", "/ethi:yu": "\u12E9", "/ethi:za": "\u12D8", "/ethi:zaa": "\u12DB", "/ethi:ze": "\u12DD", "/ethi:zee": "\u12DC", "/ethi:zha": "\u12E0", "/ethi:zhaa": "\u12E3", "/ethi:zhe": "\u12E5", "/ethi:zhee": "\u12E4", "/ethi:zhi": "\u12E2", "/ethi:zho": "\u12E6", "/ethi:zhu": "\u12E1", "/ethi:zhwa": "\u12E7", "/ethi:zi": "\u12DA", "/ethi:zo": "\u12DE", "/ethi:zu": "\u12D9", "/ethi:zwa": "\u12DF", "/etilde": "\u1EBD", "/etildebelow": "\u1E1B", "/etnahta:hb": "\u0591", "/etnahtafoukhhebrew": "\u0591", "/etnahtafoukhlefthebrew": "\u0591", "/etnahtahebrew": "\u0591", "/etnahtalefthebrew": "\u0591", "/eturned": "\u01DD", "/eukorean": "\u3161", "/eukrcyr": "\u0454", "/euler": "\u2107", "/euro": "\u20AC", "/euroarchaic": "\u20A0", "/europeanCastle": "\u1F3F0", "/europeanPostOffice": "\u1F3E4", "/evergreenTree": "\u1F332", "/evowelsignbengali": "\u09C7", "/evowelsigndeva": "\u0947", "/evowelsigngujarati": "\u0AC7", "/excellentideographiccircled": "\u329D", "/excess": "\u2239", "/exclam": "\u0021", "/exclamarmenian": "\u055C", "/exclamationquestion": "\u2049", "/exclamdbl": "\u203C", "/exclamdown": "\u00A1", "/exclamdownsmall": "\uF7A1", "/exclammonospace": "\uFF01", "/exclamsmall": "\uF721", "/existential": "\u2203", "/expressionlessFace": "\u1F611", "/extraterrestrialAlien": "\u1F47D", "/eye": "\u1F441", "/eyeglasses": "\u1F453", "/eyes": "\u1F440", "/ezh": "\u0292", "/ezhcaron": "\u01EF", "/ezhcurl": "\u0293", "/ezhreversed": "\u01B9", "/ezhtail": "\u01BA", "/f": "\u0066", "/f_f": "\uFB00", "/f_f_i": "\uFB03", "/f_f_l": "\uFB04", "/faceMassage": "\u1F486", "/faceSavouringDeliciousFood": "\u1F60B", "/faceScreamingInFear": "\u1F631", "/faceThrowingAKiss": "\u1F618", "/faceWithColdSweat": "\u1F613", "/faceWithLookOfTriumph": "\u1F624", "/faceWithMedicalMask": "\u1F637", "/faceWithNoGoodGesture": "\u1F645", "/faceWithOkGesture": "\u1F646", "/faceWithOpenMouth": "\u1F62E", "/faceWithOpenMouthAndColdSweat": "\u1F630", "/faceWithRollingEyes": "\u1F644", "/faceWithStuckOutTongue": "\u1F61B", "/faceWithStuckOutTongueAndTightlyClosedEyes": "\u1F61D", "/faceWithStuckOutTongueAndWinkingEye": "\u1F61C", "/faceWithTearsOfJoy": "\u1F602", "/faceWithoutMouth": "\u1F636", "/facsimile": "\u213B", "/factory": "\u1F3ED", "/fadeva": "\u095E", "/fagurmukhi": "\u0A5E", "/fahrenheit": "\u2109", "/fallenLeaf": "\u1F342", "/fallingdiagonal": "\u27CD", "/fallingdiagonalincircleinsquareblackwhite": "\u26DE", "/family": "\u1F46A", "/farsi": "\u262B", "/farsiYehDigitFourBelow": "\u0777", "/farsiYehDigitThreeAbove": "\u0776", "/farsiYehDigitTwoAbove": "\u0775", "/fatha": "\u064E", "/fathaIsol": "\uFE76", "/fathaMedi": "\uFE77", "/fathaarabic": "\u064E", "/fathalowarabic": "\u064E", "/fathasmall": "\u0618", "/fathatan": "\u064B", "/fathatanIsol": "\uFE70", "/fathatanarabic": "\u064B", "/fathatwodotsdots": "\u065E", "/fatherChristmas": "\u1F385", "/faxIcon": "\u1F5B7", "/faxMachine": "\u1F4E0", "/fbopomofo": "\u3108", "/fcircle": "\u24D5", "/fdot": "\u1E1F", "/fdotaccent": "\u1E1F", "/fearfulFace": "\u1F628", "/februarytelegraph": "\u32C1", "/feh.fina": "\uFED2", "/feh.init": "\uFED3", "/feh.init_alefmaksura.fina": "\uFC31", "/feh.init_hah.fina": "\uFC2E", "/feh.init_hah.medi": "\uFCBF", "/feh.init_jeem.fina": "\uFC2D", "/feh.init_jeem.medi": "\uFCBE", "/feh.init_khah.fina": "\uFC2F", "/feh.init_khah.medi": "\uFCC0", "/feh.init_khah.medi_meem.medi": "\uFD7D", "/feh.init_meem.fina": "\uFC30", "/feh.init_meem.medi": "\uFCC1", "/feh.init_yeh.fina": "\uFC32", "/feh.isol": "\uFED1", "/feh.medi": "\uFED4", "/feh.medi_alefmaksura.fina": "\uFC7C", "/feh.medi_khah.medi_meem.fina": "\uFD7C", "/feh.medi_meem.medi_yeh.fina": "\uFDC1", "/feh.medi_yeh.fina": "\uFC7D", "/fehThreeDotsUpBelow": "\u0761", "/fehTwoDotsBelow": "\u0760", "/feharabic": "\u0641", "/feharmenian": "\u0586", "/fehdotbelow": "\u06A3", "/fehdotbelowright": "\u06A2", "/fehfinalarabic": "\uFED2", "/fehinitialarabic": "\uFED3", "/fehmedialarabic": "\uFED4", "/fehthreedotsbelow": "\u06A5", "/feicoptic": "\u03E5", "/female": "\u2640", "/femaleideographiccircled": "\u329B", "/feng": "\u02A9", "/ferrisWheel": "\u1F3A1", "/ferry": "\u26F4", "/festivalideographicparen": "\u3240", "/ff": "\uFB00", "/ffi": "\uFB03", "/ffl": "\uFB04", "/fhook": "\u0192", "/fi": "\uFB01", # ligature "fi" "/fieldHockeyStickAndBall": "\u1F3D1", "/fifteencircle": "\u246E", "/fifteencircleblack": "\u24EF", "/fifteenparen": "\u2482", "/fifteenparenthesized": "\u2482", "/fifteenperiod": "\u2496", "/fifty.roman": "\u216C", "/fifty.romansmall": "\u217C", "/fiftycircle": "\u32BF", "/fiftycirclesquare": "\u324C", "/fiftyearlyform.roman": "\u2186", "/fiftythousand.roman": "\u2187", "/figuredash": "\u2012", "/figurespace": "\u2007", "/fileCabinet": "\u1F5C4", "/fileFolder": "\u1F4C1", "/filledbox": "\u25A0", "/filledrect": "\u25AC", "/filledstopabove": "\u06EC", "/filmFrames": "\u1F39E", "/filmProjector": "\u1F4FD", "/finalkaf": "\u05DA", "/finalkaf:hb": "\u05DA", "/finalkafdagesh": "\uFB3A", "/finalkafdageshhebrew": "\uFB3A", "/finalkafhebrew": "\u05DA", "/finalkafqamats": "\u05DA", "/finalkafqamatshebrew": "\u05DA", "/finalkafsheva": "\u05DA", "/finalkafshevahebrew": "\u05DA", "/finalkafwithdagesh:hb": "\uFB3A", "/finalmem": "\u05DD", "/finalmem:hb": "\u05DD", "/finalmemhebrew": "\u05DD", "/finalmemwide:hb": "\uFB26", "/finalnun": "\u05DF", "/finalnun:hb": "\u05DF", "/finalnunhebrew": "\u05DF", "/finalpe": "\u05E3", "/finalpe:hb": "\u05E3", "/finalpehebrew": "\u05E3", "/finalpewithdagesh:hb": "\uFB43", "/finalsigma": "\u03C2", "/finaltsadi": "\u05E5", "/finaltsadi:hb": "\u05E5", "/finaltsadihebrew": "\u05E5", "/financialideographiccircled": "\u3296", "/financialideographicparen": "\u3236", "/finsular": "\uA77C", "/fire": "\u1F525", "/fireEngine": "\u1F692", "/fireideographiccircled": "\u328B", "/fireideographicparen": "\u322B", "/fireworkSparkler": "\u1F387", "/fireworks": "\u1F386", "/firstQuarterMoon": "\u1F313", "/firstQuarterMoonFace": "\u1F31B", "/firstquartermoon": "\u263D", "/firststrongisolate": "\u2068", "/firsttonechinese": "\u02C9", "/fish": "\u1F41F", "/fishCakeSwirlDesign": "\u1F365", "/fisheye": "\u25C9", "/fishingPoleAndFish": "\u1F3A3", "/fistedHandSign": "\u1F44A", "/fitacyr": "\u0473", "/fitacyrillic": "\u0473", "/five": "\u0035", "/five.inferior": "\u2085", "/five.roman": "\u2164", "/five.romansmall": "\u2174", "/five.superior": "\u2075", "/fivearabic": "\u0665", "/fivebengali": "\u09EB", "/fivecircle": "\u2464", "/fivecircledbl": "\u24F9", "/fivecircleinversesansserif": "\u278E", "/fivecomma": "\u1F106", "/fivedeva": "\u096B", "/fivedot": "\u2E2D", "/fivedotpunctuation": "\u2059", "/fiveeighths": "\u215D", "/fivefar": "\u06F5", "/fivegujarati": "\u0AEB", "/fivegurmukhi": "\u0A6B", "/fivehackarabic": "\u0665", "/fivehangzhou": "\u3025", "/fivehundred.roman": "\u216E", "/fivehundred.romansmall": "\u217E", "/fiveideographiccircled": "\u3284", "/fiveideographicparen": "\u3224", "/fiveinferior": "\u2085", "/fivemonospace": "\uFF15", "/fiveoldstyle": "\uF735", "/fiveparen": "\u2478", "/fiveparenthesized": "\u2478", "/fiveperiod": "\u248C", "/fivepersian": "\u06F5", "/fivepointedstar": "\u066D", "/fivepointonesquare": "\u1F1A0", "/fiveroman": "\u2174", "/fivesixths": "\u215A", "/fivesuperior": "\u2075", "/fivethai": "\u0E55", "/fivethousand.roman": "\u2181", "/fl": "\uFB02", "/flagblack": "\u2691", "/flaghorizontalmiddlestripeblackwhite": "\u26FF", "/flaginhole": "\u26F3", "/flagwhite": "\u2690", "/flatness": "\u23E5", "/fleurdelis": "\u269C", "/flexedBiceps": "\u1F4AA", "/floorleft": "\u230A", "/floorright": "\u230B", "/floppyDisk": "\u1F4BE", "/floralheartbulletreversedrotated": "\u2619", "/florin": "\u0192", "/flower": "\u2698", "/flowerPlayingCards": "\u1F3B4", "/flowerpunctuationmark": "\u2055", "/flushedFace": "\u1F633", "/flyingEnvelope": "\u1F585", "/flyingSaucer": "\u1F6F8", "/fmfullwidth": "\u3399", "/fmonospace": "\uFF46", "/fmsquare": "\u3399", "/fofanthai": "\u0E1F", "/fofathai": "\u0E1D", "/fog": "\u1F32B", "/foggy": "\u1F301", "/folder": "\u1F5C0", "/fongmanthai": "\u0E4F", "/footnote": "\u0602", "/footprints": "\u1F463", "/footsquare": "\u23CD", "/forall": "\u2200", "/forces": "\u22A9", "/fork": "\u2442", "/forkKnife": "\u1F374", "/forkKnifePlate": "\u1F37D", "/forsamaritan": "\u214F", "/fortycircle": "\u32B5", "/fortycirclesquare": "\u324B", "/fortyeightcircle": "\u32BD", "/fortyfivecircle": "\u32BA", "/fortyfourcircle": "\u32B9", "/fortyninecircle": "\u32BE", "/fortyonecircle": "\u32B6", "/fortysevencircle": "\u32BC", "/fortysixcircle": "\u32BB", "/fortythreecircle": "\u32B8", "/fortytwocircle": "\u32B7", "/fountain": "\u26F2", "/four": "\u0034", "/four.inferior": "\u2084", "/four.roman": "\u2163", "/four.romansmall": "\u2173", "/four.superior": "\u2074", "/fourLeafClover": "\u1F340", "/fourarabic": "\u0664", "/fourbengali": "\u09EA", "/fourcircle": "\u2463", "/fourcircledbl": "\u24F8", "/fourcircleinversesansserif": "\u278D", "/fourcomma": "\u1F105", "/fourdeva": "\u096A", "/fourdotmark": "\u205B", "/fourdotpunctuation": "\u2058", "/fourfar": "\u06F4", "/fourfifths": "\u2158", "/fourgujarati": "\u0AEA", "/fourgurmukhi": "\u0A6A", "/fourhackarabic": "\u0664", "/fourhangzhou": "\u3024", "/fourideographiccircled": "\u3283", "/fourideographicparen": "\u3223", "/fourinferior": "\u2084", "/fourksquare": "\u1F19E", "/fourmonospace": "\uFF14", "/fournumeratorbengali": "\u09F7", "/fouroldstyle": "\uF734", "/fourparen": "\u2477", "/fourparenthesized": "\u2477", "/fourperemspace": "\u2005", "/fourperiod": "\u248B", "/fourpersian": "\u06F4", "/fourroman": "\u2173", "/foursuperior": "\u2074", "/fourteencircle": "\u246D", "/fourteencircleblack": "\u24EE", "/fourteenparen": "\u2481", "/fourteenparenthesized": "\u2481", "/fourteenperiod": "\u2495", "/fourthai": "\u0E54", "/fourthtonechinese": "\u02CB", "/fparen": "\u24A1", "/fparenthesized": "\u24A1", "/fraction": "\u2044", "/frameAnX": "\u1F5BE", "/framePicture": "\u1F5BC", "/frameTiles": "\u1F5BD", "/franc": "\u20A3", "/freesquare": "\u1F193", "/frenchFries": "\u1F35F", "/freversedepigraphic": "\uA7FB", "/friedShrimp": "\u1F364", "/frogFace": "\u1F438", "/front-facingBabyChick": "\u1F425", "/frown": "\u2322", "/frowningFaceWithOpenMouth": "\u1F626", "/frowningfacewhite": "\u2639", "/fstroke": "\uA799", "/fturned": "\u214E", "/fuelpump": "\u26FD", "/fullBlock": "\u2588", "/fullMoon": "\u1F315", "/fullMoonFace": "\u1F31D", "/functionapplication": "\u2061", "/funeralurn": "\u26B1", "/fuse": "\u23DB", "/fwd:A": "\uFF21", "/fwd:B": "\uFF22", "/fwd:C": "\uFF23", "/fwd:D": "\uFF24", "/fwd:E": "\uFF25", "/fwd:F": "\uFF26", "/fwd:G": "\uFF27", "/fwd:H": "\uFF28", "/fwd:I": "\uFF29", "/fwd:J": "\uFF2A", "/fwd:K": "\uFF2B", "/fwd:L": "\uFF2C", "/fwd:M": "\uFF2D", "/fwd:N": "\uFF2E", "/fwd:O": "\uFF2F", "/fwd:P": "\uFF30", "/fwd:Q": "\uFF31", "/fwd:R": "\uFF32", "/fwd:S": "\uFF33", "/fwd:T": "\uFF34", "/fwd:U": "\uFF35", "/fwd:V": "\uFF36", "/fwd:W": "\uFF37", "/fwd:X": "\uFF38", "/fwd:Y": "\uFF39", "/fwd:Z": "\uFF3A", "/fwd:a": "\uFF41", "/fwd:ampersand": "\uFF06", "/fwd:asciicircum": "\uFF3E", "/fwd:asciitilde": "\uFF5E", "/fwd:asterisk": "\uFF0A", "/fwd:at": "\uFF20", "/fwd:b": "\uFF42", "/fwd:backslash": "\uFF3C", "/fwd:bar": "\uFF5C", "/fwd:braceleft": "\uFF5B", "/fwd:braceright": "\uFF5D", "/fwd:bracketleft": "\uFF3B", "/fwd:bracketright": "\uFF3D", "/fwd:brokenbar": "\uFFE4", "/fwd:c": "\uFF43", "/fwd:centsign": "\uFFE0", "/fwd:colon": "\uFF1A", "/fwd:comma": "\uFF0C", "/fwd:d": "\uFF44", "/fwd:dollar": "\uFF04", "/fwd:e": "\uFF45", "/fwd:eight": "\uFF18", "/fwd:equal": "\uFF1D", "/fwd:exclam": "\uFF01", "/fwd:f": "\uFF46", "/fwd:five": "\uFF15", "/fwd:four": "\uFF14", "/fwd:g": "\uFF47", "/fwd:grave": "\uFF40", "/fwd:greater": "\uFF1E", "/fwd:h": "\uFF48", "/fwd:hyphen": "\uFF0D", "/fwd:i": "\uFF49", "/fwd:j": "\uFF4A", "/fwd:k": "\uFF4B", "/fwd:l": "\uFF4C", "/fwd:leftwhiteparenthesis": "\uFF5F", "/fwd:less": "\uFF1C", "/fwd:m": "\uFF4D", "/fwd:macron": "\uFFE3", "/fwd:n": "\uFF4E", "/fwd:nine": "\uFF19", "/fwd:notsign": "\uFFE2", "/fwd:numbersign": "\uFF03", "/fwd:o": "\uFF4F", "/fwd:one": "\uFF11", "/fwd:p": "\uFF50", "/fwd:parenthesisleft": "\uFF08", "/fwd:parenthesisright": "\uFF09", "/fwd:percent": "\uFF05", "/fwd:period": "\uFF0E", "/fwd:plus": "\uFF0B", "/fwd:poundsign": "\uFFE1", "/fwd:q": "\uFF51", "/fwd:question": "\uFF1F", "/fwd:quotedbl": "\uFF02", "/fwd:quotesingle": "\uFF07", "/fwd:r": "\uFF52", "/fwd:rightwhiteparenthesis": "\uFF60", "/fwd:s": "\uFF53", "/fwd:semicolon": "\uFF1B", "/fwd:seven": "\uFF17", "/fwd:six": "\uFF16", "/fwd:slash": "\uFF0F", "/fwd:t": "\uFF54", "/fwd:three": "\uFF13", "/fwd:two": "\uFF12", "/fwd:u": "\uFF55", "/fwd:underscore": "\uFF3F", "/fwd:v": "\uFF56", "/fwd:w": "\uFF57", "/fwd:wonsign": "\uFFE6", "/fwd:x": "\uFF58", "/fwd:y": "\uFF59", "/fwd:yensign": "\uFFE5", "/fwd:z": "\uFF5A", "/fwd:zero": "\uFF10", "/g": "\u0067", "/gabengali": "\u0997", "/gacute": "\u01F5", "/gadeva": "\u0917", "/gaf": "\u06AF", "/gaf.fina": "\uFB93", "/gaf.init": "\uFB94", "/gaf.isol": "\uFB92", "/gaf.medi": "\uFB95", "/gafarabic": "\u06AF", "/gaffinalarabic": "\uFB93", "/gafinitialarabic": "\uFB94", "/gafmedialarabic": "\uFB95", "/gafring": "\u06B0", "/gafthreedotsabove": "\u06B4", "/gaftwodotsbelow": "\u06B2", "/gagujarati": "\u0A97", "/gagurmukhi": "\u0A17", "/gahiragana": "\u304C", "/gakatakana": "\u30AC", "/galsquare": "\u33FF", "/gameDie": "\u1F3B2", "/gamma": "\u03B3", "/gammadblstruck": "\u213D", "/gammalatinsmall": "\u0263", "/gammasuperior": "\u02E0", "/gammasupmod": "\u02E0", "/gamurda": "\uA993", "/gangiacoptic": "\u03EB", "/ganmasquare": "\u330F", "/garonsquare": "\u330E", "/gbfullwidth": "\u3387", "/gbopomofo": "\u310D", "/gbreve": "\u011F", "/gcaron": "\u01E7", "/gcedilla": "\u0123", "/gcircle": "\u24D6", "/gcircumflex": "\u011D", "/gcommaaccent": "\u0123", "/gdot": "\u0121", "/gdotaccent": "\u0121", "/gear": "\u2699", "/gearhles": "\u26EE", "/gearouthub": "\u26ED", "/gecyr": "\u0433", "/gecyrillic": "\u0433", "/gehiragana": "\u3052", "/gehookcyr": "\u0495", "/gehookstrokecyr": "\u04FB", "/gekatakana": "\u30B2", "/gemStone": "\u1F48E", "/gemini": "\u264A", "/geometricallyequal": "\u2251", "/geometricallyequivalent": "\u224E", "/geometricproportion": "\u223A", "/geresh:hb": "\u05F3", "/gereshMuqdam:hb": "\u059D", "/gereshaccenthebrew": "\u059C", "/gereshhebrew": "\u05F3", "/gereshmuqdamhebrew": "\u059D", "/germandbls": "\u00DF", "/germanpenny": "\u20B0", "/gershayim:hb": "\u05F4", "/gershayimaccenthebrew": "\u059E", "/gershayimhebrew": "\u05F4", "/gestrokecyr": "\u0493", "/getailcyr": "\u04F7", "/getamark": "\u3013", "/geupcyr": "\u0491", "/ghabengali": "\u0998", "/ghadarmenian": "\u0572", "/ghadeva": "\u0918", "/ghagujarati": "\u0A98", "/ghagurmukhi": "\u0A18", "/ghain": "\u063A", "/ghain.fina": "\uFECE", "/ghain.init": "\uFECF", "/ghain.init_alefmaksura.fina": "\uFCF9", "/ghain.init_jeem.fina": "\uFC2B", "/ghain.init_jeem.medi": "\uFCBC", "/ghain.init_meem.fina": "\uFC2C", "/ghain.init_meem.medi": "\uFCBD", "/ghain.init_yeh.fina": "\uFCFA", "/ghain.isol": "\uFECD", "/ghain.medi": "\uFED0", "/ghain.medi_alefmaksura.fina": "\uFD15", "/ghain.medi_meem.medi_alefmaksura.fina": "\uFD7B", "/ghain.medi_meem.medi_meem.fina": "\uFD79", "/ghain.medi_meem.medi_yeh.fina": "\uFD7A", "/ghain.medi_yeh.fina": "\uFD16", "/ghainarabic": "\u063A", "/ghaindotbelow": "\u06FC", "/ghainfinalarabic": "\uFECE", "/ghaininitialarabic": "\uFECF", "/ghainmedialarabic": "\uFED0", "/ghemiddlehookcyrillic": "\u0495", "/ghestrokecyrillic": "\u0493", "/gheupturncyrillic": "\u0491", "/ghhadeva": "\u095A", "/ghhagurmukhi": "\u0A5A", "/ghook": "\u0260", "/ghost": "\u1F47B", "/ghzfullwidth": "\u3393", "/ghzsquare": "\u3393", "/gigasquare": "\u3310", "/gihiragana": "\u304E", "/gikatakana": "\u30AE", "/gimarmenian": "\u0563", "/gimel": "\u05D2", "/gimel:hb": "\u05D2", "/gimeldagesh": "\uFB32", "/gimeldageshhebrew": "\uFB32", "/gimelhebrew": "\u05D2", "/gimelwithdagesh:hb": "\uFB32", "/giniisquare": "\u3311", "/ginsularturned": "\uA77F", "/girl": "\u1F467", "/girls": "\u1F6CA", "/girudaasquare": "\u3313", "/gjecyr": "\u0453", "/gjecyrillic": "\u0453", "/globeMeridians": "\u1F310", "/glottalinvertedstroke": "\u01BE", "/glottalstop": "\u0294", "/glottalstopinverted": "\u0296", "/glottalstopmod": "\u02C0", "/glottalstopreversed": "\u0295", "/glottalstopreversedmod": "\u02C1", "/glottalstopreversedsuperior": "\u02E4", "/glottalstopstroke": "\u02A1", "/glottalstopstrokereversed": "\u02A2", "/glottalstopsupreversedmod": "\u02E4", "/glowingStar": "\u1F31F", "/gmacron": "\u1E21", "/gmonospace": "\uFF47", "/gmtr:diamondblack": "\u25C6", "/gmtr:diamondwhite": "\u25C7", "/gnrl:hyphen": "\u2010", "/goat": "\u1F410", "/gobliquestroke": "\uA7A1", "/gohiragana": "\u3054", "/gokatakana": "\u30B4", "/golfer": "\u1F3CC", "/gpafullwidth": "\u33AC", "/gparen": "\u24A2", "/gparenthesized": "\u24A2", "/gpasquare": "\u33AC", "/gr:acute": "\u1FFD", "/gr:grave": "\u1FEF", "/gr:question": "\u037E", "/gr:tilde": "\u1FC0", "/gradient": "\u2207", "/graduationCap": "\u1F393", "/grapes": "\u1F347", "/grave": "\u0060", "/gravebelowcmb": "\u0316", "/gravecmb": "\u0300", "/gravecomb": "\u0300", "/gravedblmiddlemod": "\u02F5", "/gravedeva": "\u0953", "/gravelowmod": "\u02CE", "/gravemiddlemod": "\u02F4", "/gravemod": "\u02CB", "/gravemonospace": "\uFF40", "/gravetonecmb": "\u0340", "/greater": "\u003E", "/greaterbutnotequal": "\u2269", "/greaterbutnotequivalent": "\u22E7", "/greaterdot": "\u22D7", "/greaterequal": "\u2265", "/greaterequalorless": "\u22DB", "/greatermonospace": "\uFF1E", "/greaterorequivalent": "\u2273", "/greaterorless": "\u2277", "/greateroverequal": "\u2267", "/greatersmall": "\uFE65", "/greenApple": "\u1F34F", "/greenBook": "\u1F4D7", "/greenHeart": "\u1F49A", "/grimacingFace": "\u1F62C", "/grinningCatFaceWithSmilingEyes": "\u1F638", "/grinningFace": "\u1F600", "/grinningFaceWithSmilingEyes": "\u1F601", "/growingHeart": "\u1F497", "/gscript": "\u0261", "/gstroke": "\u01E5", "/guarani": "\u20B2", "/guardsman": "\u1F482", "/gueh": "\u06B3", "/gueh.fina": "\uFB97", "/gueh.init": "\uFB98", "/gueh.isol": "\uFB96", "/gueh.medi": "\uFB99", "/guhiragana": "\u3050", "/guillemetleft": "\u00AB", "/guillemetright": "\u00BB", "/guillemotleft": "\u00AB", "/guillemotright": "\u00BB", "/guilsinglleft": "\u2039", "/guilsinglright": "\u203A", "/guitar": "\u1F3B8", "/gujr:a": "\u0A85", "/gujr:aa": "\u0A86", "/gujr:aasign": "\u0ABE", "/gujr:abbreviation": "\u0AF0", "/gujr:ai": "\u0A90", "/gujr:aisign": "\u0AC8", "/gujr:anusvara": "\u0A82", "/gujr:au": "\u0A94", "/gujr:ausign": "\u0ACC", "/gujr:avagraha": "\u0ABD", "/gujr:ba": "\u0AAC", "/gujr:bha": "\u0AAD", "/gujr:binducandra": "\u0A81", "/gujr:ca": "\u0A9A", "/gujr:cha": "\u0A9B", "/gujr:circlenuktaabove": "\u0AFE", "/gujr:da": "\u0AA6", "/gujr:dda": "\u0AA1", "/gujr:ddha": "\u0AA2", "/gujr:dha": "\u0AA7", "/gujr:e": "\u0A8F", "/gujr:ecandra": "\u0A8D", "/gujr:eight": "\u0AEE", "/gujr:esign": "\u0AC7", "/gujr:esigncandra": "\u0AC5", "/gujr:five": "\u0AEB", "/gujr:four": "\u0AEA", "/gujr:ga": "\u0A97", "/gujr:gha": "\u0A98", "/gujr:ha": "\u0AB9", "/gujr:i": "\u0A87", "/gujr:ii": "\u0A88", "/gujr:iisign": "\u0AC0", "/gujr:isign": "\u0ABF", "/gujr:ja": "\u0A9C", "/gujr:jha": "\u0A9D", "/gujr:ka": "\u0A95", "/gujr:kha": "\u0A96", "/gujr:la": "\u0AB2", "/gujr:lla": "\u0AB3", "/gujr:llvocal": "\u0AE1", "/gujr:llvocalsign": "\u0AE3", "/gujr:lvocal": "\u0A8C", "/gujr:lvocalsign": "\u0AE2", "/gujr:ma": "\u0AAE", "/gujr:maddah": "\u0AFC", "/gujr:na": "\u0AA8", "/gujr:nga": "\u0A99", "/gujr:nine": "\u0AEF", "/gujr:nna": "\u0AA3", "/gujr:nukta": "\u0ABC", "/gujr:nya": "\u0A9E", "/gujr:o": "\u0A93", "/gujr:ocandra": "\u0A91", "/gujr:om": "\u0AD0", "/gujr:one": "\u0AE7", "/gujr:osign": "\u0ACB", "/gujr:osigncandra": "\u0AC9", "/gujr:pa": "\u0AAA", "/gujr:pha": "\u0AAB", "/gujr:ra": "\u0AB0", "/gujr:rrvocal": "\u0AE0", "/gujr:rrvocalsign": "\u0AC4", "/gujr:rupee": "\u0AF1", "/gujr:rvocal": "\u0A8B", "/gujr:rvocalsign": "\u0AC3", "/gujr:sa": "\u0AB8", "/gujr:seven": "\u0AED", "/gujr:sha": "\u0AB6", "/gujr:shadda": "\u0AFB", "/gujr:six": "\u0AEC", "/gujr:ssa": "\u0AB7", "/gujr:sukun": "\u0AFA", "/gujr:ta": "\u0AA4", "/gujr:tha": "\u0AA5", "/gujr:three": "\u0AE9", "/gujr:three-dotnuktaabove": "\u0AFD", "/gujr:tta": "\u0A9F", "/gujr:ttha": "\u0AA0", "/gujr:two": "\u0AE8", "/gujr:two-circlenuktaabove": "\u0AFF", "/gujr:u": "\u0A89", "/gujr:usign": "\u0AC1", "/gujr:uu": "\u0A8A", "/gujr:uusign": "\u0AC2", "/gujr:va": "\u0AB5", "/gujr:virama": "\u0ACD", "/gujr:visarga": "\u0A83", "/gujr:ya": "\u0AAF", "/gujr:zero": "\u0AE6", "/gujr:zha": "\u0AF9", "/gukatakana": "\u30B0", "/guramusquare": "\u3318", "/guramutonsquare": "\u3319", "/guru:a": "\u0A05", "/guru:aa": "\u0A06", "/guru:aasign": "\u0A3E", "/guru:adakbindisign": "\u0A01", "/guru:addak": "\u0A71", "/guru:ai": "\u0A10", "/guru:aisign": "\u0A48", "/guru:au": "\u0A14", "/guru:ausign": "\u0A4C", "/guru:ba": "\u0A2C", "/guru:bha": "\u0A2D", "/guru:bindisign": "\u0A02", "/guru:ca": "\u0A1A", "/guru:cha": "\u0A1B", "/guru:da": "\u0A26", "/guru:dda": "\u0A21", "/guru:ddha": "\u0A22", "/guru:dha": "\u0A27", "/guru:ee": "\u0A0F", "/guru:eesign": "\u0A47", "/guru:eight": "\u0A6E", "/guru:ekonkar": "\u0A74", "/guru:fa": "\u0A5E", "/guru:five": "\u0A6B", "/guru:four": "\u0A6A", "/guru:ga": "\u0A17", "/guru:gha": "\u0A18", "/guru:ghha": "\u0A5A", "/guru:ha": "\u0A39", "/guru:i": "\u0A07", "/guru:ii": "\u0A08", "/guru:iisign": "\u0A40", "/guru:iri": "\u0A72", "/guru:isign": "\u0A3F", "/guru:ja": "\u0A1C", "/guru:jha": "\u0A1D", "/guru:ka": "\u0A15", "/guru:kha": "\u0A16", "/guru:khha": "\u0A59", "/guru:la": "\u0A32", "/guru:lla": "\u0A33", "/guru:ma": "\u0A2E", "/guru:na": "\u0A28", "/guru:nga": "\u0A19", "/guru:nine": "\u0A6F", "/guru:nna": "\u0A23", "/guru:nukta": "\u0A3C", "/guru:nya": "\u0A1E", "/guru:one": "\u0A67", "/guru:oo": "\u0A13", "/guru:oosign": "\u0A4B", "/guru:pa": "\u0A2A", "/guru:pha": "\u0A2B", "/guru:ra": "\u0A30", "/guru:rra": "\u0A5C", "/guru:sa": "\u0A38", "/guru:seven": "\u0A6D", "/guru:sha": "\u0A36", "/guru:six": "\u0A6C", "/guru:ta": "\u0A24", "/guru:tha": "\u0A25", "/guru:three": "\u0A69", "/guru:tippi": "\u0A70", "/guru:tta": "\u0A1F", "/guru:ttha": "\u0A20", "/guru:two": "\u0A68", "/guru:u": "\u0A09", "/guru:udaatsign": "\u0A51", "/guru:ura": "\u0A73", "/guru:usign": "\u0A41", "/guru:uu": "\u0A0A", "/guru:uusign": "\u0A42", "/guru:va": "\u0A35", "/guru:virama": "\u0A4D", "/guru:visarga": "\u0A03", "/guru:ya": "\u0A2F", "/guru:yakashsign": "\u0A75", "/guru:za": "\u0A5B", "/guru:zero": "\u0A66", "/gyfullwidth": "\u33C9", "/gysquare": "\u33C9", "/h": "\u0068", "/h.inferior": "\u2095", "/haabkhasiancyrillic": "\u04A9", "/haabkhcyr": "\u04A9", "/haaltonearabic": "\u06C1", "/habengali": "\u09B9", "/hacirclekatakana": "\u32E9", "/hacyr": "\u0445", "/hadescendercyrillic": "\u04B3", "/hadeva": "\u0939", "/hafullwidth": "\u33CA", "/hagujarati": "\u0AB9", "/hagurmukhi": "\u0A39", "/hah": "\u062D", "/hah.fina": "\uFEA2", "/hah.init": "\uFEA3", "/hah.init_alefmaksura.fina": "\uFCFF", "/hah.init_jeem.fina": "\uFC17", "/hah.init_jeem.medi": "\uFCA9", "/hah.init_meem.fina": "\uFC18", "/hah.init_meem.medi": "\uFCAA", "/hah.init_yeh.fina": "\uFD00", "/hah.isol": "\uFEA1", "/hah.medi": "\uFEA4", "/hah.medi_alefmaksura.fina": "\uFD1B", "/hah.medi_jeem.medi_yeh.fina": "\uFDBF", "/hah.medi_meem.medi_alefmaksura.fina": "\uFD5B", "/hah.medi_meem.medi_yeh.fina": "\uFD5A", "/hah.medi_yeh.fina": "\uFD1C", "/hahDigitFourBelow": "\u077C", "/hahSmallTahAbove": "\u0772", "/hahSmallTahBelow": "\u076E", "/hahSmallTahTwoDots": "\u076F", "/hahThreeDotsUpBelow": "\u0758", "/hahTwoDotsAbove": "\u0757", "/haharabic": "\u062D", "/hahfinalarabic": "\uFEA2", "/hahhamza": "\u0681", "/hahinitialarabic": "\uFEA3", "/hahiragana": "\u306F", "/hahmedialarabic": "\uFEA4", "/hahookcyr": "\u04FD", "/hahthreedotsabove": "\u0685", "/hahtwodotsvertical": "\u0682", "/haircut": "\u1F487", "/hairspace": "\u200A", "/haitusquare": "\u332A", "/hakatakana": "\u30CF", "/hakatakanahalfwidth": "\uFF8A", "/halantgurmukhi": "\u0A4D", "/halfcircleleftblack": "\u25D6", "/halfcirclerightblack": "\u25D7", "/hamburger": "\u1F354", "/hammer": "\u1F528", "/hammerAndWrench": "\u1F6E0", "/hammerpick": "\u2692", "/hammersickle": "\u262D", "/hamsterFace": "\u1F439", "/hamza": "\u0621", "/hamzaIsol": "\uFE80", "/hamzaabove": "\u0654", "/hamzaarabic": "\u0621", "/hamzabelow": "\u0655", "/hamzadammaarabic": "\u0621", "/hamzadammatanarabic": "\u0621", "/hamzafathaarabic": "\u0621", "/hamzafathatanarabic": "\u0621", "/hamzalowarabic": "\u0621", "/hamzalowkasraarabic": "\u0621", "/hamzalowkasratanarabic": "\u0621", "/hamzasukunarabic": "\u0621", "/handbag": "\u1F45C", "/handtailfishhookturned": "\u02AF", "/hangulchieuchaparen": "\u3217", "/hangulchieuchparen": "\u3209", "/hangulcieucaparen": "\u3216", "/hangulcieucparen": "\u3208", "/hangulcieucuparen": "\u321C", "/hanguldottonemarkdbl": "\u302F", "/hangulfiller": "\u3164", "/hangulhieuhaparen": "\u321B", "/hangulhieuhparen": "\u320D", "/hangulieungaparen": "\u3215", "/hangulieungparen": "\u3207", "/hangulkhieukhaparen": "\u3218", "/hangulkhieukhparen": "\u320A", "/hangulkiyeokaparen": "\u320E", "/hangulkiyeokparen": "\u3200", "/hangulmieumaparen": "\u3212", "/hangulmieumparen": "\u3204", "/hangulnieunaparen": "\u320F", "/hangulnieunparen": "\u3201", "/hangulphieuphaparen": "\u321A", "/hangulphieuphparen": "\u320C", "/hangulpieupaparen": "\u3213", "/hangulpieupparen": "\u3205", "/hangulrieulaparen": "\u3211", "/hangulrieulparen": "\u3203", "/hangulsingledottonemark": "\u302E", "/hangulsiosaparen": "\u3214", "/hangulsiosparen": "\u3206", "/hangulthieuthaparen": "\u3219", "/hangulthieuthparen": "\u320B", "/hangultikeutaparen": "\u3210", "/hangultikeutparen": "\u3202", "/happyPersonRaisingOneHand": "\u1F64B", "/hardDisk": "\u1F5B4", "/hardcyr": "\u044A", "/hardsigncyrillic": "\u044A", "/harpoondownbarbleft": "\u21C3", "/harpoondownbarbright": "\u21C2", "/harpoonleftbarbdown": "\u21BD", "/harpoonleftbarbup": "\u21BC", "/harpoonrightbarbdown": "\u21C1", "/harpoonrightbarbup": "\u21C0", "/harpoonupbarbleft": "\u21BF", "/harpoonupbarbright": "\u21BE", "/hasquare": "\u33CA", "/hastrokecyr": "\u04FF", "/hatafPatah:hb": "\u05B2", "/hatafQamats:hb": "\u05B3", "/hatafSegol:hb": "\u05B1", "/hatafpatah": "\u05B2", "/hatafpatah16": "\u05B2", "/hatafpatah23": "\u05B2", "/hatafpatah2f": "\u05B2", "/hatafpatahhebrew": "\u05B2", "/hatafpatahnarrowhebrew": "\u05B2", "/hatafpatahquarterhebrew": "\u05B2", "/hatafpatahwidehebrew": "\u05B2", "/hatafqamats": "\u05B3", "/hatafqamats1b": "\u05B3", "/hatafqamats28": "\u05B3", "/hatafqamats34": "\u05B3", "/hatafqamatshebrew": "\u05B3", "/hatafqamatsnarrowhebrew": "\u05B3", "/hatafqamatsquarterhebrew": "\u05B3", "/hatafqamatswidehebrew": "\u05B3", "/hatafsegol": "\u05B1", "/hatafsegol17": "\u05B1", "/hatafsegol24": "\u05B1", "/hatafsegol30": "\u05B1", "/hatafsegolhebrew": "\u05B1", "/hatafsegolnarrowhebrew": "\u05B1", "/hatafsegolquarterhebrew": "\u05B1", "/hatafsegolwidehebrew": "\u05B1", "/hatchingChick": "\u1F423", "/haveideographiccircled": "\u3292", "/haveideographicparen": "\u3232", "/hbar": "\u0127", "/hbopomofo": "\u310F", "/hbrevebelow": "\u1E2B", "/hcaron": "\u021F", "/hcedilla": "\u1E29", "/hcircle": "\u24D7", "/hcircumflex": "\u0125", "/hcsquare": "\u1F1A6", "/hdescender": "\u2C68", "/hdieresis": "\u1E27", "/hdot": "\u1E23", "/hdotaccent": "\u1E23", "/hdotbelow": "\u1E25", "/hdrsquare": "\u1F1A7", "/he": "\u05D4", "/he:hb": "\u05D4", "/headphone": "\u1F3A7", "/headstonegraveyard": "\u26FC", "/hearNoEvilMonkey": "\u1F649", "/heart": "\u2665", "/heartArrow": "\u1F498", "/heartDecoration": "\u1F49F", "/heartRibbon": "\u1F49D", "/heartTipOnTheLeft": "\u1F394", "/heartblack": "\u2665", "/heartsuitblack": "\u2665", "/heartsuitwhite": "\u2661", "/heartwhite": "\u2661", "/heavyDollarSign": "\u1F4B2", "/heavyLatinCross": "\u1F547", "/heavydbldashhorz": "\u254D", "/heavydbldashvert": "\u254F", "/heavydn": "\u257B", "/heavydnhorz": "\u2533", "/heavydnleft": "\u2513", "/heavydnright": "\u250F", "/heavyhorz": "\u2501", "/heavyleft": "\u2578", "/heavyleftlightright": "\u257E", "/heavyquaddashhorz": "\u2509", "/heavyquaddashvert": "\u250B", "/heavyright": "\u257A", "/heavytrpldashhorz": "\u2505", "/heavytrpldashvert": "\u2507", "/heavyup": "\u2579", "/heavyuphorz": "\u253B", "/heavyupleft": "\u251B", "/heavyuplightdn": "\u257F", "/heavyupright": "\u2517", "/heavyvert": "\u2503", "/heavyverthorz": "\u254B", "/heavyvertleft": "\u252B", "/heavyvertright": "\u2523", "/hecirclekatakana": "\u32EC", "/hedagesh": "\uFB34", "/hedageshhebrew": "\uFB34", "/hedinterlacedpentagramleft": "\u26E6", "/hedinterlacedpentagramright": "\u26E5", "/heh": "\u0647", "/heh.fina": "\uFEEA", "/heh.init": "\uFEEB", "/heh.init_alefmaksura.fina": "\uFC53", "/heh.init_jeem.fina": "\uFC51", "/heh.init_jeem.medi": "\uFCD7", "/heh.init_meem.fina": "\uFC52", "/heh.init_meem.medi": "\uFCD8", "/heh.init_meem.medi_jeem.medi": "\uFD93", "/heh.init_meem.medi_meem.medi": "\uFD94", "/heh.init_superscriptalef.medi": "\uFCD9", "/heh.init_yeh.fina": "\uFC54", "/heh.isol": "\uFEE9", "/heh.medi": "\uFEEC", "/hehaltonearabic": "\u06C1", "/heharabic": "\u0647", "/hehdoachashmee": "\u06BE", "/hehdoachashmee.fina": "\uFBAB", "/hehdoachashmee.init": "\uFBAC", "/hehdoachashmee.isol": "\uFBAA", "/hehdoachashmee.medi": "\uFBAD", "/hehebrew": "\u05D4", "/hehfinalaltonearabic": "\uFBA7", "/hehfinalalttwoarabic": "\uFEEA", "/hehfinalarabic": "\uFEEA", "/hehgoal": "\u06C1", "/hehgoal.fina": "\uFBA7", "/hehgoal.init": "\uFBA8", "/hehgoal.isol": "\uFBA6", "/hehgoal.medi": "\uFBA9", "/hehgoalhamza": "\u06C2", "/hehhamzaabovefinalarabic": "\uFBA5", "/hehhamzaaboveisolatedarabic": "\uFBA4", "/hehinitialaltonearabic": "\uFBA8", "/hehinitialarabic": "\uFEEB", "/hehinvertedV": "\u06FF", "/hehiragana": "\u3078", "/hehmedialaltonearabic": "\uFBA9", "/hehmedialarabic": "\uFEEC", "/hehyeh": "\u06C0", "/hehyeh.fina": "\uFBA5", "/hehyeh.isol": "\uFBA4", "/heiseierasquare": "\u337B", "/hekatakana": "\u30D8", "/hekatakanahalfwidth": "\uFF8D", "/hekutaarusquare": "\u3336", "/helicopter": "\u1F681", "/helm": "\u2388", "/helmetcrosswhite": "\u26D1", "/heng": "\uA727", "/henghook": "\u0267", "/herb": "\u1F33F", "/hermitianconjugatematrix": "\u22B9", "/herutusquare": "\u3339", "/het": "\u05D7", "/het:hb": "\u05D7", "/heta": "\u0371", "/hethebrew": "\u05D7", "/hewide:hb": "\uFB23", "/hewithmapiq:hb": "\uFB34", "/hfishhookturned": "\u02AE", "/hhalf": "\u2C76", "/hhook": "\u0266", "/hhooksuperior": "\u02B1", "/hhooksupmod": "\u02B1", "/hi-ressquare": "\u1F1A8", "/hibiscus": "\u1F33A", "/hicirclekatakana": "\u32EA", "/hieuhacirclekorean": "\u327B", "/hieuhaparenkorean": "\u321B", "/hieuhcirclekorean": "\u326D", "/hieuhkorean": "\u314E", "/hieuhparenkorean": "\u320D", "/high-heeledShoe": "\u1F460", "/highBrightness": "\u1F506", "/highSpeedTrain": "\u1F684", "/highSpeedTrainWithBulletNose": "\u1F685", "/highhamza": "\u0674", "/highideographiccircled": "\u32A4", "/highvoltage": "\u26A1", "/hihiragana": "\u3072", "/hikatakana": "\u30D2", "/hikatakanahalfwidth": "\uFF8B", "/hira:a": "\u3042", "/hira:asmall": "\u3041", "/hira:ba": "\u3070", "/hira:be": "\u3079", "/hira:bi": "\u3073", "/hira:bo": "\u307C", "/hira:bu": "\u3076", "/hira:da": "\u3060", "/hira:de": "\u3067", "/hira:di": "\u3062", "/hira:digraphyori": "\u309F", "/hira:do": "\u3069", "/hira:du": "\u3065", "/hira:e": "\u3048", "/hira:esmall": "\u3047", "/hira:ga": "\u304C", "/hira:ge": "\u3052", "/hira:gi": "\u304E", "/hira:go": "\u3054", "/hira:gu": "\u3050", "/hira:ha": "\u306F", "/hira:he": "\u3078", "/hira:hi": "\u3072", "/hira:ho": "\u307B", "/hira:hu": "\u3075", "/hira:i": "\u3044", "/hira:ismall": "\u3043", "/hira:iterationhiragana": "\u309D", "/hira:ka": "\u304B", "/hira:kasmall": "\u3095", "/hira:ke": "\u3051", "/hira:kesmall": "\u3096", "/hira:ki": "\u304D", "/hira:ko": "\u3053", "/hira:ku": "\u304F", "/hira:ma": "\u307E", "/hira:me": "\u3081", "/hira:mi": "\u307F", "/hira:mo": "\u3082", "/hira:mu": "\u3080", "/hira:n": "\u3093", "/hira:na": "\u306A", "/hira:ne": "\u306D", "/hira:ni": "\u306B", "/hira:no": "\u306E", "/hira:nu": "\u306C", "/hira:o": "\u304A", "/hira:osmall": "\u3049", "/hira:pa": "\u3071", "/hira:pe": "\u307A", "/hira:pi": "\u3074", "/hira:po": "\u307D", "/hira:pu": "\u3077", "/hira:ra": "\u3089", "/hira:re": "\u308C", "/hira:ri": "\u308A", "/hira:ro": "\u308D", "/hira:ru": "\u308B", "/hira:sa": "\u3055", "/hira:se": "\u305B", "/hira:semivoicedmarkkana": "\u309C", "/hira:semivoicedmarkkanacmb": "\u309A", "/hira:si": "\u3057", "/hira:so": "\u305D", "/hira:su": "\u3059", "/hira:ta": "\u305F", "/hira:te": "\u3066", "/hira:ti": "\u3061", "/hira:to": "\u3068", "/hira:tu": "\u3064", "/hira:tusmall": "\u3063", "/hira:u": "\u3046", "/hira:usmall": "\u3045", "/hira:voicediterationhiragana": "\u309E", "/hira:voicedmarkkana": "\u309B", "/hira:voicedmarkkanacmb": "\u3099", "/hira:vu": "\u3094", "/hira:wa": "\u308F", "/hira:wasmall": "\u308E", "/hira:we": "\u3091", "/hira:wi": "\u3090", "/hira:wo": "\u3092", "/hira:ya": "\u3084", "/hira:yasmall": "\u3083", "/hira:yo": "\u3088", "/hira:yosmall": "\u3087", "/hira:yu": "\u3086", "/hira:yusmall": "\u3085", "/hira:za": "\u3056", "/hira:ze": "\u305C", "/hira:zi": "\u3058", "/hira:zo": "\u305E", "/hira:zu": "\u305A", "/hiriq": "\u05B4", "/hiriq14": "\u05B4", "/hiriq21": "\u05B4", "/hiriq2d": "\u05B4", "/hiriq:hb": "\u05B4", "/hiriqhebrew": "\u05B4", "/hiriqnarrowhebrew": "\u05B4", "/hiriqquarterhebrew": "\u05B4", "/hiriqwidehebrew": "\u05B4", "/historicsite": "\u26EC", "/hlinebelow": "\u1E96", "/hmonospace": "\uFF48", "/hoarmenian": "\u0570", "/hocho": "\u1F52A", "/hocirclekatakana": "\u32ED", "/hohipthai": "\u0E2B", "/hohiragana": "\u307B", "/hokatakana": "\u30DB", "/hokatakanahalfwidth": "\uFF8E", "/holam": "\u05B9", "/holam19": "\u05B9", "/holam26": "\u05B9", "/holam32": "\u05B9", "/holam:hb": "\u05B9", "/holamHaser:hb": "\u05BA", "/holamhebrew": "\u05B9", "/holamnarrowhebrew": "\u05B9", "/holamquarterhebrew": "\u05B9", "/holamwidehebrew": "\u05B9", "/hole": "\u1F573", "/homotic": "\u223B", "/honeyPot": "\u1F36F", "/honeybee": "\u1F41D", "/honokhukthai": "\u0E2E", "/honsquare": "\u333F", "/hook": "\u2440", "/hookabovecomb": "\u0309", "/hookcmb": "\u0309", "/hookpalatalizedbelowcmb": "\u0321", "/hookretroflexbelowcmb": "\u0322", "/hoonsquare": "\u3342", "/hoorusquare": "\u3341", "/horicoptic": "\u03E9", "/horizontalTrafficLight": "\u1F6A5", "/horizontalbar": "\u2015", "/horizontalbarwhitearrowonpedestalup": "\u21EC", "/horizontalmalestroke": "\u26A9", "/horncmb": "\u031B", "/horse": "\u1F40E", "/horseFace": "\u1F434", "/horseRacing": "\u1F3C7", "/hospital": "\u1F3E5", "/hotDog": "\u1F32D", "/hotPepper": "\u1F336", "/hotbeverage": "\u2615", "/hotel": "\u1F3E8", "/hotsprings": "\u2668", "/hourglass": "\u231B", "/hourglassflowings": "\u23F3", "/house": "\u2302", "/houseBuilding": "\u1F3E0", "/houseBuildings": "\u1F3D8", "/houseGarden": "\u1F3E1", "/hpafullwidth": "\u3371", "/hpalatalhook": "\uA795", "/hparen": "\u24A3", "/hparenthesized": "\u24A3", "/hpfullwidth": "\u33CB", "/hryvnia": "\u20B4", "/hsuperior": "\u02B0", "/hsupmod": "\u02B0", "/hturned": "\u0265", "/htypeopencircuit": "\u238F", "/huaraddosquare": "\u3332", "/hucirclekatakana": "\u32EB", "/huhiragana": "\u3075", "/huiitosquare": "\u3333", "/hukatakana": "\u30D5", "/hukatakanahalfwidth": "\uFF8C", "/hundredPoints": "\u1F4AF", "/hundredthousandscmbcyr": "\u0488", "/hungarumlaut": "\u02DD", "/hungarumlautcmb": "\u030B", "/huransquare": "\u3335", "/hushedFace": "\u1F62F", "/hv": "\u0195", "/hwd:a": "\uFFC2", "/hwd:ae": "\uFFC3", "/hwd:blacksquare": "\uFFED", "/hwd:chieuch": "\uFFBA", "/hwd:cieuc": "\uFFB8", "/hwd:downwardsarrow": "\uFFEC", "/hwd:e": "\uFFC7", "/hwd:eo": "\uFFC6", "/hwd:eu": "\uFFDA", "/hwd:formslightvertical": "\uFFE8", "/hwd:hangulfiller": "\uFFA0", "/hwd:hieuh": "\uFFBE", "/hwd:i": "\uFFDC", "/hwd:ideographiccomma": "\uFF64", "/hwd:ideographicfullstop": "\uFF61", "/hwd:ieung": "\uFFB7", "/hwd:kata:a": "\uFF71", "/hwd:kata:asmall": "\uFF67", "/hwd:kata:e": "\uFF74", "/hwd:kata:esmall": "\uFF6A", "/hwd:kata:ha": "\uFF8A", "/hwd:kata:he": "\uFF8D", "/hwd:kata:hi": "\uFF8B", "/hwd:kata:ho": "\uFF8E", "/hwd:kata:hu": "\uFF8C", "/hwd:kata:i": "\uFF72", "/hwd:kata:ismall": "\uFF68", "/hwd:kata:ka": "\uFF76", "/hwd:kata:ke": "\uFF79", "/hwd:kata:ki": "\uFF77", "/hwd:kata:ko": "\uFF7A", "/hwd:kata:ku": "\uFF78", "/hwd:kata:ma": "\uFF8F", "/hwd:kata:me": "\uFF92", "/hwd:kata:mi": "\uFF90", "/hwd:kata:middledot": "\uFF65", "/hwd:kata:mo": "\uFF93", "/hwd:kata:mu": "\uFF91", "/hwd:kata:n": "\uFF9D", "/hwd:kata:na": "\uFF85", "/hwd:kata:ne": "\uFF88", "/hwd:kata:ni": "\uFF86", "/hwd:kata:no": "\uFF89", "/hwd:kata:nu": "\uFF87", "/hwd:kata:o": "\uFF75", "/hwd:kata:osmall": "\uFF6B", "/hwd:kata:prolongedkana": "\uFF70", "/hwd:kata:ra": "\uFF97", "/hwd:kata:re": "\uFF9A", "/hwd:kata:ri": "\uFF98", "/hwd:kata:ro": "\uFF9B", "/hwd:kata:ru": "\uFF99", "/hwd:kata:sa": "\uFF7B", "/hwd:kata:se": "\uFF7E", "/hwd:kata:semi-voiced": "\uFF9F", "/hwd:kata:si": "\uFF7C", "/hwd:kata:so": "\uFF7F", "/hwd:kata:su": "\uFF7D", "/hwd:kata:ta": "\uFF80", "/hwd:kata:te": "\uFF83", "/hwd:kata:ti": "\uFF81", "/hwd:kata:to": "\uFF84", "/hwd:kata:tu": "\uFF82", "/hwd:kata:tusmall": "\uFF6F", "/hwd:kata:u": "\uFF73", "/hwd:kata:usmall": "\uFF69", "/hwd:kata:voiced": "\uFF9E", "/hwd:kata:wa": "\uFF9C", "/hwd:kata:wo": "\uFF66", "/hwd:kata:ya": "\uFF94", "/hwd:kata:yasmall": "\uFF6C", "/hwd:kata:yo": "\uFF96", "/hwd:kata:yosmall": "\uFF6E", "/hwd:kata:yu": "\uFF95", "/hwd:kata:yusmall": "\uFF6D", "/hwd:khieukh": "\uFFBB", "/hwd:kiyeok": "\uFFA1", "/hwd:kiyeoksios": "\uFFA3", "/hwd:leftcornerbracket": "\uFF62", "/hwd:leftwardsarrow": "\uFFE9", "/hwd:mieum": "\uFFB1", "/hwd:nieun": "\uFFA4", "/hwd:nieuncieuc": "\uFFA5", "/hwd:nieunhieuh": "\uFFA6", "/hwd:o": "\uFFCC", "/hwd:oe": "\uFFCF", "/hwd:phieuph": "\uFFBD", "/hwd:pieup": "\uFFB2", "/hwd:pieupsios": "\uFFB4", "/hwd:rieul": "\uFFA9", "/hwd:rieulhieuh": "\uFFB0", "/hwd:rieulkiyeok": "\uFFAA", "/hwd:rieulmieum": "\uFFAB", "/hwd:rieulphieuph": "\uFFAF", "/hwd:rieulpieup": "\uFFAC", "/hwd:rieulsios": "\uFFAD", "/hwd:rieulthieuth": "\uFFAE", "/hwd:rightcornerbracket": "\uFF63", "/hwd:rightwardsarrow": "\uFFEB", "/hwd:sios": "\uFFB5", "/hwd:ssangcieuc": "\uFFB9", "/hwd:ssangkiyeok": "\uFFA2", "/hwd:ssangpieup": "\uFFB3", "/hwd:ssangsios": "\uFFB6", "/hwd:ssangtikeut": "\uFFA8", "/hwd:thieuth": "\uFFBC", "/hwd:tikeut": "\uFFA7", "/hwd:u": "\uFFD3", "/hwd:upwardsarrow": "\uFFEA", "/hwd:wa": "\uFFCD", "/hwd:wae": "\uFFCE", "/hwd:we": "\uFFD5", "/hwd:weo": "\uFFD4", "/hwd:whitecircle": "\uFFEE", "/hwd:wi": "\uFFD6", "/hwd:ya": "\uFFC4", "/hwd:yae": "\uFFC5", "/hwd:ye": "\uFFCB", "/hwd:yeo": "\uFFCA", "/hwd:yi": "\uFFDB", "/hwd:yo": "\uFFD2", "/hwd:yu": "\uFFD7", "/hyphen": "\u002D", "/hyphenationpoint": "\u2027", "/hyphenbullet": "\u2043", "/hyphendbl": "\u2E40", "/hyphendbloblique": "\u2E17", "/hyphendieresis": "\u2E1A", "/hypheninferior": "\uF6E5", "/hyphenminus": "\u002D", "/hyphenmonospace": "\uFF0D", "/hyphensmall": "\uFE63", "/hyphensoft": "\u00AD", "/hyphensuperior": "\uF6E6", "/hyphentwo": "\u2010", "/hypodiastole": "\u2E12", "/hysteresis": "\u238E", "/hzfullwidth": "\u3390", "/i": "\u0069", "/i.superior": "\u2071", "/iacute": "\u00ED", "/iacyrillic": "\u044F", "/iaepigraphic": "\uA7FE", "/ibengali": "\u0987", "/ibopomofo": "\u3127", "/ibreve": "\u012D", "/icaron": "\u01D0", "/iceCream": "\u1F368", "/iceHockeyStickAndPuck": "\u1F3D2", "/iceskate": "\u26F8", "/icircle": "\u24D8", "/icirclekatakana": "\u32D1", "/icircumflex": "\u00EE", "/icyr": "\u0438", "/icyrillic": "\u0456", "/idblgrave": "\u0209", "/idblstruckitalic": "\u2148", "/ideographearthcircle": "\u328F", "/ideographfirecircle": "\u328B", "/ideographicallianceparen": "\u323F", "/ideographiccallparen": "\u323A", "/ideographiccentrecircle": "\u32A5", "/ideographicclose": "\u3006", "/ideographiccomma": "\u3001", "/ideographiccommaleft": "\uFF64", "/ideographiccongratulationparen": "\u3237", "/ideographiccorrectcircle": "\u32A3", "/ideographicdepartingtonemark": "\u302C", "/ideographicearthparen": "\u322F", "/ideographicenteringtonemark": "\u302D", "/ideographicenterpriseparen": "\u323D", "/ideographicexcellentcircle": "\u329D", "/ideographicfestivalparen": "\u3240", "/ideographicfinancialcircle": "\u3296", "/ideographicfinancialparen": "\u3236", "/ideographicfireparen": "\u322B", "/ideographichalffillspace": "\u303F", "/ideographichaveparen": "\u3232", "/ideographichighcircle": "\u32A4", "/ideographiciterationmark": "\u3005", "/ideographiclaborcircle": "\u3298", "/ideographiclaborparen": "\u3238", "/ideographicleftcircle": "\u32A7", "/ideographicleveltonemark": "\u302A", "/ideographiclowcircle": "\u32A6", "/ideographicmedicinecircle": "\u32A9", "/ideographicmetalparen": "\u322E", "/ideographicmoonparen": "\u322A", "/ideographicnameparen": "\u3234", "/ideographicperiod": "\u3002", "/ideographicprintcircle": "\u329E", "/ideographicreachparen": "\u3243", "/ideographicrepresentparen": "\u3239", "/ideographicresourceparen": "\u323E", "/ideographicrightcircle": "\u32A8", "/ideographicrisingtonemark": "\u302B", "/ideographicsecretcircle": "\u3299", "/ideographicselfparen": "\u3242", "/ideographicsocietyparen": "\u3233", "/ideographicspace": "\u3000", "/ideographicspecialparen": "\u3235", "/ideographicstockparen": "\u3231", "/ideographicstudyparen": "\u323B", "/ideographicsunparen": "\u3230", "/ideographicsuperviseparen": "\u323C", "/ideographictelegraphlinefeedseparatorsymbol": "\u3037", "/ideographictelegraphsymbolforhoureight": "\u3360", "/ideographictelegraphsymbolforhoureighteen": "\u336A", "/ideographictelegraphsymbolforhoureleven": "\u3363", "/ideographictelegraphsymbolforhourfifteen": "\u3367", "/ideographictelegraphsymbolforhourfive": "\u335D", "/ideographictelegraphsymbolforhourfour": "\u335C", "/ideographictelegraphsymbolforhourfourteen": "\u3366", "/ideographictelegraphsymbolforhournine": "\u3361", "/ideographictelegraphsymbolforhournineteen": "\u336B", "/ideographictelegraphsymbolforhourone": "\u3359", "/ideographictelegraphsymbolforhourseven": "\u335F", "/ideographictelegraphsymbolforhourseventeen": "\u3369", "/ideographictelegraphsymbolforhoursix": "\u335E", "/ideographictelegraphsymbolforhoursixteen": "\u3368", "/ideographictelegraphsymbolforhourten": "\u3362", "/ideographictelegraphsymbolforhourthirteen": "\u3365", "/ideographictelegraphsymbolforhourthree": "\u335B", "/ideographictelegraphsymbolforhourtwelve": "\u3364", "/ideographictelegraphsymbolforhourtwenty": "\u336C", "/ideographictelegraphsymbolforhourtwentyfour": "\u3370", "/ideographictelegraphsymbolforhourtwentyone": "\u336D", "/ideographictelegraphsymbolforhourtwentythree": "\u336F", "/ideographictelegraphsymbolforhourtwentytwo": "\u336E", "/ideographictelegraphsymbolforhourtwo": "\u335A", "/ideographictelegraphsymbolforhourzero": "\u3358", "/ideographicvariationindicator": "\u303E", "/ideographicwaterparen": "\u322C", "/ideographicwoodparen": "\u322D", "/ideographiczero": "\u3007", "/ideographmetalcircle": "\u328E", "/ideographmooncircle": "\u328A", "/ideographnamecircle": "\u3294", "/ideographsuncircle": "\u3290", "/ideographwatercircle": "\u328C", "/ideographwoodcircle": "\u328D", "/ideva": "\u0907", "/idieresis": "\u00EF", "/idieresisacute": "\u1E2F", "/idieresiscyr": "\u04E5", "/idieresiscyrillic": "\u04E5", "/idotbelow": "\u1ECB", "/idsquare": "\u1F194", "/iebrevecyr": "\u04D7", "/iebrevecyrillic": "\u04D7", "/iecyr": "\u0435", "/iecyrillic": "\u0435", "/iegravecyr": "\u0450", "/iepigraphicsideways": "\uA7F7", "/ieungacirclekorean": "\u3275", "/ieungaparenkorean": "\u3215", "/ieungcirclekorean": "\u3267", "/ieungkorean": "\u3147", "/ieungparenkorean": "\u3207", "/ieungucirclekorean": "\u327E", "/igrave": "\u00EC", "/igravecyr": "\u045D", "/igravedbl": "\u0209", "/igujarati": "\u0A87", "/igurmukhi": "\u0A07", "/ihiragana": "\u3044", "/ihoi": "\u1EC9", "/ihookabove": "\u1EC9", "/iibengali": "\u0988", "/iicyrillic": "\u0438", "/iideva": "\u0908", "/iigujarati": "\u0A88", "/iigurmukhi": "\u0A08", "/iimatragurmukhi": "\u0A40", "/iinvertedbreve": "\u020B", "/iishortcyrillic": "\u0439", "/iivowelsignbengali": "\u09C0", "/iivowelsigndeva": "\u0940", "/iivowelsigngujarati": "\u0AC0", "/ij": "\u0133", "/ikatakana": "\u30A4", "/ikatakanahalfwidth": "\uFF72", "/ikawi": "\uA985", "/ikorean": "\u3163", "/ilde": "\u02DC", "/iluy:hb": "\u05AC", "/iluyhebrew": "\u05AC", "/imacron": "\u012B", "/imacroncyr": "\u04E3", "/imacroncyrillic": "\u04E3", "/image": "\u22B7", "/imageorapproximatelyequal": "\u2253", "/imatragurmukhi": "\u0A3F", "/imonospace": "\uFF49", "/imp": "\u1F47F", "/inboxTray": "\u1F4E5", "/incomingEnvelope": "\u1F4E8", "/increaseFontSize": "\u1F5DA", "/increment": "\u2206", "/indianrupee": "\u20B9", "/infinity": "\u221E", "/information": "\u2139", "/infullwidth": "\u33CC", "/inhibitarabicformshaping": "\u206C", "/inhibitsymmetricswapping": "\u206A", "/iniarmenian": "\u056B", "/iningusquare": "\u3304", "/inmationDeskPerson": "\u1F481", "/inputLatinCapitalLetters": "\u1F520", "/inputLatinLetters": "\u1F524", "/inputLatinSmallLetters": "\u1F521", "/inputNumbers": "\u1F522", "/inputS": "\u1F523", "/insertion": "\u2380", "/integral": "\u222B", "/integralbottom": "\u2321", "/integralbt": "\u2321", "/integralclockwise": "\u2231", "/integralcontour": "\u222E", "/integralcontouranticlockwise": "\u2233", "/integralcontourclockwise": "\u2232", "/integraldbl": "\u222C", "/integralex": "\uF8F5", "/integralextension": "\u23AE", "/integralsurface": "\u222F", "/integraltop": "\u2320", "/integraltp": "\u2320", "/integraltpl": "\u222D", "/integralvolume": "\u2230", "/intercalate": "\u22BA", "/interlinearanchor": "\uFFF9", "/interlinearseparator": "\uFFFA", "/interlinearterminator": "\uFFFB", "/interlockedfemalemale": "\u26A4", "/interrobang": "\u203D", "/interrobanginverted": "\u2E18", "/intersection": "\u2229", "/intersectionarray": "\u22C2", "/intersectiondbl": "\u22D2", "/intisquare": "\u3305", "/invbullet": "\u25D8", "/invcircle": "\u25D9", "/inverteddamma": "\u0657", "/invertedfork": "\u2443", "/invertedpentagram": "\u26E7", "/invertedundertie": "\u2054", "/invisibleplus": "\u2064", "/invisibleseparator": "\u2063", "/invisibletimes": "\u2062", "/invsmileface": "\u263B", "/iocyr": "\u0451", "/iocyrillic": "\u0451", "/iogonek": "\u012F", "/iota": "\u03B9", "/iotaacute": "\u1F77", "/iotaadscript": "\u1FBE", "/iotaasper": "\u1F31", "/iotaasperacute": "\u1F35", "/iotaaspergrave": "\u1F33", "/iotaaspertilde": "\u1F37", "/iotabreve": "\u1FD0", "/iotadieresis": "\u03CA", "/iotadieresisacute": "\u1FD3", "/iotadieresisgrave": "\u1FD2", "/iotadieresistilde": "\u1FD7", "/iotadieresistonos": "\u0390", "/iotafunc": "\u2373", "/iotagrave": "\u1F76", "/iotalatin": "\u0269", "/iotalenis": "\u1F30", "/iotalenisacute": "\u1F34", "/iotalenisgrave": "\u1F32", "/iotalenistilde": "\u1F36", "/iotasub": "\u037A", "/iotatilde": "\u1FD6", "/iotatonos": "\u03AF", "/iotaturned": "\u2129", "/iotaunderlinefunc": "\u2378", "/iotawithmacron": "\u1FD1", "/ipa:Ismall": "\u026A", "/ipa:alpha": "\u0251", "/ipa:ereversed": "\u0258", "/ipa:esh": "\u0283", "/ipa:gamma": "\u0263", "/ipa:glottalstop": "\u0294", "/ipa:gscript": "\u0261", "/ipa:iota": "\u0269", "/ipa:phi": "\u0278", "/ipa:rtail": "\u027D", "/ipa:schwa": "\u0259", "/ipa:upsilon": "\u028A", "/iparen": "\u24A4", "/iparenthesized": "\u24A4", "/irigurmukhi": "\u0A72", "/is": "\uA76D", "/isen-isenpada": "\uA9DF", "/ishortcyr": "\u0439", "/ishortsharptailcyr": "\u048B", "/ismallhiragana": "\u3043", "/ismallkatakana": "\u30A3", "/ismallkatakanahalfwidth": "\uFF68", "/issharbengali": "\u09FA", "/istroke": "\u0268", "/isuperior": "\uF6ED", "/itemideographiccircled": "\u32A0", "/iterationhiragana": "\u309D", "/iterationkatakana": "\u30FD", "/itilde": "\u0129", "/itildebelow": "\u1E2D", "/iubopomofo": "\u3129", "/iucyrillic": "\u044E", "/iufullwidth": "\u337A", "/iukrcyr": "\u0456", "/ivowelsignbengali": "\u09BF", "/ivowelsigndeva": "\u093F", "/ivowelsigngujarati": "\u0ABF", "/izakayaLantern": "\u1F3EE", "/izhitsacyr": "\u0475", "/izhitsacyrillic": "\u0475", "/izhitsadblgravecyrillic": "\u0477", "/izhitsagravedblcyr": "\u0477", "/j": "\u006A", "/j.inferior": "\u2C7C", "/jaarmenian": "\u0571", "/jabengali": "\u099C", "/jackOLantern": "\u1F383", "/jadeva": "\u091C", "/jagujarati": "\u0A9C", "/jagurmukhi": "\u0A1C", "/jamahaprana": "\uA999", "/januarytelegraph": "\u32C0", "/japaneseBeginner": "\u1F530", "/japaneseCastle": "\u1F3EF", "/japaneseDolls": "\u1F38E", "/japaneseGoblin": "\u1F47A", "/japaneseOgre": "\u1F479", "/japanesePostOffice": "\u1F3E3", "/japanesebank": "\u26FB", "/java:a": "\uA984", "/java:ai": "\uA98D", "/java:ba": "\uA9A7", "/java:ca": "\uA995", "/java:da": "\uA9A2", "/java:dda": "\uA99D", "/java:e": "\uA98C", "/java:eight": "\uA9D8", "/java:five": "\uA9D5", "/java:four": "\uA9D4", "/java:ga": "\uA992", "/java:ha": "\uA9B2", "/java:i": "\uA986", "/java:ii": "\uA987", "/java:ja": "\uA997", "/java:ka": "\uA98F", "/java:la": "\uA9AD", "/java:ma": "\uA9A9", "/java:na": "\uA9A4", "/java:nga": "\uA994", "/java:nine": "\uA9D9", "/java:nya": "\uA99A", "/java:o": "\uA98E", "/java:one": "\uA9D1", "/java:pa": "\uA9A5", "/java:ra": "\uA9AB", "/java:sa": "\uA9B1", "/java:seven": "\uA9D7", "/java:six": "\uA9D6", "/java:ta": "\uA9A0", "/java:three": "\uA9D3", "/java:tta": "\uA99B", "/java:two": "\uA9D2", "/java:u": "\uA988", "/java:wa": "\uA9AE", "/java:ya": "\uA9AA", "/java:zero": "\uA9D0", "/jbopomofo": "\u3110", "/jcaron": "\u01F0", "/jcircle": "\u24D9", "/jcircumflex": "\u0135", "/jcrossedtail": "\u029D", "/jdblstruckitalic": "\u2149", "/jdotlessstroke": "\u025F", "/jeans": "\u1F456", "/jecyr": "\u0458", "/jecyrillic": "\u0458", "/jeem": "\u062C", "/jeem.fina": "\uFE9E", "/jeem.init": "\uFE9F", "/jeem.init_alefmaksura.fina": "\uFD01", "/jeem.init_hah.fina": "\uFC15", "/jeem.init_hah.medi": "\uFCA7", "/jeem.init_meem.fina": "\uFC16", "/jeem.init_meem.medi": "\uFCA8", "/jeem.init_meem.medi_hah.medi": "\uFD59", "/jeem.init_yeh.fina": "\uFD02", "/jeem.isol": "\uFE9D", "/jeem.medi": "\uFEA0", "/jeem.medi_alefmaksura.fina": "\uFD1D", "/jeem.medi_hah.medi_alefmaksura.fina": "\uFDA6", "/jeem.medi_hah.medi_yeh.fina": "\uFDBE", "/jeem.medi_meem.medi_alefmaksura.fina": "\uFDA7", "/jeem.medi_meem.medi_hah.fina": "\uFD58", "/jeem.medi_meem.medi_yeh.fina": "\uFDA5", "/jeem.medi_yeh.fina": "\uFD1E", "/jeemabove": "\u06DA", "/jeemarabic": "\u062C", "/jeemfinalarabic": "\uFE9E", "/jeeminitialarabic": "\uFE9F", "/jeemmedialarabic": "\uFEA0", "/jeh": "\u0698", "/jeh.fina": "\uFB8B", "/jeh.isol": "\uFB8A", "/jeharabic": "\u0698", "/jehfinalarabic": "\uFB8B", "/jhabengali": "\u099D", "/jhadeva": "\u091D", "/jhagujarati": "\u0A9D", "/jhagurmukhi": "\u0A1D", "/jheharmenian": "\u057B", "/jis": "\u3004", "/jiterup": "\u2643", "/jmonospace": "\uFF4A", "/jotdiaeresisfunc": "\u2364", "/jotunderlinefunc": "\u235B", "/joystick": "\u1F579", "/jparen": "\u24A5", "/jparenthesized": "\u24A5", "/jstroke": "\u0249", "/jsuperior": "\u02B2", "/jsupmod": "\u02B2", "/jueuicircle": "\u327D", "/julytelegraph": "\u32C6", "/junetelegraph": "\u32C5", "/juno": "\u26B5", "/k": "\u006B", "/k.inferior": "\u2096", "/kaaba": "\u1F54B", "/kaaleutcyr": "\u051F", "/kabashkcyr": "\u04A1", "/kabashkircyrillic": "\u04A1", "/kabengali": "\u0995", "/kacirclekatakana": "\u32D5", "/kacute": "\u1E31", "/kacyr": "\u043A", "/kacyrillic": "\u043A", "/kadescendercyrillic": "\u049B", "/kadeva": "\u0915", "/kaf": "\u05DB", "/kaf.fina": "\uFEDA", "/kaf.init": "\uFEDB", "/kaf.init_alef.fina": "\uFC37", "/kaf.init_alefmaksura.fina": "\uFC3D", "/kaf.init_hah.fina": "\uFC39", "/kaf.init_hah.medi": "\uFCC5", "/kaf.init_jeem.fina": "\uFC38", "/kaf.init_jeem.medi": "\uFCC4", "/kaf.init_khah.fina": "\uFC3A", "/kaf.init_khah.medi": "\uFCC6", "/kaf.init_lam.fina": "\uFC3B", "/kaf.init_lam.medi": "\uFCC7", "/kaf.init_meem.fina": "\uFC3C", "/kaf.init_meem.medi": "\uFCC8", "/kaf.init_meem.medi_meem.medi": "\uFDC3", "/kaf.init_yeh.fina": "\uFC3E", "/kaf.isol": "\uFED9", "/kaf.medi": "\uFEDC", "/kaf.medi_alef.fina": "\uFC80", "/kaf.medi_alefmaksura.fina": "\uFC83", "/kaf.medi_lam.fina": "\uFC81", "/kaf.medi_lam.medi": "\uFCEB", "/kaf.medi_meem.fina": "\uFC82", "/kaf.medi_meem.medi": "\uFCEC", "/kaf.medi_meem.medi_meem.fina": "\uFDBB", "/kaf.medi_meem.medi_yeh.fina": "\uFDB7", "/kaf.medi_yeh.fina": "\uFC84", "/kaf:hb": "\u05DB", "/kafTwoDotsAbove": "\u077F", "/kafarabic": "\u0643", "/kafdagesh": "\uFB3B", "/kafdageshhebrew": "\uFB3B", "/kafdotabove": "\u06AC", "/kaffinalarabic": "\uFEDA", "/kafhebrew": "\u05DB", "/kafinitialarabic": "\uFEDB", "/kafmedialarabic": "\uFEDC", "/kafrafehebrew": "\uFB4D", "/kafring": "\u06AB", "/kafswash": "\u06AA", "/kafthreedotsbelow": "\u06AE", "/kafullwidth": "\u3384", "/kafwide:hb": "\uFB24", "/kafwithdagesh:hb": "\uFB3B", "/kafwithrafe:hb": "\uFB4D", "/kagujarati": "\u0A95", "/kagurmukhi": "\u0A15", "/kahiragana": "\u304B", "/kahookcyr": "\u04C4", "/kahookcyrillic": "\u04C4", "/kairisquare": "\u330B", "/kaisymbol": "\u03D7", "/kakatakana": "\u30AB", "/kakatakanahalfwidth": "\uFF76", "/kamurda": "\uA991", "/kappa": "\u03BA", "/kappa.math": "\u03F0", "/kappasymbolgreek": "\u03F0", "/kapyeounmieumkorean": "\u3171", "/kapyeounphieuphkorean": "\u3184", "/kapyeounpieupkorean": "\u3178", "/kapyeounssangpieupkorean": "\u3179", "/karattosquare": "\u330C", "/karoriisquare": "\u330D", "/kasasak": "\uA990", "/kashida": "\u0640", "/kashidaFina": "\uFE73", "/kashidaautoarabic": "\u0640", "/kashidaautonosidebearingarabic": "\u0640", "/kashmiriyeh": "\u0620", "/kasmallkatakana": "\u30F5", "/kasquare": "\u3384", "/kasra": "\u0650", "/kasraIsol": "\uFE7A", "/kasraMedi": "\uFE7B", "/kasraarabic": "\u0650", "/kasrasmall": "\u061A", "/kasratan": "\u064D", "/kasratanIsol": "\uFE74", "/kasratanarabic": "\u064D", "/kastrokecyr": "\u049F", "/kastrokecyrillic": "\u049F", "/kata:a": "\u30A2", "/kata:asmall": "\u30A1", "/kata:ba": "\u30D0", "/kata:be": "\u30D9", "/kata:bi": "\u30D3", "/kata:bo": "\u30DC", "/kata:bu": "\u30D6", "/kata:da": "\u30C0", "/kata:de": "\u30C7", "/kata:di": "\u30C2", "/kata:digraphkoto": "\u30FF", "/kata:do": "\u30C9", "/kata:doublehyphenkana": "\u30A0", "/kata:du": "\u30C5", "/kata:e": "\u30A8", "/kata:esmall": "\u30A7", "/kata:ga": "\u30AC", "/kata:ge": "\u30B2", "/kata:gi": "\u30AE", "/kata:go": "\u30B4", "/kata:gu": "\u30B0", "/kata:ha": "\u30CF", "/kata:he": "\u30D8", "/kata:hi": "\u30D2", "/kata:ho": "\u30DB", "/kata:hu": "\u30D5", "/kata:i": "\u30A4", "/kata:ismall": "\u30A3", "/kata:iteration": "\u30FD", "/kata:ka": "\u30AB", "/kata:kasmall": "\u30F5", "/kata:ke": "\u30B1", "/kata:kesmall": "\u30F6", "/kata:ki": "\u30AD", "/kata:ko": "\u30B3", "/kata:ku": "\u30AF", "/kata:ma": "\u30DE", "/kata:me": "\u30E1", "/kata:mi": "\u30DF", "/kata:middledot": "\u30FB", "/kata:mo": "\u30E2", "/kata:mu": "\u30E0", "/kata:n": "\u30F3", "/kata:na": "\u30CA", "/kata:ne": "\u30CD", "/kata:ni": "\u30CB", "/kata:no": "\u30CE", "/kata:nu": "\u30CC", "/kata:o": "\u30AA", "/kata:osmall": "\u30A9", "/kata:pa": "\u30D1", "/kata:pe": "\u30DA", "/kata:pi": "\u30D4", "/kata:po": "\u30DD", "/kata:prolongedkana": "\u30FC", "/kata:pu": "\u30D7", "/kata:ra": "\u30E9", "/kata:re": "\u30EC", "/kata:ri": "\u30EA", "/kata:ro": "\u30ED", "/kata:ru": "\u30EB", "/kata:sa": "\u30B5", "/kata:se": "\u30BB", "/kata:si": "\u30B7", "/kata:so": "\u30BD", "/kata:su": "\u30B9", "/kata:ta": "\u30BF", "/kata:te": "\u30C6", "/kata:ti": "\u30C1", "/kata:to": "\u30C8", "/kata:tu": "\u30C4", "/kata:tusmall": "\u30C3", "/kata:u": "\u30A6", "/kata:usmall": "\u30A5", "/kata:va": "\u30F7", "/kata:ve": "\u30F9", "/kata:vi": "\u30F8", "/kata:vo": "\u30FA", "/kata:voicediteration": "\u30FE", "/kata:vu": "\u30F4", "/kata:wa": "\u30EF", "/kata:wasmall": "\u30EE", "/kata:we": "\u30F1", "/kata:wi": "\u30F0", "/kata:wo": "\u30F2", "/kata:ya": "\u30E4", "/kata:yasmall": "\u30E3", "/kata:yo": "\u30E8", "/kata:yosmall": "\u30E7", "/kata:yu": "\u30E6", "/kata:yusmall": "\u30E5", "/kata:za": "\u30B6", "/kata:ze": "\u30BC", "/kata:zi": "\u30B8", "/kata:zo": "\u30BE", "/kata:zu": "\u30BA", "/katahiraprolongmarkhalfwidth": "\uFF70", "/katailcyr": "\u049B", "/kaverticalstrokecyr": "\u049D", "/kaverticalstrokecyrillic": "\u049D", "/kavykainvertedlow": "\u2E45", "/kavykalow": "\u2E47", "/kavykawithdotlow": "\u2E48", "/kavykawithkavykaaboveinvertedlow": "\u2E46", "/kbfullwidth": "\u3385", "/kbopomofo": "\u310E", "/kcalfullwidth": "\u3389", "/kcalsquare": "\u3389", "/kcaron": "\u01E9", "/kcedilla": "\u0137", "/kcircle": "\u24DA", "/kcommaaccent": "\u0137", "/kdescender": "\u2C6A", "/kdiagonalstroke": "\uA743", "/kdotbelow": "\u1E33", "/kecirclekatakana": "\u32D8", "/keesusquare": "\u331C", "/keharmenian": "\u0584", "/keheh": "\u06A9", "/keheh.fina": "\uFB8F", "/keheh.init": "\uFB90", "/keheh.isol": "\uFB8E", "/keheh.medi": "\uFB91", "/kehehDotAbove": "\u0762", "/kehehThreeDotsAbove": "\u0763", "/kehehThreeDotsUpBelow": "\u0764", "/kehehthreedotsbelow": "\u063C", "/kehehtwodotsabove": "\u063B", "/kehiragana": "\u3051", "/kekatakana": "\u30B1", "/kekatakanahalfwidth": "\uFF79", "/kelvin": "\u212A", "/kenarmenian": "\u056F", "/keretconsonant": "\uA9BD", "/kesmallkatakana": "\u30F6", "/key": "\u1F511", "/keyboardAndMouse": "\u1F5A6", "/keycapTen": "\u1F51F", "/kgfullwidth": "\u338F", "/kgreenlandic": "\u0138", "/khabengali": "\u0996", "/khacyrillic": "\u0445", "/khadeva": "\u0916", "/khagujarati": "\u0A96", "/khagurmukhi": "\u0A16", "/khah": "\u062E", "/khah.fina": "\uFEA6", "/khah.init": "\uFEA7", "/khah.init_alefmaksura.fina": "\uFD03", "/khah.init_hah.fina": "\uFC1A", "/khah.init_jeem.fina": "\uFC19", "/khah.init_jeem.medi": "\uFCAB", "/khah.init_meem.fina": "\uFC1B", "/khah.init_meem.medi": "\uFCAC", "/khah.init_yeh.fina": "\uFD04", "/khah.isol": "\uFEA5", "/khah.medi": "\uFEA8", "/khah.medi_alefmaksura.fina": "\uFD1F", "/khah.medi_yeh.fina": "\uFD20", "/khaharabic": "\u062E", "/khahfinalarabic": "\uFEA6", "/khahinitialarabic": "\uFEA7", "/khahmedialarabic": "\uFEA8", "/kheicoptic": "\u03E7", "/khhadeva": "\u0959", "/khhagurmukhi": "\u0A59", "/khieukhacirclekorean": "\u3278", "/khieukhaparenkorean": "\u3218", "/khieukhcirclekorean": "\u326A", "/khieukhkorean": "\u314B", "/khieukhparenkorean": "\u320A", "/khokhaithai": "\u0E02", "/khokhonthai": "\u0E05", "/khokhuatthai": "\u0E03", "/khokhwaithai": "\u0E04", "/khomutthai": "\u0E5B", "/khook": "\u0199", "/khorakhangthai": "\u0E06", "/khzfullwidth": "\u3391", "/khzsquare": "\u3391", "/kicirclekatakana": "\u32D6", "/kihiragana": "\u304D", "/kikatakana": "\u30AD", "/kikatakanahalfwidth": "\uFF77", "/kimono": "\u1F458", "/kindergartenideographiccircled": "\u3245", "/kingblack": "\u265A", "/kingwhite": "\u2654", "/kip": "\u20AD", "/kiroguramusquare": "\u3315", "/kiromeetorusquare": "\u3316", "/kirosquare": "\u3314", "/kirowattosquare": "\u3317", "/kiss": "\u1F48F", "/kissMark": "\u1F48B", "/kissingCatFaceWithClosedEyes": "\u1F63D", "/kissingFace": "\u1F617", "/kissingFaceWithClosedEyes": "\u1F61A", "/kissingFaceWithSmilingEyes": "\u1F619", "/kiyeokacirclekorean": "\u326E", "/kiyeokaparenkorean": "\u320E", "/kiyeokcirclekorean": "\u3260", "/kiyeokkorean": "\u3131", "/kiyeokparenkorean": "\u3200", "/kiyeoksioskorean": "\u3133", "/kjecyr": "\u045C", "/kjecyrillic": "\u045C", "/kkfullwidth": "\u33CD", "/klfullwidth": "\u3398", "/klinebelow": "\u1E35", "/klsquare": "\u3398", "/km2fullwidth": "\u33A2", "/km3fullwidth": "\u33A6", "/kmcapitalfullwidth": "\u33CE", "/kmcubedsquare": "\u33A6", "/kmfullwidth": "\u339E", "/kmonospace": "\uFF4B", "/kmsquaredsquare": "\u33A2", "/knda:a": "\u0C85", "/knda:aa": "\u0C86", "/knda:aasign": "\u0CBE", "/knda:ai": "\u0C90", "/knda:ailength": "\u0CD6", "/knda:aisign": "\u0CC8", "/knda:anusvara": "\u0C82", "/knda:au": "\u0C94", "/knda:ausign": "\u0CCC", "/knda:avagraha": "\u0CBD", "/knda:ba": "\u0CAC", "/knda:bha": "\u0CAD", "/knda:ca": "\u0C9A", "/knda:cha": "\u0C9B", "/knda:da": "\u0CA6", "/knda:dda": "\u0CA1", "/knda:ddha": "\u0CA2", "/knda:dha": "\u0CA7", "/knda:e": "\u0C8E", "/knda:ee": "\u0C8F", "/knda:eesign": "\u0CC7", "/knda:eight": "\u0CEE", "/knda:esign": "\u0CC6", "/knda:fa": "\u0CDE", "/knda:five": "\u0CEB", "/knda:four": "\u0CEA", "/knda:ga": "\u0C97", "/knda:gha": "\u0C98", "/knda:ha": "\u0CB9", "/knda:i": "\u0C87", "/knda:ii": "\u0C88", "/knda:iisign": "\u0CC0", "/knda:isign": "\u0CBF", "/knda:ja": "\u0C9C", "/knda:jha": "\u0C9D", "/knda:jihvamuliya": "\u0CF1", "/knda:ka": "\u0C95", "/knda:kha": "\u0C96", "/knda:la": "\u0CB2", "/knda:length": "\u0CD5", "/knda:lla": "\u0CB3", "/knda:llvocal": "\u0CE1", "/knda:llvocalsign": "\u0CE3", "/knda:lvocal": "\u0C8C", "/knda:lvocalsign": "\u0CE2", "/knda:ma": "\u0CAE", "/knda:na": "\u0CA8", "/knda:nga": "\u0C99", "/knda:nine": "\u0CEF", "/knda:nna": "\u0CA3", "/knda:nukta": "\u0CBC", "/knda:nya": "\u0C9E", "/knda:o": "\u0C92", "/knda:one": "\u0CE7", "/knda:oo": "\u0C93", "/knda:oosign": "\u0CCB", "/knda:osign": "\u0CCA", "/knda:pa": "\u0CAA", "/knda:pha": "\u0CAB", "/knda:ra": "\u0CB0", "/knda:rra": "\u0CB1", "/knda:rrvocal": "\u0CE0", "/knda:rrvocalsign": "\u0CC4", "/knda:rvocal": "\u0C8B", "/knda:rvocalsign": "\u0CC3", "/knda:sa": "\u0CB8", "/knda:seven": "\u0CED", "/knda:sha": "\u0CB6", "/knda:signcandrabindu": "\u0C81", "/knda:signspacingcandrabindu": "\u0C80", "/knda:six": "\u0CEC", "/knda:ssa": "\u0CB7", "/knda:ta": "\u0CA4", "/knda:tha": "\u0CA5", "/knda:three": "\u0CE9", "/knda:tta": "\u0C9F", "/knda:ttha": "\u0CA0", "/knda:two": "\u0CE8", "/knda:u": "\u0C89", "/knda:upadhmaniya": "\u0CF2", "/knda:usign": "\u0CC1", "/knda:uu": "\u0C8A", "/knda:uusign": "\u0CC2", "/knda:va": "\u0CB5", "/knda:virama": "\u0CCD", "/knda:visarga": "\u0C83", "/knda:ya": "\u0CAF", "/knda:zero": "\u0CE6", "/knightblack": "\u265E", "/knightwhite": "\u2658", "/ko:a": "\u314F", "/ko:ae": "\u3150", "/ko:aejungseong": "\u1162", "/ko:aeujungseong": "\u11A3", "/ko:ajungseong": "\u1161", "/ko:aojungseong": "\u1176", "/ko:araea": "\u318D", "/ko:araeae": "\u318E", "/ko:araeaeojungseong": "\u119F", "/ko:araeaijungseong": "\u11A1", "/ko:araeajungseong": "\u119E", "/ko:araeaujungseong": "\u11A0", "/ko:aujungseong": "\u1177", "/ko:ceongchieumchieuchchoseong": "\u1155", "/ko:ceongchieumcieucchoseong": "\u1150", "/ko:ceongchieumsioschoseong": "\u113E", "/ko:ceongchieumssangcieucchoseong": "\u1151", "/ko:ceongchieumssangsioschoseong": "\u113F", "/ko:chieuch": "\u314A", "/ko:chieuchchoseong": "\u110E", "/ko:chieuchhieuhchoseong": "\u1153", "/ko:chieuchjongseong": "\u11BE", "/ko:chieuchkhieukhchoseong": "\u1152", "/ko:chitueumchieuchchoseong": "\u1154", "/ko:chitueumcieucchoseong": "\u114E", "/ko:chitueumsioschoseong": "\u113C", "/ko:chitueumssangcieucchoseong": "\u114F", "/ko:chitueumssangsioschoseong": "\u113D", "/ko:cieuc": "\u3148", "/ko:cieucchoseong": "\u110C", "/ko:cieucieungchoseong": "\u114D", "/ko:cieucjongseong": "\u11BD", "/ko:e": "\u3154", "/ko:ejungseong": "\u1166", "/ko:eo": "\u3153", "/ko:eo_eujungseong": "\u117C", "/ko:eojungseong": "\u1165", "/ko:eoojungseong": "\u117A", "/ko:eoujungseong": "\u117B", "/ko:eu": "\u3161", "/ko:eueujungseong": "\u1196", "/ko:eujungseong": "\u1173", "/ko:euujungseong": "\u1195", "/ko:filler": "\u3164", "/ko:fillerchoseong": "\u115F", "/ko:fillerjungseong": "\u1160", "/ko:hieuh": "\u314E", "/ko:hieuhchoseong": "\u1112", "/ko:hieuhjongseong": "\u11C2", "/ko:hieuhmieumjongseong": "\u11F7", "/ko:hieuhnieunjongseong": "\u11F5", "/ko:hieuhpieupjongseong": "\u11F8", "/ko:hieuhrieuljongseong": "\u11F6", "/ko:i": "\u3163", "/ko:iajungseong": "\u1198", "/ko:iaraeajungseong": "\u119D", "/ko:ieujungseong": "\u119C", "/ko:ieung": "\u3147", "/ko:ieungchieuchchoseong": "\u1149", "/ko:ieungchoseong": "\u110B", "/ko:ieungcieucchoseong": "\u1148", "/ko:ieungjongseong": "\u11BC", "/ko:ieungkhieukhjongseong": "\u11EF", "/ko:ieungkiyeokchoseong": "\u1141", "/ko:ieungkiyeokjongseong": "\u11EC", "/ko:ieungmieumchoseong": "\u1143", "/ko:ieungpansioschoseong": "\u1146", "/ko:ieungphieuphchoseong": "\u114B", "/ko:ieungpieupchoseong": "\u1144", "/ko:ieungsioschoseong": "\u1145", "/ko:ieungssangkiyeokjongseong": "\u11ED", "/ko:ieungthieuthchoseong": "\u114A", "/ko:ieungtikeutchoseong": "\u1142", "/ko:ijungseong": "\u1175", "/ko:iojungseong": "\u119A", "/ko:iujungseong": "\u119B", "/ko:iyajungseong": "\u1199", "/ko:kapyeounmieum": "\u3171", "/ko:kapyeounmieumchoseong": "\u111D", "/ko:kapyeounmieumjongseong": "\u11E2", "/ko:kapyeounphieuph": "\u3184", "/ko:kapyeounphieuphchoseong": "\u1157", "/ko:kapyeounphieuphjongseong": "\u11F4", "/ko:kapyeounpieup": "\u3178", "/ko:kapyeounpieupchoseong": "\u112B", "/ko:kapyeounpieupjongseong": "\u11E6", "/ko:kapyeounrieulchoseong": "\u111B", "/ko:kapyeounssangpieup": "\u3179", "/ko:kapyeounssangpieupchoseong": "\u112C", "/ko:khieukh": "\u314B", "/ko:khieukhchoseong": "\u110F", "/ko:khieukhjongseong": "\u11BF", "/ko:kiyeok": "\u3131", "/ko:kiyeokchieuchjongseong": "\u11FC", "/ko:kiyeokchoseong": "\u1100", "/ko:kiyeokhieuhjongseong": "\u11FE", "/ko:kiyeokjongseong": "\u11A8", "/ko:kiyeokkhieukhjongseong": "\u11FD", "/ko:kiyeoknieunjongseong": "\u11FA", "/ko:kiyeokpieupjongseong": "\u11FB", "/ko:kiyeokrieuljongseong": "\u11C3", "/ko:kiyeoksios": "\u3133", "/ko:kiyeoksiosjongseong": "\u11AA", "/ko:kiyeoksioskiyeokjongseong": "\u11C4", "/ko:kiyeoktikeutchoseong": "\u115A", "/ko:mieum": "\u3141", "/ko:mieumchieuchjongseong": "\u11E0", "/ko:mieumchoseong": "\u1106", "/ko:mieumhieuhjongseong": "\u11E1", "/ko:mieumjongseong": "\u11B7", "/ko:mieumkiyeokjongseong": "\u11DA", "/ko:mieumpansios": "\u3170", "/ko:mieumpansiosjongseong": "\u11DF", "/ko:mieumpieup": "\u316E", "/ko:mieumpieupchoseong": "\u111C", "/ko:mieumpieupjongseong": "\u11DC", "/ko:mieumrieuljongseong": "\u11DB", "/ko:mieumsios": "\u316F", "/ko:mieumsiosjongseong": "\u11DD", "/ko:mieumssangsiosjongseong": "\u11DE", "/ko:nieun": "\u3134", "/ko:nieunchoseong": "\u1102", "/ko:nieuncieuc": "\u3135", "/ko:nieuncieucchoseong": "\u115C", "/ko:nieuncieucjongseong": "\u11AC", "/ko:nieunhieuh": "\u3136", "/ko:nieunhieuhchoseong": "\u115D", "/ko:nieunhieuhjongseong": "\u11AD", "/ko:nieunjongseong": "\u11AB", "/ko:nieunkiyeokchoseong": "\u1113", "/ko:nieunkiyeokjongseong": "\u11C5", "/ko:nieunpansios": "\u3168", "/ko:nieunpansiosjongseong": "\u11C8", "/ko:nieunpieupchoseong": "\u1116", "/ko:nieunsios": "\u3167", "/ko:nieunsioschoseong": "\u115B", "/ko:nieunsiosjongseong": "\u11C7", "/ko:nieunthieuthjongseong": "\u11C9", "/ko:nieuntikeut": "\u3166", "/ko:nieuntikeutchoseong": "\u1115", "/ko:nieuntikeutjongseong": "\u11C6", "/ko:o": "\u3157", "/ko:o_ejungseong": "\u1180", "/ko:o_eojungseong": "\u117F", "/ko:oe": "\u315A", "/ko:oejungseong": "\u116C", "/ko:ojungseong": "\u1169", "/ko:oojungseong": "\u1182", "/ko:oujungseong": "\u1183", "/ko:oyaejungseong": "\u11A7", "/ko:oyajungseong": "\u11A6", "/ko:oyejungseong": "\u1181", "/ko:pansios": "\u317F", "/ko:pansioschoseong": "\u1140", "/ko:pansiosjongseong": "\u11EB", "/ko:phieuph": "\u314D", "/ko:phieuphchoseong": "\u1111", "/ko:phieuphjongseong": "\u11C1", "/ko:phieuphpieupchoseong": "\u1156", "/ko:phieuphpieupjongseong": "\u11F3", "/ko:pieup": "\u3142", "/ko:pieupchieuchchoseong": "\u1128", "/ko:pieupchoseong": "\u1107", "/ko:pieupcieuc": "\u3176", "/ko:pieupcieucchoseong": "\u1127", "/ko:pieuphieuhjongseong": "\u11E5", "/ko:pieupjongseong": "\u11B8", "/ko:pieupkiyeok": "\u3172", "/ko:pieupkiyeokchoseong": "\u111E", "/ko:pieupnieunchoseong": "\u111F", "/ko:pieupphieuphchoseong": "\u112A", "/ko:pieupphieuphjongseong": "\u11E4", "/ko:pieuprieuljongseong": "\u11E3", "/ko:pieupsios": "\u3144", "/ko:pieupsioschoseong": "\u1121", "/ko:pieupsioscieucchoseong": "\u1126", "/ko:pieupsiosjongseong": "\u11B9", "/ko:pieupsioskiyeok": "\u3174", "/ko:pieupsioskiyeokchoseong": "\u1122", "/ko:pieupsiospieupchoseong": "\u1124", "/ko:pieupsiostikeut": "\u3175", "/ko:pieupsiostikeutchoseong": "\u1123", "/ko:pieupssangsioschoseong": "\u1125", "/ko:pieupthieuth": "\u3177", "/ko:pieupthieuthchoseong": "\u1129", "/ko:pieuptikeut": "\u3173", "/ko:pieuptikeutchoseong": "\u1120", "/ko:rieul": "\u3139", "/ko:rieulchoseong": "\u1105", "/ko:rieulhieuh": "\u3140", "/ko:rieulhieuhchoseong": "\u111A", "/ko:rieulhieuhjongseong": "\u11B6", "/ko:rieuljongseong": "\u11AF", "/ko:rieulkapyeounpieupjongseong": "\u11D5", "/ko:rieulkhieukhjongseong": "\u11D8", "/ko:rieulkiyeok": "\u313A", "/ko:rieulkiyeokjongseong": "\u11B0", "/ko:rieulkiyeoksios": "\u3169", "/ko:rieulkiyeoksiosjongseong": "\u11CC", "/ko:rieulmieum": "\u313B", "/ko:rieulmieumjongseong": "\u11B1", "/ko:rieulmieumkiyeokjongseong": "\u11D1", "/ko:rieulmieumsiosjongseong": "\u11D2", "/ko:rieulnieunchoseong": "\u1118", "/ko:rieulnieunjongseong": "\u11CD", "/ko:rieulpansios": "\u316C", "/ko:rieulpansiosjongseong": "\u11D7", "/ko:rieulphieuph": "\u313F", "/ko:rieulphieuphjongseong": "\u11B5", "/ko:rieulpieup": "\u313C", "/ko:rieulpieuphieuhjongseong": "\u11D4", "/ko:rieulpieupjongseong": "\u11B2", "/ko:rieulpieupsios": "\u316B", "/ko:rieulpieupsiosjongseong": "\u11D3", "/ko:rieulsios": "\u313D", "/ko:rieulsiosjongseong": "\u11B3", "/ko:rieulssangsiosjongseong": "\u11D6", "/ko:rieulthieuth": "\u313E", "/ko:rieulthieuthjongseong": "\u11B4", "/ko:rieultikeut": "\u316A", "/ko:rieultikeuthieuhjongseong": "\u11CF", "/ko:rieultikeutjongseong": "\u11CE", "/ko:rieulyeorinhieuh": "\u316D", "/ko:rieulyeorinhieuhjongseong": "\u11D9", "/ko:sios": "\u3145", "/ko:sioschieuchchoseong": "\u1137", "/ko:sioschoseong": "\u1109", "/ko:sioscieuc": "\u317E", "/ko:sioscieucchoseong": "\u1136", "/ko:sioshieuhchoseong": "\u113B", "/ko:siosieungchoseong": "\u1135", "/ko:siosjongseong": "\u11BA", "/ko:sioskhieukhchoseong": "\u1138", "/ko:sioskiyeok": "\u317A", "/ko:sioskiyeokchoseong": "\u112D", "/ko:sioskiyeokjongseong": "\u11E7", "/ko:siosmieumchoseong": "\u1131", "/ko:siosnieun": "\u317B", "/ko:siosnieunchoseong": "\u112E", "/ko:siosphieuphchoseong": "\u113A", "/ko:siospieup": "\u317D", "/ko:siospieupchoseong": "\u1132", "/ko:siospieupjongseong": "\u11EA", "/ko:siospieupkiyeokchoseong": "\u1133", "/ko:siosrieulchoseong": "\u1130", "/ko:siosrieuljongseong": "\u11E9", "/ko:siosssangsioschoseong": "\u1134", "/ko:siosthieuthchoseong": "\u1139", "/ko:siostikeut": "\u317C", "/ko:siostikeutchoseong": "\u112F", "/ko:siostikeutjongseong": "\u11E8", "/ko:ssangaraeajungseong": "\u11A2", "/ko:ssangcieuc": "\u3149", "/ko:ssangcieucchoseong": "\u110D", "/ko:ssanghieuh": "\u3185", "/ko:ssanghieuhchoseong": "\u1158", "/ko:ssangieung": "\u3180", "/ko:ssangieungchoseong": "\u1147", "/ko:ssangieungjongseong": "\u11EE", "/ko:ssangkiyeok": "\u3132", "/ko:ssangkiyeokchoseong": "\u1101", "/ko:ssangkiyeokjongseong": "\u11A9", "/ko:ssangnieun": "\u3165", "/ko:ssangnieunchoseong": "\u1114", "/ko:ssangnieunjongseong": "\u11FF", "/ko:ssangpieup": "\u3143", "/ko:ssangpieupchoseong": "\u1108", "/ko:ssangrieulchoseong": "\u1119", "/ko:ssangrieuljongseong": "\u11D0", "/ko:ssangsios": "\u3146", "/ko:ssangsioschoseong": "\u110A", "/ko:ssangsiosjongseong": "\u11BB", "/ko:ssangtikeut": "\u3138", "/ko:ssangtikeutchoseong": "\u1104", "/ko:thieuth": "\u314C", "/ko:thieuthchoseong": "\u1110", "/ko:thieuthjongseong": "\u11C0", "/ko:tikeut": "\u3137", "/ko:tikeutchoseong": "\u1103", "/ko:tikeutjongseong": "\u11AE", "/ko:tikeutkiyeokchoseong": "\u1117", "/ko:tikeutkiyeokjongseong": "\u11CA", "/ko:tikeutrieulchoseong": "\u115E", "/ko:tikeutrieuljongseong": "\u11CB", "/ko:u": "\u315C", "/ko:uaejungseong": "\u118A", "/ko:uajungseong": "\u1189", "/ko:ueo_eujungseong": "\u118B", "/ko:ujungseong": "\u116E", "/ko:uujungseong": "\u118D", "/ko:uyejungseong": "\u118C", "/ko:wa": "\u3158", "/ko:wae": "\u3159", "/ko:waejungseong": "\u116B", "/ko:wajungseong": "\u116A", "/ko:we": "\u315E", "/ko:wejungseong": "\u1170", "/ko:weo": "\u315D", "/ko:weojungseong": "\u116F", "/ko:wi": "\u315F", "/ko:wijungseong": "\u1171", "/ko:ya": "\u3151", "/ko:yae": "\u3152", "/ko:yaejungseong": "\u1164", "/ko:yajungseong": "\u1163", "/ko:yaojungseong": "\u1178", "/ko:yaujungseong": "\u11A4", "/ko:yayojungseong": "\u1179", "/ko:ye": "\u3156", "/ko:yejungseong": "\u1168", "/ko:yeo": "\u3155", "/ko:yeojungseong": "\u1167", "/ko:yeoojungseong": "\u117D", "/ko:yeorinhieuh": "\u3186", "/ko:yeorinhieuhchoseong": "\u1159", "/ko:yeorinhieuhjongseong": "\u11F9", "/ko:yeoujungseong": "\u117E", "/ko:yeoyajungseong": "\u11A5", "/ko:yesieung": "\u3181", "/ko:yesieungchoseong": "\u114C", "/ko:yesieungjongseong": "\u11F0", "/ko:yesieungpansios": "\u3183", "/ko:yesieungpansiosjongseong": "\u11F2", "/ko:yesieungsios": "\u3182", "/ko:yesieungsiosjongseong": "\u11F1", "/ko:yi": "\u3162", "/ko:yijungseong": "\u1174", "/ko:yiujungseong": "\u1197", "/ko:yo": "\u315B", "/ko:yoi": "\u3189", "/ko:yoijungseong": "\u1188", "/ko:yojungseong": "\u116D", "/ko:yoojungseong": "\u1187", "/ko:yoya": "\u3187", "/ko:yoyae": "\u3188", "/ko:yoyaejungseong": "\u1185", "/ko:yoyajungseong": "\u1184", "/ko:yoyeojungseong": "\u1186", "/ko:yu": "\u3160", "/ko:yuajungseong": "\u118E", "/ko:yuejungseong": "\u1190", "/ko:yueojungseong": "\u118F", "/ko:yui": "\u318C", "/ko:yuijungseong": "\u1194", "/ko:yujungseong": "\u1172", "/ko:yuujungseong": "\u1193", "/ko:yuye": "\u318B", "/ko:yuyejungseong": "\u1192", "/ko:yuyeo": "\u318A", "/ko:yuyeojungseong": "\u1191", "/koala": "\u1F428", "/kobliquestroke": "\uA7A3", "/kocirclekatakana": "\u32D9", "/kohiragana": "\u3053", "/kohmfullwidth": "\u33C0", "/kohmsquare": "\u33C0", "/kokaithai": "\u0E01", "/kokatakana": "\u30B3", "/kokatakanahalfwidth": "\uFF7A", "/kooposquare": "\u331E", "/koppa": "\u03DF", "/koppaarchaic": "\u03D9", "/koppacyr": "\u0481", "/koppacyrillic": "\u0481", "/koreanstandardsymbol": "\u327F", "/koroniscmb": "\u0343", "/korunasquare": "\u331D", "/kotoideographiccircled": "\u3247", "/kpafullwidth": "\u33AA", "/kparen": "\u24A6", "/kparenthesized": "\u24A6", "/kpasquare": "\u33AA", "/kra": "\u0138", "/ksicyr": "\u046F", "/ksicyrillic": "\u046F", "/kstroke": "\uA741", "/kstrokediagonalstroke": "\uA745", "/ktfullwidth": "\u33CF", "/ktsquare": "\u33CF", "/kturned": "\u029E", "/kucirclekatakana": "\u32D7", "/kuhiragana": "\u304F", "/kukatakana": "\u30AF", "/kukatakanahalfwidth": "\uFF78", "/kuroonesquare": "\u331B", "/kuruzeirosquare": "\u331A", "/kvfullwidth": "\u33B8", "/kvsquare": "\u33B8", "/kwfullwidth": "\u33BE", "/kwsquare": "\u33BE", "/kyuriisquare": "\u3312", "/l": "\u006C", "/l.inferior": "\u2097", "/label": "\u1F3F7", "/labengali": "\u09B2", "/laborideographiccircled": "\u3298", "/laborideographicparen": "\u3238", "/lacute": "\u013A", "/ladeva": "\u0932", "/ladyBeetle": "\u1F41E", "/lagujarati": "\u0AB2", "/lagurmukhi": "\u0A32", "/lakkhangyaothai": "\u0E45", "/lam": "\u0644", "/lam.fina": "\uFEDE", "/lam.init": "\uFEDF", "/lam.init_alef.fina": "\uFEFB", "/lam.init_alef.medi_hamzaabove.fina": "\uFEF7", "/lam.init_alef.medi_hamzabelow.fina": "\uFEF9", "/lam.init_alef.medi_maddaabove.fina": "\uFEF5", "/lam.init_alefmaksura.fina": "\uFC43", "/lam.init_hah.fina": "\uFC40", "/lam.init_hah.medi": "\uFCCA", "/lam.init_hah.medi_meem.medi": "\uFDB5", "/lam.init_heh.medi": "\uFCCD", "/lam.init_jeem.fina": "\uFC3F", "/lam.init_jeem.medi": "\uFCC9", "/lam.init_jeem.medi_jeem.medi": "\uFD83", "/lam.init_jeem.medi_meem.medi": "\uFDBA", "/lam.init_khah.fina": "\uFC41", "/lam.init_khah.medi": "\uFCCB", "/lam.init_khah.medi_meem.medi": "\uFD86", "/lam.init_meem.fina": "\uFC42", "/lam.init_meem.medi": "\uFCCC", "/lam.init_meem.medi_hah.medi": "\uFD88", "/lam.init_yeh.fina": "\uFC44", "/lam.isol": "\uFEDD", "/lam.medi": "\uFEE0", "/lam.medi_alef.fina": "\uFEFC", "/lam.medi_alef.medi_hamzaabove.fina": "\uFEF8", "/lam.medi_alef.medi_hamzabelow.fina": "\uFEFA", "/lam.medi_alef.medi_maddaabove.fina": "\uFEF6", "/lam.medi_alefmaksura.fina": "\uFC86", "/lam.medi_hah.medi_alefmaksura.fina": "\uFD82", "/lam.medi_hah.medi_meem.fina": "\uFD80", "/lam.medi_hah.medi_yeh.fina": "\uFD81", "/lam.medi_jeem.medi_jeem.fina": "\uFD84", "/lam.medi_jeem.medi_meem.fina": "\uFDBC", "/lam.medi_jeem.medi_yeh.fina": "\uFDAC", "/lam.medi_khah.medi_meem.fina": "\uFD85", "/lam.medi_meem.fina": "\uFC85", "/lam.medi_meem.medi": "\uFCED", "/lam.medi_meem.medi_hah.fina": "\uFD87", "/lam.medi_meem.medi_yeh.fina": "\uFDAD", "/lam.medi_yeh.fina": "\uFC87", "/lamBar": "\u076A", "/lamVabove": "\u06B5", "/lamalefabove": "\u06D9", "/lamaleffinalarabic": "\uFEFC", "/lamalefhamzaabovefinalarabic": "\uFEF8", "/lamalefhamzaaboveisolatedarabic": "\uFEF7", "/lamalefhamzabelowfinalarabic": "\uFEFA", "/lamalefhamzabelowisolatedarabic": "\uFEF9", "/lamalefisolatedarabic": "\uFEFB", "/lamalefmaddaabovefinalarabic": "\uFEF6", "/lamalefmaddaaboveisolatedarabic": "\uFEF5", "/lamarabic": "\u0644", "/lambda": "\u03BB", "/lambdastroke": "\u019B", "/lamdotabove": "\u06B6", "/lamed": "\u05DC", "/lamed:hb": "\u05DC", "/lameddagesh": "\uFB3C", "/lameddageshhebrew": "\uFB3C", "/lamedhebrew": "\u05DC", "/lamedholam": "\u05DC", "/lamedholamdagesh": "\u05DC", "/lamedholamdageshhebrew": "\u05DC", "/lamedholamhebrew": "\u05DC", "/lamedwide:hb": "\uFB25", "/lamedwithdagesh:hb": "\uFB3C", "/lamfinalarabic": "\uFEDE", "/lamhahinitialarabic": "\uFCCA", "/laminitialarabic": "\uFEDF", "/lamjeeminitialarabic": "\uFCC9", "/lamkhahinitialarabic": "\uFCCB", "/lamlamhehisolatedarabic": "\uFDF2", "/lammedialarabic": "\uFEE0", "/lammeemhahinitialarabic": "\uFD88", "/lammeeminitialarabic": "\uFCCC", "/lammeemjeeminitialarabic": "\uFEDF", "/lammeemkhahinitialarabic": "\uFEDF", "/lamthreedotsabove": "\u06B7", "/lamthreedotsbelow": "\u06B8", "/lanemergeleftblack": "\u26D8", "/lanemergeleftwhite": "\u26D9", "/largeBlueCircle": "\u1F535", "/largeBlueDiamond": "\u1F537", "/largeOrangeDiamond": "\u1F536", "/largeRedCircle": "\u1F534", "/largecircle": "\u25EF", "/largetackdown": "\u27D9", "/largetackup": "\u27D8", "/lari": "\u20BE", "/lastQuarterMoon": "\u1F317", "/lastQuarterMoonFace": "\u1F31C", "/lastquartermoon": "\u263E", "/layar": "\uA982", "/lazysinverted": "\u223E", "/lbar": "\u019A", "/lbbar": "\u2114", "/lbelt": "\u026C", "/lbeltretroflex": "\uA78E", "/lbopomofo": "\u310C", "/lbroken": "\uA747", "/lcaron": "\u013E", "/lcedilla": "\u013C", "/lcircle": "\u24DB", "/lcircumflexbelow": "\u1E3D", "/lcommaaccent": "\u013C", "/lcurl": "\u0234", "/ldblbar": "\u2C61", "/ldot": "\u0140", "/ldotaccent": "\u0140", "/ldotbelow": "\u1E37", "/ldotbelowmacron": "\u1E39", "/leafFlutteringInWind": "\u1F343", "/ledger": "\u1F4D2", "/left-pointingMagnifyingGlass": "\u1F50D", "/leftAngerBubble": "\u1F5EE", "/leftFiveEighthsBlock": "\u258B", "/leftHalfBlock": "\u258C", "/leftHandTelephoneReceiver": "\u1F57B", "/leftLuggage": "\u1F6C5", "/leftOneEighthBlock": "\u258F", "/leftOneQuarterBlock": "\u258E", "/leftSevenEighthsBlock": "\u2589", "/leftSpeechBubble": "\u1F5E8", "/leftThoughtBubble": "\u1F5EC", "/leftThreeEighthsBlock": "\u258D", "/leftThreeQuartersBlock": "\u258A", "/leftWritingHand": "\u1F58E", "/leftangleabovecmb": "\u031A", "/leftarrowoverrightarrow": "\u21C6", "/leftdnheavyrightuplight": "\u2545", "/leftharpoonoverrightharpoon": "\u21CB", "/leftheavyrightdnlight": "\u252D", "/leftheavyrightuplight": "\u2535", "/leftheavyrightvertlight": "\u253D", "/leftideographiccircled": "\u32A7", "/leftlightrightdnheavy": "\u2532", "/leftlightrightupheavy": "\u253A", "/leftlightrightvertheavy": "\u254A", "/lefttackbelowcmb": "\u0318", "/lefttorightembed": "\u202A", "/lefttorightisolate": "\u2066", "/lefttorightmark": "\u200E", "/lefttorightoverride": "\u202D", "/leftupheavyrightdnlight": "\u2543", "/lemon": "\u1F34B", "/lenis": "\u1FBF", "/lenisacute": "\u1FCE", "/lenisgrave": "\u1FCD", "/lenistilde": "\u1FCF", "/leo": "\u264C", "/leopard": "\u1F406", "/less": "\u003C", "/lessbutnotequal": "\u2268", "/lessbutnotequivalent": "\u22E6", "/lessdot": "\u22D6", "/lessequal": "\u2264", "/lessequalorgreater": "\u22DA", "/lessmonospace": "\uFF1C", "/lessorequivalent": "\u2272", "/lessorgreater": "\u2276", "/lessoverequal": "\u2266", "/lesssmall": "\uFE64", "/levelSlider": "\u1F39A", "/lezh": "\u026E", "/lfblock": "\u258C", "/lhacyr": "\u0515", "/lhookretroflex": "\u026D", "/libra": "\u264E", "/ligaturealeflamed:hb": "\uFB4F", "/ligatureoemod": "\uA7F9", "/lightCheckMark": "\u1F5F8", "/lightRail": "\u1F688", "/lightShade": "\u2591", "/lightarcdnleft": "\u256E", "/lightarcdnright": "\u256D", "/lightarcupleft": "\u256F", "/lightarcupright": "\u2570", "/lightdbldashhorz": "\u254C", "/lightdbldashvert": "\u254E", "/lightdiagcross": "\u2573", "/lightdiagupleftdnright": "\u2572", "/lightdiaguprightdnleft": "\u2571", "/lightdn": "\u2577", "/lightdnhorz": "\u252C", "/lightdnleft": "\u2510", "/lightdnright": "\u250C", "/lighthorz": "\u2500", "/lightleft": "\u2574", "/lightleftheavyright": "\u257C", "/lightning": "\u2607", "/lightningMood": "\u1F5F2", "/lightningMoodBubble": "\u1F5F1", "/lightquaddashhorz": "\u2508", "/lightquaddashvert": "\u250A", "/lightright": "\u2576", "/lighttrpldashhorz": "\u2504", "/lighttrpldashvert": "\u2506", "/lightup": "\u2575", "/lightupheavydn": "\u257D", "/lightuphorz": "\u2534", "/lightupleft": "\u2518", "/lightupright": "\u2514", "/lightvert": "\u2502", "/lightverthorz": "\u253C", "/lightvertleft": "\u2524", "/lightvertright": "\u251C", "/lineextensionhorizontal": "\u23AF", "/lineextensionvertical": "\u23D0", "/linemiddledotvertical": "\u237F", "/lineseparator": "\u2028", "/lingsapada": "\uA9C8", "/link": "\u1F517", "/linkedPaperclips": "\u1F587", "/lips": "\u1F5E2", "/lipstick": "\u1F484", "/lira": "\u20A4", "/litre": "\u2113", "/livretournois": "\u20B6", "/liwnarmenian": "\u056C", "/lj": "\u01C9", "/ljecyr": "\u0459", "/ljecyrillic": "\u0459", "/ljekomicyr": "\u0509", "/ll": "\uF6C0", "/lladeva": "\u0933", "/llagujarati": "\u0AB3", "/llinebelow": "\u1E3B", "/llladeva": "\u0934", "/llvocalicbengali": "\u09E1", "/llvocalicdeva": "\u0961", "/llvocalicvowelsignbengali": "\u09E3", "/llvocalicvowelsigndeva": "\u0963", "/llwelsh": "\u1EFB", "/lmacrondot": "\u1E39", "/lmfullwidth": "\u33D0", "/lmiddletilde": "\u026B", "/lmonospace": "\uFF4C", "/lmsquare": "\u33D0", "/lnfullwidth": "\u33D1", "/lochulathai": "\u0E2C", "/lock": "\u1F512", "/lockInkPen": "\u1F50F", "/logfullwidth": "\u33D2", "/logicaland": "\u2227", "/logicalandarray": "\u22C0", "/logicalnot": "\u00AC", "/logicalnotreversed": "\u2310", "/logicalor": "\u2228", "/logicalorarray": "\u22C1", "/lolingthai": "\u0E25", "/lollipop": "\u1F36D", "/longdivision": "\u27CC", "/longovershortmetrical": "\u23D2", "/longovertwoshortsmetrical": "\u23D4", "/longs": "\u017F", "/longs_t": "\uFB05", "/longsdot": "\u1E9B", "/longswithdiagonalstroke": "\u1E9C", "/longswithhighstroke": "\u1E9D", "/longtackleft": "\u27DE", "/longtackright": "\u27DD", "/losslesssquare": "\u1F1A9", "/loudlyCryingFace": "\u1F62D", "/loveHotel": "\u1F3E9", "/loveLetter": "\u1F48C", "/lowBrightness": "\u1F505", "/lowasterisk": "\u204E", "/lowerFiveEighthsBlock": "\u2585", "/lowerHalfBlock": "\u2584", "/lowerLeftBallpointPen": "\u1F58A", "/lowerLeftCrayon": "\u1F58D", "/lowerLeftFountainPen": "\u1F58B", "/lowerLeftPaintbrush": "\u1F58C", "/lowerLeftPencil": "\u1F589", "/lowerOneEighthBlock": "\u2581", "/lowerOneQuarterBlock": "\u2582", "/lowerRightShadowedWhiteCircle": "\u1F53E", "/lowerSevenEighthsBlock": "\u2587", "/lowerThreeEighthsBlock": "\u2583", "/lowerThreeQuartersBlock": "\u2586", "/lowercornerdotright": "\u27D3", "/lowerhalfcircle": "\u25E1", "/lowerhalfcircleinversewhite": "\u25DB", "/lowerquadrantcirculararcleft": "\u25DF", "/lowerquadrantcirculararcright": "\u25DE", "/lowertriangleleft": "\u25FA", "/lowertriangleleftblack": "\u25E3", "/lowertriangleright": "\u25FF", "/lowertrianglerightblack": "\u25E2", "/lowideographiccircled": "\u32A6", "/lowlinecenterline": "\uFE4E", "/lowlinecmb": "\u0332", "/lowlinedashed": "\uFE4D", "/lownumeralsign": "\u0375", "/lowquotedblprime": "\u301F", "/lozenge": "\u25CA", "/lozengedividedbyrulehorizontal": "\u27E0", "/lozengesquare": "\u2311", "/lparen": "\u24A7", "/lparenthesized": "\u24A7", "/lretroflex": "\u026D", "/ls": "\u02AA", "/lslash": "\u0142", "/lsquare": "\u2113", "/lstroke": "\uA749", "/lsuperior": "\uF6EE", "/lsupmod": "\u02E1", "/lt:Alpha": "\u2C6D", "/lt:Alphaturned": "\u2C70", "/lt:Beta": "\uA7B4", "/lt:Chi": "\uA7B3", "/lt:Gamma": "\u0194", "/lt:Iota": "\u0196", "/lt:Omega": "\uA7B6", "/lt:Upsilon": "\u01B1", "/lt:beta": "\uA7B5", "/lt:delta": "\u1E9F", "/lt:omega": "\uA7B7", "/ltshade": "\u2591", "/lttr:bet": "\u2136", "/lttr:dalet": "\u2138", "/lttr:gimel": "\u2137", "/lttr:gscript": "\u210A", "/lturned": "\uA781", "/ltypeopencircuit": "\u2390", "/luhurpada": "\uA9C5", "/lum": "\uA772", "/lungsipada": "\uA9C9", "/luthai": "\u0E26", "/lvocalicbengali": "\u098C", "/lvocalicdeva": "\u090C", "/lvocalicvowelsignbengali": "\u09E2", "/lvocalicvowelsigndeva": "\u0962", "/lxfullwidth": "\u33D3", "/lxsquare": "\u33D3", "/lzed": "\u02AB", "/m": "\u006D", "/m.inferior": "\u2098", "/m2fullwidth": "\u33A1", "/m3fullwidth": "\u33A5", "/mabengali": "\u09AE", "/macirclekatakana": "\u32EE", "/macron": "\u00AF", "/macronbelowcmb": "\u0331", "/macroncmb": "\u0304", "/macronlowmod": "\u02CD", "/macronmod": "\u02C9", "/macronmonospace": "\uFFE3", "/macute": "\u1E3F", "/madda": "\u0653", "/maddaabove": "\u06E4", "/madeva": "\u092E", "/madyapada": "\uA9C4", "/mafullwidth": "\u3383", "/magujarati": "\u0AAE", "/magurmukhi": "\u0A2E", "/mahapakhhebrew": "\u05A4", "/mahapakhlefthebrew": "\u05A4", "/mahhasquare": "\u3345", "/mahiragana": "\u307E", "/mahpach:hb": "\u05A4", "/maichattawalowleftthai": "\uF895", "/maichattawalowrightthai": "\uF894", "/maichattawathai": "\u0E4B", "/maichattawaupperleftthai": "\uF893", "/maieklowleftthai": "\uF88C", "/maieklowrightthai": "\uF88B", "/maiekthai": "\u0E48", "/maiekupperleftthai": "\uF88A", "/maihanakatleftthai": "\uF884", "/maihanakatthai": "\u0E31", "/maikurosquare": "\u3343", "/mairusquare": "\u3344", "/maitaikhuleftthai": "\uF889", "/maitaikhuthai": "\u0E47", "/maitholowleftthai": "\uF88F", "/maitholowrightthai": "\uF88E", "/maithothai": "\u0E49", "/maithoupperleftthai": "\uF88D", "/maitrilowleftthai": "\uF892", "/maitrilowrightthai": "\uF891", "/maitrithai": "\u0E4A", "/maitriupperleftthai": "\uF890", "/maiyamokthai": "\u0E46", "/makatakana": "\u30DE", "/makatakanahalfwidth": "\uFF8F", "/male": "\u2642", "/malefemale": "\u26A5", "/maleideographiccircled": "\u329A", "/malestroke": "\u26A6", "/malestrokemalefemale": "\u26A7", "/man": "\u1F468", "/manAndWomanHoldingHands": "\u1F46B", "/manDancing": "\u1F57A", "/manGuaPiMao": "\u1F472", "/manInBusinessSuitLevitating": "\u1F574", "/manTurban": "\u1F473", "/manat": "\u20BC", "/mansShoe": "\u1F45E", "/mansyonsquare": "\u3347", "/mantelpieceClock": "\u1F570", "/mapleLeaf": "\u1F341", "/maplighthouse": "\u26EF", "/maqaf:hb": "\u05BE", "/maqafhebrew": "\u05BE", "/marchtelegraph": "\u32C2", "/mark": "\u061C", "/markerdottedraisedinterpolation": "\u2E07", "/markerdottedtransposition": "\u2E08", "/markerraisedinterpolation": "\u2E06", "/marknoonghunna": "\u0658", "/marksChapter": "\u1F545", "/marriage": "\u26AD", "/mars": "\u2642", "/marukusquare": "\u3346", "/masoraCircle:hb": "\u05AF", "/masoracirclehebrew": "\u05AF", "/masquare": "\u3383", "/masumark": "\u303C", "/math:bowtie": "\u22C8", "/math:cuberoot": "\u221B", "/math:fourthroot": "\u221C", "/maximize": "\u1F5D6", "/maytelegraph": "\u32C4", "/mbfullwidth": "\u3386", "/mbopomofo": "\u3107", "/mbsmallfullwidth": "\u33D4", "/mbsquare": "\u33D4", "/mcircle": "\u24DC", "/mcubedsquare": "\u33A5", "/mdot": "\u1E41", "/mdotaccent": "\u1E41", "/mdotbelow": "\u1E43", "/measuredangle": "\u2221", "/measuredby": "\u225E", "/meatOnBone": "\u1F356", "/mecirclekatakana": "\u32F1", "/medicineideographiccircled": "\u32A9", "/mediumShade": "\u2592", "/mediumcircleblack": "\u26AB", "/mediumcirclewhite": "\u26AA", "/mediummathematicalspace": "\u205F", "/mediumsmallcirclewhite": "\u26AC", "/meem": "\u0645", "/meem.fina": "\uFEE2", "/meem.init": "\uFEE3", "/meem.init_alefmaksura.fina": "\uFC49", "/meem.init_hah.fina": "\uFC46", "/meem.init_hah.medi": "\uFCCF", "/meem.init_hah.medi_jeem.medi": "\uFD89", "/meem.init_hah.medi_meem.medi": "\uFD8A", "/meem.init_jeem.fina": "\uFC45", "/meem.init_jeem.medi": "\uFCCE", "/meem.init_jeem.medi_hah.medi": "\uFD8C", "/meem.init_jeem.medi_khah.medi": "\uFD92", "/meem.init_jeem.medi_meem.medi": "\uFD8D", "/meem.init_khah.fina": "\uFC47", "/meem.init_khah.medi": "\uFCD0", "/meem.init_khah.medi_jeem.medi": "\uFD8E", "/meem.init_khah.medi_meem.medi": "\uFD8F", "/meem.init_meem.fina": "\uFC48", "/meem.init_meem.medi": "\uFCD1", "/meem.init_yeh.fina": "\uFC4A", "/meem.isol": "\uFEE1", "/meem.medi": "\uFEE4", "/meem.medi_alef.fina": "\uFC88", "/meem.medi_hah.medi_yeh.fina": "\uFD8B", "/meem.medi_jeem.medi_yeh.fina": "\uFDC0", "/meem.medi_khah.medi_yeh.fina": "\uFDB9", "/meem.medi_meem.fina": "\uFC89", "/meem.medi_meem.medi_yeh.fina": "\uFDB1", "/meemDotAbove": "\u0765", "/meemDotBelow": "\u0766", "/meemabove": "\u06E2", "/meemabove.init": "\u06D8", "/meemarabic": "\u0645", "/meembelow": "\u06ED", "/meemfinalarabic": "\uFEE2", "/meeminitialarabic": "\uFEE3", "/meemmedialarabic": "\uFEE4", "/meemmeeminitialarabic": "\uFCD1", "/meemmeemisolatedarabic": "\uFC48", "/meetorusquare": "\u334D", "/megasquare": "\u334B", "/megatonsquare": "\u334C", "/mehiragana": "\u3081", "/meizierasquare": "\u337E", "/mekatakana": "\u30E1", "/mekatakanahalfwidth": "\uFF92", "/melon": "\u1F348", "/mem": "\u05DE", "/mem:hb": "\u05DE", "/memdagesh": "\uFB3E", "/memdageshhebrew": "\uFB3E", "/memhebrew": "\u05DE", "/memo": "\u1F4DD", "/memwithdagesh:hb": "\uFB3E", "/menarmenian": "\u0574", "/menorahNineBranches": "\u1F54E", "/menpostSindhi": "\u06FE", "/mens": "\u1F6B9", "/mepigraphicinverted": "\uA7FD", "/mercha:hb": "\u05A5", "/merchaKefulah:hb": "\u05A6", "/mercury": "\u263F", "/merkhahebrew": "\u05A5", "/merkhakefulahebrew": "\u05A6", "/merkhakefulalefthebrew": "\u05A6", "/merkhalefthebrew": "\u05A5", "/metalideographiccircled": "\u328E", "/metalideographicparen": "\u322E", "/meteg:hb": "\u05BD", "/metro": "\u1F687", "/mgfullwidth": "\u338E", "/mhook": "\u0271", "/mhzfullwidth": "\u3392", "/mhzsquare": "\u3392", "/micirclekatakana": "\u32EF", "/microphone": "\u1F3A4", "/microscope": "\u1F52C", "/middledotkatakanahalfwidth": "\uFF65", "/middot": "\u00B7", "/mieumacirclekorean": "\u3272", "/mieumaparenkorean": "\u3212", "/mieumcirclekorean": "\u3264", "/mieumkorean": "\u3141", "/mieumpansioskorean": "\u3170", "/mieumparenkorean": "\u3204", "/mieumpieupkorean": "\u316E", "/mieumsioskorean": "\u316F", "/mihiragana": "\u307F", "/mikatakana": "\u30DF", "/mikatakanahalfwidth": "\uFF90", "/mikuronsquare": "\u3348", "/milfullwidth": "\u33D5", "/militaryMedal": "\u1F396", "/milkyWay": "\u1F30C", "/mill": "\u20A5", "/millionscmbcyr": "\u0489", "/millisecond": "\u2034", "/millisecondreversed": "\u2037", "/minibus": "\u1F690", "/minidisc": "\u1F4BD", "/minimize": "\u1F5D5", "/minus": "\u2212", "/minus.inferior": "\u208B", "/minus.superior": "\u207B", "/minusbelowcmb": "\u0320", "/minuscircle": "\u2296", "/minusmod": "\u02D7", "/minusplus": "\u2213", "/minussignmod": "\u02D7", "/minustilde": "\u2242", "/minute": "\u2032", "/minutereversed": "\u2035", "/miribaarusquare": "\u334A", "/mirisquare": "\u3349", "/misc:baby": "\u1F476", "/misc:bell": "\u1F514", "/misc:dash": "\u1F4A8", "/misc:decimalseparator": "\u2396", "/misc:diamondblack": "\u2666", "/misc:diamondwhite": "\u2662", "/misc:ear": "\u1F442", "/misc:om": "\u1F549", "/misc:ring": "\u1F48D", "/misra": "\u060F", "/mlfullwidth": "\u3396", "/mlonglegturned": "\u0270", "/mlsquare": "\u3396", "/mlym:a": "\u0D05", "/mlym:aa": "\u0D06", "/mlym:aasign": "\u0D3E", "/mlym:ai": "\u0D10", "/mlym:aisign": "\u0D48", "/mlym:anusvarasign": "\u0D02", "/mlym:archaicii": "\u0D5F", "/mlym:au": "\u0D14", "/mlym:aulength": "\u0D57", "/mlym:ausign": "\u0D4C", "/mlym:avagrahasign": "\u0D3D", "/mlym:ba": "\u0D2C", "/mlym:bha": "\u0D2D", "/mlym:ca": "\u0D1A", "/mlym:candrabindusign": "\u0D01", "/mlym:cha": "\u0D1B", "/mlym:circularviramasign": "\u0D3C", "/mlym:combininganusvaraabovesign": "\u0D00", "/mlym:da": "\u0D26", "/mlym:date": "\u0D79", "/mlym:dda": "\u0D21", "/mlym:ddha": "\u0D22", "/mlym:dha": "\u0D27", "/mlym:dotreph": "\u0D4E", "/mlym:e": "\u0D0E", "/mlym:ee": "\u0D0F", "/mlym:eesign": "\u0D47", "/mlym:eight": "\u0D6E", "/mlym:esign": "\u0D46", "/mlym:five": "\u0D6B", "/mlym:four": "\u0D6A", "/mlym:ga": "\u0D17", "/mlym:gha": "\u0D18", "/mlym:ha": "\u0D39", "/mlym:i": "\u0D07", "/mlym:ii": "\u0D08", "/mlym:iisign": "\u0D40", "/mlym:isign": "\u0D3F", "/mlym:ja": "\u0D1C", "/mlym:jha": "\u0D1D", "/mlym:ka": "\u0D15", "/mlym:kchillu": "\u0D7F", "/mlym:kha": "\u0D16", "/mlym:la": "\u0D32", "/mlym:lchillu": "\u0D7D", "/mlym:lla": "\u0D33", "/mlym:llchillu": "\u0D7E", "/mlym:llla": "\u0D34", "/mlym:lllchillu": "\u0D56", "/mlym:llvocal": "\u0D61", "/mlym:llvocalsign": "\u0D63", "/mlym:lvocal": "\u0D0C", "/mlym:lvocalsign": "\u0D62", "/mlym:ma": "\u0D2E", "/mlym:mchillu": "\u0D54", "/mlym:na": "\u0D28", "/mlym:nchillu": "\u0D7B", "/mlym:nga": "\u0D19", "/mlym:nine": "\u0D6F", "/mlym:nna": "\u0D23", "/mlym:nnchillu": "\u0D7A", "/mlym:nnna": "\u0D29", "/mlym:nya": "\u0D1E", "/mlym:o": "\u0D12", "/mlym:one": "\u0D67", "/mlym:oneeighth": "\u0D77", "/mlym:onefifth": "\u0D5E", "/mlym:onefortieth": "\u0D59", "/mlym:onehalf": "\u0D74", "/mlym:onehundred": "\u0D71", "/mlym:oneone-hundred-and-sixtieth": "\u0D58", "/mlym:onequarter": "\u0D73", "/mlym:onesixteenth": "\u0D76", "/mlym:onetenth": "\u0D5C", "/mlym:onethousand": "\u0D72", "/mlym:onetwentieth": "\u0D5B", "/mlym:oo": "\u0D13", "/mlym:oosign": "\u0D4B", "/mlym:osign": "\u0D4A", "/mlym:pa": "\u0D2A", "/mlym:parasign": "\u0D4F", "/mlym:pha": "\u0D2B", "/mlym:ra": "\u0D30", "/mlym:rra": "\u0D31", "/mlym:rrchillu": "\u0D7C", "/mlym:rrvocal": "\u0D60", "/mlym:rrvocalsign": "\u0D44", "/mlym:rvocal": "\u0D0B", "/mlym:rvocalsign": "\u0D43", "/mlym:sa": "\u0D38", "/mlym:seven": "\u0D6D", "/mlym:sha": "\u0D36", "/mlym:six": "\u0D6C", "/mlym:ssa": "\u0D37", "/mlym:ta": "\u0D24", "/mlym:ten": "\u0D70", "/mlym:tha": "\u0D25", "/mlym:three": "\u0D69", "/mlym:threeeightieths": "\u0D5A", "/mlym:threequarters": "\u0D75", "/mlym:threesixteenths": "\u0D78", "/mlym:threetwentieths": "\u0D5D", "/mlym:tta": "\u0D1F", "/mlym:ttha": "\u0D20", "/mlym:ttta": "\u0D3A", "/mlym:two": "\u0D68", "/mlym:u": "\u0D09", "/mlym:usign": "\u0D41", "/mlym:uu": "\u0D0A", "/mlym:uusign": "\u0D42", "/mlym:va": "\u0D35", "/mlym:verticalbarviramasign": "\u0D3B", "/mlym:viramasign": "\u0D4D", "/mlym:visargasign": "\u0D03", "/mlym:ya": "\u0D2F", "/mlym:ychillu": "\u0D55", "/mlym:zero": "\u0D66", "/mm2fullwidth": "\u339F", "/mm3fullwidth": "\u33A3", "/mmcubedsquare": "\u33A3", "/mmfullwidth": "\u339C", "/mmonospace": "\uFF4D", "/mmsquaredsquare": "\u339F", "/mobilePhone": "\u1F4F1", "/mobilePhoneOff": "\u1F4F4", "/mobilePhoneRightwardsArrowAtLeft": "\u1F4F2", "/mocirclekatakana": "\u32F2", "/models": "\u22A7", "/mohiragana": "\u3082", "/mohmfullwidth": "\u33C1", "/mohmsquare": "\u33C1", "/mokatakana": "\u30E2", "/mokatakanahalfwidth": "\uFF93", "/molfullwidth": "\u33D6", "/molsquare": "\u33D6", "/momathai": "\u0E21", "/moneyBag": "\u1F4B0", "/moneyWings": "\u1F4B8", "/mong:a": "\u1820", "/mong:aaligali": "\u1887", "/mong:ahaligali": "\u1897", "/mong:ang": "\u1829", "/mong:angsibe": "\u1862", "/mong:angtodo": "\u184A", "/mong:anusvaraonealigali": "\u1880", "/mong:ba": "\u182A", "/mong:baludaaligali": "\u1885", "/mong:baludaaligalithree": "\u1886", "/mong:batodo": "\u184B", "/mong:bhamanchualigali": "\u18A8", "/mong:birga": "\u1800", "/mong:caaligali": "\u188B", "/mong:camanchualigali": "\u189C", "/mong:cha": "\u1834", "/mong:chasibe": "\u1871", "/mong:chatodo": "\u1852", "/mong:chi": "\u1842", "/mong:colon": "\u1804", "/mong:comma": "\u1802", "/mong:commamanchu": "\u1808", "/mong:cyamanchualigali": "\u18A3", "/mong:da": "\u1833", "/mong:daaligali": "\u1891", "/mong:dagalgaaligali": "\u18A9", "/mong:damarualigali": "\u1882", "/mong:dasibe": "\u1869", "/mong:datodo": "\u1851", "/mong:ddaaligali": "\u188E", "/mong:ddhamanchualigali": "\u189F", "/mong:dhamanchualigali": "\u18A1", "/mong:dzatodo": "\u185C", "/mong:e": "\u1821", "/mong:ee": "\u1827", "/mong:eight": "\u1818", "/mong:ellipsis": "\u1801", "/mong:esibe": "\u185D", "/mong:etodo": "\u1844", "/mong:fa": "\u1839", "/mong:famanchu": "\u1876", "/mong:fasibe": "\u186B", "/mong:five": "\u1815", "/mong:four": "\u1814", "/mong:fourdots": "\u1805", "/mong:freevariationselectorone": "\u180B", "/mong:freevariationselectorthree": "\u180D", "/mong:freevariationselectortwo": "\u180C", "/mong:ga": "\u182D", "/mong:gaasibe": "\u186C", "/mong:gaatodo": "\u1858", "/mong:gasibe": "\u1864", "/mong:gatodo": "\u184E", "/mong:ghamanchualigali": "\u189A", "/mong:haa": "\u183E", "/mong:haasibe": "\u186D", "/mong:haatodo": "\u1859", "/mong:hasibe": "\u1865", "/mong:i": "\u1822", "/mong:ialigali": "\u1888", "/mong:imanchu": "\u1873", "/mong:isibe": "\u185E", "/mong:itodo": "\u1845", "/mong:iysibe": "\u185F", "/mong:ja": "\u1835", "/mong:jasibe": "\u186A", "/mong:jatodo": "\u1853", "/mong:jhamanchualigali": "\u189D", "/mong:jiatodo": "\u185A", "/mong:ka": "\u183A", "/mong:kaaligali": "\u1889", "/mong:kamanchu": "\u1874", "/mong:kasibe": "\u1863", "/mong:katodo": "\u1857", "/mong:kha": "\u183B", "/mong:la": "\u182F", "/mong:lha": "\u1840", "/mong:lhamanchualigali": "\u18AA", "/mong:longvowelsigntodo": "\u1843", "/mong:ma": "\u182E", "/mong:matodo": "\u184F", "/mong:na": "\u1828", "/mong:ngaaligali": "\u188A", "/mong:ngamanchualigali": "\u189B", "/mong:niatodo": "\u185B", "/mong:nine": "\u1819", "/mong:nirugu": "\u180A", "/mong:nnaaligali": "\u188F", "/mong:o": "\u1823", "/mong:oe": "\u1825", "/mong:oetodo": "\u1848", "/mong:one": "\u1811", "/mong:otodo": "\u1846", "/mong:pa": "\u182B", "/mong:paaligali": "\u1892", "/mong:pasibe": "\u1866", "/mong:patodo": "\u184C", "/mong:period": "\u1803", "/mong:periodmanchu": "\u1809", "/mong:phaaligali": "\u1893", "/mong:qa": "\u182C", "/mong:qatodo": "\u184D", "/mong:ra": "\u1837", "/mong:raasibe": "\u1870", "/mong:ramanchu": "\u1875", "/mong:sa": "\u1830", "/mong:seven": "\u1817", "/mong:sha": "\u1831", "/mong:shasibe": "\u1867", "/mong:six": "\u1816", "/mong:softhyphentodo": "\u1806", "/mong:ssaaligali": "\u1894", "/mong:ssamanchualigali": "\u18A2", "/mong:syllableboundarymarkersibe": "\u1807", "/mong:ta": "\u1832", "/mong:taaligali": "\u1890", "/mong:tamanchualigali": "\u18A0", "/mong:tasibe": "\u1868", "/mong:tatodo": "\u1850", "/mong:tatodoaligali": "\u1898", "/mong:three": "\u1813", "/mong:tsa": "\u183C", "/mong:tsasibe": "\u186E", "/mong:tsatodo": "\u1854", "/mong:ttaaligali": "\u188C", "/mong:ttamanchualigali": "\u189E", "/mong:tthaaligali": "\u188D", "/mong:two": "\u1812", "/mong:u": "\u1824", "/mong:ualigalihalf": "\u18A6", "/mong:ubadamaaligali": "\u1883", "/mong:ubadamaaligaliinverted": "\u1884", "/mong:ue": "\u1826", "/mong:uesibe": "\u1860", "/mong:uetodo": "\u1849", "/mong:usibe": "\u1861", "/mong:utodo": "\u1847", "/mong:visargaonealigali": "\u1881", "/mong:vowelseparator": "\u180E", "/mong:wa": "\u1838", "/mong:watodo": "\u1856", "/mong:ya": "\u1836", "/mong:yaaligalihalf": "\u18A7", "/mong:yatodo": "\u1855", "/mong:za": "\u183D", "/mong:zaaligali": "\u1896", "/mong:zamanchualigali": "\u18A5", "/mong:zasibe": "\u186F", "/mong:zero": "\u1810", "/mong:zhaaligali": "\u1895", "/mong:zhamanchu": "\u1877", "/mong:zhamanchualigali": "\u18A4", "/mong:zhasibe": "\u1872", "/mong:zhatodoaligali": "\u1899", "/mong:zhi": "\u1841", "/mong:zra": "\u183F", "/monkey": "\u1F412", "/monkeyFace": "\u1F435", "/monogramyang": "\u268A", "/monogramyin": "\u268B", "/monorail": "\u1F69D", "/monostable": "\u238D", "/moodBubble": "\u1F5F0", "/moonViewingCeremony": "\u1F391", "/moonideographiccircled": "\u328A", "/moonideographicparen": "\u322A", "/moonlilithblack": "\u26B8", "/mosque": "\u1F54C", "/motorBoat": "\u1F6E5", "/motorScooter": "\u1F6F5", "/motorway": "\u1F6E3", "/mountFuji": "\u1F5FB", "/mountain": "\u26F0", "/mountainBicyclist": "\u1F6B5", "/mountainCableway": "\u1F6A0", "/mountainRailway": "\u1F69E", "/mouse": "\u1F401", "/mouseFace": "\u1F42D", "/mouth": "\u1F444", "/movers2fullwidth": "\u33A8", "/moversfullwidth": "\u33A7", "/moverssquare": "\u33A7", "/moverssquaredsquare": "\u33A8", "/movieCamera": "\u1F3A5", "/moyai": "\u1F5FF", "/mpafullwidth": "\u33AB", "/mparen": "\u24A8", "/mparenthesized": "\u24A8", "/mpasquare": "\u33AB", "/msfullwidth": "\u33B3", "/mssquare": "\u33B3", "/msuperior": "\uF6EF", "/mturned": "\u026F", "/mu": "\u00B5", "/mu.math": "\u00B5", "/mu1": "\u00B5", "/muafullwidth": "\u3382", "/muasquare": "\u3382", "/muchgreater": "\u226B", "/muchless": "\u226A", "/mucirclekatakana": "\u32F0", "/muffullwidth": "\u338C", "/mufsquare": "\u338C", "/mugfullwidth": "\u338D", "/mugreek": "\u03BC", "/mugsquare": "\u338D", "/muhiragana": "\u3080", "/mukatakana": "\u30E0", "/mukatakanahalfwidth": "\uFF91", "/mulfullwidth": "\u3395", "/mulsquare": "\u3395", "/multimap": "\u22B8", "/multimapleft": "\u27DC", "/multipleMusicalNotes": "\u1F3B6", "/multiply": "\u00D7", "/multiset": "\u228C", "/multisetmultiplication": "\u228D", "/multisetunion": "\u228E", "/mum": "\uA773", "/mumfullwidth": "\u339B", "/mumsquare": "\u339B", "/munach:hb": "\u05A3", "/munahhebrew": "\u05A3", "/munahlefthebrew": "\u05A3", "/musfullwidth": "\u33B2", "/mushroom": "\u1F344", "/musicalKeyboard": "\u1F3B9", "/musicalKeyboardJacks": "\u1F398", "/musicalNote": "\u1F3B5", "/musicalScore": "\u1F3BC", "/musicalnote": "\u266A", "/musicalnotedbl": "\u266B", "/musicflat": "\u266D", "/musicflatsign": "\u266D", "/musicnatural": "\u266E", "/musicsharp": "\u266F", "/musicsharpsign": "\u266F", "/mussquare": "\u33B2", "/muvfullwidth": "\u33B6", "/muvsquare": "\u33B6", "/muwfullwidth": "\u33BC", "/muwsquare": "\u33BC", "/mvfullwidth": "\u33B7", "/mvmegafullwidth": "\u33B9", "/mvmegasquare": "\u33B9", "/mvsquare": "\u33B7", "/mwfullwidth": "\u33BD", "/mwmegafullwidth": "\u33BF", "/mwmegasquare": "\u33BF", "/mwsquare": "\u33BD", "/n": "\u006E", "/n.inferior": "\u2099", "/n.superior": "\u207F", "/nabengali": "\u09A8", "/nabla": "\u2207", "/nacirclekatakana": "\u32E4", "/nacute": "\u0144", "/nadeva": "\u0928", "/nafullwidth": "\u3381", "/nagujarati": "\u0AA8", "/nagurmukhi": "\u0A28", "/nahiragana": "\u306A", "/nailPolish": "\u1F485", "/naira": "\u20A6", "/nakatakana": "\u30CA", "/nakatakanahalfwidth": "\uFF85", "/nameBadge": "\u1F4DB", "/nameideographiccircled": "\u3294", "/nameideographicparen": "\u3234", "/namurda": "\uA99F", "/nand": "\u22BC", "/nanosquare": "\u3328", "/napostrophe": "\u0149", "/narrownobreakspace": "\u202F", "/nasquare": "\u3381", "/nationalPark": "\u1F3DE", "/nationaldigitshapes": "\u206E", "/nbopomofo": "\u310B", "/nbspace": "\u00A0", "/ncaron": "\u0148", "/ncedilla": "\u0146", "/ncircle": "\u24DD", "/ncircumflexbelow": "\u1E4B", "/ncommaaccent": "\u0146", "/ncurl": "\u0235", "/ndescender": "\uA791", "/ndot": "\u1E45", "/ndotaccent": "\u1E45", "/ndotbelow": "\u1E47", "/necirclekatakana": "\u32E7", "/necktie": "\u1F454", "/negatedturnstiledblverticalbarright": "\u22AF", "/nehiragana": "\u306D", "/neirapproximatelynoractuallyequal": "\u2247", "/neirasersetnorequalup": "\u2289", "/neirasubsetnorequal": "\u2288", "/neirgreaternorequal": "\u2271", "/neirgreaternorequivalent": "\u2275", "/neirgreaternorless": "\u2279", "/neirlessnorequal": "\u2270", "/neirlessnorequivalent": "\u2274", "/neirlessnorgreater": "\u2278", "/nekatakana": "\u30CD", "/nekatakanahalfwidth": "\uFF88", "/neptune": "\u2646", "/neuter": "\u26B2", "/neutralFace": "\u1F610", "/newMoon": "\u1F311", "/newMoonFace": "\u1F31A", "/newsheqel": "\u20AA", "/newsheqelsign": "\u20AA", "/newspaper": "\u1F4F0", "/newsquare": "\u1F195", "/nextpage": "\u2398", "/nffullwidth": "\u338B", "/nfsquare": "\u338B", "/ng.fina": "\uFBD4", "/ng.init": "\uFBD5", "/ng.isol": "\uFBD3", "/ng.medi": "\uFBD6", "/ngabengali": "\u0999", "/ngadeva": "\u0919", "/ngagujarati": "\u0A99", "/ngagurmukhi": "\u0A19", "/ngalelet": "\uA98A", "/ngaleletraswadi": "\uA98B", "/ngoeh": "\u06B1", "/ngoeh.fina": "\uFB9B", "/ngoeh.init": "\uFB9C", "/ngoeh.isol": "\uFB9A", "/ngoeh.medi": "\uFB9D", "/ngonguthai": "\u0E07", "/ngrave": "\u01F9", "/ngsquare": "\u1F196", "/nhiragana": "\u3093", "/nhookleft": "\u0272", "/nhookretroflex": "\u0273", "/nicirclekatakana": "\u32E5", "/nieunacirclekorean": "\u326F", "/nieunaparenkorean": "\u320F", "/nieuncieuckorean": "\u3135", "/nieuncirclekorean": "\u3261", "/nieunhieuhkorean": "\u3136", "/nieunkorean": "\u3134", "/nieunpansioskorean": "\u3168", "/nieunparenkorean": "\u3201", "/nieunsioskorean": "\u3167", "/nieuntikeutkorean": "\u3166", "/nightStars": "\u1F303", "/nightideographiccircled": "\u32B0", "/nihiragana": "\u306B", "/nikatakana": "\u30CB", "/nikatakanahalfwidth": "\uFF86", "/nikhahitleftthai": "\uF899", "/nikhahitthai": "\u0E4D", "/nine": "\u0039", "/nine.inferior": "\u2089", "/nine.roman": "\u2168", "/nine.romansmall": "\u2178", "/nine.superior": "\u2079", "/ninearabic": "\u0669", "/ninebengali": "\u09EF", "/ninecircle": "\u2468", "/ninecircledbl": "\u24FD", "/ninecircleinversesansserif": "\u2792", "/ninecomma": "\u1F10A", "/ninedeva": "\u096F", "/ninefar": "\u06F9", "/ninegujarati": "\u0AEF", "/ninegurmukhi": "\u0A6F", "/ninehackarabic": "\u0669", "/ninehangzhou": "\u3029", "/nineideographiccircled": "\u3288", "/nineideographicparen": "\u3228", "/nineinferior": "\u2089", "/ninemonospace": "\uFF19", "/nineoldstyle": "\uF739", "/nineparen": "\u247C", "/nineparenthesized": "\u247C", "/nineperiod": "\u2490", "/ninepersian": "\u06F9", "/nineroman": "\u2178", "/ninesuperior": "\u2079", "/nineteencircle": "\u2472", "/nineteencircleblack": "\u24F3", "/nineteenparen": "\u2486", "/nineteenparenthesized": "\u2486", "/nineteenperiod": "\u249A", "/ninethai": "\u0E59", "/nj": "\u01CC", "/njecyr": "\u045A", "/njecyrillic": "\u045A", "/njekomicyr": "\u050B", "/nkatakana": "\u30F3", "/nkatakanahalfwidth": "\uFF9D", "/nlegrightlong": "\u019E", "/nlinebelow": "\u1E49", "/nlongrightleg": "\u019E", "/nmbr:oneeighth": "\u215B", "/nmbr:onefifth": "\u2155", "/nmbr:onetenth": "\u2152", "/nmfullwidth": "\u339A", "/nmonospace": "\uFF4E", "/nmsquare": "\u339A", "/nnabengali": "\u09A3", "/nnadeva": "\u0923", "/nnagujarati": "\u0AA3", "/nnagurmukhi": "\u0A23", "/nnnadeva": "\u0929", "/noBicycles": "\u1F6B3", "/noEntrySign": "\u1F6AB", "/noMobilePhones": "\u1F4F5", "/noOneUnderEighteen": "\u1F51E", "/noPedestrians": "\u1F6B7", "/noPiracy": "\u1F572", "/noSmoking": "\u1F6AD", "/nobliquestroke": "\uA7A5", "/nocirclekatakana": "\u32E8", "/nodeascending": "\u260A", "/nodedescending": "\u260B", "/noentry": "\u26D4", "/nohiragana": "\u306E", "/nokatakana": "\u30CE", "/nokatakanahalfwidth": "\uFF89", "/nominaldigitshapes": "\u206F", "/nonPotableWater": "\u1F6B1", "/nonbreakinghyphen": "\u2011", "/nonbreakingspace": "\u00A0", "/nonenthai": "\u0E13", "/nonuthai": "\u0E19", "/noon": "\u0646", "/noon.fina": "\uFEE6", "/noon.init": "\uFEE7", "/noon.init_alefmaksura.fina": "\uFC4F", "/noon.init_hah.fina": "\uFC4C", "/noon.init_hah.medi": "\uFCD3", "/noon.init_hah.medi_meem.medi": "\uFD95", "/noon.init_heh.medi": "\uFCD6", "/noon.init_jeem.fina": "\uFC4B", "/noon.init_jeem.medi": "\uFCD2", "/noon.init_jeem.medi_hah.medi": "\uFDB8", "/noon.init_jeem.medi_meem.medi": "\uFD98", "/noon.init_khah.fina": "\uFC4D", "/noon.init_khah.medi": "\uFCD4", "/noon.init_meem.fina": "\uFC4E", "/noon.init_meem.medi": "\uFCD5", "/noon.init_yeh.fina": "\uFC50", "/noon.isol": "\uFEE5", "/noon.medi": "\uFEE8", "/noon.medi_alefmaksura.fina": "\uFC8E", "/noon.medi_hah.medi_alefmaksura.fina": "\uFD96", "/noon.medi_hah.medi_yeh.fina": "\uFDB3", "/noon.medi_heh.medi": "\uFCEF", "/noon.medi_jeem.medi_alefmaksura.fina": "\uFD99", "/noon.medi_jeem.medi_hah.fina": "\uFDBD", "/noon.medi_jeem.medi_meem.fina": "\uFD97", "/noon.medi_jeem.medi_yeh.fina": "\uFDC7", "/noon.medi_meem.fina": "\uFC8C", "/noon.medi_meem.medi": "\uFCEE", "/noon.medi_meem.medi_alefmaksura.fina": "\uFD9B", "/noon.medi_meem.medi_yeh.fina": "\uFD9A", "/noon.medi_noon.fina": "\uFC8D", "/noon.medi_reh.fina": "\uFC8A", "/noon.medi_yeh.fina": "\uFC8F", "/noon.medi_zain.fina": "\uFC8B", "/noonSmallTah": "\u0768", "/noonSmallV": "\u0769", "/noonTwoDotsBelow": "\u0767", "/noonabove": "\u06E8", "/noonarabic": "\u0646", "/noondotbelow": "\u06B9", "/noonfinalarabic": "\uFEE6", "/noonghunna": "\u06BA", "/noonghunna.fina": "\uFB9F", "/noonghunna.isol": "\uFB9E", "/noonghunnaarabic": "\u06BA", "/noonghunnafinalarabic": "\uFB9F", "/noonhehinitialarabic": "\uFEE7", "/nooninitialarabic": "\uFEE7", "/noonjeeminitialarabic": "\uFCD2", "/noonjeemisolatedarabic": "\uFC4B", "/noonmedialarabic": "\uFEE8", "/noonmeeminitialarabic": "\uFCD5", "/noonmeemisolatedarabic": "\uFC4E", "/noonnoonfinalarabic": "\uFC8D", "/noonring": "\u06BC", "/noonthreedotsabove": "\u06BD", "/nor": "\u22BD", "/nordicmark": "\u20BB", "/normalfacrsemidirectproductleft": "\u22C9", "/normalfacrsemidirectproductright": "\u22CA", "/normalsubgroorequalup": "\u22B4", "/normalsubgroup": "\u22B2", "/northeastPointingAirplane": "\u1F6EA", "/nose": "\u1F443", "/notalmostequal": "\u2249", "/notasersetup": "\u2285", "/notasympticallyequal": "\u2244", "/notcheckmark": "\u237B", "/notchedLeftSemicircleThreeDots": "\u1F543", "/notchedRightSemicircleThreeDots": "\u1F544", "/notcontains": "\u220C", "/note": "\u1F5C8", "/notePad": "\u1F5CA", "/notePage": "\u1F5C9", "/notebook": "\u1F4D3", "/notebookDecorativeCover": "\u1F4D4", "/notelement": "\u2209", "/notelementof": "\u2209", "/notequal": "\u2260", "/notequivalent": "\u226D", "/notexistential": "\u2204", "/notgreater": "\u226F", "/notgreaternorequal": "\u2271", "/notgreaternorless": "\u2279", "/notidentical": "\u2262", "/notless": "\u226E", "/notlessnorequal": "\u2270", "/notnormalsubgroorequalup": "\u22EC", "/notnormalsubgroup": "\u22EA", "/notparallel": "\u2226", "/notprecedes": "\u2280", "/notsignturned": "\u2319", "/notsquareimageorequal": "\u22E2", "/notsquareoriginalorequal": "\u22E3", "/notsubset": "\u2284", "/notsucceeds": "\u2281", "/notsuperset": "\u2285", "/nottilde": "\u2241", "/nottosquare": "\u3329", "/nottrue": "\u22AD", "/novembertelegraph": "\u32CA", "/nowarmenian": "\u0576", "/nparen": "\u24A9", "/nparenthesized": "\u24A9", "/nretroflex": "\u0273", "/nsfullwidth": "\u33B1", "/nssquare": "\u33B1", "/nsuperior": "\u207F", "/ntilde": "\u00F1", "/nu": "\u03BD", "/nucirclekatakana": "\u32E6", "/nuhiragana": "\u306C", "/nukatakana": "\u30CC", "/nukatakanahalfwidth": "\uFF87", "/nuktabengali": "\u09BC", "/nuktadeva": "\u093C", "/nuktagujarati": "\u0ABC", "/nuktagurmukhi": "\u0A3C", "/num": "\uA774", "/numbermarkabove": "\u0605", "/numbersign": "\u0023", "/numbersignmonospace": "\uFF03", "/numbersignsmall": "\uFE5F", "/numeralsign": "\u0374", "/numeralsigngreek": "\u0374", "/numeralsignlowergreek": "\u0375", "/numero": "\u2116", "/nun": "\u05E0", "/nun:hb": "\u05E0", "/nunHafukha:hb": "\u05C6", "/nundagesh": "\uFB40", "/nundageshhebrew": "\uFB40", "/nunhebrew": "\u05E0", "/nunwithdagesh:hb": "\uFB40", "/nutAndBolt": "\u1F529", "/nvfullwidth": "\u33B5", "/nvsquare": "\u33B5", "/nwfullwidth": "\u33BB", "/nwsquare": "\u33BB", "/nyabengali": "\u099E", "/nyadeva": "\u091E", "/nyagujarati": "\u0A9E", "/nyagurmukhi": "\u0A1E", "/nyamurda": "\uA998", "/nyeh": "\u0683", "/nyeh.fina": "\uFB77", "/nyeh.init": "\uFB78", "/nyeh.isol": "\uFB76", "/nyeh.medi": "\uFB79", "/o": "\u006F", "/o.inferior": "\u2092", "/oacute": "\u00F3", "/oangthai": "\u0E2D", "/obarcyr": "\u04E9", "/obardieresiscyr": "\u04EB", "/obarred": "\u0275", "/obarredcyrillic": "\u04E9", "/obarreddieresiscyrillic": "\u04EB", "/obelosdotted": "\u2E13", "/obengali": "\u0993", "/obopomofo": "\u311B", "/obreve": "\u014F", "/observereye": "\u23FF", "/ocandradeva": "\u0911", "/ocandragujarati": "\u0A91", "/ocandravowelsigndeva": "\u0949", "/ocandravowelsigngujarati": "\u0AC9", "/ocaron": "\u01D2", "/ocircle": "\u24DE", "/ocirclekatakana": "\u32D4", "/ocircumflex": "\u00F4", "/ocircumflexacute": "\u1ED1", "/ocircumflexdotbelow": "\u1ED9", "/ocircumflexgrave": "\u1ED3", "/ocircumflexhoi": "\u1ED5", "/ocircumflexhookabove": "\u1ED5", "/ocircumflextilde": "\u1ED7", "/ocr:bowtie": "\u2445", "/ocr:dash": "\u2448", "/octagonalSign": "\u1F6D1", "/octobertelegraph": "\u32C9", "/octopus": "\u1F419", "/ocyr": "\u043E", "/ocyrillic": "\u043E", "/odblacute": "\u0151", "/odblgrave": "\u020D", "/oden": "\u1F362", "/odeva": "\u0913", "/odieresis": "\u00F6", "/odieresiscyr": "\u04E7", "/odieresiscyrillic": "\u04E7", "/odieresismacron": "\u022B", "/odot": "\u022F", "/odotbelow": "\u1ECD", "/odotmacron": "\u0231", "/oe": "\u0153", "/oe.fina": "\uFBDA", "/oe.isol": "\uFBD9", "/oekirghiz": "\u06C5", "/oekirghiz.fina": "\uFBE1", "/oekirghiz.isol": "\uFBE0", "/oekorean": "\u315A", "/officeBuilding": "\u1F3E2", "/ogonek": "\u02DB", "/ogonekcmb": "\u0328", "/ograve": "\u00F2", "/ogravedbl": "\u020D", "/ogujarati": "\u0A93", "/oharmenian": "\u0585", "/ohiragana": "\u304A", "/ohm": "\u2126", "/ohminverted": "\u2127", "/ohoi": "\u1ECF", "/ohookabove": "\u1ECF", "/ohorn": "\u01A1", "/ohornacute": "\u1EDB", "/ohorndotbelow": "\u1EE3", "/ohorngrave": "\u1EDD", "/ohornhoi": "\u1EDF", "/ohornhookabove": "\u1EDF", "/ohorntilde": "\u1EE1", "/ohungarumlaut": "\u0151", "/ohuparen": "\u321E", "/oi": "\u01A3", "/oilDrum": "\u1F6E2", "/oinvertedbreve": "\u020F", "/ojeonparen": "\u321D", "/okHandSign": "\u1F44C", "/okatakana": "\u30AA", "/okatakanahalfwidth": "\uFF75", "/okorean": "\u3157", "/oksquare": "\u1F197", "/oldKey": "\u1F5DD", "/oldPersonalComputer": "\u1F5B3", "/olderMan": "\u1F474", "/olderWoman": "\u1F475", "/ole:hb": "\u05AB", "/olehebrew": "\u05AB", "/oloop": "\uA74D", "/olowringinside": "\u2C7A", "/omacron": "\u014D", "/omacronacute": "\u1E53", "/omacrongrave": "\u1E51", "/omdeva": "\u0950", "/omega": "\u03C9", "/omega1": "\u03D6", "/omegaacute": "\u1F7D", "/omegaacuteiotasub": "\u1FF4", "/omegaasper": "\u1F61", "/omegaasperacute": "\u1F65", "/omegaasperacuteiotasub": "\u1FA5", "/omegaaspergrave": "\u1F63", "/omegaaspergraveiotasub": "\u1FA3", "/omegaasperiotasub": "\u1FA1", "/omegaaspertilde": "\u1F67", "/omegaaspertildeiotasub": "\u1FA7", "/omegaclosed": "\u0277", "/omegacyr": "\u0461", "/omegacyrillic": "\u0461", "/omegafunc": "\u2375", "/omegagrave": "\u1F7C", "/omegagraveiotasub": "\u1FF2", "/omegaiotasub": "\u1FF3", "/omegalatinclosed": "\u0277", "/omegalenis": "\u1F60", "/omegalenisacute": "\u1F64", "/omegalenisacuteiotasub": "\u1FA4", "/omegalenisgrave": "\u1F62", "/omegalenisgraveiotasub": "\u1FA2", "/omegalenisiotasub": "\u1FA0", "/omegalenistilde": "\u1F66", "/omegalenistildeiotasub": "\u1FA6", "/omegaroundcyr": "\u047B", "/omegaroundcyrillic": "\u047B", "/omegatilde": "\u1FF6", "/omegatildeiotasub": "\u1FF7", "/omegatitlocyr": "\u047D", "/omegatitlocyrillic": "\u047D", "/omegatonos": "\u03CE", "/omegaunderlinefunc": "\u2379", "/omgujarati": "\u0AD0", "/omicron": "\u03BF", "/omicronacute": "\u1F79", "/omicronasper": "\u1F41", "/omicronasperacute": "\u1F45", "/omicronaspergrave": "\u1F43", "/omicrongrave": "\u1F78", "/omicronlenis": "\u1F40", "/omicronlenisacute": "\u1F44", "/omicronlenisgrave": "\u1F42", "/omicrontonos": "\u03CC", "/omonospace": "\uFF4F", "/onExclamationMarkLeftRightArrowAbove": "\u1F51B", "/oncomingAutomobile": "\u1F698", "/oncomingBus": "\u1F68D", "/oncomingFireEngine": "\u1F6F1", "/oncomingPoliceCar": "\u1F694", "/oncomingTaxi": "\u1F696", "/one": "\u0031", "/one.inferior": "\u2081", "/one.roman": "\u2160", "/one.romansmall": "\u2170", "/oneButtonMouse": "\u1F5AF", "/onearabic": "\u0661", "/onebengali": "\u09E7", "/onecircle": "\u2460", "/onecircledbl": "\u24F5", "/onecircleinversesansserif": "\u278A", "/onecomma": "\u1F102", "/onedeva": "\u0967", "/onedotenleader": "\u2024", "/onedotovertwodots": "\u2E2B", "/oneeighth": "\u215B", "/onefar": "\u06F1", "/onefitted": "\uF6DC", "/onefraction": "\u215F", "/onegujarati": "\u0AE7", "/onegurmukhi": "\u0A67", "/onehackarabic": "\u0661", "/onehalf": "\u00BD", "/onehangzhou": "\u3021", "/onehundred.roman": "\u216D", "/onehundred.romansmall": "\u217D", "/onehundredthousand.roman": "\u2188", "/onehundredtwentypsquare": "\u1F1A4", "/oneideographiccircled": "\u3280", "/oneideographicparen": "\u3220", "/oneinferior": "\u2081", "/onemonospace": "\uFF11", "/oneninth": "\u2151", "/onenumeratorbengali": "\u09F4", "/oneoldstyle": "\uF731", "/oneparen": "\u2474", "/oneparenthesized": "\u2474", "/oneperiod": "\u2488", "/onepersian": "\u06F1", "/onequarter": "\u00BC", "/oneroman": "\u2170", "/oneseventh": "\u2150", "/onesixth": "\u2159", "/onesuperior": "\u00B9", "/onethai": "\u0E51", "/onethird": "\u2153", "/onethousand.roman": "\u216F", "/onethousand.romansmall": "\u217F", "/onethousandcd.roman": "\u2180", "/onsusquare": "\u3309", "/oo": "\uA74F", "/oogonek": "\u01EB", "/oogonekmacron": "\u01ED", "/oogurmukhi": "\u0A13", "/oomatragurmukhi": "\u0A4B", "/oomusquare": "\u330A", "/oopen": "\u0254", "/oparen": "\u24AA", "/oparenthesized": "\u24AA", "/openBook": "\u1F4D6", "/openFileFolder": "\u1F4C2", "/openFolder": "\u1F5C1", "/openHandsSign": "\u1F450", "/openLock": "\u1F513", "/openMailboxLoweredFlag": "\u1F4ED", "/openMailboxRaisedFlag": "\u1F4EC", "/openbullet": "\u25E6", "/openheadarrowleft": "\u21FD", "/openheadarrowleftright": "\u21FF", "/openheadarrowright": "\u21FE", "/opensubset": "\u27C3", "/opensuperset": "\u27C4", "/ophiuchus": "\u26CE", "/opposition": "\u260D", "/opticalDisc": "\u1F4BF", "/opticalDiscIcon": "\u1F5B8", "/option": "\u2325", "/orangeBook": "\u1F4D9", "/ordfeminine": "\u00AA", "/ordmasculine": "\u00BA", "/ordotinside": "\u27C7", "/original": "\u22B6", "/ornateleftparenthesis": "\uFD3E", "/ornaterightparenthesis": "\uFD3F", "/orthodoxcross": "\u2626", "/orthogonal": "\u221F", "/orya:a": "\u0B05", "/orya:aa": "\u0B06", "/orya:aasign": "\u0B3E", "/orya:ai": "\u0B10", "/orya:ailengthmark": "\u0B56", "/orya:aisign": "\u0B48", "/orya:anusvara": "\u0B02", "/orya:au": "\u0B14", "/orya:aulengthmark": "\u0B57", "/orya:ausign": "\u0B4C", "/orya:avagraha": "\u0B3D", "/orya:ba": "\u0B2C", "/orya:bha": "\u0B2D", "/orya:ca": "\u0B1A", "/orya:candrabindu": "\u0B01", "/orya:cha": "\u0B1B", "/orya:da": "\u0B26", "/orya:dda": "\u0B21", "/orya:ddha": "\u0B22", "/orya:dha": "\u0B27", "/orya:e": "\u0B0F", "/orya:eight": "\u0B6E", "/orya:esign": "\u0B47", "/orya:five": "\u0B6B", "/orya:four": "\u0B6A", "/orya:fractiononeeighth": "\u0B76", "/orya:fractiononehalf": "\u0B73", "/orya:fractiononequarter": "\u0B72", "/orya:fractiononesixteenth": "\u0B75", "/orya:fractionthreequarters": "\u0B74", "/orya:fractionthreesixteenths": "\u0B77", "/orya:ga": "\u0B17", "/orya:gha": "\u0B18", "/orya:ha": "\u0B39", "/orya:i": "\u0B07", "/orya:ii": "\u0B08", "/orya:iisign": "\u0B40", "/orya:isign": "\u0B3F", "/orya:isshar": "\u0B70", "/orya:ja": "\u0B1C", "/orya:jha": "\u0B1D", "/orya:ka": "\u0B15", "/orya:kha": "\u0B16", "/orya:la": "\u0B32", "/orya:lla": "\u0B33", "/orya:llvocal": "\u0B61", "/orya:llvocalsign": "\u0B63", "/orya:lvocal": "\u0B0C", "/orya:lvocalsign": "\u0B62", "/orya:ma": "\u0B2E", "/orya:na": "\u0B28", "/orya:nga": "\u0B19", "/orya:nine": "\u0B6F", "/orya:nna": "\u0B23", "/orya:nukta": "\u0B3C", "/orya:nya": "\u0B1E", "/orya:o": "\u0B13", "/orya:one": "\u0B67", "/orya:osign": "\u0B4B", "/orya:pa": "\u0B2A", "/orya:pha": "\u0B2B", "/orya:ra": "\u0B30", "/orya:rha": "\u0B5D", "/orya:rra": "\u0B5C", "/orya:rrvocal": "\u0B60", "/orya:rrvocalsign": "\u0B44", "/orya:rvocal": "\u0B0B", "/orya:rvocalsign": "\u0B43", "/orya:sa": "\u0B38", "/orya:seven": "\u0B6D", "/orya:sha": "\u0B36", "/orya:six": "\u0B6C", "/orya:ssa": "\u0B37", "/orya:ta": "\u0B24", "/orya:tha": "\u0B25", "/orya:three": "\u0B69", "/orya:tta": "\u0B1F", "/orya:ttha": "\u0B20", "/orya:two": "\u0B68", "/orya:u": "\u0B09", "/orya:usign": "\u0B41", "/orya:uu": "\u0B0A", "/orya:uusign": "\u0B42", "/orya:va": "\u0B35", "/orya:virama": "\u0B4D", "/orya:visarga": "\u0B03", "/orya:wa": "\u0B71", "/orya:ya": "\u0B2F", "/orya:yya": "\u0B5F", "/orya:zero": "\u0B66", "/oscript": "\u2134", "/oshortdeva": "\u0912", "/oshortvowelsigndeva": "\u094A", "/oslash": "\u00F8", "/oslashacute": "\u01FF", "/osmallhiragana": "\u3049", "/osmallkatakana": "\u30A9", "/osmallkatakanahalfwidth": "\uFF6B", "/ostroke": "\uA74B", "/ostrokeacute": "\u01FF", "/osuperior": "\uF6F0", "/otcyr": "\u047F", "/otcyrillic": "\u047F", "/otilde": "\u00F5", "/otildeacute": "\u1E4D", "/otildedieresis": "\u1E4F", "/otildemacron": "\u022D", "/ou": "\u0223", "/oubopomofo": "\u3121", "/ounce": "\u2125", "/outboxTray": "\u1F4E4", "/outerjoinfull": "\u27D7", "/outerjoinleft": "\u27D5", "/outerjoinright": "\u27D6", "/outputpassiveup": "\u2392", "/overlap": "\u1F5D7", "/overline": "\u203E", "/overlinecenterline": "\uFE4A", "/overlinecmb": "\u0305", "/overlinedashed": "\uFE49", "/overlinedblwavy": "\uFE4C", "/overlinewavy": "\uFE4B", "/overscore": "\u00AF", "/ovfullwidth": "\u3375", "/ovowelsignbengali": "\u09CB", "/ovowelsigndeva": "\u094B", "/ovowelsigngujarati": "\u0ACB", "/ox": "\u1F402", "/p": "\u0070", "/p.inferior": "\u209A", "/paampsfullwidth": "\u3380", "/paampssquare": "\u3380", "/paasentosquare": "\u332B", "/paatusquare": "\u332C", "/pabengali": "\u09AA", "/pacerek": "\uA989", "/package": "\u1F4E6", "/pacute": "\u1E55", "/padeva": "\u092A", "/pafullwidth": "\u33A9", "/page": "\u1F5CF", "/pageCircledText": "\u1F5DF", "/pageCurl": "\u1F4C3", "/pageFacingUp": "\u1F4C4", "/pagedown": "\u21DF", "/pager": "\u1F4DF", "/pages": "\u1F5D0", "/pageup": "\u21DE", "/pagoda": "\u1F6D4", "/pagujarati": "\u0AAA", "/pagurmukhi": "\u0A2A", "/pahiragana": "\u3071", "/paiyannoithai": "\u0E2F", "/pakatakana": "\u30D1", "/palatalizationcyrilliccmb": "\u0484", "/palatcmbcyr": "\u0484", "/pallas": "\u26B4", "/palmTree": "\u1F334", "/palmbranch": "\u2E19", "/palochkacyr": "\u04CF", "/palochkacyrillic": "\u04C0", "/pamurda": "\uA9A6", "/pandaFace": "\u1F43C", "/pangkatpada": "\uA9C7", "/pangkon": "\uA9C0", "/pangrangkep": "\uA9CF", "/pansioskorean": "\u317F", "/panyangga": "\uA980", "/paperclip": "\u1F4CE", "/paragraph": "\u00B6", "/paragraphos": "\u2E0F", "/paragraphosforked": "\u2E10", "/paragraphosforkedreversed": "\u2E11", "/paragraphseparator": "\u2029", "/parallel": "\u2225", "/parallelogramblack": "\u25B0", "/parallelogramwhite": "\u25B1", "/parenbottom": "\u23DD", "/parendblleft": "\u2E28", "/parendblright": "\u2E29", "/parenextensionleft": "\u239C", "/parenextensionright": "\u239F", "/parenflatleft": "\u27EE", "/parenflatright": "\u27EF", "/parenhookupleft": "\u239B", "/parenhookupright": "\u239E", "/parenleft": "\u0028", "/parenleft.inferior": "\u208D", "/parenleft.superior": "\u207D", "/parenleftaltonearabic": "\uFD3E", "/parenleftbt": "\uF8ED", "/parenleftex": "\uF8EC", "/parenleftinferior": "\u208D", "/parenleftmonospace": "\uFF08", "/parenleftsmall": "\uFE59", "/parenleftsuperior": "\u207D", "/parenlefttp": "\uF8EB", "/parenleftvertical": "\uFE35", "/parenlowerhookleft": "\u239D", "/parenlowerhookright": "\u23A0", "/parenright": "\u0029", "/parenright.inferior": "\u208E", "/parenright.superior": "\u207E", "/parenrightaltonearabic": "\uFD3F", "/parenrightbt": "\uF8F8", "/parenrightex": "\uF8F7", "/parenrightinferior": "\u208E", "/parenrightmonospace": "\uFF09", "/parenrightsmall": "\uFE5A", "/parenrightsuperior": "\u207E", "/parenrighttp": "\uF8F6", "/parenrightvertical": "\uFE36", "/parentop": "\u23DC", "/partalternationmark": "\u303D", "/partialdiff": "\u2202", "/partnership": "\u3250", "/partyPopper": "\u1F389", "/paseq:hb": "\u05C0", "/paseqhebrew": "\u05C0", "/pashta:hb": "\u0599", "/pashtahebrew": "\u0599", "/pasquare": "\u33A9", "/passengerShip": "\u1F6F3", "/passivedown": "\u2391", "/passportControl": "\u1F6C2", "/patah": "\u05B7", "/patah11": "\u05B7", "/patah1d": "\u05B7", "/patah2a": "\u05B7", "/patah:hb": "\u05B7", "/patahhebrew": "\u05B7", "/patahnarrowhebrew": "\u05B7", "/patahquarterhebrew": "\u05B7", "/patahwidehebrew": "\u05B7", "/pawPrints": "\u1F43E", "/pawnblack": "\u265F", "/pawnwhite": "\u2659", "/pazer:hb": "\u05A1", "/pazerhebrew": "\u05A1", "/pbopomofo": "\u3106", "/pcfullwidth": "\u3376", "/pcircle": "\u24DF", "/pdot": "\u1E57", "/pdotaccent": "\u1E57", "/pe": "\u05E4", "/pe:hb": "\u05E4", "/peace": "\u262E", "/peach": "\u1F351", "/pear": "\u1F350", "/pecyr": "\u043F", "/pecyrillic": "\u043F", "/pedagesh": "\uFB44", "/pedageshhebrew": "\uFB44", "/pedestrian": "\u1F6B6", "/peezisquare": "\u333B", "/pefinaldageshhebrew": "\uFB43", "/peh.fina": "\uFB57", "/peh.init": "\uFB58", "/peh.isol": "\uFB56", "/peh.medi": "\uFB59", "/peharabic": "\u067E", "/peharmenian": "\u057A", "/pehebrew": "\u05E4", "/peheh": "\u06A6", "/peheh.fina": "\uFB6F", "/peheh.init": "\uFB70", "/peheh.isol": "\uFB6E", "/peheh.medi": "\uFB71", "/pehfinalarabic": "\uFB57", "/pehinitialarabic": "\uFB58", "/pehiragana": "\u307A", "/pehmedialarabic": "\uFB59", "/pehookcyr": "\u04A7", "/pekatakana": "\u30DA", "/pemiddlehookcyrillic": "\u04A7", "/penOverStampedEnvelope": "\u1F586", "/pengkalconsonant": "\uA9BE", "/penguin": "\u1F427", "/penihisquare": "\u3338", "/pensiveFace": "\u1F614", "/pensusquare": "\u333A", "/pentagram": "\u26E4", "/pentasememetrical": "\u23D9", "/pepetvowel": "\uA9BC", "/per": "\u214C", "/perafehebrew": "\uFB4E", "/percent": "\u0025", "/percentarabic": "\u066A", "/percentmonospace": "\uFF05", "/percentsmall": "\uFE6A", "/percussivebidental": "\u02AD", "/percussivebilabial": "\u02AC", "/performingArts": "\u1F3AD", "/period": "\u002E", "/periodarmenian": "\u0589", "/periodcentered": "\u00B7", "/periodhalfwidth": "\uFF61", "/periodinferior": "\uF6E7", "/periodmonospace": "\uFF0E", "/periodsmall": "\uFE52", "/periodsuperior": "\uF6E8", "/periodurdu": "\u06D4", "/perispomenigreekcmb": "\u0342", "/permanentpaper": "\u267E", "/permille": "\u0609", "/perpendicular": "\u22A5", "/perseveringFace": "\u1F623", "/personBlondHair": "\u1F471", "/personBowingDeeply": "\u1F647", "/personFrowning": "\u1F64D", "/personRaisingBothHandsInCelebration": "\u1F64C", "/personWithFoldedHands": "\u1F64F", "/personWithPoutingFace": "\u1F64E", "/personalComputer": "\u1F4BB", "/personball": "\u26F9", "/perspective": "\u2306", "/pertenthousandsign": "\u2031", "/perthousand": "\u2030", "/peseta": "\u20A7", "/peso": "\u20B1", "/pesosquare": "\u3337", "/petailcyr": "\u0525", "/pewithdagesh:hb": "\uFB44", "/pewithrafe:hb": "\uFB4E", "/pffullwidth": "\u338A", "/pflourish": "\uA753", "/pfsquare": "\u338A", "/phabengali": "\u09AB", "/phadeva": "\u092B", "/phagujarati": "\u0AAB", "/phagurmukhi": "\u0A2B", "/pharyngealvoicedfricative": "\u0295", "/phfullwidth": "\u33D7", "/phi": "\u03C6", "/phi.math": "\u03D5", "/phi1": "\u03D5", "/phieuphacirclekorean": "\u327A", "/phieuphaparenkorean": "\u321A", "/phieuphcirclekorean": "\u326C", "/phieuphkorean": "\u314D", "/phieuphparenkorean": "\u320C", "/philatin": "\u0278", "/phinthuthai": "\u0E3A", "/phisymbolgreek": "\u03D5", "/phitailless": "\u2C77", "/phon:AEsmall": "\u1D01", "/phon:Aemod": "\u1D2D", "/phon:Amod": "\u1D2C", "/phon:Asmall": "\u1D00", "/phon:Bbarmod": "\u1D2F", "/phon:Bbarsmall": "\u1D03", "/phon:Bmod": "\u1D2E", "/phon:Csmall": "\u1D04", "/phon:Dmod": "\u1D30", "/phon:Dsmall": "\u1D05", "/phon:ENcyrmod": "\u1D78", "/phon:Elsmallcyr": "\u1D2B", "/phon:Emod": "\u1D31", "/phon:Ereversedmod": "\u1D32", "/phon:Esmall": "\u1D07", "/phon:Ethsmall": "\u1D06", "/phon:Ezhsmall": "\u1D23", "/phon:Gmod": "\u1D33", "/phon:Hmod": "\u1D34", "/phon:Imod": "\u1D35", "/phon:Ismallmod": "\u1DA6", "/phon:Ismallstroke": "\u1D7B", "/phon:Istrokesmallmod": "\u1DA7", "/phon:Jmod": "\u1D36", "/phon:Jsmall": "\u1D0A", "/phon:Kmod": "\u1D37", "/phon:Ksmall": "\u1D0B", "/phon:Lmod": "\u1D38", "/phon:Lsmallmod": "\u1DAB", "/phon:Lsmallstroke": "\u1D0C", "/phon:Mmod": "\u1D39", "/phon:Msmall": "\u1D0D", "/phon:Nmod": "\u1D3A", "/phon:Nreversedmod": "\u1D3B", "/phon:Nsmallmod": "\u1DB0", "/phon:Nsmallreversed": "\u1D0E", "/phon:OUsmall": "\u1D15", "/phon:Omod": "\u1D3C", "/phon:Oopensmall": "\u1D10", "/phon:Osmall": "\u1D0F", "/phon:Oumod": "\u1D3D", "/phon:Pmod": "\u1D3E", "/phon:Psmall": "\u1D18", "/phon:Rmod": "\u1D3F", "/phon:Rsmallreversed": "\u1D19", "/phon:Rsmallturned": "\u1D1A", "/phon:Tmod": "\u1D40", "/phon:Tsmall": "\u1D1B", "/phon:Umod": "\u1D41", "/phon:Usmall": "\u1D1C", "/phon:Usmallmod": "\u1DB8", "/phon:Usmallstroke": "\u1D7E", "/phon:Vsmall": "\u1D20", "/phon:Wmod": "\u1D42", "/phon:Wsmall": "\u1D21", "/phon:Zsmall": "\u1D22", "/phon:aeturned": "\u1D02", "/phon:aeturnedmod": "\u1D46", "/phon:ain": "\u1D25", "/phon:ainmod": "\u1D5C", "/phon:alphamod": "\u1D45", "/phon:alpharetroflexhook": "\u1D90", "/phon:alphaturnedmod": "\u1D9B", "/phon:amod": "\u1D43", "/phon:aretroflexhook": "\u1D8F", "/phon:aturnedmod": "\u1D44", "/phon:betamod": "\u1D5D", "/phon:bmiddletilde": "\u1D6C", "/phon:bmod": "\u1D47", "/phon:bpalatalhook": "\u1D80", "/phon:ccurlmod": "\u1D9D", "/phon:chimod": "\u1D61", "/phon:cmod": "\u1D9C", "/phon:deltamod": "\u1D5F", "/phon:dhooktail": "\u1D91", "/phon:dmiddletilde": "\u1D6D", "/phon:dmod": "\u1D48", "/phon:dotlessjstrokemod": "\u1DA1", "/phon:dpalatalhook": "\u1D81", "/phon:emod": "\u1D49", "/phon:engmod": "\u1D51", "/phon:eopenmod": "\u1D4B", "/phon:eopenretroflexhook": "\u1D93", "/phon:eopenreversedmod": "\u1D9F", "/phon:eopenreversedretroflexhook": "\u1D94", "/phon:eopenturned": "\u1D08", "/phon:eopenturnedmod": "\u1D4C", "/phon:eretroflexhook": "\u1D92", "/phon:eshmod": "\u1DB4", "/phon:eshpalatalhook": "\u1D8B", "/phon:eshretroflexhook": "\u1D98", "/phon:ethmod": "\u1D9E", "/phon:ezhmod": "\u1DBE", "/phon:ezhretroflexhook": "\u1D9A", "/phon:fmiddletilde": "\u1D6E", "/phon:fmod": "\u1DA0", "/phon:fpalatalhook": "\u1D82", "/phon:ginsular": "\u1D79", "/phon:gmod": "\u1D4D", "/phon:gpalatalhook": "\u1D83", "/phon:gr:Gammasmall": "\u1D26", "/phon:gr:Lambdasmall": "\u1D27", "/phon:gr:Pismall": "\u1D28", "/phon:gr:Psismall": "\u1D2A", "/phon:gr:RsmallHO": "\u1D29", "/phon:gr:betasubscript": "\u1D66", "/phon:gr:chisubscript": "\u1D6A", "/phon:gr:gammamod": "\u1D5E", "/phon:gr:gammasubscript": "\u1D67", "/phon:gr:phimod": "\u1D60", "/phon:gr:phisubscript": "\u1D69", "/phon:gr:rhosubscript": "\u1D68", "/phon:gscriptmod": "\u1DA2", "/phon:gturned": "\u1D77", "/phon:hturnedmod": "\u1DA3", "/phon:iotamod": "\u1DA5", "/phon:iotastroke": "\u1D7C", "/phon:iretroflexhook": "\u1D96", "/phon:istrokemod": "\u1DA4", "/phon:isubscript": "\u1D62", "/phon:iturned": "\u1D09", "/phon:iturnedmod": "\u1D4E", "/phon:jcrossedtailmod": "\u1DA8", "/phon:kmod": "\u1D4F", "/phon:kpalatalhook": "\u1D84", "/phon:lpalatalhook": "\u1D85", "/phon:lpalatalhookmod": "\u1DAA", "/phon:lretroflexhookmod": "\u1DA9", "/phon:mhookmod": "\u1DAC", "/phon:mlonglegturnedmod": "\u1DAD", "/phon:mmiddletilde": "\u1D6F", "/phon:mmod": "\u1D50", "/phon:mpalatalhook": "\u1D86", "/phon:mturnedmod": "\u1D5A", "/phon:mturnedsideways": "\u1D1F", "/phon:nlefthookmod": "\u1DAE", "/phon:nmiddletilde": "\u1D70", "/phon:npalatalhook": "\u1D87", "/phon:nretroflexhookmod": "\u1DAF", "/phon:obarmod": "\u1DB1", "/phon:obottomhalf": "\u1D17", "/phon:obottomhalfmod": "\u1D55", "/phon:oeturned": "\u1D14", "/phon:omod": "\u1D52", "/phon:oopenmod": "\u1D53", "/phon:oopenretroflexhook": "\u1D97", "/phon:oopensideways": "\u1D12", "/phon:osideways": "\u1D11", "/phon:ostrokesideways": "\u1D13", "/phon:otophalf": "\u1D16", "/phon:otophalfmod": "\u1D54", "/phon:phimod": "\u1DB2", "/phon:pmiddletilde": "\u1D71", "/phon:pmod": "\u1D56", "/phon:ppalatalhook": "\u1D88", "/phon:pstroke": "\u1D7D", "/phon:rfishmiddletilde": "\u1D73", "/phon:rmiddletilde": "\u1D72", "/phon:rpalatalhook": "\u1D89", "/phon:rsubscript": "\u1D63", "/phon:schwamod": "\u1D4A", "/phon:schwaretroflexhook": "\u1D95", "/phon:shookmod": "\u1DB3", "/phon:smiddletilde": "\u1D74", "/phon:spalatalhook": "\u1D8A", "/phon:spirantvoicedlaryngeal": "\u1D24", "/phon:thetamod": "\u1DBF", "/phon:thstrike": "\u1D7A", "/phon:tmiddletilde": "\u1D75", "/phon:tmod": "\u1D57", "/phon:tpalatalhookmod": "\u1DB5", "/phon:ubarmod": "\u1DB6", "/phon:ue": "\u1D6B", "/phon:umod": "\u1D58", "/phon:upsilonmod": "\u1DB7", "/phon:upsilonstroke": "\u1D7F", "/phon:uretroflexhook": "\u1D99", "/phon:usideways": "\u1D1D", "/phon:usidewaysdieresised": "\u1D1E", "/phon:usidewaysmod": "\u1D59", "/phon:usubscript": "\u1D64", "/phon:vhookmod": "\u1DB9", "/phon:vmod": "\u1D5B", "/phon:vpalatalhook": "\u1D8C", "/phon:vsubscript": "\u1D65", "/phon:vturnedmod": "\u1DBA", "/phon:xpalatalhook": "\u1D8D", "/phon:zcurlmod": "\u1DBD", "/phon:zmiddletilde": "\u1D76", "/phon:zmod": "\u1DBB", "/phon:zpalatalhook": "\u1D8E", "/phon:zretroflexhookmod": "\u1DBC", "/phook": "\u01A5", "/phophanthai": "\u0E1E", "/phophungthai": "\u0E1C", "/phosamphaothai": "\u0E20", "/pi": "\u03C0", "/pi.math": "\u03D6", "/piasutorusquare": "\u332E", "/pick": "\u26CF", "/pidblstruck": "\u213C", "/pieupacirclekorean": "\u3273", "/pieupaparenkorean": "\u3213", "/pieupcieuckorean": "\u3176", "/pieupcirclekorean": "\u3265", "/pieupkiyeokkorean": "\u3172", "/pieupkorean": "\u3142", "/pieupparenkorean": "\u3205", "/pieupsioskiyeokkorean": "\u3174", "/pieupsioskorean": "\u3144", "/pieupsiostikeutkorean": "\u3175", "/pieupthieuthkorean": "\u3177", "/pieuptikeutkorean": "\u3173", "/pig": "\u1F416", "/pigFace": "\u1F437", "/pigNose": "\u1F43D", "/pihiragana": "\u3074", "/pikatakana": "\u30D4", "/pikosquare": "\u3330", "/pikurusquare": "\u332F", "/pilcrowsignreversed": "\u204B", "/pileOfPoo": "\u1F4A9", "/pill": "\u1F48A", "/pineDecoration": "\u1F38D", "/pineapple": "\u1F34D", "/pisces": "\u2653", "/piselehpada": "\uA9CC", "/pistol": "\u1F52B", "/pisymbolgreek": "\u03D6", "/pitchfork": "\u22D4", "/piwrarmenian": "\u0583", "/placeOfWorship": "\u1F6D0", "/placeofinterestsign": "\u2318", "/planck": "\u210E", "/plancktwopi": "\u210F", "/plus": "\u002B", "/plus.inferior": "\u208A", "/plus.superior": "\u207A", "/plusbelowcmb": "\u031F", "/pluscircle": "\u2295", "/plusminus": "\u00B1", "/plusmod": "\u02D6", "/plusmonospace": "\uFF0B", "/plussignalt:hb": "\uFB29", "/plussignmod": "\u02D6", "/plussmall": "\uFE62", "/plussuperior": "\u207A", "/pluto": "\u2647", "/pmfullwidth": "\u33D8", "/pmonospace": "\uFF50", "/pmsquare": "\u33D8", "/pocketCalculator": "\u1F5A9", "/poeticverse": "\u060E", "/pohiragana": "\u307D", "/pointerleftblack": "\u25C4", "/pointerleftwhite": "\u25C5", "/pointerrightblack": "\u25BA", "/pointerrightwhite": "\u25BB", "/pointingindexdownwhite": "\u261F", "/pointingindexleftblack": "\u261A", "/pointingindexleftwhite": "\u261C", "/pointingindexrightblack": "\u261B", "/pointingindexrightwhite": "\u261E", "/pointingindexupwhite": "\u261D", "/pointingtriangledownheavywhite": "\u26DB", "/pointosquare": "\u333D", "/pointring": "\u2E30", "/pokatakana": "\u30DD", "/pokrytiecmbcyr": "\u0487", "/policeCar": "\u1F693", "/policeCarsRevolvingLight": "\u1F6A8", "/policeOfficer": "\u1F46E", "/pondosquare": "\u3340", "/poodle": "\u1F429", "/popcorn": "\u1F37F", "/popdirectionalformatting": "\u202C", "/popdirectionalisolate": "\u2069", "/poplathai": "\u0E1B", "/portableStereo": "\u1F4FE", "/positionindicator": "\u2316", "/postalHorn": "\u1F4EF", "/postalmark": "\u3012", "/postalmarkface": "\u3020", "/postbox": "\u1F4EE", "/potOfFood": "\u1F372", "/potableWater": "\u1F6B0", "/pouch": "\u1F45D", "/poultryLeg": "\u1F357", "/poutingCatFace": "\u1F63E", "/poutingFace": "\u1F621", "/power": "\u23FB", "/poweron": "\u23FD", "/poweronoff": "\u23FC", "/powersleep": "\u23FE", "/pparen": "\u24AB", "/pparenthesized": "\u24AB", "/ppmfullwidth": "\u33D9", "/prayerBeads": "\u1F4FF", "/precedes": "\u227A", "/precedesbutnotequivalent": "\u22E8", "/precedesorequal": "\u227C", "/precedesorequivalent": "\u227E", "/precedesunderrelation": "\u22B0", "/prescription": "\u211E", "/preversedepigraphic": "\uA7FC", "/previouspage": "\u2397", "/prfullwidth": "\u33DA", "/primedblmod": "\u02BA", "/primemod": "\u02B9", "/primereversed": "\u2035", "/princess": "\u1F478", "/printer": "\u1F5A8", "/printerIcon": "\u1F5B6", "/printideographiccircled": "\u329E", "/printscreen": "\u2399", "/product": "\u220F", "/prohibitedSign": "\u1F6C7", "/projective": "\u2305", "/prolongedkana": "\u30FC", "/propellor": "\u2318", "/propersubset": "\u2282", "/propersuperset": "\u2283", "/propertyline": "\u214A", "/proportion": "\u2237", "/proportional": "\u221D", "/psfullwidth": "\u33B0", "/psi": "\u03C8", "/psicyr": "\u0471", "/psicyrillic": "\u0471", "/psilicmbcyr": "\u0486", "/psilipneumatacyrilliccmb": "\u0486", "/pssquare": "\u33B0", "/pstrokedescender": "\uA751", "/ptail": "\uA755", "/publicAddressLoudspeaker": "\u1F4E2", "/puhiragana": "\u3077", "/pukatakana": "\u30D7", "/punctuationspace": "\u2008", "/purpleHeart": "\u1F49C", "/purse": "\u1F45B", "/pushpin": "\u1F4CC", "/putLitterInItsPlace": "\u1F6AE", "/pvfullwidth": "\u33B4", "/pvsquare": "\u33B4", "/pwfullwidth": "\u33BA", "/pwsquare": "\u33BA", "/q": "\u0071", "/qacyr": "\u051B", "/qadeva": "\u0958", "/qadma:hb": "\u05A8", "/qadmahebrew": "\u05A8", "/qaf": "\u0642", "/qaf.fina": "\uFED6", "/qaf.init": "\uFED7", "/qaf.init_alefmaksura.fina": "\uFC35", "/qaf.init_hah.fina": "\uFC33", "/qaf.init_hah.medi": "\uFCC2", "/qaf.init_meem.fina": "\uFC34", "/qaf.init_meem.medi": "\uFCC3", "/qaf.init_meem.medi_hah.medi": "\uFDB4", "/qaf.init_yeh.fina": "\uFC36", "/qaf.isol": "\uFED5", "/qaf.medi": "\uFED8", "/qaf.medi_alefmaksura.fina": "\uFC7E", "/qaf.medi_meem.medi_hah.fina": "\uFD7E", "/qaf.medi_meem.medi_meem.fina": "\uFD7F", "/qaf.medi_meem.medi_yeh.fina": "\uFDB2", "/qaf.medi_yeh.fina": "\uFC7F", "/qaf_lam_alefmaksuraabove": "\u06D7", "/qafarabic": "\u0642", "/qafdotabove": "\u06A7", "/qaffinalarabic": "\uFED6", "/qafinitialarabic": "\uFED7", "/qafmedialarabic": "\uFED8", "/qafthreedotsabove": "\u06A8", "/qamats": "\u05B8", "/qamats10": "\u05B8", "/qamats1a": "\u05B8", "/qamats1c": "\u05B8", "/qamats27": "\u05B8", "/qamats29": "\u05B8", "/qamats33": "\u05B8", "/qamats:hb": "\u05B8", "/qamatsQatan:hb": "\u05C7", "/qamatsde": "\u05B8", "/qamatshebrew": "\u05B8", "/qamatsnarrowhebrew": "\u05B8", "/qamatsqatanhebrew": "\u05B8", "/qamatsqatannarrowhebrew": "\u05B8", "/qamatsqatanquarterhebrew": "\u05B8", "/qamatsqatanwidehebrew": "\u05B8", "/qamatsquarterhebrew": "\u05B8", "/qamatswidehebrew": "\u05B8", "/qarneFarah:hb": "\u059F", "/qarneyparahebrew": "\u059F", "/qbopomofo": "\u3111", "/qcircle": "\u24E0", "/qdiagonalstroke": "\uA759", "/qhook": "\u02A0", "/qhooktail": "\u024B", "/qmonospace": "\uFF51", "/qof": "\u05E7", "/qof:hb": "\u05E7", "/qofdagesh": "\uFB47", "/qofdageshhebrew": "\uFB47", "/qofhatafpatah": "\u05E7", "/qofhatafpatahhebrew": "\u05E7", "/qofhatafsegol": "\u05E7", "/qofhatafsegolhebrew": "\u05E7", "/qofhebrew": "\u05E7", "/qofhiriq": "\u05E7", "/qofhiriqhebrew": "\u05E7", "/qofholam": "\u05E7", "/qofholamhebrew": "\u05E7", "/qofpatah": "\u05E7", "/qofpatahhebrew": "\u05E7", "/qofqamats": "\u05E7", "/qofqamatshebrew": "\u05E7", "/qofqubuts": "\u05E7", "/qofqubutshebrew": "\u05E7", "/qofsegol": "\u05E7", "/qofsegolhebrew": "\u05E7", "/qofsheva": "\u05E7", "/qofshevahebrew": "\u05E7", "/qoftsere": "\u05E7", "/qoftserehebrew": "\u05E7", "/qofwithdagesh:hb": "\uFB47", "/qparen": "\u24AC", "/qparenthesized": "\u24AC", "/qpdigraph": "\u0239", "/qstrokedescender": "\uA757", "/quadarrowdownfunc": "\u2357", "/quadarrowleftfunc": "\u2347", "/quadarrowrightfunc": "\u2348", "/quadarrowupfunc": "\u2350", "/quadbackslashfunc": "\u2342", "/quadcaretdownfunc": "\u234C", "/quadcaretupfunc": "\u2353", "/quadcirclefunc": "\u233C", "/quadcolonfunc": "\u2360", "/quaddelfunc": "\u2354", "/quaddeltafunc": "\u234D", "/quaddiamondfunc": "\u233A", "/quaddividefunc": "\u2339", "/quadequalfunc": "\u2338", "/quadfunc": "\u2395", "/quadgreaterfunc": "\u2344", "/quadjotfunc": "\u233B", "/quadlessfunc": "\u2343", "/quadnotequalfunc": "\u236F", "/quadquestionfunc": "\u2370", "/quadrantLowerLeft": "\u2596", "/quadrantLowerRight": "\u2597", "/quadrantUpperLeft": "\u2598", "/quadrantUpperLeftAndLowerLeftAndLowerRight": "\u2599", "/quadrantUpperLeftAndLowerRight": "\u259A", "/quadrantUpperLeftAndUpperRightAndLowerLeft": "\u259B", "/quadrantUpperLeftAndUpperRightAndLowerRight": "\u259C", "/quadrantUpperRight": "\u259D", "/quadrantUpperRightAndLowerLeft": "\u259E", "/quadrantUpperRightAndLowerLeftAndLowerRight": "\u259F", "/quadrupleminute": "\u2057", "/quadslashfunc": "\u2341", "/quarternote": "\u2669", "/qubuts": "\u05BB", "/qubuts18": "\u05BB", "/qubuts25": "\u05BB", "/qubuts31": "\u05BB", "/qubuts:hb": "\u05BB", "/qubutshebrew": "\u05BB", "/qubutsnarrowhebrew": "\u05BB", "/qubutsquarterhebrew": "\u05BB", "/qubutswidehebrew": "\u05BB", "/queenblack": "\u265B", "/queenwhite": "\u2655", "/question": "\u003F", "/questionarabic": "\u061F", "/questionarmenian": "\u055E", "/questiondbl": "\u2047", "/questiondown": "\u00BF", "/questiondownsmall": "\uF7BF", "/questionedequal": "\u225F", "/questionexclamationmark": "\u2048", "/questiongreek": "\u037E", "/questionideographiccircled": "\u3244", "/questionmonospace": "\uFF1F", "/questionreversed": "\u2E2E", "/questionsmall": "\uF73F", "/quincunx": "\u26BB", "/quotedbl": "\u0022", "/quotedblbase": "\u201E", "/quotedblleft": "\u201C", "/quotedbllowreversed": "\u2E42", "/quotedblmonospace": "\uFF02", "/quotedblprime": "\u301E", "/quotedblprimereversed": "\u301D", "/quotedblreversed": "\u201F", "/quotedblright": "\u201D", "/quoteleft": "\u2018", "/quoteleftreversed": "\u201B", "/quotequadfunc": "\u235E", "/quotereversed": "\u201B", "/quoteright": "\u2019", "/quoterightn": "\u0149", "/quotesinglbase": "\u201A", "/quotesingle": "\u0027", "/quotesinglemonospace": "\uFF07", "/quoteunderlinefunc": "\u2358", "/r": "\u0072", "/raagung": "\uA9AC", "/raarmenian": "\u057C", "/rabbit": "\u1F407", "/rabbitFace": "\u1F430", "/rabengali": "\u09B0", "/racingCar": "\u1F3CE", "/racingMotorcycle": "\u1F3CD", "/racirclekatakana": "\u32F6", "/racute": "\u0155", "/radeva": "\u0930", "/radfullwidth": "\u33AD", "/radical": "\u221A", "/radicalbottom": "\u23B7", "/radicalex": "\uF8E5", "/radio": "\u1F4FB", "/radioButton": "\u1F518", "/radioactive": "\u2622", "/radovers2fullwidth": "\u33AF", "/radoversfullwidth": "\u33AE", "/radoverssquare": "\u33AE", "/radoverssquaredsquare": "\u33AF", "/radsquare": "\u33AD", "/rafe": "\u05BF", "/rafe:hb": "\u05BF", "/rafehebrew": "\u05BF", "/ragujarati": "\u0AB0", "/ragurmukhi": "\u0A30", "/rahiragana": "\u3089", "/railwayCar": "\u1F683", "/railwayTrack": "\u1F6E4", "/rain": "\u26C6", "/rainbow": "\u1F308", "/raisedHandFingersSplayed": "\u1F590", "/raisedHandPartBetweenMiddleAndRingFingers": "\u1F596", "/raisedmcsign": "\u1F16A", "/raisedmdsign": "\u1F16B", "/rakatakana": "\u30E9", "/rakatakanahalfwidth": "\uFF97", "/ralowerdiagonalbengali": "\u09F1", "/ram": "\u1F40F", "/ramiddlediagonalbengali": "\u09F0", "/ramshorn": "\u0264", "/rat": "\u1F400", "/ratio": "\u2236", "/ray": "\u0608", "/rbopomofo": "\u3116", "/rcaron": "\u0159", "/rcedilla": "\u0157", "/rcircle": "\u24E1", "/rcommaaccent": "\u0157", "/rdblgrave": "\u0211", "/rdot": "\u1E59", "/rdotaccent": "\u1E59", "/rdotbelow": "\u1E5B", "/rdotbelowmacron": "\u1E5D", "/reachideographicparen": "\u3243", "/recirclekatakana": "\u32F9", "/recreationalVehicle": "\u1F699", "/rectangleblack": "\u25AC", "/rectangleverticalblack": "\u25AE", "/rectangleverticalwhite": "\u25AF", "/rectanglewhite": "\u25AD", "/recycledpaper": "\u267C", "/recyclefiveplastics": "\u2677", "/recyclefourplastics": "\u2676", "/recyclegeneric": "\u267A", "/recycleoneplastics": "\u2673", "/recyclepartiallypaper": "\u267D", "/recyclesevenplastics": "\u2679", "/recyclesixplastics": "\u2678", "/recyclethreeplastics": "\u2675", "/recycletwoplastics": "\u2674", "/recycleuniversal": "\u2672", "/recycleuniversalblack": "\u267B", "/redApple": "\u1F34E", "/redTriangleDOwn": "\u1F53B", "/redTriangleUp": "\u1F53A", "/referencemark": "\u203B", "/reflexsubset": "\u2286", "/reflexsuperset": "\u2287", "/regionalindicatorsymbollettera": "\u1F1E6", "/regionalindicatorsymbolletterb": "\u1F1E7", "/regionalindicatorsymbolletterc": "\u1F1E8", "/regionalindicatorsymbolletterd": "\u1F1E9", "/regionalindicatorsymbollettere": "\u1F1EA", "/regionalindicatorsymbolletterf": "\u1F1EB", "/regionalindicatorsymbolletterg": "\u1F1EC", "/regionalindicatorsymbolletterh": "\u1F1ED", "/regionalindicatorsymbolletteri": "\u1F1EE", "/regionalindicatorsymbolletterj": "\u1F1EF", "/regionalindicatorsymbolletterk": "\u1F1F0", "/regionalindicatorsymbolletterl": "\u1F1F1", "/regionalindicatorsymbolletterm": "\u1F1F2", "/regionalindicatorsymbollettern": "\u1F1F3", "/regionalindicatorsymbollettero": "\u1F1F4", "/regionalindicatorsymbolletterp": "\u1F1F5", "/regionalindicatorsymbolletterq": "\u1F1F6", "/regionalindicatorsymbolletterr": "\u1F1F7", "/regionalindicatorsymbolletters": "\u1F1F8", "/regionalindicatorsymbollettert": "\u1F1F9", "/regionalindicatorsymbolletteru": "\u1F1FA", "/regionalindicatorsymbolletterv": "\u1F1FB", "/regionalindicatorsymbolletterw": "\u1F1FC", "/regionalindicatorsymbolletterx": "\u1F1FD", "/regionalindicatorsymbollettery": "\u1F1FE", "/regionalindicatorsymbolletterz": "\u1F1FF", "/registered": "\u00AE", "/registersans": "\uF8E8", "/registerserif": "\uF6DA", "/reh.fina": "\uFEAE", "/reh.init_superscriptalef.fina": "\uFC5C", "/reh.isol": "\uFEAD", "/rehHamzaAbove": "\u076C", "/rehSmallTahTwoDots": "\u0771", "/rehStroke": "\u075B", "/rehTwoDotsVerticallyAbove": "\u076B", "/rehVabove": "\u0692", "/rehVbelow": "\u0695", "/reharabic": "\u0631", "/reharmenian": "\u0580", "/rehdotbelow": "\u0694", "/rehdotbelowdotabove": "\u0696", "/rehfinalarabic": "\uFEAE", "/rehfourdotsabove": "\u0699", "/rehinvertedV": "\u06EF", "/rehiragana": "\u308C", "/rehring": "\u0693", "/rehtwodotsabove": "\u0697", "/rehyehaleflamarabic": "\u0631", "/rekatakana": "\u30EC", "/rekatakanahalfwidth": "\uFF9A", "/relievedFace": "\u1F60C", "/religionideographiccircled": "\u32AA", "/reminderRibbon": "\u1F397", "/remusquare": "\u3355", "/rentogensquare": "\u3356", "/replacementchar": "\uFFFD", "/replacementcharobj": "\uFFFC", "/representideographicparen": "\u3239", "/rerengganleft": "\uA9C1", "/rerengganright": "\uA9C2", "/resh": "\u05E8", "/resh:hb": "\u05E8", "/reshdageshhebrew": "\uFB48", "/reshhatafpatah": "\u05E8", "/reshhatafpatahhebrew": "\u05E8", "/reshhatafsegol": "\u05E8", "/reshhatafsegolhebrew": "\u05E8", "/reshhebrew": "\u05E8", "/reshhiriq": "\u05E8", "/reshhiriqhebrew": "\u05E8", "/reshholam": "\u05E8", "/reshholamhebrew": "\u05E8", "/reshpatah": "\u05E8", "/reshpatahhebrew": "\u05E8", "/reshqamats": "\u05E8", "/reshqamatshebrew": "\u05E8", "/reshqubuts": "\u05E8", "/reshqubutshebrew": "\u05E8", "/reshsegol": "\u05E8", "/reshsegolhebrew": "\u05E8", "/reshsheva": "\u05E8", "/reshshevahebrew": "\u05E8", "/reshtsere": "\u05E8", "/reshtserehebrew": "\u05E8", "/reshwide:hb": "\uFB27", "/reshwithdagesh:hb": "\uFB48", "/resourceideographiccircled": "\u32AE", "/resourceideographicparen": "\u323E", "/response": "\u211F", "/restideographiccircled": "\u32A1", "/restideographicparen": "\u3241", "/restrictedentryoneleft": "\u26E0", "/restrictedentrytwoleft": "\u26E1", "/restroom": "\u1F6BB", "/return": "\u23CE", "/reversedHandMiddleFingerExtended": "\u1F595", "/reversedRaisedHandFingersSplayed": "\u1F591", "/reversedThumbsDownSign": "\u1F593", "/reversedThumbsUpSign": "\u1F592", "/reversedVictoryHand": "\u1F594", "/reversedonehundred.roman": "\u2183", "/reversedtilde": "\u223D", "/reversedzecyr": "\u0511", "/revia:hb": "\u0597", "/reviahebrew": "\u0597", "/reviamugrashhebrew": "\u0597", "/revlogicalnot": "\u2310", "/revolvingHearts": "\u1F49E", "/rfishhook": "\u027E", "/rfishhookreversed": "\u027F", "/rgravedbl": "\u0211", "/rhabengali": "\u09DD", "/rhacyr": "\u0517", "/rhadeva": "\u095D", "/rho": "\u03C1", "/rhoasper": "\u1FE5", "/rhofunc": "\u2374", "/rholenis": "\u1FE4", "/rhook": "\u027D", "/rhookturned": "\u027B", "/rhookturnedsuperior": "\u02B5", "/rhookturnedsupmod": "\u02B5", "/rhostrokesymbol": "\u03FC", "/rhosymbol": "\u03F1", "/rhosymbolgreek": "\u03F1", "/rhotichookmod": "\u02DE", "/rial": "\uFDFC", "/ribbon": "\u1F380", "/riceBall": "\u1F359", "/riceCracker": "\u1F358", "/ricirclekatakana": "\u32F7", "/rieulacirclekorean": "\u3271", "/rieulaparenkorean": "\u3211", "/rieulcirclekorean": "\u3263", "/rieulhieuhkorean": "\u3140", "/rieulkiyeokkorean": "\u313A", "/rieulkiyeoksioskorean": "\u3169", "/rieulkorean": "\u3139", "/rieulmieumkorean": "\u313B", "/rieulpansioskorean": "\u316C", "/rieulparenkorean": "\u3203", "/rieulphieuphkorean": "\u313F", "/rieulpieupkorean": "\u313C", "/rieulpieupsioskorean": "\u316B", "/rieulsioskorean": "\u313D", "/rieulthieuthkorean": "\u313E", "/rieultikeutkorean": "\u316A", "/rieulyeorinhieuhkorean": "\u316D", "/right-pointingMagnifyingGlass": "\u1F50E", "/rightAngerBubble": "\u1F5EF", "/rightHalfBlock": "\u2590", "/rightHandTelephoneReceiver": "\u1F57D", "/rightOneEighthBlock": "\u2595", "/rightSpeaker": "\u1F568", "/rightSpeakerOneSoundWave": "\u1F569", "/rightSpeakerThreeSoundWaves": "\u1F56A", "/rightSpeechBubble": "\u1F5E9", "/rightThoughtBubble": "\u1F5ED", "/rightangle": "\u221F", "/rightarrowoverleftarrow": "\u21C4", "/rightdnheavyleftuplight": "\u2546", "/rightharpoonoverleftharpoon": "\u21CC", "/rightheavyleftdnlight": "\u252E", "/rightheavyleftuplight": "\u2536", "/rightheavyleftvertlight": "\u253E", "/rightideographiccircled": "\u32A8", "/rightlightleftdnheavy": "\u2531", "/rightlightleftupheavy": "\u2539", "/rightlightleftvertheavy": "\u2549", "/righttackbelowcmb": "\u0319", "/righttoleftembed": "\u202B", "/righttoleftisolate": "\u2067", "/righttoleftmark": "\u200F", "/righttoleftoverride": "\u202E", "/righttriangle": "\u22BF", "/rightupheavyleftdnlight": "\u2544", "/rihiragana": "\u308A", "/rikatakana": "\u30EA", "/rikatakanahalfwidth": "\uFF98", "/ring": "\u02DA", "/ringbelowcmb": "\u0325", "/ringcmb": "\u030A", "/ringequal": "\u2257", "/ringhalfleft": "\u02BF", "/ringhalfleftarmenian": "\u0559", "/ringhalfleftbelowcmb": "\u031C", "/ringhalfleftcentered": "\u02D3", "/ringhalfleftcentredmod": "\u02D3", "/ringhalfleftmod": "\u02BF", "/ringhalfright": "\u02BE", "/ringhalfrightbelowcmb": "\u0339", "/ringhalfrightcentered": "\u02D2", "/ringhalfrightcentredmod": "\u02D2", "/ringhalfrightmod": "\u02BE", "/ringinequal": "\u2256", "/ringingBell": "\u1F56D", "/ringlowmod": "\u02F3", "/ringoperator": "\u2218", "/rinsular": "\uA783", "/rinvertedbreve": "\u0213", "/rirasquare": "\u3352", "/risingdiagonal": "\u27CB", "/rittorusquare": "\u3351", "/rlinebelow": "\u1E5F", "/rlongleg": "\u027C", "/rlonglegturned": "\u027A", "/rmacrondot": "\u1E5D", "/rmonospace": "\uFF52", "/rnoon": "\u06BB", "/rnoon.fina": "\uFBA1", "/rnoon.init": "\uFBA2", "/rnoon.isol": "\uFBA0", "/rnoon.medi": "\uFBA3", "/roastedSweetPotato": "\u1F360", "/robliquestroke": "\uA7A7", "/rocirclekatakana": "\u32FA", "/rocket": "\u1F680", "/rohiragana": "\u308D", "/rokatakana": "\u30ED", "/rokatakanahalfwidth": "\uFF9B", "/rolled-upNewspaper": "\u1F5DE", "/rollerCoaster": "\u1F3A2", "/rookblack": "\u265C", "/rookwhite": "\u2656", "/rooster": "\u1F413", "/roruathai": "\u0E23", "/rose": "\u1F339", "/rosette": "\u1F3F5", "/roundPushpin": "\u1F4CD", "/roundedzeroabove": "\u06DF", "/rowboat": "\u1F6A3", "/rparen": "\u24AD", "/rparenthesized": "\u24AD", "/rrabengali": "\u09DC", "/rradeva": "\u0931", "/rragurmukhi": "\u0A5C", "/rreh": "\u0691", "/rreh.fina": "\uFB8D", "/rreh.isol": "\uFB8C", "/rreharabic": "\u0691", "/rrehfinalarabic": "\uFB8D", "/rrotunda": "\uA75B", "/rrvocalicbengali": "\u09E0", "/rrvocalicdeva": "\u0960", "/rrvocalicgujarati": "\u0AE0", "/rrvocalicvowelsignbengali": "\u09C4", "/rrvocalicvowelsigndeva": "\u0944", "/rrvocalicvowelsigngujarati": "\u0AC4", "/rstroke": "\u024D", "/rsuperior": "\uF6F1", "/rsupmod": "\u02B3", "/rtailturned": "\u2C79", "/rtblock": "\u2590", "/rturned": "\u0279", "/rturnedsuperior": "\u02B4", "/rturnedsupmod": "\u02B4", "/ruble": "\u20BD", "/rucirclekatakana": "\u32F8", "/rugbyFootball": "\u1F3C9", "/ruhiragana": "\u308B", "/rukatakana": "\u30EB", "/rukatakanahalfwidth": "\uFF99", "/rum": "\uA775", "/rumrotunda": "\uA75D", "/runner": "\u1F3C3", "/runningShirtSash": "\u1F3BD", "/rupeemarkbengali": "\u09F2", "/rupeesignbengali": "\u09F3", "/rupiah": "\uF6DD", "/rupiisquare": "\u3353", "/ruthai": "\u0E24", "/ruuburusquare": "\u3354", "/rvocalicbengali": "\u098B", "/rvocalicdeva": "\u090B", "/rvocalicgujarati": "\u0A8B", "/rvocalicvowelsignbengali": "\u09C3", "/rvocalicvowelsigndeva": "\u0943", "/rvocalicvowelsigngujarati": "\u0AC3", "/s": "\u0073", "/s.inferior": "\u209B", "/s_t": "\uFB06", "/sabengali": "\u09B8", "/sacirclekatakana": "\u32DA", "/sacute": "\u015B", "/sacutedotaccent": "\u1E65", "/sad": "\u0635", "/sad.fina": "\uFEBA", "/sad.init": "\uFEBB", "/sad.init_alefmaksura.fina": "\uFD05", "/sad.init_hah.fina": "\uFC20", "/sad.init_hah.medi": "\uFCB1", "/sad.init_hah.medi_hah.medi": "\uFD65", "/sad.init_khah.medi": "\uFCB2", "/sad.init_meem.fina": "\uFC21", "/sad.init_meem.medi": "\uFCB3", "/sad.init_meem.medi_meem.medi": "\uFDC5", "/sad.init_reh.fina": "\uFD0F", "/sad.init_yeh.fina": "\uFD06", "/sad.isol": "\uFEB9", "/sad.medi": "\uFEBC", "/sad.medi_alefmaksura.fina": "\uFD21", "/sad.medi_hah.medi_hah.fina": "\uFD64", "/sad.medi_hah.medi_yeh.fina": "\uFDA9", "/sad.medi_meem.medi_meem.fina": "\uFD66", "/sad.medi_reh.fina": "\uFD2B", "/sad.medi_yeh.fina": "\uFD22", "/sad_lam_alefmaksuraabove": "\u06D6", "/sadarabic": "\u0635", "/sadeva": "\u0938", "/sadfinalarabic": "\uFEBA", "/sadinitialarabic": "\uFEBB", "/sadmedialarabic": "\uFEBC", "/sadthreedotsabove": "\u069E", "/sadtwodotsbelow": "\u069D", "/sagittarius": "\u2650", "/sagujarati": "\u0AB8", "/sagurmukhi": "\u0A38", "/sahiragana": "\u3055", "/saikurusquare": "\u331F", "/sailboat": "\u26F5", "/sakatakana": "\u30B5", "/sakatakanahalfwidth": "\uFF7B", "/sakeBottleAndCup": "\u1F376", "/sallallahoualayhewasallamarabic": "\uFDFA", "/saltillo": "\uA78C", "/saltire": "\u2613", "/samahaprana": "\uA9B0", "/samekh": "\u05E1", "/samekh:hb": "\u05E1", "/samekhdagesh": "\uFB41", "/samekhdageshhebrew": "\uFB41", "/samekhhebrew": "\u05E1", "/samekhwithdagesh:hb": "\uFB41", "/sampi": "\u03E1", "/sampiarchaic": "\u0373", "/samurda": "\uA9AF", "/samvat": "\u0604", "/san": "\u03FB", "/santiimusquare": "\u3320", "/saraaathai": "\u0E32", "/saraaethai": "\u0E41", "/saraaimaimalaithai": "\u0E44", "/saraaimaimuanthai": "\u0E43", "/saraamthai": "\u0E33", "/saraathai": "\u0E30", "/saraethai": "\u0E40", "/saraiileftthai": "\uF886", "/saraiithai": "\u0E35", "/saraileftthai": "\uF885", "/saraithai": "\u0E34", "/saraothai": "\u0E42", "/saraueeleftthai": "\uF888", "/saraueethai": "\u0E37", "/saraueleftthai": "\uF887", "/sarauethai": "\u0E36", "/sarauthai": "\u0E38", "/sarauuthai": "\u0E39", "/satellite": "\u1F6F0", "/satelliteAntenna": "\u1F4E1", "/saturn": "\u2644", "/saxophone": "\u1F3B7", "/sbopomofo": "\u3119", "/scales": "\u2696", "/scanninehorizontal": "\u23BD", "/scanonehorizontal": "\u23BA", "/scansevenhorizontal": "\u23BC", "/scanthreehorizontal": "\u23BB", "/scaron": "\u0161", "/scarondot": "\u1E67", "/scarondotaccent": "\u1E67", "/scedilla": "\u015F", "/school": "\u1F3EB", "/schoolSatchel": "\u1F392", "/schoolideographiccircled": "\u3246", "/schwa": "\u0259", "/schwa.inferior": "\u2094", "/schwacyr": "\u04D9", "/schwacyrillic": "\u04D9", "/schwadieresiscyr": "\u04DB", "/schwadieresiscyrillic": "\u04DB", "/schwahook": "\u025A", "/scircle": "\u24E2", "/scircumflex": "\u015D", "/scommaaccent": "\u0219", "/scooter": "\u1F6F4", "/scorpius": "\u264F", "/screen": "\u1F5B5", "/scroll": "\u1F4DC", "/scruple": "\u2108", "/sdot": "\u1E61", "/sdotaccent": "\u1E61", "/sdotbelow": "\u1E63", "/sdotbelowdotabove": "\u1E69", "/sdotbelowdotaccent": "\u1E69", "/seagullbelowcmb": "\u033C", "/seat": "\u1F4BA", "/secirclekatakana": "\u32DD", "/second": "\u2033", "/secondreversed": "\u2036", "/secondscreensquare": "\u1F19C", "/secondtonechinese": "\u02CA", "/secretideographiccircled": "\u3299", "/section": "\u00A7", "/sectionsignhalftop": "\u2E39", "/sector": "\u2314", "/seeNoEvilMonkey": "\u1F648", "/seedling": "\u1F331", "/seen": "\u0633", "/seen.fina": "\uFEB2", "/seen.init": "\uFEB3", "/seen.init_alefmaksura.fina": "\uFCFB", "/seen.init_hah.fina": "\uFC1D", "/seen.init_hah.medi": "\uFCAE", "/seen.init_hah.medi_jeem.medi": "\uFD5C", "/seen.init_heh.medi": "\uFD31", "/seen.init_jeem.fina": "\uFC1C", "/seen.init_jeem.medi": "\uFCAD", "/seen.init_jeem.medi_hah.medi": "\uFD5D", "/seen.init_khah.fina": "\uFC1E", "/seen.init_khah.medi": "\uFCAF", "/seen.init_meem.fina": "\uFC1F", "/seen.init_meem.medi": "\uFCB0", "/seen.init_meem.medi_hah.medi": "\uFD60", "/seen.init_meem.medi_jeem.medi": "\uFD61", "/seen.init_meem.medi_meem.medi": "\uFD63", "/seen.init_reh.fina": "\uFD0E", "/seen.init_yeh.fina": "\uFCFC", "/seen.isol": "\uFEB1", "/seen.medi": "\uFEB4", "/seen.medi_alefmaksura.fina": "\uFD17", "/seen.medi_hah.medi": "\uFD35", "/seen.medi_heh.medi": "\uFCE8", "/seen.medi_jeem.medi": "\uFD34", "/seen.medi_jeem.medi_alefmaksura.fina": "\uFD5E", "/seen.medi_khah.medi": "\uFD36", "/seen.medi_khah.medi_alefmaksura.fina": "\uFDA8", "/seen.medi_khah.medi_yeh.fina": "\uFDC6", "/seen.medi_meem.medi": "\uFCE7", "/seen.medi_meem.medi_hah.fina": "\uFD5F", "/seen.medi_meem.medi_meem.fina": "\uFD62", "/seen.medi_reh.fina": "\uFD2A", "/seen.medi_yeh.fina": "\uFD18", "/seenDigitFourAbove": "\u077D", "/seenFourDotsAbove": "\u075C", "/seenInvertedV": "\u077E", "/seenSmallTahTwoDots": "\u0770", "/seenTwoDotsVerticallyAbove": "\u076D", "/seenabove": "\u06DC", "/seenarabic": "\u0633", "/seendotbelowdotabove": "\u069A", "/seenfinalarabic": "\uFEB2", "/seeninitialarabic": "\uFEB3", "/seenlow": "\u06E3", "/seenmedialarabic": "\uFEB4", "/seenthreedotsbelow": "\u069B", "/seenthreedotsbelowthreedotsabove": "\u069C", "/segment": "\u2313", "/segol": "\u05B6", "/segol13": "\u05B6", "/segol1f": "\u05B6", "/segol2c": "\u05B6", "/segol:hb": "\u05B6", "/segolhebrew": "\u05B6", "/segolnarrowhebrew": "\u05B6", "/segolquarterhebrew": "\u05B6", "/segolta:hb": "\u0592", "/segoltahebrew": "\u0592", "/segolwidehebrew": "\u05B6", "/seharmenian": "\u057D", "/sehiragana": "\u305B", "/sekatakana": "\u30BB", "/sekatakanahalfwidth": "\uFF7E", "/selfideographicparen": "\u3242", "/semicolon": "\u003B", "/semicolonarabic": "\u061B", "/semicolonmonospace": "\uFF1B", "/semicolonreversed": "\u204F", "/semicolonsmall": "\uFE54", "/semicolonunderlinefunc": "\u236E", "/semidirectproductleft": "\u22CB", "/semidirectproductright": "\u22CC", "/semisextile": "\u26BA", "/semisoftcyr": "\u048D", "/semivoicedmarkkana": "\u309C", "/semivoicedmarkkanahalfwidth": "\uFF9F", "/sentisquare": "\u3322", "/sentosquare": "\u3323", "/septembertelegraph": "\u32C8", "/sersetdblup": "\u22D1", "/sersetnotequalup": "\u228B", "/servicemark": "\u2120", "/sesamedot": "\uFE45", "/sesquiquadrate": "\u26BC", "/setminus": "\u2216", "/seven": "\u0037", "/seven.inferior": "\u2087", "/seven.roman": "\u2166", "/seven.romansmall": "\u2176", "/seven.superior": "\u2077", "/sevenarabic": "\u0667", "/sevenbengali": "\u09ED", "/sevencircle": "\u2466", "/sevencircledbl": "\u24FB", "/sevencircleinversesansserif": "\u2790", "/sevencomma": "\u1F108", "/sevendeva": "\u096D", "/seveneighths": "\u215E", "/sevenfar": "\u06F7", "/sevengujarati": "\u0AED", "/sevengurmukhi": "\u0A6D", "/sevenhackarabic": "\u0667", "/sevenhangzhou": "\u3027", "/sevenideographiccircled": "\u3286", "/sevenideographicparen": "\u3226", "/seveninferior": "\u2087", "/sevenmonospace": "\uFF17", "/sevenoldstyle": "\uF737", "/sevenparen": "\u247A", "/sevenparenthesized": "\u247A", "/sevenperiod": "\u248E", "/sevenpersian": "\u06F7", "/sevenpointonesquare": "\u1F1A1", "/sevenroman": "\u2176", "/sevensuperior": "\u2077", "/seventeencircle": "\u2470", "/seventeencircleblack": "\u24F1", "/seventeenparen": "\u2484", "/seventeenparenthesized": "\u2484", "/seventeenperiod": "\u2498", "/seventhai": "\u0E57", "/seventycirclesquare": "\u324E", "/sextile": "\u26B9", "/sfthyphen": "\u00AD", "/shaarmenian": "\u0577", "/shabengali": "\u09B6", "/shacyr": "\u0448", "/shacyrillic": "\u0448", "/shaddaAlefIsol": "\uFC63", "/shaddaDammaIsol": "\uFC61", "/shaddaDammaMedi": "\uFCF3", "/shaddaDammatanIsol": "\uFC5E", "/shaddaFathaIsol": "\uFC60", "/shaddaFathaMedi": "\uFCF2", "/shaddaIsol": "\uFE7C", "/shaddaKasraIsol": "\uFC62", "/shaddaKasraMedi": "\uFCF4", "/shaddaKasratanIsol": "\uFC5F", "/shaddaMedi": "\uFE7D", "/shaddaarabic": "\u0651", "/shaddadammaarabic": "\uFC61", "/shaddadammatanarabic": "\uFC5E", "/shaddafathaarabic": "\uFC60", "/shaddafathatanarabic": "\u0651", "/shaddakasraarabic": "\uFC62", "/shaddakasratanarabic": "\uFC5F", "/shade": "\u2592", "/shadedark": "\u2593", "/shadelight": "\u2591", "/shademedium": "\u2592", "/shadeva": "\u0936", "/shagujarati": "\u0AB6", "/shagurmukhi": "\u0A36", "/shalshelet:hb": "\u0593", "/shalshelethebrew": "\u0593", "/shamrock": "\u2618", "/shavedIce": "\u1F367", "/shbopomofo": "\u3115", "/shchacyr": "\u0449", "/shchacyrillic": "\u0449", "/sheen": "\u0634", "/sheen.fina": "\uFEB6", "/sheen.init": "\uFEB7", "/sheen.init_alefmaksura.fina": "\uFCFD", "/sheen.init_hah.fina": "\uFD0A", "/sheen.init_hah.medi": "\uFD2E", "/sheen.init_hah.medi_meem.medi": "\uFD68", "/sheen.init_heh.medi": "\uFD32", "/sheen.init_jeem.fina": "\uFD09", "/sheen.init_jeem.medi": "\uFD2D", "/sheen.init_khah.fina": "\uFD0B", "/sheen.init_khah.medi": "\uFD2F", "/sheen.init_meem.fina": "\uFD0C", "/sheen.init_meem.medi": "\uFD30", "/sheen.init_meem.medi_khah.medi": "\uFD6B", "/sheen.init_meem.medi_meem.medi": "\uFD6D", "/sheen.init_reh.fina": "\uFD0D", "/sheen.init_yeh.fina": "\uFCFE", "/sheen.isol": "\uFEB5", "/sheen.medi": "\uFEB8", "/sheen.medi_alefmaksura.fina": "\uFD19", "/sheen.medi_hah.fina": "\uFD26", "/sheen.medi_hah.medi": "\uFD38", "/sheen.medi_hah.medi_meem.fina": "\uFD67", "/sheen.medi_hah.medi_yeh.fina": "\uFDAA", "/sheen.medi_heh.medi": "\uFCEA", "/sheen.medi_jeem.fina": "\uFD25", "/sheen.medi_jeem.medi": "\uFD37", "/sheen.medi_jeem.medi_yeh.fina": "\uFD69", "/sheen.medi_khah.fina": "\uFD27", "/sheen.medi_khah.medi": "\uFD39", "/sheen.medi_meem.fina": "\uFD28", "/sheen.medi_meem.medi": "\uFCE9", "/sheen.medi_meem.medi_khah.fina": "\uFD6A", "/sheen.medi_meem.medi_meem.fina": "\uFD6C", "/sheen.medi_reh.fina": "\uFD29", "/sheen.medi_yeh.fina": "\uFD1A", "/sheenarabic": "\u0634", "/sheendotbelow": "\u06FA", "/sheenfinalarabic": "\uFEB6", "/sheeninitialarabic": "\uFEB7", "/sheenmedialarabic": "\uFEB8", "/sheep": "\u1F411", "/sheicoptic": "\u03E3", "/shelfmod": "\u02FD", "/shelfopenmod": "\u02FE", "/sheqel": "\u20AA", "/sheqelhebrew": "\u20AA", "/sheva": "\u05B0", "/sheva115": "\u05B0", "/sheva15": "\u05B0", "/sheva22": "\u05B0", "/sheva2e": "\u05B0", "/sheva:hb": "\u05B0", "/shevahebrew": "\u05B0", "/shevanarrowhebrew": "\u05B0", "/shevaquarterhebrew": "\u05B0", "/shevawidehebrew": "\u05B0", "/shhacyr": "\u04BB", "/shhacyrillic": "\u04BB", "/shhatailcyr": "\u0527", "/shield": "\u1F6E1", "/shimacoptic": "\u03ED", "/shin": "\u05E9", "/shin:hb": "\u05E9", "/shinDot:hb": "\u05C1", "/shindagesh": "\uFB49", "/shindageshhebrew": "\uFB49", "/shindageshshindot": "\uFB2C", "/shindageshshindothebrew": "\uFB2C", "/shindageshsindot": "\uFB2D", "/shindageshsindothebrew": "\uFB2D", "/shindothebrew": "\u05C1", "/shinhebrew": "\u05E9", "/shinshindot": "\uFB2A", "/shinshindothebrew": "\uFB2A", "/shinsindot": "\uFB2B", "/shinsindothebrew": "\uFB2B", "/shintoshrine": "\u26E9", "/shinwithdagesh:hb": "\uFB49", "/shinwithdageshandshinDot:hb": "\uFB2C", "/shinwithdageshandsinDot:hb": "\uFB2D", "/shinwithshinDot:hb": "\uFB2A", "/shinwithsinDot:hb": "\uFB2B", "/ship": "\u1F6A2", "/sho": "\u03F8", "/shoejotupfunc": "\u235D", "/shoestiledownfunc": "\u2366", "/shoestileleftfunc": "\u2367", "/shogipieceblack": "\u2617", "/shogipiecewhite": "\u2616", "/shook": "\u0282", "/shootingStar": "\u1F320", "/shoppingBags": "\u1F6CD", "/shoppingTrolley": "\u1F6D2", "/shortcake": "\u1F370", "/shortequalsmod": "\uA78A", "/shortoverlongmetrical": "\u23D3", "/shoulderedopenbox": "\u237D", "/shower": "\u1F6BF", "/shvsquare": "\u1F1AA", "/sicirclekatakana": "\u32DB", "/sidewaysBlackDownPointingIndex": "\u1F5A1", "/sidewaysBlackLeftPointingIndex": "\u1F59A", "/sidewaysBlackRightPointingIndex": "\u1F59B", "/sidewaysBlackUpPointingIndex": "\u1F5A0", "/sidewaysWhiteDownPointingIndex": "\u1F59F", "/sidewaysWhiteLeftPointingIndex": "\u1F598", "/sidewaysWhiteRightPointingIndex": "\u1F599", "/sidewaysWhiteUpPointingIndex": "\u1F59E", "/sigma": "\u03C3", "/sigma1": "\u03C2", "/sigmafinal": "\u03C2", "/sigmalunatedottedreversedsymbol": "\u037D", "/sigmalunatedottedsymbol": "\u037C", "/sigmalunatereversedsymbol": "\u037B", "/sigmalunatesymbol": "\u03F2", "/sigmalunatesymbolgreek": "\u03F2", "/sihiragana": "\u3057", "/sikatakana": "\u30B7", "/sikatakanahalfwidth": "\uFF7C", "/silhouetteOfJapan": "\u1F5FE", "/siluqhebrew": "\u05BD", "/siluqlefthebrew": "\u05BD", "/similar": "\u223C", "/sinDot:hb": "\u05C2", "/sindothebrew": "\u05C2", "/sinewave": "\u223F", "/sinh:a": "\u0D85", "/sinh:aa": "\u0D86", "/sinh:aae": "\u0D88", "/sinh:aaesign": "\u0DD1", "/sinh:aasign": "\u0DCF", "/sinh:ae": "\u0D87", "/sinh:aesign": "\u0DD0", "/sinh:ai": "\u0D93", "/sinh:aisign": "\u0DDB", "/sinh:anusvara": "\u0D82", "/sinh:au": "\u0D96", "/sinh:ausign": "\u0DDE", "/sinh:ba": "\u0DB6", "/sinh:bha": "\u0DB7", "/sinh:ca": "\u0DA0", "/sinh:cha": "\u0DA1", "/sinh:da": "\u0DAF", "/sinh:dda": "\u0DA9", "/sinh:ddha": "\u0DAA", "/sinh:dha": "\u0DB0", "/sinh:e": "\u0D91", "/sinh:ee": "\u0D92", "/sinh:eesign": "\u0DDA", "/sinh:esign": "\u0DD9", "/sinh:fa": "\u0DC6", "/sinh:ga": "\u0D9C", "/sinh:gha": "\u0D9D", "/sinh:ha": "\u0DC4", "/sinh:i": "\u0D89", "/sinh:ii": "\u0D8A", "/sinh:iisign": "\u0DD3", "/sinh:isign": "\u0DD2", "/sinh:ja": "\u0DA2", "/sinh:jha": "\u0DA3", "/sinh:jnya": "\u0DA5", "/sinh:ka": "\u0D9A", "/sinh:kha": "\u0D9B", "/sinh:kunddaliya": "\u0DF4", "/sinh:la": "\u0DBD", "/sinh:litheight": "\u0DEE", "/sinh:lithfive": "\u0DEB", "/sinh:lithfour": "\u0DEA", "/sinh:lithnine": "\u0DEF", "/sinh:lithone": "\u0DE7", "/sinh:lithseven": "\u0DED", "/sinh:lithsix": "\u0DEC", "/sinh:liththree": "\u0DE9", "/sinh:lithtwo": "\u0DE8", "/sinh:lithzero": "\u0DE6", "/sinh:lla": "\u0DC5", "/sinh:llvocal": "\u0D90", "/sinh:llvocalsign": "\u0DF3", "/sinh:lvocal": "\u0D8F", "/sinh:lvocalsign": "\u0DDF", "/sinh:ma": "\u0DB8", "/sinh:mba": "\u0DB9", "/sinh:na": "\u0DB1", "/sinh:nda": "\u0DB3", "/sinh:nga": "\u0D9E", "/sinh:nna": "\u0DAB", "/sinh:nndda": "\u0DAC", "/sinh:nnga": "\u0D9F", "/sinh:nya": "\u0DA4", "/sinh:nyja": "\u0DA6", "/sinh:o": "\u0D94", "/sinh:oo": "\u0D95", "/sinh:oosign": "\u0DDD", "/sinh:osign": "\u0DDC", "/sinh:pa": "\u0DB4", "/sinh:pha": "\u0DB5", "/sinh:ra": "\u0DBB", "/sinh:rrvocal": "\u0D8E", "/sinh:rrvocalsign": "\u0DF2", "/sinh:rvocal": "\u0D8D", "/sinh:rvocalsign": "\u0DD8", "/sinh:sa": "\u0DC3", "/sinh:sha": "\u0DC1", "/sinh:ssa": "\u0DC2", "/sinh:ta": "\u0DAD", "/sinh:tha": "\u0DAE", "/sinh:tta": "\u0DA7", "/sinh:ttha": "\u0DA8", "/sinh:u": "\u0D8B", "/sinh:usign": "\u0DD4", "/sinh:uu": "\u0D8C", "/sinh:uusign": "\u0DD6", "/sinh:va": "\u0DC0", "/sinh:virama": "\u0DCA", "/sinh:visarga": "\u0D83", "/sinh:ya": "\u0DBA", "/sinologicaldot": "\uA78F", "/sinsular": "\uA785", "/siosacirclekorean": "\u3274", "/siosaparenkorean": "\u3214", "/sioscieuckorean": "\u317E", "/sioscirclekorean": "\u3266", "/sioskiyeokkorean": "\u317A", "/sioskorean": "\u3145", "/siosnieunkorean": "\u317B", "/siosparenkorean": "\u3206", "/siospieupkorean": "\u317D", "/siostikeutkorean": "\u317C", "/siringusquare": "\u3321", "/six": "\u0036", "/six.inferior": "\u2086", "/six.roman": "\u2165", "/six.romansmall": "\u2175", "/six.superior": "\u2076", "/sixPointedStarMiddleDot": "\u1F52F", "/sixarabic": "\u0666", "/sixbengali": "\u09EC", "/sixcircle": "\u2465", "/sixcircledbl": "\u24FA", "/sixcircleinversesansserif": "\u278F", "/sixcomma": "\u1F107", "/sixdeva": "\u096C", "/sixdotsvertical": "\u2E3D", "/sixfar": "\u06F6", "/sixgujarati": "\u0AEC", "/sixgurmukhi": "\u0A6C", "/sixhackarabic": "\u0666", "/sixhangzhou": "\u3026", "/sixideographiccircled": "\u3285", "/sixideographicparen": "\u3225", "/sixinferior": "\u2086", "/sixlateform.roman": "\u2185", "/sixmonospace": "\uFF16", "/sixoldstyle": "\uF736", "/sixparen": "\u2479", "/sixparenthesized": "\u2479", "/sixperemspace": "\u2006", "/sixperiod": "\u248D", "/sixpersian": "\u06F6", "/sixroman": "\u2175", "/sixsuperior": "\u2076", "/sixteencircle": "\u246F", "/sixteencircleblack": "\u24F0", "/sixteencurrencydenominatorbengali": "\u09F9", "/sixteenparen": "\u2483", "/sixteenparenthesized": "\u2483", "/sixteenperiod": "\u2497", "/sixthai": "\u0E56", "/sixtycirclesquare": "\u324D", "/sixtypsquare": "\u1F1A3", "/sjekomicyr": "\u050D", "/skiAndSkiBoot": "\u1F3BF", "/skier": "\u26F7", "/skull": "\u1F480", "/skullcrossbones": "\u2620", "/slash": "\u002F", "/slashbarfunc": "\u233F", "/slashmonospace": "\uFF0F", "/sled": "\u1F6F7", "/sleeping": "\u1F4A4", "/sleepingAccommodation": "\u1F6CC", "/sleepingFace": "\u1F634", "/sleepyFace": "\u1F62A", "/sleuthOrSpy": "\u1F575", "/sliceOfPizza": "\u1F355", "/slightlyFrowningFace": "\u1F641", "/slightlySmilingFace": "\u1F642", "/slong": "\u017F", "/slongdotaccent": "\u1E9B", "/slope": "\u2333", "/slotMachine": "\u1F3B0", "/smallAirplane": "\u1F6E9", "/smallBlueDiamond": "\u1F539", "/smallOrangeDiamond": "\u1F538", "/smallRedTriangleDOwn": "\u1F53D", "/smallRedTriangleUp": "\u1F53C", "/smile": "\u2323", "/smileface": "\u263A", "/smilingCatFaceWithHeartShapedEyes": "\u1F63B", "/smilingCatFaceWithOpenMouth": "\u1F63A", "/smilingFaceWithHalo": "\u1F607", "/smilingFaceWithHeartShapedEyes": "\u1F60D", "/smilingFaceWithHorns": "\u1F608", "/smilingFaceWithOpenMouth": "\u1F603", "/smilingFaceWithOpenMouthAndColdSweat": "\u1F605", "/smilingFaceWithOpenMouthAndSmilingEyes": "\u1F604", "/smilingFaceWithOpenMouthAndTightlyClosedEyes": "\u1F606", "/smilingFaceWithSmilingEyes": "\u1F60A", "/smilingFaceWithSunglasses": "\u1F60E", "/smilingfaceblack": "\u263B", "/smilingfacewhite": "\u263A", "/smirkingFace": "\u1F60F", "/smll:ampersand": "\uFE60", "/smll:asterisk": "\uFE61", "/smll:backslash": "\uFE68", "/smll:braceleft": "\uFE5B", "/smll:braceright": "\uFE5C", "/smll:colon": "\uFE55", "/smll:comma": "\uFE50", "/smll:dollar": "\uFE69", "/smll:emdash": "\uFE58", "/smll:equal": "\uFE66", "/smll:exclam": "\uFE57", "/smll:greater": "\uFE65", "/smll:hyphen": "\uFE63", "/smll:ideographiccomma": "\uFE51", "/smll:less": "\uFE64", "/smll:numbersign": "\uFE5F", "/smll:parenthesisleft": "\uFE59", "/smll:parenthesisright": "\uFE5A", "/smll:percent": "\uFE6A", "/smll:period": "\uFE52", "/smll:plus": "\uFE62", "/smll:question": "\uFE56", "/smll:semicolon": "\uFE54", "/smll:tortoiseshellbracketleft": "\uFE5D", "/smll:tortoiseshellbracketright": "\uFE5E", "/smoking": "\u1F6AC", "/smonospace": "\uFF53", "/snail": "\u1F40C", "/snake": "\u1F40D", "/snowboarder": "\u1F3C2", "/snowcappedMountain": "\u1F3D4", "/snowman": "\u2603", "/snowmanblack": "\u26C7", "/snowmanoutsnow": "\u26C4", "/sobliquestroke": "\uA7A9", "/soccerball": "\u26BD", "/societyideographiccircled": "\u3293", "/societyideographicparen": "\u3233", "/socirclekatakana": "\u32DE", "/sofPasuq:hb": "\u05C3", "/sofpasuqhebrew": "\u05C3", "/softIceCream": "\u1F366", "/softShellFloppyDisk": "\u1F5AC", "/softcyr": "\u044C", "/softhyphen": "\u00AD", "/softsigncyrillic": "\u044C", "/softwarefunction": "\u2394", "/sohiragana": "\u305D", "/sokatakana": "\u30BD", "/sokatakanahalfwidth": "\uFF7F", "/soliduslongoverlaycmb": "\u0338", "/solidusshortoverlaycmb": "\u0337", "/solidussubsetreversepreceding": "\u27C8", "/solidussupersetpreceding": "\u27C9", "/soonRightwardsArrowAbove": "\u1F51C", "/sorusithai": "\u0E29", "/sosalathai": "\u0E28", "/sosothai": "\u0E0B", "/sossquare": "\u1F198", "/sosuathai": "\u0E2A", "/soundcopyright": "\u2117", "/space": "\u0020", "/spacehackarabic": "\u0020", "/spade": "\u2660", "/spadeblack": "\u2660", "/spadesuitblack": "\u2660", "/spadesuitwhite": "\u2664", "/spadewhite": "\u2664", "/spaghetti": "\u1F35D", "/sparen": "\u24AE", "/sparenthesized": "\u24AE", "/sparklingHeart": "\u1F496", "/speakNoEvilMonkey": "\u1F64A", "/speaker": "\u1F508", "/speakerCancellationStroke": "\u1F507", "/speakerOneSoundWave": "\u1F509", "/speakerThreeSoundWaves": "\u1F50A", "/speakingHeadInSilhouette": "\u1F5E3", "/specialideographiccircled": "\u3295", "/specialideographicparen": "\u3235", "/speechBalloon": "\u1F4AC", "/speedboat": "\u1F6A4", "/spesmilo": "\u20B7", "/sphericalangle": "\u2222", "/spider": "\u1F577", "/spiderWeb": "\u1F578", "/spiralCalendarPad": "\u1F5D3", "/spiralNotePad": "\u1F5D2", "/spiralShell": "\u1F41A", "/splashingSweat": "\u1F4A6", "/sportsMedal": "\u1F3C5", "/spoutingWhale": "\u1F433", "/sppl:tildevertical": "\u2E2F", "/squarebelowcmb": "\u033B", "/squareblack": "\u25A0", "/squarebracketleftvertical": "\uFE47", "/squarebracketrightvertical": "\uFE48", "/squarecap": "\u2293", "/squarecc": "\u33C4", "/squarecm": "\u339D", "/squarecup": "\u2294", "/squareddotoperator": "\u22A1", "/squarediagonalcrosshatchfill": "\u25A9", "/squaredj": "\u1F190", "/squaredkey": "\u26BF", "/squaredminus": "\u229F", "/squaredplus": "\u229E", "/squaredsaltire": "\u26DD", "/squaredtimes": "\u22A0", "/squarefourcorners": "\u26F6", "/squarehalfleftblack": "\u25E7", "/squarehalfrightblack": "\u25E8", "/squarehorizontalfill": "\u25A4", "/squareimage": "\u228F", "/squareimageorequal": "\u2291", "/squareimageornotequal": "\u22E4", "/squarekg": "\u338F", "/squarekm": "\u339E", "/squarekmcapital": "\u33CE", "/squareln": "\u33D1", "/squarelog": "\u33D2", "/squarelowerdiagonalhalfrightblack": "\u25EA", "/squaremediumblack": "\u25FC", "/squaremediumwhite": "\u25FB", "/squaremg": "\u338E", "/squaremil": "\u33D5", "/squaremm": "\u339C", "/squaremsquared": "\u33A1", "/squareoriginal": "\u2290", "/squareoriginalorequal": "\u2292", "/squareoriginalornotequal": "\u22E5", "/squareorthogonalcrosshatchfill": "\u25A6", "/squareraised": "\u2E0B", "/squaresmallblack": "\u25AA", "/squaresmallmediumblack": "\u25FE", "/squaresmallmediumwhite": "\u25FD", "/squaresmallwhite": "\u25AB", "/squareupperdiagonalhalfleftblack": "\u25E9", "/squareupperlefttolowerrightfill": "\u25A7", "/squareupperrighttolowerleftfill": "\u25A8", "/squareverticalfill": "\u25A5", "/squarewhite": "\u25A1", "/squarewhitebisectinglinevertical": "\u25EB", "/squarewhitelowerquadrantleft": "\u25F1", "/squarewhitelowerquadrantright": "\u25F2", "/squarewhiteround": "\u25A2", "/squarewhiteupperquadrantleft": "\u25F0", "/squarewhiteupperquadrantright": "\u25F3", "/squarewhitewithsmallblack": "\u25A3", "/squarewhitewithsquaresmallblack": "\u25A3", "/squishquadfunc": "\u2337", "/srfullwidth": "\u33DB", "/srsquare": "\u33DB", "/ssabengali": "\u09B7", "/ssadeva": "\u0937", "/ssagujarati": "\u0AB7", "/ssangcieuckorean": "\u3149", "/ssanghieuhkorean": "\u3185", "/ssangieungkorean": "\u3180", "/ssangkiyeokkorean": "\u3132", "/ssangnieunkorean": "\u3165", "/ssangpieupkorean": "\u3143", "/ssangsioskorean": "\u3146", "/ssangtikeutkorean": "\u3138", "/ssuperior": "\uF6F2", "/ssupmod": "\u02E2", "/sswashtail": "\u023F", "/stackedcommadbl": "\u2E49", "/stadium": "\u1F3DF", "/staffofaesculapius": "\u2695", "/staffofhermes": "\u269A", "/stampedEnvelope": "\u1F583", "/star": "\u22C6", "/starblack": "\u2605", "/starcrescent": "\u262A", "/stardiaeresisfunc": "\u2363", "/starequals": "\u225B", "/staroperator": "\u22C6", "/staroutlinedwhite": "\u269D", "/starwhite": "\u2606", "/station": "\u1F689", "/statueOfLiberty": "\u1F5FD", "/steamLocomotive": "\u1F682", "/steamingBowl": "\u1F35C", "/stenographicfullstop": "\u2E3C", "/sterling": "\u00A3", "/sterlingmonospace": "\uFFE1", "/stigma": "\u03DB", "/stiletildefunc": "\u236D", "/stockChart": "\u1F5E0", "/stockideographiccircled": "\u3291", "/stockideographicparen": "\u3231", "/stopabove": "\u06EB", "/stopbelow": "\u06EA", "/straightRuler": "\u1F4CF", "/straightness": "\u23E4", "/strawberry": "\u1F353", "/stresslowtonemod": "\uA721", "/stresstonemod": "\uA720", "/strictlyequivalent": "\u2263", "/strokelongoverlaycmb": "\u0336", "/strokeshortoverlaycmb": "\u0335", "/studioMicrophone": "\u1F399", "/studyideographiccircled": "\u32AB", "/studyideographicparen": "\u323B", "/stupa": "\u1F6D3", "/subscriptalef": "\u0656", "/subset": "\u2282", "/subsetdbl": "\u22D0", "/subsetnotequal": "\u228A", "/subsetorequal": "\u2286", "/succeeds": "\u227B", "/succeedsbutnotequivalent": "\u22E9", "/succeedsorequal": "\u227D", "/succeedsorequivalent": "\u227F", "/succeedsunderrelation": "\u22B1", "/suchthat": "\u220B", "/sucirclekatakana": "\u32DC", "/suhiragana": "\u3059", "/suitableideographiccircled": "\u329C", "/sukatakana": "\u30B9", "/sukatakanahalfwidth": "\uFF7D", "/sukumendutvowel": "\uA9B9", "/sukunIsol": "\uFE7E", "/sukunMedi": "\uFE7F", "/sukunarabic": "\u0652", "/sukuvowel": "\uA9B8", "/summation": "\u2211", "/summationbottom": "\u23B3", "/summationdblstruck": "\u2140", "/summationtop": "\u23B2", "/sun": "\u263C", "/sunFace": "\u1F31E", "/sunbehindcloud": "\u26C5", "/sunflower": "\u1F33B", "/sunideographiccircled": "\u3290", "/sunideographicparen": "\u3230", "/sunraysblack": "\u2600", "/sunrayswhite": "\u263C", "/sunrise": "\u1F305", "/sunriseOverMountains": "\u1F304", "/sunsetOverBuildings": "\u1F307", "/superset": "\u2283", "/supersetnotequal": "\u228B", "/supersetorequal": "\u2287", "/superviseideographiccircled": "\u32AC", "/superviseideographicparen": "\u323C", "/surfer": "\u1F3C4", "/sushi": "\u1F363", "/suspensionRailway": "\u1F69F", "/suspensiondbl": "\u2E44", "/svfullwidth": "\u33DC", "/svsquare": "\u33DC", "/swatchtop": "\u23F1", "/swimmer": "\u1F3CA", "/swungdash": "\u2053", "/symbolabovethreedotsabove": "\uFBB6", "/symbolbelowthreedotsabove": "\uFBB7", "/symboldotabove": "\uFBB2", "/symboldotbelow": "\uFBB3", "/symboldoubleverticalbarbelow": "\uFBBC", "/symbolfourdotsabove": "\uFBBA", "/symbolfourdotsbelow": "\uFBBB", "/symbolpointingabovedownthreedotsabove": "\uFBB8", "/symbolpointingbelowdownthreedotsabove": "\uFBB9", "/symbolring": "\uFBBF", "/symboltahabovesmall": "\uFBC0", "/symboltahbelowsmall": "\uFBC1", "/symboltwodotsabove": "\uFBB4", "/symboltwodotsbelow": "\uFBB5", "/symboltwodotsverticallyabove": "\uFBBD", "/symboltwodotsverticallybelow": "\uFBBE", "/symmetry": "\u232F", "/synagogue": "\u1F54D", "/syouwaerasquare": "\u337C", "/syringe": "\u1F489", "/t": "\u0074", "/t-shirt": "\u1F455", "/t.inferior": "\u209C", "/tabengali": "\u09A4", "/tableTennisPaddleAndBall": "\u1F3D3", "/tacirclekatakana": "\u32DF", "/tackcircleaboveup": "\u27DF", "/tackdiaeresisupfunc": "\u2361", "/tackdown": "\u22A4", "/tackdownmod": "\u02D5", "/tackjotdownfunc": "\u234E", "/tackjotupfunc": "\u2355", "/tackleft": "\u22A3", "/tackleftright": "\u27DB", "/tackoverbarupfunc": "\u2351", "/tackright": "\u22A2", "/tackunderlinedownfunc": "\u234A", "/tackup": "\u22A5", "/tackupmod": "\u02D4", "/taco": "\u1F32E", "/tadeva": "\u0924", "/tagujarati": "\u0AA4", "/tagurmukhi": "\u0A24", "/tah": "\u0637", "/tah.fina": "\uFEC2", "/tah.init": "\uFEC3", "/tah.init_alefmaksura.fina": "\uFCF5", "/tah.init_hah.fina": "\uFC26", "/tah.init_hah.medi": "\uFCB8", "/tah.init_meem.fina": "\uFC27", "/tah.init_meem.medi": "\uFD33", "/tah.init_meem.medi_hah.medi": "\uFD72", "/tah.init_meem.medi_meem.medi": "\uFD73", "/tah.init_yeh.fina": "\uFCF6", "/tah.isol": "\uFEC1", "/tah.medi": "\uFEC4", "/tah.medi_alefmaksura.fina": "\uFD11", "/tah.medi_meem.medi": "\uFD3A", "/tah.medi_meem.medi_hah.fina": "\uFD71", "/tah.medi_meem.medi_yeh.fina": "\uFD74", "/tah.medi_yeh.fina": "\uFD12", "/tahabove": "\u0615", "/taharabic": "\u0637", "/tahfinalarabic": "\uFEC2", "/tahinitialarabic": "\uFEC3", "/tahiragana": "\u305F", "/tahmedialarabic": "\uFEC4", "/tahthreedotsabove": "\u069F", "/taisyouerasquare": "\u337D", "/takatakana": "\u30BF", "/takatakanahalfwidth": "\uFF80", "/takhallus": "\u0614", "/talingvowel": "\uA9BA", "/taml:a": "\u0B85", "/taml:aa": "\u0B86", "/taml:aasign": "\u0BBE", "/taml:ai": "\u0B90", "/taml:aisign": "\u0BC8", "/taml:anusvarasign": "\u0B82", "/taml:asabovesign": "\u0BF8", "/taml:au": "\u0B94", "/taml:aulengthmark": "\u0BD7", "/taml:ausign": "\u0BCC", "/taml:ca": "\u0B9A", "/taml:creditsign": "\u0BF7", "/taml:daysign": "\u0BF3", "/taml:debitsign": "\u0BF6", "/taml:e": "\u0B8E", "/taml:ee": "\u0B8F", "/taml:eesign": "\u0BC7", "/taml:eight": "\u0BEE", "/taml:esign": "\u0BC6", "/taml:five": "\u0BEB", "/taml:four": "\u0BEA", "/taml:ha": "\u0BB9", "/taml:i": "\u0B87", "/taml:ii": "\u0B88", "/taml:iisign": "\u0BC0", "/taml:isign": "\u0BBF", "/taml:ja": "\u0B9C", "/taml:ka": "\u0B95", "/taml:la": "\u0BB2", "/taml:lla": "\u0BB3", "/taml:llla": "\u0BB4", "/taml:ma": "\u0BAE", "/taml:monthsign": "\u0BF4", "/taml:na": "\u0BA8", "/taml:nga": "\u0B99", "/taml:nine": "\u0BEF", "/taml:nna": "\u0BA3", "/taml:nnna": "\u0BA9", "/taml:nya": "\u0B9E", "/taml:o": "\u0B92", "/taml:om": "\u0BD0", "/taml:one": "\u0BE7", "/taml:onehundred": "\u0BF1", "/taml:onethousand": "\u0BF2", "/taml:oo": "\u0B93", "/taml:oosign": "\u0BCB", "/taml:osign": "\u0BCA", "/taml:pa": "\u0BAA", "/taml:ra": "\u0BB0", "/taml:rra": "\u0BB1", "/taml:rupeesign": "\u0BF9", "/taml:sa": "\u0BB8", "/taml:seven": "\u0BED", "/taml:sha": "\u0BB6", "/taml:sign": "\u0BFA", "/taml:six": "\u0BEC", "/taml:ssa": "\u0BB7", "/taml:ta": "\u0BA4", "/taml:ten": "\u0BF0", "/taml:three": "\u0BE9", "/taml:tta": "\u0B9F", "/taml:two": "\u0BE8", "/taml:u": "\u0B89", "/taml:usign": "\u0BC1", "/taml:uu": "\u0B8A", "/taml:uusign": "\u0BC2", "/taml:va": "\u0BB5", "/taml:viramasign": "\u0BCD", "/taml:visargasign": "\u0B83", "/taml:ya": "\u0BAF", "/taml:yearsign": "\u0BF5", "/taml:zero": "\u0BE6", "/tamurda": "\uA9A1", "/tanabataTree": "\u1F38B", "/tangerine": "\u1F34A", "/tapeCartridge": "\u1F5AD", "/tarungvowel": "\uA9B4", "/tatweelFathatanAbove": "\uFE71", "/tatweelarabic": "\u0640", "/tau": "\u03C4", "/taurus": "\u2649", "/tav": "\u05EA", "/tav:hb": "\u05EA", "/tavdages": "\uFB4A", "/tavdagesh": "\uFB4A", "/tavdageshhebrew": "\uFB4A", "/tavhebrew": "\u05EA", "/tavwide:hb": "\uFB28", "/tavwithdagesh:hb": "\uFB4A", "/taxi": "\u1F695", "/tbar": "\u0167", "/tbopomofo": "\u310A", "/tcaron": "\u0165", "/tccurl": "\u02A8", "/tcedilla": "\u0163", "/tcheh": "\u0686", "/tcheh.fina": "\uFB7B", "/tcheh.init": "\uFB7C", "/tcheh.isol": "\uFB7A", "/tcheh.medi": "\uFB7D", "/tcheharabic": "\u0686", "/tchehdotabove": "\u06BF", "/tcheheh": "\u0687", "/tcheheh.fina": "\uFB7F", "/tcheheh.init": "\uFB80", "/tcheheh.isol": "\uFB7E", "/tcheheh.medi": "\uFB81", "/tchehfinalarabic": "\uFB7B", "/tchehinitialarabic": "\uFB7C", "/tchehmedialarabic": "\uFB7D", "/tchehmeeminitialarabic": "\uFB7C", "/tcircle": "\u24E3", "/tcircumflexbelow": "\u1E71", "/tcommaaccent": "\u0163", "/tcurl": "\u0236", "/tdieresis": "\u1E97", "/tdot": "\u1E6B", "/tdotaccent": "\u1E6B", "/tdotbelow": "\u1E6D", "/teacupOutHandle": "\u1F375", "/tear-offCalendar": "\u1F4C6", "/tecirclekatakana": "\u32E2", "/tecyr": "\u0442", "/tecyrillic": "\u0442", "/tedescendercyrillic": "\u04AD", "/teh": "\u062A", "/teh.fina": "\uFE96", "/teh.init": "\uFE97", "/teh.init_alefmaksura.fina": "\uFC0F", "/teh.init_hah.fina": "\uFC0C", "/teh.init_hah.medi": "\uFCA2", "/teh.init_hah.medi_jeem.medi": "\uFD52", "/teh.init_hah.medi_meem.medi": "\uFD53", "/teh.init_heh.medi": "\uFCA5", "/teh.init_jeem.fina": "\uFC0B", "/teh.init_jeem.medi": "\uFCA1", "/teh.init_jeem.medi_meem.medi": "\uFD50", "/teh.init_khah.fina": "\uFC0D", "/teh.init_khah.medi": "\uFCA3", "/teh.init_khah.medi_meem.medi": "\uFD54", "/teh.init_meem.fina": "\uFC0E", "/teh.init_meem.medi": "\uFCA4", "/teh.init_meem.medi_hah.medi": "\uFD56", "/teh.init_meem.medi_jeem.medi": "\uFD55", "/teh.init_meem.medi_khah.medi": "\uFD57", "/teh.init_yeh.fina": "\uFC10", "/teh.isol": "\uFE95", "/teh.medi": "\uFE98", "/teh.medi_alefmaksura.fina": "\uFC74", "/teh.medi_hah.medi_jeem.fina": "\uFD51", "/teh.medi_heh.medi": "\uFCE4", "/teh.medi_jeem.medi_alefmaksura.fina": "\uFDA0", "/teh.medi_jeem.medi_yeh.fina": "\uFD9F", "/teh.medi_khah.medi_alefmaksura.fina": "\uFDA2", "/teh.medi_khah.medi_yeh.fina": "\uFDA1", "/teh.medi_meem.fina": "\uFC72", "/teh.medi_meem.medi": "\uFCE3", "/teh.medi_meem.medi_alefmaksura.fina": "\uFDA4", "/teh.medi_meem.medi_yeh.fina": "\uFDA3", "/teh.medi_noon.fina": "\uFC73", "/teh.medi_reh.fina": "\uFC70", "/teh.medi_yeh.fina": "\uFC75", "/teh.medi_zain.fina": "\uFC71", "/teharabic": "\u062A", "/tehdownthreedotsabove": "\u067D", "/teheh": "\u067F", "/teheh.fina": "\uFB63", "/teheh.init": "\uFB64", "/teheh.isol": "\uFB62", "/teheh.medi": "\uFB65", "/tehfinalarabic": "\uFE96", "/tehhahinitialarabic": "\uFCA2", "/tehhahisolatedarabic": "\uFC0C", "/tehinitialarabic": "\uFE97", "/tehiragana": "\u3066", "/tehjeeminitialarabic": "\uFCA1", "/tehjeemisolatedarabic": "\uFC0B", "/tehmarbuta": "\u0629", "/tehmarbuta.fina": "\uFE94", "/tehmarbuta.isol": "\uFE93", "/tehmarbutaarabic": "\u0629", "/tehmarbutafinalarabic": "\uFE94", "/tehmarbutagoal": "\u06C3", "/tehmedialarabic": "\uFE98", "/tehmeeminitialarabic": "\uFCA4", "/tehmeemisolatedarabic": "\uFC0E", "/tehnoonfinalarabic": "\uFC73", "/tehring": "\u067C", "/tekatakana": "\u30C6", "/tekatakanahalfwidth": "\uFF83", "/telephone": "\u2121", "/telephoneOnTopOfModem": "\u1F580", "/telephoneReceiver": "\u1F4DE", "/telephoneReceiverPage": "\u1F57C", "/telephoneblack": "\u260E", "/telephonerecorder": "\u2315", "/telephonewhite": "\u260F", "/telescope": "\u1F52D", "/television": "\u1F4FA", "/telishaGedolah:hb": "\u05A0", "/telishaQetannah:hb": "\u05A9", "/telishagedolahebrew": "\u05A0", "/telishaqetanahebrew": "\u05A9", "/telu:a": "\u0C05", "/telu:aa": "\u0C06", "/telu:aasign": "\u0C3E", "/telu:ai": "\u0C10", "/telu:ailengthmark": "\u0C56", "/telu:aisign": "\u0C48", "/telu:anusvarasign": "\u0C02", "/telu:au": "\u0C14", "/telu:ausign": "\u0C4C", "/telu:avagrahasign": "\u0C3D", "/telu:ba": "\u0C2C", "/telu:bha": "\u0C2D", "/telu:bindusigncandra": "\u0C01", "/telu:ca": "\u0C1A", "/telu:cha": "\u0C1B", "/telu:combiningbinduabovesigncandra": "\u0C00", "/telu:da": "\u0C26", "/telu:dda": "\u0C21", "/telu:ddha": "\u0C22", "/telu:dha": "\u0C27", "/telu:dza": "\u0C59", "/telu:e": "\u0C0E", "/telu:ee": "\u0C0F", "/telu:eesign": "\u0C47", "/telu:eight": "\u0C6E", "/telu:esign": "\u0C46", "/telu:five": "\u0C6B", "/telu:four": "\u0C6A", "/telu:fractiononeforevenpowersoffour": "\u0C7C", "/telu:fractiononeforoddpowersoffour": "\u0C79", "/telu:fractionthreeforevenpowersoffour": "\u0C7E", "/telu:fractionthreeforoddpowersoffour": "\u0C7B", "/telu:fractiontwoforevenpowersoffour": "\u0C7D", "/telu:fractiontwoforoddpowersoffour": "\u0C7A", "/telu:fractionzeroforoddpowersoffour": "\u0C78", "/telu:ga": "\u0C17", "/telu:gha": "\u0C18", "/telu:ha": "\u0C39", "/telu:i": "\u0C07", "/telu:ii": "\u0C08", "/telu:iisign": "\u0C40", "/telu:isign": "\u0C3F", "/telu:ja": "\u0C1C", "/telu:jha": "\u0C1D", "/telu:ka": "\u0C15", "/telu:kha": "\u0C16", "/telu:la": "\u0C32", "/telu:lengthmark": "\u0C55", "/telu:lla": "\u0C33", "/telu:llla": "\u0C34", "/telu:llsignvocal": "\u0C63", "/telu:llvocal": "\u0C61", "/telu:lsignvocal": "\u0C62", "/telu:lvocal": "\u0C0C", "/telu:ma": "\u0C2E", "/telu:na": "\u0C28", "/telu:nga": "\u0C19", "/telu:nine": "\u0C6F", "/telu:nna": "\u0C23", "/telu:nya": "\u0C1E", "/telu:o": "\u0C12", "/telu:one": "\u0C67", "/telu:oo": "\u0C13", "/telu:oosign": "\u0C4B", "/telu:osign": "\u0C4A", "/telu:pa": "\u0C2A", "/telu:pha": "\u0C2B", "/telu:ra": "\u0C30", "/telu:rra": "\u0C31", "/telu:rrra": "\u0C5A", "/telu:rrsignvocal": "\u0C44", "/telu:rrvocal": "\u0C60", "/telu:rsignvocal": "\u0C43", "/telu:rvocal": "\u0C0B", "/telu:sa": "\u0C38", "/telu:seven": "\u0C6D", "/telu:sha": "\u0C36", "/telu:six": "\u0C6C", "/telu:ssa": "\u0C37", "/telu:ta": "\u0C24", "/telu:tha": "\u0C25", "/telu:three": "\u0C69", "/telu:tsa": "\u0C58", "/telu:tta": "\u0C1F", "/telu:ttha": "\u0C20", "/telu:tuumusign": "\u0C7F", "/telu:two": "\u0C68", "/telu:u": "\u0C09", "/telu:usign": "\u0C41", "/telu:uu": "\u0C0A", "/telu:uusign": "\u0C42", "/telu:va": "\u0C35", "/telu:viramasign": "\u0C4D", "/telu:visargasign": "\u0C03", "/telu:ya": "\u0C2F", "/telu:zero": "\u0C66", "/ten.roman": "\u2169", "/ten.romansmall": "\u2179", "/tencircle": "\u2469", "/tencircledbl": "\u24FE", "/tencirclesquare": "\u3248", "/tenge": "\u20B8", "/tenhangzhou": "\u3038", "/tenideographiccircled": "\u3289", "/tenideographicparen": "\u3229", "/tennisRacquetAndBall": "\u1F3BE", "/tenparen": "\u247D", "/tenparenthesized": "\u247D", "/tenperiod": "\u2491", "/tenroman": "\u2179", "/tent": "\u26FA", "/tenthousand.roman": "\u2182", "/tesh": "\u02A7", "/tet": "\u05D8", "/tet:hb": "\u05D8", "/tetailcyr": "\u04AD", "/tetdagesh": "\uFB38", "/tetdageshhebrew": "\uFB38", "/tethebrew": "\u05D8", "/tetrasememetrical": "\u23D8", "/tetsecyr": "\u04B5", "/tetsecyrillic": "\u04B5", "/tetwithdagesh:hb": "\uFB38", "/tevir:hb": "\u059B", "/tevirhebrew": "\u059B", "/tevirlefthebrew": "\u059B", "/thabengali": "\u09A5", "/thadeva": "\u0925", "/thagujarati": "\u0AA5", "/thagurmukhi": "\u0A25", "/thai:angkhankhu": "\u0E5A", "/thai:baht": "\u0E3F", "/thai:bobaimai": "\u0E1A", "/thai:chochan": "\u0E08", "/thai:chochang": "\u0E0A", "/thai:choching": "\u0E09", "/thai:chochoe": "\u0E0C", "/thai:dochada": "\u0E0E", "/thai:dodek": "\u0E14", "/thai:eight": "\u0E58", "/thai:five": "\u0E55", "/thai:fofa": "\u0E1D", "/thai:fofan": "\u0E1F", "/thai:fongman": "\u0E4F", "/thai:four": "\u0E54", "/thai:hohip": "\u0E2B", "/thai:honokhuk": "\u0E2E", "/thai:khokhai": "\u0E02", "/thai:khokhon": "\u0E05", "/thai:khokhuat": "\u0E03", "/thai:khokhwai": "\u0E04", "/thai:khomut": "\u0E5B", "/thai:khorakhang": "\u0E06", "/thai:kokai": "\u0E01", "/thai:lakkhangyao": "\u0E45", "/thai:lochula": "\u0E2C", "/thai:loling": "\u0E25", "/thai:lu": "\u0E26", "/thai:maichattawa": "\u0E4B", "/thai:maiek": "\u0E48", "/thai:maihan-akat": "\u0E31", "/thai:maitaikhu": "\u0E47", "/thai:maitho": "\u0E49", "/thai:maitri": "\u0E4A", "/thai:maiyamok": "\u0E46", "/thai:moma": "\u0E21", "/thai:ngongu": "\u0E07", "/thai:nikhahit": "\u0E4D", "/thai:nine": "\u0E59", "/thai:nonen": "\u0E13", "/thai:nonu": "\u0E19", "/thai:oang": "\u0E2D", "/thai:one": "\u0E51", "/thai:paiyannoi": "\u0E2F", "/thai:phinthu": "\u0E3A", "/thai:phophan": "\u0E1E", "/thai:phophung": "\u0E1C", "/thai:phosamphao": "\u0E20", "/thai:popla": "\u0E1B", "/thai:rorua": "\u0E23", "/thai:ru": "\u0E24", "/thai:saraa": "\u0E30", "/thai:saraaa": "\u0E32", "/thai:saraae": "\u0E41", "/thai:saraaimaimalai": "\u0E44", "/thai:saraaimaimuan": "\u0E43", "/thai:saraam": "\u0E33", "/thai:sarae": "\u0E40", "/thai:sarai": "\u0E34", "/thai:saraii": "\u0E35", "/thai:sarao": "\u0E42", "/thai:sarau": "\u0E38", "/thai:saraue": "\u0E36", "/thai:sarauee": "\u0E37", "/thai:sarauu": "\u0E39", "/thai:seven": "\u0E57", "/thai:six": "\u0E56", "/thai:sorusi": "\u0E29", "/thai:sosala": "\u0E28", "/thai:soso": "\u0E0B", "/thai:sosua": "\u0E2A", "/thai:thanthakhat": "\u0E4C", "/thai:thonangmontho": "\u0E11", "/thai:thophuthao": "\u0E12", "/thai:thothahan": "\u0E17", "/thai:thothan": "\u0E10", "/thai:thothong": "\u0E18", "/thai:thothung": "\u0E16", "/thai:three": "\u0E53", "/thai:topatak": "\u0E0F", "/thai:totao": "\u0E15", "/thai:two": "\u0E52", "/thai:wowaen": "\u0E27", "/thai:yamakkan": "\u0E4E", "/thai:yoyak": "\u0E22", "/thai:yoying": "\u0E0D", "/thai:zero": "\u0E50", "/thal": "\u0630", "/thal.fina": "\uFEAC", "/thal.init_superscriptalef.fina": "\uFC5B", "/thal.isol": "\uFEAB", "/thalarabic": "\u0630", "/thalfinalarabic": "\uFEAC", "/thanthakhatlowleftthai": "\uF898", "/thanthakhatlowrightthai": "\uF897", "/thanthakhatthai": "\u0E4C", "/thanthakhatupperleftthai": "\uF896", "/theh": "\u062B", "/theh.fina": "\uFE9A", "/theh.init": "\uFE9B", "/theh.init_alefmaksura.fina": "\uFC13", "/theh.init_jeem.fina": "\uFC11", "/theh.init_meem.fina": "\uFC12", "/theh.init_meem.medi": "\uFCA6", "/theh.init_yeh.fina": "\uFC14", "/theh.isol": "\uFE99", "/theh.medi": "\uFE9C", "/theh.medi_alefmaksura.fina": "\uFC7A", "/theh.medi_heh.medi": "\uFCE6", "/theh.medi_meem.fina": "\uFC78", "/theh.medi_meem.medi": "\uFCE5", "/theh.medi_noon.fina": "\uFC79", "/theh.medi_reh.fina": "\uFC76", "/theh.medi_yeh.fina": "\uFC7B", "/theh.medi_zain.fina": "\uFC77", "/theharabic": "\u062B", "/thehfinalarabic": "\uFE9A", "/thehinitialarabic": "\uFE9B", "/thehmedialarabic": "\uFE9C", "/thereexists": "\u2203", "/therefore": "\u2234", "/thermometer": "\u1F321", "/theta": "\u03B8", "/theta.math": "\u03D1", "/theta1": "\u03D1", "/thetasymbolgreek": "\u03D1", "/thieuthacirclekorean": "\u3279", "/thieuthaparenkorean": "\u3219", "/thieuthcirclekorean": "\u326B", "/thieuthkorean": "\u314C", "/thieuthparenkorean": "\u320B", "/thinspace": "\u2009", "/thirteencircle": "\u246C", "/thirteencircleblack": "\u24ED", "/thirteenparen": "\u2480", "/thirteenparenthesized": "\u2480", "/thirteenperiod": "\u2494", "/thirtycircle": "\u325A", "/thirtycirclesquare": "\u324A", "/thirtyeightcircle": "\u32B3", "/thirtyfivecircle": "\u325F", "/thirtyfourcircle": "\u325E", "/thirtyhangzhou": "\u303A", "/thirtyninecircle": "\u32B4", "/thirtyonecircle": "\u325B", "/thirtysevencircle": "\u32B2", "/thirtysixcircle": "\u32B1", "/thirtythreecircle": "\u325D", "/thirtytwocircle": "\u325C", "/thonangmonthothai": "\u0E11", "/thook": "\u01AD", "/thophuthaothai": "\u0E12", "/thorn": "\u00FE", "/thornstroke": "\uA765", "/thornstrokedescender": "\uA767", "/thothahanthai": "\u0E17", "/thothanthai": "\u0E10", "/thothongthai": "\u0E18", "/thothungthai": "\u0E16", "/thoughtBalloon": "\u1F4AD", "/thousandcyrillic": "\u0482", "/thousandscyr": "\u0482", "/thousandsseparator": "\u066C", "/thousandsseparatorarabic": "\u066C", "/thousandsseparatorpersian": "\u066C", "/three": "\u0033", "/three.inferior": "\u2083", "/three.roman": "\u2162", "/three.romansmall": "\u2172", "/threeButtonMouse": "\u1F5B1", "/threeNetworkedComputers": "\u1F5A7", "/threeRaysAbove": "\u1F5E4", "/threeRaysBelow": "\u1F5E5", "/threeRaysLeft": "\u1F5E6", "/threeRaysRight": "\u1F5E7", "/threeSpeechBubbles": "\u1F5EB", "/threearabic": "\u0663", "/threebengali": "\u09E9", "/threecircle": "\u2462", "/threecircledbl": "\u24F7", "/threecircleinversesansserif": "\u278C", "/threecomma": "\u1F104", "/threedeva": "\u0969", "/threedimensionalangle": "\u27C0", "/threedotpunctuation": "\u2056", "/threedotsaboveabove": "\u06DB", "/threedsquare": "\u1F19B", "/threeeighths": "\u215C", "/threefar": "\u06F3", "/threefifths": "\u2157", "/threegujarati": "\u0AE9", "/threegurmukhi": "\u0A69", "/threehackarabic": "\u0663", "/threehangzhou": "\u3023", "/threeideographiccircled": "\u3282", "/threeideographicparen": "\u3222", "/threeinferior": "\u2083", "/threelinesconvergingleft": "\u269F", "/threelinesconvergingright": "\u269E", "/threemonospace": "\uFF13", "/threenumeratorbengali": "\u09F6", "/threeoldstyle": "\uF733", "/threeparen": "\u2476", "/threeparenthesized": "\u2476", "/threeperemspace": "\u2004", "/threeperiod": "\u248A", "/threepersian": "\u06F3", "/threequarters": "\u00BE", "/threequartersemdash": "\uF6DE", "/threerightarrows": "\u21F6", "/threeroman": "\u2172", "/threesuperior": "\u00B3", "/threethai": "\u0E53", "/thumbsDownSign": "\u1F44E", "/thumbsUpSign": "\u1F44D", "/thundercloudrain": "\u26C8", "/thunderstorm": "\u2608", "/thzfullwidth": "\u3394", "/thzsquare": "\u3394", "/tibt:AA": "\u0F60", "/tibt:a": "\u0F68", "/tibt:aavowelsign": "\u0F71", "/tibt:angkhanggyasmark": "\u0F3D", "/tibt:angkhanggyonmark": "\u0F3C", "/tibt:astrologicalkhyudpasign": "\u0F18", "/tibt:astrologicalsdongtshugssign": "\u0F19", "/tibt:astrologicalsgragcancharrtagssign": "\u0F17", "/tibt:asubjoined": "\u0FB8", "/tibt:ba": "\u0F56", "/tibt:basubjoined": "\u0FA6", "/tibt:bha": "\u0F57", "/tibt:bhasubjoined": "\u0FA7", "/tibt:bkashogyigmgomark": "\u0F0A", "/tibt:brdarnyingyigmgomdunmainitialmark": "\u0FD3", "/tibt:brdarnyingyigmgosgabmaclosingmark": "\u0FD4", "/tibt:bsdusrtagsmark": "\u0F34", "/tibt:bskashoggimgorgyanmark": "\u0FD0", "/tibt:bskuryigmgomark": "\u0F09", "/tibt:ca": "\u0F45", "/tibt:cangteucantillationsign": "\u0FC2", "/tibt:caretdzudrtagsbzhimigcanmark": "\u0F36", "/tibt:caretdzudrtagsmelongcanmark": "\u0F13", "/tibt:caretyigmgophurshadmamark": "\u0F06", "/tibt:casubjoined": "\u0F95", "/tibt:cha": "\u0F46", "/tibt:chadrtagslogotypesign": "\u0F15", "/tibt:chasubjoined": "\u0F96", "/tibt:chemgomark": "\u0F38", "/tibt:da": "\u0F51", "/tibt:dasubjoined": "\u0FA1", "/tibt:dda": "\u0F4C", "/tibt:ddasubjoined": "\u0F9C", "/tibt:ddha": "\u0F4D", "/tibt:ddhasubjoined": "\u0F9D", "/tibt:delimitertshegbstarmark": "\u0F0C", "/tibt:dha": "\u0F52", "/tibt:dhasubjoined": "\u0FA2", "/tibt:drilbusymbol": "\u0FC4", "/tibt:dza": "\u0F5B", "/tibt:dzasubjoined": "\u0FAB", "/tibt:dzha": "\u0F5C", "/tibt:dzhasubjoined": "\u0FAC", "/tibt:eevowelsign": "\u0F7B", "/tibt:eight": "\u0F28", "/tibt:evowelsign": "\u0F7A", "/tibt:five": "\u0F25", "/tibt:four": "\u0F24", "/tibt:ga": "\u0F42", "/tibt:gasubjoined": "\u0F92", "/tibt:gha": "\u0F43", "/tibt:ghasubjoined": "\u0F93", "/tibt:grucanrgyingssign": "\u0F8A", "/tibt:grumedrgyingssign": "\u0F8B", "/tibt:gtertshegmark": "\u0F14", "/tibt:gteryigmgotruncatedamark": "\u0F01", "/tibt:gteryigmgoumgtertshegmamark": "\u0F03", "/tibt:gteryigmgoumrnambcadmamark": "\u0F02", "/tibt:gugrtagsgyasmark": "\u0F3B", "/tibt:gugrtagsgyonmark": "\u0F3A", "/tibt:ha": "\u0F67", "/tibt:halantamark": "\u0F84", "/tibt:halfeight": "\u0F31", "/tibt:halffive": "\u0F2E", "/tibt:halffour": "\u0F2D", "/tibt:halfnine": "\u0F32", "/tibt:halfone": "\u0F2A", "/tibt:halfseven": "\u0F30", "/tibt:halfsix": "\u0F2F", "/tibt:halfthree": "\u0F2C", "/tibt:halftwo": "\u0F2B", "/tibt:halfzero": "\u0F33", "/tibt:hasubjoined": "\u0FB7", "/tibt:heavybeatcantillationsign": "\u0FC0", "/tibt:iivowelsign": "\u0F73", "/tibt:intersyllabictshegmark": "\u0F0B", "/tibt:invertedmchucansign": "\u0F8C", "/tibt:invertedmchucansubjoinedsign": "\u0F8F", "/tibt:ivowelsign": "\u0F72", "/tibt:ja": "\u0F47", "/tibt:jasubjoined": "\u0F97", "/tibt:ka": "\u0F40", "/tibt:kasubjoined": "\u0F90", "/tibt:kha": "\u0F41", "/tibt:khasubjoined": "\u0F91", "/tibt:kka": "\u0F6B", "/tibt:kssa": "\u0F69", "/tibt:kssasubjoined": "\u0FB9", "/tibt:kurukha": "\u0FBE", "/tibt:kurukhabzhimigcan": "\u0FBF", "/tibt:la": "\u0F63", "/tibt:lasubjoined": "\u0FB3", "/tibt:lcetsacansign": "\u0F88", "/tibt:lcetsacansubjoinedsign": "\u0F8D", "/tibt:lcirtagssign": "\u0F86", "/tibt:leadingmchanrtagsmark": "\u0FD9", "/tibt:lhagrtagslogotypesign": "\u0F16", "/tibt:lightbeatcantillationsign": "\u0FC1", "/tibt:llvocalicvowelsign": "\u0F79", "/tibt:lvocalicvowelsign": "\u0F78", "/tibt:ma": "\u0F58", "/tibt:martshessign": "\u0F3F", "/tibt:masubjoined": "\u0FA8", "/tibt:mchucansign": "\u0F89", "/tibt:mchucansubjoinedsign": "\u0F8E", "/tibt:mnyamyiggimgorgyanmark": "\u0FD1", "/tibt:na": "\u0F53", "/tibt:nasubjoined": "\u0FA3", "/tibt:nga": "\u0F44", "/tibt:ngasbzungnyizlamark": "\u0F35", "/tibt:ngasbzungsgorrtagsmark": "\u0F37", "/tibt:ngasubjoined": "\u0F94", "/tibt:nine": "\u0F29", "/tibt:nna": "\u0F4E", "/tibt:nnasubjoined": "\u0F9E", "/tibt:norbubzhikhyilsymbol": "\u0FCC", "/tibt:norbugsumkhyilsymbol": "\u0FCB", "/tibt:norbunyiskhyilsymbol": "\u0FCA", "/tibt:norbusymbol": "\u0FC9", "/tibt:nya": "\u0F49", "/tibt:nyasubjoined": "\u0F99", "/tibt:nyisshadmark": "\u0F0E", "/tibt:nyistshegmark": "\u0FD2", "/tibt:nyistshegshadmark": "\u0F10", "/tibt:nyizlanaadasign": "\u0F82", "/tibt:omsyllable": "\u0F00", "/tibt:one": "\u0F21", "/tibt:oovowelsign": "\u0F7D", "/tibt:ovowelsign": "\u0F7C", "/tibt:pa": "\u0F54", "/tibt:padmagdansymbol": "\u0FC6", "/tibt:palutamark": "\u0F85", "/tibt:pasubjoined": "\u0FA4", "/tibt:pha": "\u0F55", "/tibt:phasubjoined": "\u0FA5", "/tibt:phurpasymbol": "\u0FC8", "/tibt:ra": "\u0F62", "/tibt:rafixed": "\u0F6A", "/tibt:rasubjoined": "\u0FB2", "/tibt:rasubjoinedfixed": "\u0FBC", "/tibt:rdeldkargcigsign": "\u0F1A", "/tibt:rdeldkargnyissign": "\u0F1B", "/tibt:rdeldkargsumsign": "\u0F1C", "/tibt:rdeldkarrdelnagsign": "\u0F1F", "/tibt:rdelnaggcigsign": "\u0F1D", "/tibt:rdelnaggnyissign": "\u0F1E", "/tibt:rdelnaggsumsign": "\u0FCF", "/tibt:rdelnagrdeldkarsign": "\u0FCE", "/tibt:rdorjergyagramsymbol": "\u0FC7", "/tibt:rdorjesymbol": "\u0FC5", "/tibt:reversediivowelsign": "\u0F81", "/tibt:reversedivowelsign": "\u0F80", "/tibt:rgyagramshadmark": "\u0F12", "/tibt:rinchenspungsshadmark": "\u0F11", "/tibt:rjessungarosign": "\u0F7E", "/tibt:rnambcadsign": "\u0F7F", "/tibt:rra": "\u0F6C", "/tibt:rrvocalicvowelsign": "\u0F77", "/tibt:rvocalicvowelsign": "\u0F76", "/tibt:sa": "\u0F66", "/tibt:sasubjoined": "\u0FB6", "/tibt:sbrulshadmark": "\u0F08", "/tibt:sbubchalcantillationsign": "\u0FC3", "/tibt:seven": "\u0F27", "/tibt:sha": "\u0F64", "/tibt:shadmark": "\u0F0D", "/tibt:shasubjoined": "\u0FB4", "/tibt:six": "\u0F26", "/tibt:snaldansign": "\u0F83", "/tibt:ssa": "\u0F65", "/tibt:ssasubjoined": "\u0FB5", "/tibt:subjoinedAA": "\u0FB0", "/tibt:svastileft": "\u0FD6", "/tibt:svastileftdot": "\u0FD8", "/tibt:svastiright": "\u0FD5", "/tibt:svastirightdot": "\u0FD7", "/tibt:ta": "\u0F4F", "/tibt:tasubjoined": "\u0F9F", "/tibt:tha": "\u0F50", "/tibt:thasubjoined": "\u0FA0", "/tibt:three": "\u0F23", "/tibt:trailingmchanrtagsmark": "\u0FDA", "/tibt:tsa": "\u0F59", "/tibt:tsaphrumark": "\u0F39", "/tibt:tsasubjoined": "\u0FA9", "/tibt:tsha": "\u0F5A", "/tibt:tshasubjoined": "\u0FAA", "/tibt:tshegshadmark": "\u0F0F", "/tibt:tta": "\u0F4A", "/tibt:ttasubjoined": "\u0F9A", "/tibt:ttha": "\u0F4B", "/tibt:tthasubjoined": "\u0F9B", "/tibt:two": "\u0F22", "/tibt:uuvowelsign": "\u0F75", "/tibt:uvowelsign": "\u0F74", "/tibt:wa": "\u0F5D", "/tibt:wasubjoined": "\u0FAD", "/tibt:wasubjoinedfixed": "\u0FBA", "/tibt:ya": "\u0F61", "/tibt:yangrtagssign": "\u0F87", "/tibt:yartshessign": "\u0F3E", "/tibt:yasubjoined": "\u0FB1", "/tibt:yasubjoinedfixed": "\u0FBB", "/tibt:yigmgomdunmainitialmark": "\u0F04", "/tibt:yigmgosgabmaclosingmark": "\u0F05", "/tibt:yigmgotshegshadmamark": "\u0F07", "/tibt:za": "\u0F5F", "/tibt:zasubjoined": "\u0FAF", "/tibt:zero": "\u0F20", "/tibt:zha": "\u0F5E", "/tibt:zhasubjoined": "\u0FAE", "/ticirclekatakana": "\u32E0", "/tickconvavediamondleftwhite": "\u27E2", "/tickconvavediamondrightwhite": "\u27E3", "/ticket": "\u1F3AB", "/tickleftwhitesquare": "\u27E4", "/tickrightwhitesquare": "\u27E5", "/tifcha:hb": "\u0596", "/tiger": "\u1F405", "/tigerFace": "\u1F42F", "/tihiragana": "\u3061", "/tikatakana": "\u30C1", "/tikatakanahalfwidth": "\uFF81", "/tikeutacirclekorean": "\u3270", "/tikeutaparenkorean": "\u3210", "/tikeutcirclekorean": "\u3262", "/tikeutkorean": "\u3137", "/tikeutparenkorean": "\u3202", "/tilde": "\u02DC", "/tildebelowcmb": "\u0330", "/tildecmb": "\u0303", "/tildecomb": "\u0303", "/tildediaeresisfunc": "\u2368", "/tildedotaccent": "\u2E1E", "/tildedotbelow": "\u2E1F", "/tildedoublecmb": "\u0360", "/tildeequalsreversed": "\u22CD", "/tildelowmod": "\u02F7", "/tildeoperator": "\u223C", "/tildeoverlaycmb": "\u0334", "/tildereversed": "\u223D", "/tildering": "\u2E1B", "/tildetpl": "\u224B", "/tildeverticalcmb": "\u033E", "/timerclock": "\u23F2", "/timescircle": "\u2297", "/tinsular": "\uA787", "/tipehahebrew": "\u0596", "/tipehalefthebrew": "\u0596", "/tippigurmukhi": "\u0A70", "/tiredFace": "\u1F62B", "/tironiansignet": "\u204A", "/tirtatumetespada": "\uA9DE", "/titlocmbcyr": "\u0483", "/titlocyrilliccmb": "\u0483", "/tiwnarmenian": "\u057F", "/tjekomicyr": "\u050F", "/tlinebelow": "\u1E6F", "/tmonospace": "\uFF54", "/toarmenian": "\u0569", "/tocirclekatakana": "\u32E3", "/tocornerarrowNW": "\u21F1", "/tocornerarrowSE": "\u21F2", "/tohiragana": "\u3068", "/toilet": "\u1F6BD", "/tokatakana": "\u30C8", "/tokatakanahalfwidth": "\uFF84", "/tokyoTower": "\u1F5FC", "/tolongvowel": "\uA9B5", "/tomato": "\u1F345", "/tonebarextrahighmod": "\u02E5", "/tonebarextralowmod": "\u02E9", "/tonebarhighmod": "\u02E6", "/tonebarlowmod": "\u02E8", "/tonebarmidmod": "\u02E7", "/tonefive": "\u01BD", "/tonehighbeginmod": "\u02F9", "/tonehighendmod": "\u02FA", "/tonelowbeginmod": "\u02FB", "/tonelowendmod": "\u02FC", "/tonesix": "\u0185", "/tonetwo": "\u01A8", "/tongue": "\u1F445", "/tonos": "\u0384", "/tonsquare": "\u3327", "/topHat": "\u1F3A9", "/topUpwardsArrowAbove": "\u1F51D", "/topatakthai": "\u0E0F", "/tortoiseshellbracketleft": "\u3014", "/tortoiseshellbracketleftsmall": "\uFE5D", "/tortoiseshellbracketleftvertical": "\uFE39", "/tortoiseshellbracketright": "\u3015", "/tortoiseshellbracketrightsmall": "\uFE5E", "/tortoiseshellbracketrightvertical": "\uFE3A", "/totalrunout": "\u2330", "/totaothai": "\u0E15", "/tpalatalhook": "\u01AB", "/tparen": "\u24AF", "/tparenthesized": "\u24AF", "/trackball": "\u1F5B2", "/tractor": "\u1F69C", "/trademark": "\u2122", "/trademarksans": "\uF8EA", "/trademarkserif": "\uF6DB", "/train": "\u1F686", "/tram": "\u1F68A", "/tramCar": "\u1F68B", "/trapeziumwhite": "\u23E2", "/tresillo": "\uA72B", "/tretroflex": "\u0288", "/tretroflexhook": "\u0288", "/triagdn": "\u25BC", "/triaglf": "\u25C4", "/triagrt": "\u25BA", "/triagup": "\u25B2", "/triangleWithRoundedCorners": "\u1F6C6", "/triangledotupwhite": "\u25EC", "/triangledownblack": "\u25BC", "/triangledownsmallblack": "\u25BE", "/triangledownsmallwhite": "\u25BF", "/triangledownwhite": "\u25BD", "/trianglehalfupleftblack": "\u25ED", "/trianglehalfuprightblack": "\u25EE", "/triangleleftblack": "\u25C0", "/triangleleftsmallblack": "\u25C2", "/triangleleftsmallwhite": "\u25C3", "/triangleleftwhite": "\u25C1", "/triangleright": "\u22BF", "/trianglerightblack": "\u25B6", "/trianglerightsmallblack": "\u25B8", "/trianglerightsmallwhite": "\u25B9", "/trianglerightwhite": "\u25B7", "/triangleupblack": "\u25B2", "/triangleupsmallblack": "\u25B4", "/triangleupsmallwhite": "\u25B5", "/triangleupwhite": "\u25B3", "/triangularFlagOnPost": "\u1F6A9", "/triangularRuler": "\u1F4D0", "/triangularbullet": "\u2023", "/tricolon": "\u205D", "/tricontainingtriwhiteanglesmall": "\u27C1", "/tridentEmblem": "\u1F531", "/trigramearth": "\u2637", "/trigramfire": "\u2632", "/trigramheaven": "\u2630", "/trigramlake": "\u2631", "/trigrammountain": "\u2636", "/trigramthunder": "\u2633", "/trigramwater": "\u2635", "/trigramwind": "\u2634", "/triplearrowleft": "\u21DA", "/triplearrowright": "\u21DB", "/tripledot": "\u061E", "/trisememetrical": "\u23D7", "/trns:baby": "\u1F6BC", "/trolleybus": "\u1F68E", "/trophy": "\u1F3C6", "/tropicalDrink": "\u1F379", "/tropicalFish": "\u1F420", "/truckblack": "\u26DF", "/true": "\u22A8", "/trumpet": "\u1F3BA", "/ts": "\u02A6", "/tsadi": "\u05E6", "/tsadi:hb": "\u05E6", "/tsadidagesh": "\uFB46", "/tsadidageshhebrew": "\uFB46", "/tsadihebrew": "\u05E6", "/tsadiwithdagesh:hb": "\uFB46", "/tsecyr": "\u0446", "/tsecyrillic": "\u0446", "/tsere": "\u05B5", "/tsere12": "\u05B5", "/tsere1e": "\u05B5", "/tsere2b": "\u05B5", "/tsere:hb": "\u05B5", "/tserehebrew": "\u05B5", "/tserenarrowhebrew": "\u05B5", "/tserequarterhebrew": "\u05B5", "/tserewidehebrew": "\u05B5", "/tshecyr": "\u045B", "/tshecyrillic": "\u045B", "/tsinnorit:hb": "\u05AE", "/tstroke": "\u2C66", "/tsuperior": "\uF6F3", "/ttabengali": "\u099F", "/ttadeva": "\u091F", "/ttagujarati": "\u0A9F", "/ttagurmukhi": "\u0A1F", "/ttamahaprana": "\uA99C", "/tteh": "\u0679", "/tteh.fina": "\uFB67", "/tteh.init": "\uFB68", "/tteh.isol": "\uFB66", "/tteh.medi": "\uFB69", "/tteharabic": "\u0679", "/tteheh": "\u067A", "/tteheh.fina": "\uFB5F", "/tteheh.init": "\uFB60", "/tteheh.isol": "\uFB5E", "/tteheh.medi": "\uFB61", "/ttehfinalarabic": "\uFB67", "/ttehinitialarabic": "\uFB68", "/ttehmedialarabic": "\uFB69", "/tthabengali": "\u09A0", "/tthadeva": "\u0920", "/tthagujarati": "\u0AA0", "/tthagurmukhi": "\u0A20", "/tturned": "\u0287", "/tucirclekatakana": "\u32E1", "/tugrik": "\u20AE", "/tuhiragana": "\u3064", "/tukatakana": "\u30C4", "/tukatakanahalfwidth": "\uFF82", "/tulip": "\u1F337", "/tum": "\uA777", "/turkishlira": "\u20BA", "/turnedOkHandSign": "\u1F58F", "/turnedcomma": "\u2E32", "/turneddagger": "\u2E38", "/turneddigitthree": "\u218B", "/turneddigittwo": "\u218A", "/turnedpiselehpada": "\uA9CD", "/turnedsemicolon": "\u2E35", "/turnedshogipieceblack": "\u26CA", "/turnedshogipiecewhite": "\u26C9", "/turnstiledblverticalbarright": "\u22AB", "/turnstileleftrightdbl": "\u27DA", "/turnstiletplverticalbarright": "\u22AA", "/turtle": "\u1F422", "/tusmallhiragana": "\u3063", "/tusmallkatakana": "\u30C3", "/tusmallkatakanahalfwidth": "\uFF6F", "/twelve.roman": "\u216B", "/twelve.romansmall": "\u217B", "/twelvecircle": "\u246B", "/twelvecircleblack": "\u24EC", "/twelveparen": "\u247F", "/twelveparenthesized": "\u247F", "/twelveperiod": "\u2493", "/twelveroman": "\u217B", "/twenty-twopointtwosquare": "\u1F1A2", "/twentycircle": "\u2473", "/twentycircleblack": "\u24F4", "/twentycirclesquare": "\u3249", "/twentyeightcircle": "\u3258", "/twentyfivecircle": "\u3255", "/twentyfourcircle": "\u3254", "/twentyhangzhou": "\u5344", "/twentyninecircle": "\u3259", "/twentyonecircle": "\u3251", "/twentyparen": "\u2487", "/twentyparenthesized": "\u2487", "/twentyperiod": "\u249B", "/twentysevencircle": "\u3257", "/twentysixcircle": "\u3256", "/twentythreecircle": "\u3253", "/twentytwocircle": "\u3252", "/twistedRightwardsArrows": "\u1F500", "/two": "\u0032", "/two.inferior": "\u2082", "/two.roman": "\u2161", "/two.romansmall": "\u2171", "/twoButtonMouse": "\u1F5B0", "/twoHearts": "\u1F495", "/twoMenHoldingHands": "\u1F46C", "/twoSpeechBubbles": "\u1F5EA", "/twoWomenHoldingHands": "\u1F46D", "/twoarabic": "\u0662", "/twoasterisksalignedvertically": "\u2051", "/twobengali": "\u09E8", "/twocircle": "\u2461", "/twocircledbl": "\u24F6", "/twocircleinversesansserif": "\u278B", "/twocomma": "\u1F103", "/twodeva": "\u0968", "/twodotenleader": "\u2025", "/twodotleader": "\u2025", "/twodotleadervertical": "\uFE30", "/twodotpunctuation": "\u205A", "/twodotsoveronedot": "\u2E2A", "/twofar": "\u06F2", "/twofifths": "\u2156", "/twogujarati": "\u0AE8", "/twogurmukhi": "\u0A68", "/twohackarabic": "\u0662", "/twohangzhou": "\u3022", "/twoideographiccircled": "\u3281", "/twoideographicparen": "\u3221", "/twoinferior": "\u2082", "/twoksquare": "\u1F19D", "/twomonospace": "\uFF12", "/twonumeratorbengali": "\u09F5", "/twooldstyle": "\uF732", "/twoparen": "\u2475", "/twoparenthesized": "\u2475", "/twoperiod": "\u2489", "/twopersian": "\u06F2", "/tworoman": "\u2171", "/twoshortsjoinedmetrical": "\u23D6", "/twoshortsoverlongmetrical": "\u23D5", "/twostroke": "\u01BB", "/twosuperior": "\u00B2", "/twothai": "\u0E52", "/twothirds": "\u2154", "/twowayleftwaytrafficblack": "\u26D6", "/twowayleftwaytrafficwhite": "\u26D7", "/tz": "\uA729", "/u": "\u0075", "/u.fina": "\uFBD8", "/u.isol": "\uFBD7", "/uacute": "\u00FA", "/uacutedblcyr": "\u04F3", "/ubar": "\u0289", "/ubengali": "\u0989", "/ubopomofo": "\u3128", "/ubracketleft": "\u2E26", "/ubracketright": "\u2E27", "/ubreve": "\u016D", "/ucaron": "\u01D4", "/ucircle": "\u24E4", "/ucirclekatakana": "\u32D2", "/ucircumflex": "\u00FB", "/ucircumflexbelow": "\u1E77", "/ucyr": "\u0443", "/ucyrillic": "\u0443", "/udattadeva": "\u0951", "/udblacute": "\u0171", "/udblgrave": "\u0215", "/udeva": "\u0909", "/udieresis": "\u00FC", "/udieresisacute": "\u01D8", "/udieresisbelow": "\u1E73", "/udieresiscaron": "\u01DA", "/udieresiscyr": "\u04F1", "/udieresiscyrillic": "\u04F1", "/udieresisgrave": "\u01DC", "/udieresismacron": "\u01D6", "/udotbelow": "\u1EE5", "/ugrave": "\u00F9", "/ugravedbl": "\u0215", "/ugujarati": "\u0A89", "/ugurmukhi": "\u0A09", "/uhamza": "\u0677", "/uhamza.isol": "\uFBDD", "/uhdsquare": "\u1F1AB", "/uhiragana": "\u3046", "/uhoi": "\u1EE7", "/uhookabove": "\u1EE7", "/uhorn": "\u01B0", "/uhornacute": "\u1EE9", "/uhorndotbelow": "\u1EF1", "/uhorngrave": "\u1EEB", "/uhornhoi": "\u1EED", "/uhornhookabove": "\u1EED", "/uhorntilde": "\u1EEF", "/uhungarumlaut": "\u0171", "/uhungarumlautcyrillic": "\u04F3", "/uighurkazakhkirghizalefmaksura.init": "\uFBE8", "/uighurkazakhkirghizalefmaksura.medi": "\uFBE9", "/uighurkirghizyeh.init_hamzaabove.medi_alefmaksura.fina": "\uFBF9", "/uighurkirghizyeh.init_hamzaabove.medi_alefmaksura.medi": "\uFBFB", "/uighurkirghizyeh.medi_hamzaabove.medi_alefmaksura.fina": "\uFBFA", "/uinvertedbreve": "\u0217", "/ukatakana": "\u30A6", "/ukatakanahalfwidth": "\uFF73", "/ukcyr": "\u0479", "/ukcyrillic": "\u0479", "/ukorean": "\u315C", "/um": "\uA778", "/umacron": "\u016B", "/umacroncyr": "\u04EF", "/umacroncyrillic": "\u04EF", "/umacrondieresis": "\u1E7B", "/umatragurmukhi": "\u0A41", "/umbrella": "\u2602", "/umbrellaonground": "\u26F1", "/umbrellaraindrops": "\u2614", "/umonospace": "\uFF55", "/unamusedFace": "\u1F612", "/unaspiratedmod": "\u02ED", "/underscore": "\u005F", "/underscorecenterline": "\uFE4E", "/underscoredashed": "\uFE4D", "/underscoredbl": "\u2017", "/underscoremonospace": "\uFF3F", "/underscorevertical": "\uFE33", "/underscorewavy": "\uFE4F", "/underscorewavyvertical": "\uFE34", "/undertie": "\u203F", "/undo": "\u238C", "/union": "\u222A", "/unionarray": "\u22C3", "/uniondbl": "\u22D3", "/universal": "\u2200", "/unmarriedpartnership": "\u26AF", "/uogonek": "\u0173", "/uonsquare": "\u3306", "/upPointingAirplane": "\u1F6E7", "/upPointingMilitaryAirplane": "\u1F6E6", "/upPointingSmallAirplane": "\u1F6E8", "/uparen": "\u24B0", "/uparenthesized": "\u24B0", "/uparrowleftofdownarrow": "\u21C5", "/upblock": "\u2580", "/updblhorzsng": "\u2568", "/updblleftsng": "\u255C", "/updblrightsng": "\u2559", "/upheavydnhorzlight": "\u2540", "/upheavyhorzlight": "\u2538", "/upheavyleftdnlight": "\u2526", "/upheavyleftlight": "\u251A", "/upheavyrightdnlight": "\u251E", "/upheavyrightlight": "\u2516", "/uplightdnhorzheavy": "\u2548", "/uplighthorzheavy": "\u2537", "/uplightleftdnheavy": "\u252A", "/uplightleftheavy": "\u2519", "/uplightrightdnheavy": "\u2522", "/uplightrightheavy": "\u2515", "/upperHalfBlock": "\u2580", "/upperOneEighthBlock": "\u2594", "/upperRightShadowedWhiteCircle": "\u1F53F", "/upperdothebrew": "\u05C4", "/upperhalfcircle": "\u25E0", "/upperhalfcircleinversewhite": "\u25DA", "/upperquadrantcirculararcleft": "\u25DC", "/upperquadrantcirculararcright": "\u25DD", "/uppertriangleleft": "\u25F8", "/uppertriangleleftblack": "\u25E4", "/uppertriangleright": "\u25F9", "/uppertrianglerightblack": "\u25E5", "/upsideDownFace": "\u1F643", "/upsilon": "\u03C5", "/upsilonacute": "\u1F7B", "/upsilonasper": "\u1F51", "/upsilonasperacute": "\u1F55", "/upsilonaspergrave": "\u1F53", "/upsilonaspertilde": "\u1F57", "/upsilonbreve": "\u1FE0", "/upsilondieresis": "\u03CB", "/upsilondieresisacute": "\u1FE3", "/upsilondieresisgrave": "\u1FE2", "/upsilondieresistilde": "\u1FE7", "/upsilondieresistonos": "\u03B0", "/upsilongrave": "\u1F7A", "/upsilonlatin": "\u028A", "/upsilonlenis": "\u1F50", "/upsilonlenisacute": "\u1F54", "/upsilonlenisgrave": "\u1F52", "/upsilonlenistilde": "\u1F56", "/upsilontilde": "\u1FE6", "/upsilontonos": "\u03CD", "/upsilonwithmacron": "\u1FE1", "/upsnghorzdbl": "\u2567", "/upsngleftdbl": "\u255B", "/upsngrightdbl": "\u2558", "/uptackbelowcmb": "\u031D", "/uptackmod": "\u02D4", "/upwithexclamationmarksquare": "\u1F199", "/uragurmukhi": "\u0A73", "/uranus": "\u2645", "/uring": "\u016F", "/ushortcyr": "\u045E", "/ushortcyrillic": "\u045E", "/usmallhiragana": "\u3045", "/usmallkatakana": "\u30A5", "/usmallkatakanahalfwidth": "\uFF69", "/usmod": "\uA770", "/ustraightcyr": "\u04AF", "/ustraightcyrillic": "\u04AF", "/ustraightstrokecyr": "\u04B1", "/ustraightstrokecyrillic": "\u04B1", "/utilde": "\u0169", "/utildeacute": "\u1E79", "/utildebelow": "\u1E75", "/uubengali": "\u098A", "/uudeva": "\u090A", "/uugujarati": "\u0A8A", "/uugurmukhi": "\u0A0A", "/uumatragurmukhi": "\u0A42", "/uuvowelsignbengali": "\u09C2", "/uuvowelsigndeva": "\u0942", "/uuvowelsigngujarati": "\u0AC2", "/uvowelsignbengali": "\u09C1", "/uvowelsigndeva": "\u0941", "/uvowelsigngujarati": "\u0AC1", "/v": "\u0076", "/vadeva": "\u0935", "/vagujarati": "\u0AB5", "/vagurmukhi": "\u0A35", "/vakatakana": "\u30F7", "/vanedownfunc": "\u2356", "/vaneleftfunc": "\u2345", "/vanerightfunc": "\u2346", "/vaneupfunc": "\u234F", "/varikajudeospanish:hb": "\uFB1E", "/vav": "\u05D5", "/vav:hb": "\u05D5", "/vav_vav:hb": "\u05F0", "/vav_yod:hb": "\u05F1", "/vavdagesh": "\uFB35", "/vavdagesh65": "\uFB35", "/vavdageshhebrew": "\uFB35", "/vavhebrew": "\u05D5", "/vavholam": "\uFB4B", "/vavholamhebrew": "\uFB4B", "/vavvavhebrew": "\u05F0", "/vavwithdagesh:hb": "\uFB35", "/vavwithholam:hb": "\uFB4B", "/vavyodhebrew": "\u05F1", "/vcircle": "\u24E5", "/vcurl": "\u2C74", "/vdiagonalstroke": "\uA75F", "/vdotbelow": "\u1E7F", "/ve.fina": "\uFBDF", "/ve.isol": "\uFBDE", "/ve:abovetonecandra": "\u1CF4", "/ve:anusvaraantargomukhasign": "\u1CE9", "/ve:anusvarabahirgomukhasign": "\u1CEA", "/ve:anusvarasignlong": "\u1CEF", "/ve:anusvaraubhayatomukhasign": "\u1CF1", "/ve:anusvaravamagomukhasign": "\u1CEB", "/ve:anusvaravamagomukhawithtailsign": "\u1CEC", "/ve:ardhavisargasign": "\u1CF2", "/ve:atharvaindependentsvaritatone": "\u1CE1", "/ve:atikramasign": "\u1CF7", "/ve:belowtonecandra": "\u1CD8", "/ve:dotbelowtone": "\u1CDD", "/ve:hexiformanusvarasignlong": "\u1CEE", "/ve:jihvamuliyasign": "\u1CF5", "/ve:karshanatone": "\u1CD0", "/ve:kathakaanudattatone": "\u1CDC", "/ve:nihshvasasign": "\u1CD3", "/ve:prenkhatone": "\u1CD2", "/ve:rigkashmiriindependentsvaritatone": "\u1CE0", "/ve:ringabovetone": "\u1CF8", "/ve:ringabovetonedbl": "\u1CF9", "/ve:rotatedardhavisargasign": "\u1CF3", "/ve:rthanganusvarasignlong": "\u1CF0", "/ve:sharatone": "\u1CD1", "/ve:svaritatonedbl": "\u1CDA", "/ve:svaritatonetpl": "\u1CDB", "/ve:threedotsbelowtone": "\u1CDF", "/ve:tiryaksign": "\u1CED", "/ve:twodotsbelowtone": "\u1CDE", "/ve:upadhmaniyasign": "\u1CF6", "/ve:visargaanudattasign": "\u1CE5", "/ve:visargaanudattasignreversed": "\u1CE6", "/ve:visargaanudattawithtailsign": "\u1CE8", "/ve:visargasvaritasign": "\u1CE2", "/ve:visargaudattasign": "\u1CE3", "/ve:visargaudattasignreversed": "\u1CE4", "/ve:visargaudattawithtailsign": "\u1CE7", "/ve:yajuraggravatedindependentsvaritatone": "\u1CD5", "/ve:yajurindependentsvaritatone": "\u1CD6", "/ve:yajurkathakaindependentsvaritaschroedertone": "\u1CD9", "/ve:yajurkathakaindependentsvaritatone": "\u1CD7", "/ve:yajurmidlinesvaritasign": "\u1CD4", "/vecyr": "\u0432", "/vecyrillic": "\u0432", "/veh": "\u06A4", "/veh.fina": "\uFB6B", "/veh.init": "\uFB6C", "/veh.isol": "\uFB6A", "/veh.medi": "\uFB6D", "/veharabic": "\u06A4", "/vehfinalarabic": "\uFB6B", "/vehinitialarabic": "\uFB6C", "/vehmedialarabic": "\uFB6D", "/vekatakana": "\u30F9", "/vend": "\uA769", "/venus": "\u2640", "/versicle": "\u2123", "/vert:bracketwhiteleft": "\uFE17", "/vert:brakcetwhiteright": "\uFE18", "/vert:colon": "\uFE13", "/vert:comma": "\uFE10", "/vert:ellipsishor": "\uFE19", "/vert:exclam": "\uFE15", "/vert:ideographiccomma": "\uFE11", "/vert:ideographicfullstop": "\uFE12", "/vert:question": "\uFE16", "/vert:semicolon": "\uFE14", "/vertdblhorzsng": "\u256B", "/vertdblleftsng": "\u2562", "/vertdblrightsng": "\u255F", "/vertheavyhorzlight": "\u2542", "/vertheavyleftlight": "\u2528", "/vertheavyrightlight": "\u2520", "/verticalTrafficLight": "\u1F6A6", "/verticalbar": "\u007C", "/verticalbardbl": "\u2016", "/verticalbarhorizontalstroke": "\u27CA", "/verticalbarwhitearrowonpedestalup": "\u21ED", "/verticalfourdots": "\u205E", "/verticalideographiciterationmark": "\u303B", "/verticalkanarepeatmark": "\u3031", "/verticalkanarepeatmarklowerhalf": "\u3035", "/verticalkanarepeatmarkupperhalf": "\u3033", "/verticalkanarepeatwithvoicedsoundmark": "\u3032", "/verticalkanarepeatwithvoicedsoundmarkupperhalf": "\u3034", "/verticallineabovecmb": "\u030D", "/verticallinebelowcmb": "\u0329", "/verticallinelowmod": "\u02CC", "/verticallinemod": "\u02C8", "/verticalmalestroke": "\u26A8", "/verticalsdbltrokearrowleft": "\u21FA", "/verticalsdbltrokearrowleftright": "\u21FC", "/verticalsdbltrokearrowright": "\u21FB", "/verticalstrokearrowleft": "\u21F7", "/verticalstrokearrowleftright": "\u21F9", "/verticalstrokearrowright": "\u21F8", "/vertlighthorzheavy": "\u253F", "/vertlightleftheavy": "\u2525", "/vertlightrightheavy": "\u251D", "/vertsnghorzdbl": "\u256A", "/vertsngleftdbl": "\u2561", "/vertsngrightdbl": "\u255E", "/verymuchgreater": "\u22D9", "/verymuchless": "\u22D8", "/vesta": "\u26B6", "/vewarmenian": "\u057E", "/vhook": "\u028B", "/vibrationMode": "\u1F4F3", "/videoCamera": "\u1F4F9", "/videoGame": "\u1F3AE", "/videocassette": "\u1F4FC", "/viewdatasquare": "\u2317", "/vikatakana": "\u30F8", "/violin": "\u1F3BB", "/viramabengali": "\u09CD", "/viramadeva": "\u094D", "/viramagujarati": "\u0ACD", "/virgo": "\u264D", "/visargabengali": "\u0983", "/visargadeva": "\u0903", "/visargagujarati": "\u0A83", "/visigothicz": "\uA763", "/vmonospace": "\uFF56", "/voarmenian": "\u0578", "/vodsquare": "\u1F1AC", "/voicediterationhiragana": "\u309E", "/voicediterationkatakana": "\u30FE", "/voicedmarkkana": "\u309B", "/voicedmarkkanahalfwidth": "\uFF9E", "/voicingmod": "\u02EC", "/vokatakana": "\u30FA", "/volapukae": "\uA79B", "/volapukoe": "\uA79D", "/volapukue": "\uA79F", "/volcano": "\u1F30B", "/volleyball": "\u1F3D0", "/vovermfullwidth": "\u33DE", "/vowelVabove": "\u065A", "/voweldotbelow": "\u065C", "/vowelinvertedVabove": "\u065B", "/vparen": "\u24B1", "/vparenthesized": "\u24B1", "/vrighthook": "\u2C71", "/vssquare": "\u1F19A", "/vtilde": "\u1E7D", "/vturned": "\u028C", "/vuhiragana": "\u3094", "/vukatakana": "\u30F4", "/vwelsh": "\u1EFD", "/vy": "\uA761", "/w": "\u0077", "/wacirclekatakana": "\u32FB", "/wacute": "\u1E83", "/waekorean": "\u3159", "/wahiragana": "\u308F", "/wakatakana": "\u30EF", "/wakatakanahalfwidth": "\uFF9C", "/wakorean": "\u3158", "/waningCrescentMoon": "\u1F318", "/waningGibbousMoon": "\u1F316", "/warning": "\u26A0", "/wasmallhiragana": "\u308E", "/wasmallkatakana": "\u30EE", "/wastebasket": "\u1F5D1", "/watch": "\u231A", "/waterBuffalo": "\u1F403", "/waterCloset": "\u1F6BE", "/waterWave": "\u1F30A", "/waterideographiccircled": "\u328C", "/waterideographicparen": "\u322C", "/watermelon": "\u1F349", "/wattosquare": "\u3357", "/wavedash": "\u301C", "/wavingBlackFlag": "\u1F3F4", "/wavingHandSign": "\u1F44B", "/wavingWhiteFlag": "\u1F3F3", "/wavydash": "\u3030", "/wavyhamzabelow": "\u065F", "/wavyline": "\u2307", "/wavyunderscorevertical": "\uFE34", "/waw": "\u0648", "/waw.fina": "\uFEEE", "/waw.isol": "\uFEED", "/wawDigitThreeAbove": "\u0779", "/wawDigitTwoAbove": "\u0778", "/wawarabic": "\u0648", "/wawdotabove": "\u06CF", "/wawfinalarabic": "\uFEEE", "/wawhamza": "\u0624", "/wawhamza.fina": "\uFE86", "/wawhamza.isol": "\uFE85", "/wawhamzaabovearabic": "\u0624", "/wawhamzaabovefinalarabic": "\uFE86", "/wawhighhamza": "\u0676", "/wawring": "\u06C4", "/wawsmall": "\u06E5", "/wawtwodotsabove": "\u06CA", "/waxingCrescentMoon": "\u1F312", "/waxingGibbousMoon": "\u1F314", "/wbfullwidth": "\u33DD", "/wbsquare": "\u33DD", "/wcircle": "\u24E6", "/wcircumflex": "\u0175", "/wcsquare": "\u1F14F", "/wcsquareblack": "\u1F18F", "/wdieresis": "\u1E85", "/wdot": "\u1E87", "/wdotaccent": "\u1E87", "/wdotbelow": "\u1E89", "/wearyCatFace": "\u1F640", "/wearyFace": "\u1F629", "/wecirclekatakana": "\u32FD", "/wecyr": "\u051D", "/wedding": "\u1F492", "/wehiragana": "\u3091", "/weierstrass": "\u2118", "/weightLifter": "\u1F3CB", "/wekatakana": "\u30F1", "/wekorean": "\u315E", "/weokorean": "\u315D", "/westsyriaccross": "\u2670", "/wgrave": "\u1E81", "/whale": "\u1F40B", "/wheelchair": "\u267F", "/wheelofdharma": "\u2638", "/whiteDownPointingBackhandIndex": "\u1F447", "/whiteDownPointingLeftHandIndex": "\u1F597", "/whiteFlower": "\u1F4AE", "/whiteHardShellFloppyDisk": "\u1F5AB", "/whiteLatinCross": "\u1F546", "/whiteLeftPointingBackhandIndex": "\u1F448", "/whitePennant": "\u1F3F1", "/whiteRightPointingBackhandIndex": "\u1F449", "/whiteSquareButton": "\u1F533", "/whiteSun": "\u1F323", "/whiteSunBehindCloud": "\u1F325", "/whiteSunBehindCloudRain": "\u1F326", "/whiteSunSmallCloud": "\u1F324", "/whiteTouchtoneTelephone": "\u1F57E", "/whiteUpPointingBackhandIndex": "\u1F446", "/whitearrowdown": "\u21E9", "/whitearrowfromwallright": "\u21F0", "/whitearrowleft": "\u21E6", "/whitearrowonpedestalup": "\u21EB", "/whitearrowright": "\u21E8", "/whitearrowup": "\u21E7", "/whitearrowupdown": "\u21F3", "/whitearrowupfrombar": "\u21EA", "/whitebullet": "\u25E6", "/whitecircle": "\u25CB", "/whitecircleinverse": "\u25D9", "/whitecornerbracketleft": "\u300E", "/whitecornerbracketleftvertical": "\uFE43", "/whitecornerbracketright": "\u300F", "/whitecornerbracketrightvertical": "\uFE44", "/whitedblarrowonpedestalup": "\u21EF", "/whitedblarrowup": "\u21EE", "/whitediamond": "\u25C7", "/whitediamondcontainingblacksmalldiamond": "\u25C8", "/whitedownpointingsmalltriangle": "\u25BF", "/whitedownpointingtriangle": "\u25BD", "/whiteleftpointingsmalltriangle": "\u25C3", "/whiteleftpointingtriangle": "\u25C1", "/whitelenticularbracketleft": "\u3016", "/whitelenticularbracketright": "\u3017", "/whiterightpointingsmalltriangle": "\u25B9", "/whiterightpointingtriangle": "\u25B7", "/whitesesamedot": "\uFE46", "/whitesmallsquare": "\u25AB", "/whitesmilingface": "\u263A", "/whitesquare": "\u25A1", "/whitesquarebracketleft": "\u301A", "/whitesquarebracketright": "\u301B", "/whitestar": "\u2606", "/whitetelephone": "\u260F", "/whitetortoiseshellbracketleft": "\u3018", "/whitetortoiseshellbracketright": "\u3019", "/whiteuppointingsmalltriangle": "\u25B5", "/whiteuppointingtriangle": "\u25B3", "/whook": "\u2C73", "/wicirclekatakana": "\u32FC", "/wigglylinevertical": "\u2E3E", "/wignyan": "\uA983", "/wihiragana": "\u3090", "/wikatakana": "\u30F0", "/wikorean": "\u315F", "/windBlowingFace": "\u1F32C", "/windChime": "\u1F390", "/windupada": "\uA9C6", "/wineGlass": "\u1F377", "/winkingFace": "\u1F609", "/wiredKeyboard": "\u1F5AE", "/wmonospace": "\uFF57", "/wocirclekatakana": "\u32FE", "/wohiragana": "\u3092", "/wokatakana": "\u30F2", "/wokatakanahalfwidth": "\uFF66", "/wolfFace": "\u1F43A", "/woman": "\u1F469", "/womanBunnyEars": "\u1F46F", "/womansBoots": "\u1F462", "/womansClothes": "\u1F45A", "/womansHat": "\u1F452", "/womansSandal": "\u1F461", "/womens": "\u1F6BA", "/won": "\u20A9", "/wonmonospace": "\uFFE6", "/woodideographiccircled": "\u328D", "/woodideographicparen": "\u322D", "/wordjoiner": "\u2060", "/wordseparatormiddledot": "\u2E31", "/worldMap": "\u1F5FA", "/worriedFace": "\u1F61F", "/wowaenthai": "\u0E27", "/wparen": "\u24B2", "/wparenthesized": "\u24B2", "/wrappedPresent": "\u1F381", "/wreathproduct": "\u2240", "/wrench": "\u1F527", "/wring": "\u1E98", "/wsuperior": "\u02B7", "/wsupmod": "\u02B7", "/wturned": "\u028D", "/wulumelikvowel": "\uA9B7", "/wuluvowel": "\uA9B6", "/wynn": "\u01BF", "/x": "\u0078", "/x.inferior": "\u2093", "/xabovecmb": "\u033D", "/xatailcyr": "\u04B3", "/xbopomofo": "\u3112", "/xcircle": "\u24E7", "/xdieresis": "\u1E8D", "/xdot": "\u1E8B", "/xdotaccent": "\u1E8B", "/xeharmenian": "\u056D", "/xi": "\u03BE", "/xmonospace": "\uFF58", "/xor": "\u22BB", "/xparen": "\u24B3", "/xparenthesized": "\u24B3", "/xsuperior": "\u02E3", "/xsupmod": "\u02E3", "/y": "\u0079", "/yaadosquare": "\u334E", "/yaarusquare": "\u334F", "/yabengali": "\u09AF", "/yacirclekatakana": "\u32F3", "/yacute": "\u00FD", "/yacyr": "\u044F", "/yadeva": "\u092F", "/yaecyr": "\u0519", "/yaekorean": "\u3152", "/yagujarati": "\u0AAF", "/yagurmukhi": "\u0A2F", "/yahiragana": "\u3084", "/yakatakana": "\u30E4", "/yakatakanahalfwidth": "\uFF94", "/yakorean": "\u3151", "/yamakkanthai": "\u0E4E", "/yangtonemod": "\u02EB", "/yasmallhiragana": "\u3083", "/yasmallkatakana": "\u30E3", "/yasmallkatakanahalfwidth": "\uFF6C", "/yatcyr": "\u0463", "/yatcyrillic": "\u0463", "/ycircle": "\u24E8", "/ycircumflex": "\u0177", "/ydieresis": "\u00FF", "/ydot": "\u1E8F", "/ydotaccent": "\u1E8F", "/ydotbelow": "\u1EF5", "/yeh": "\u064A", "/yeh.fina": "\uFEF2", "/yeh.init": "\uFEF3", "/yeh.init_alefmaksura.fina": "\uFC59", "/yeh.init_hah.fina": "\uFC56", "/yeh.init_hah.medi": "\uFCDB", "/yeh.init_hamzaabove.medi_ae.fina": "\uFBEC", "/yeh.init_hamzaabove.medi_alef.fina": "\uFBEA", "/yeh.init_hamzaabove.medi_alefmaksura.fina": "\uFC03", "/yeh.init_hamzaabove.medi_e.fina": "\uFBF6", "/yeh.init_hamzaabove.medi_e.medi": "\uFBF8", "/yeh.init_hamzaabove.medi_hah.fina": "\uFC01", "/yeh.init_hamzaabove.medi_hah.medi": "\uFC98", "/yeh.init_hamzaabove.medi_heh.medi": "\uFC9B", "/yeh.init_hamzaabove.medi_jeem.fina": "\uFC00", "/yeh.init_hamzaabove.medi_jeem.medi": "\uFC97", "/yeh.init_hamzaabove.medi_khah.medi": "\uFC99", "/yeh.init_hamzaabove.medi_meem.fina": "\uFC02", "/yeh.init_hamzaabove.medi_meem.medi": "\uFC9A", "/yeh.init_hamzaabove.medi_oe.fina": "\uFBF2", "/yeh.init_hamzaabove.medi_u.fina": "\uFBF0", "/yeh.init_hamzaabove.medi_waw.fina": "\uFBEE", "/yeh.init_hamzaabove.medi_yeh.fina": "\uFC04", "/yeh.init_hamzaabove.medi_yu.fina": "\uFBF4", "/yeh.init_heh.medi": "\uFCDE", "/yeh.init_jeem.fina": "\uFC55", "/yeh.init_jeem.medi": "\uFCDA", "/yeh.init_khah.fina": "\uFC57", "/yeh.init_khah.medi": "\uFCDC", "/yeh.init_meem.fina": "\uFC58", "/yeh.init_meem.medi": "\uFCDD", "/yeh.init_meem.medi_meem.medi": "\uFD9D", "/yeh.init_yeh.fina": "\uFC5A", "/yeh.isol": "\uFEF1", "/yeh.medi": "\uFEF4", "/yeh.medi_alefmaksura.fina": "\uFC95", "/yeh.medi_hah.medi_yeh.fina": "\uFDAE", "/yeh.medi_hamzaabove.medi_ae.fina": "\uFBED", "/yeh.medi_hamzaabove.medi_alef.fina": "\uFBEB", "/yeh.medi_hamzaabove.medi_alefmaksura.fina": "\uFC68", "/yeh.medi_hamzaabove.medi_e.fina": "\uFBF7", "/yeh.medi_hamzaabove.medi_heh.medi": "\uFCE0", "/yeh.medi_hamzaabove.medi_meem.fina": "\uFC66", "/yeh.medi_hamzaabove.medi_meem.medi": "\uFCDF", "/yeh.medi_hamzaabove.medi_noon.fina": "\uFC67", "/yeh.medi_hamzaabove.medi_oe.fina": "\uFBF3", "/yeh.medi_hamzaabove.medi_reh.fina": "\uFC64", "/yeh.medi_hamzaabove.medi_u.fina": "\uFBF1", "/yeh.medi_hamzaabove.medi_waw.fina": "\uFBEF", "/yeh.medi_hamzaabove.medi_yeh.fina": "\uFC69", "/yeh.medi_hamzaabove.medi_yu.fina": "\uFBF5", "/yeh.medi_hamzaabove.medi_zain.fina": "\uFC65", "/yeh.medi_heh.medi": "\uFCF1", "/yeh.medi_jeem.medi_yeh.fina": "\uFDAF", "/yeh.medi_meem.fina": "\uFC93", "/yeh.medi_meem.medi": "\uFCF0", "/yeh.medi_meem.medi_meem.fina": "\uFD9C", "/yeh.medi_meem.medi_yeh.fina": "\uFDB0", "/yeh.medi_noon.fina": "\uFC94", "/yeh.medi_reh.fina": "\uFC91", "/yeh.medi_yeh.fina": "\uFC96", "/yeh.medi_zain.fina": "\uFC92", "/yehBarreeDigitThreeAbove": "\u077B", "/yehBarreeDigitTwoAbove": "\u077A", "/yehVabove": "\u06CE", "/yehabove": "\u06E7", "/yeharabic": "\u064A", "/yehbarree": "\u06D2", "/yehbarree.fina": "\uFBAF", "/yehbarree.isol": "\uFBAE", "/yehbarreearabic": "\u06D2", "/yehbarreefinalarabic": "\uFBAF", "/yehbarreehamza": "\u06D3", "/yehbarreehamza.fina": "\uFBB1", "/yehbarreehamza.isol": "\uFBB0", "/yehfarsi": "\u06CC", "/yehfarsi.fina": "\uFBFD", "/yehfarsi.init": "\uFBFE", "/yehfarsi.isol": "\uFBFC", "/yehfarsi.medi": "\uFBFF", "/yehfarsiinvertedV": "\u063D", "/yehfarsithreedotsabove": "\u063F", "/yehfarsitwodotsabove": "\u063E", "/yehfinalarabic": "\uFEF2", "/yehhamza": "\u0626", "/yehhamza.fina": "\uFE8A", "/yehhamza.init": "\uFE8B", "/yehhamza.isol": "\uFE89", "/yehhamza.medi": "\uFE8C", "/yehhamzaabovearabic": "\u0626", "/yehhamzaabovefinalarabic": "\uFE8A", "/yehhamzaaboveinitialarabic": "\uFE8B", "/yehhamzaabovemedialarabic": "\uFE8C", "/yehhighhamza": "\u0678", "/yehinitialarabic": "\uFEF3", "/yehmedialarabic": "\uFEF4", "/yehmeeminitialarabic": "\uFCDD", "/yehmeemisolatedarabic": "\uFC58", "/yehnoonfinalarabic": "\uFC94", "/yehsmall": "\u06E6", "/yehtail": "\u06CD", "/yehthreedotsbelow": "\u06D1", "/yehthreedotsbelowarabic": "\u06D1", "/yekorean": "\u3156", "/yellowHeart": "\u1F49B", "/yen": "\u00A5", "/yenmonospace": "\uFFE5", "/yeokorean": "\u3155", "/yeorinhieuhkorean": "\u3186", "/yerachBenYomo:hb": "\u05AA", "/yerahbenyomohebrew": "\u05AA", "/yerahbenyomolefthebrew": "\u05AA", "/yericyrillic": "\u044B", "/yerudieresiscyrillic": "\u04F9", "/yesieungkorean": "\u3181", "/yesieungpansioskorean": "\u3183", "/yesieungsioskorean": "\u3182", "/yetiv:hb": "\u059A", "/yetivhebrew": "\u059A", "/ygrave": "\u1EF3", "/yhoi": "\u1EF7", "/yhook": "\u01B4", "/yhookabove": "\u1EF7", "/yiarmenian": "\u0575", "/yicyrillic": "\u0457", "/yikorean": "\u3162", "/yintonemod": "\u02EA", "/yinyang": "\u262F", "/yiwnarmenian": "\u0582", "/ylongcyr": "\u044B", "/ylongdieresiscyr": "\u04F9", "/yloop": "\u1EFF", "/ymacron": "\u0233", "/ymonospace": "\uFF59", "/yocirclekatakana": "\u32F5", "/yod": "\u05D9", "/yod:hb": "\u05D9", "/yod_yod:hb": "\u05F2", "/yod_yod_patah:hb": "\uFB1F", "/yoddagesh": "\uFB39", "/yoddageshhebrew": "\uFB39", "/yodhebrew": "\u05D9", "/yodwithdagesh:hb": "\uFB39", "/yodwithhiriq:hb": "\uFB1D", "/yodyodhebrew": "\u05F2", "/yodyodpatahhebrew": "\uFB1F", "/yogh": "\u021D", "/yohiragana": "\u3088", "/yoikorean": "\u3189", "/yokatakana": "\u30E8", "/yokatakanahalfwidth": "\uFF96", "/yokorean": "\u315B", "/yosmallhiragana": "\u3087", "/yosmallkatakana": "\u30E7", "/yosmallkatakanahalfwidth": "\uFF6E", "/yot": "\u03F3", "/yotgreek": "\u03F3", "/yoyaekorean": "\u3188", "/yoyakorean": "\u3187", "/yoyakthai": "\u0E22", "/yoyingthai": "\u0E0D", "/yparen": "\u24B4", "/yparenthesized": "\u24B4", "/ypogegrammeni": "\u037A", "/ypogegrammenigreekcmb": "\u0345", "/yr": "\u01A6", "/yring": "\u1E99", "/ystroke": "\u024F", "/ysuperior": "\u02B8", "/ysupmod": "\u02B8", "/ytilde": "\u1EF9", "/yturned": "\u028E", "/yu.fina": "\uFBDC", "/yu.isol": "\uFBDB", "/yuansquare": "\u3350", "/yucirclekatakana": "\u32F4", "/yucyr": "\u044E", "/yuhiragana": "\u3086", "/yuikorean": "\u318C", "/yukatakana": "\u30E6", "/yukatakanahalfwidth": "\uFF95", "/yukirghiz": "\u06C9", "/yukirghiz.fina": "\uFBE3", "/yukirghiz.isol": "\uFBE2", "/yukorean": "\u3160", "/yukrcyr": "\u0457", "/yusbigcyr": "\u046B", "/yusbigcyrillic": "\u046B", "/yusbigiotifiedcyr": "\u046D", "/yusbigiotifiedcyrillic": "\u046D", "/yuslittlecyr": "\u0467", "/yuslittlecyrillic": "\u0467", "/yuslittleiotifiedcyr": "\u0469", "/yuslittleiotifiedcyrillic": "\u0469", "/yusmallhiragana": "\u3085", "/yusmallkatakana": "\u30E5", "/yusmallkatakanahalfwidth": "\uFF6D", "/yuyekorean": "\u318B", "/yuyeokorean": "\u318A", "/yyabengali": "\u09DF", "/yyadeva": "\u095F", "/z": "\u007A", "/zaarmenian": "\u0566", "/zacute": "\u017A", "/zadeva": "\u095B", "/zagurmukhi": "\u0A5B", "/zah": "\u0638", "/zah.fina": "\uFEC6", "/zah.init": "\uFEC7", "/zah.init_meem.fina": "\uFC28", "/zah.init_meem.medi": "\uFCB9", "/zah.isol": "\uFEC5", "/zah.medi": "\uFEC8", "/zah.medi_meem.medi": "\uFD3B", "/zaharabic": "\u0638", "/zahfinalarabic": "\uFEC6", "/zahinitialarabic": "\uFEC7", "/zahiragana": "\u3056", "/zahmedialarabic": "\uFEC8", "/zain": "\u0632", "/zain.fina": "\uFEB0", "/zain.isol": "\uFEAF", "/zainabove": "\u0617", "/zainarabic": "\u0632", "/zainfinalarabic": "\uFEB0", "/zakatakana": "\u30B6", "/zaqefGadol:hb": "\u0595", "/zaqefQatan:hb": "\u0594", "/zaqefgadolhebrew": "\u0595", "/zaqefqatanhebrew": "\u0594", "/zarqa:hb": "\u0598", "/zarqahebrew": "\u0598", "/zayin": "\u05D6", "/zayin:hb": "\u05D6", "/zayindagesh": "\uFB36", "/zayindageshhebrew": "\uFB36", "/zayinhebrew": "\u05D6", "/zayinwithdagesh:hb": "\uFB36", "/zbopomofo": "\u3117", "/zcaron": "\u017E", "/zcircle": "\u24E9", "/zcircumflex": "\u1E91", "/zcurl": "\u0291", "/zdescender": "\u2C6C", "/zdot": "\u017C", "/zdotaccent": "\u017C", "/zdotbelow": "\u1E93", "/zecyr": "\u0437", "/zecyrillic": "\u0437", "/zedescendercyrillic": "\u0499", "/zedieresiscyr": "\u04DF", "/zedieresiscyrillic": "\u04DF", "/zehiragana": "\u305C", "/zekatakana": "\u30BC", "/zero": "\u0030", "/zero.inferior": "\u2080", "/zero.superior": "\u2070", "/zeroarabic": "\u0660", "/zerobengali": "\u09E6", "/zerocircle": "\u24EA", "/zerocircleblack": "\u24FF", "/zerocomma": "\u1F101", "/zerodeva": "\u0966", "/zerofar": "\u06F0", "/zerofullstop": "\u1F100", "/zerogujarati": "\u0AE6", "/zerogurmukhi": "\u0A66", "/zerohackarabic": "\u0660", "/zeroinferior": "\u2080", "/zeromonospace": "\uFF10", "/zerooldstyle": "\uF730", "/zeropersian": "\u06F0", "/zerosquareabove": "\u06E0", "/zerosuperior": "\u2070", "/zerothai": "\u0E50", "/zerothirds": "\u2189", "/zerowidthjoiner": "\uFEFF", "/zerowidthnobreakspace": "\uFEFF", "/zerowidthnonjoiner": "\u200C", "/zerowidthspace": "\u200B", "/zeta": "\u03B6", "/zetailcyr": "\u0499", "/zhbopomofo": "\u3113", "/zhearmenian": "\u056A", "/zhebrevecyr": "\u04C2", "/zhebrevecyrillic": "\u04C2", "/zhecyr": "\u0436", "/zhecyrillic": "\u0436", "/zhedescendercyrillic": "\u0497", "/zhedieresiscyr": "\u04DD", "/zhedieresiscyrillic": "\u04DD", "/zhetailcyr": "\u0497", "/zhook": "\u0225", "/zihiragana": "\u3058", "/zikatakana": "\u30B8", "/zildefunc": "\u236C", "/zinorhebrew": "\u05AE", "/zjekomicyr": "\u0505", "/zlinebelow": "\u1E95", "/zmonospace": "\uFF5A", "/znotationbagmembership": "\u22FF", "/zohiragana": "\u305E", "/zokatakana": "\u30BE", "/zparen": "\u24B5", "/zparenthesized": "\u24B5", "/zretroflex": "\u0290", "/zretroflexhook": "\u0290", "/zstroke": "\u01B6", "/zswashtail": "\u0240", "/zuhiragana": "\u305A", "/zukatakana": "\u30BA", "/zwarakay": "\u0659", # manually added from # https://github.com/serviceprototypinglab/latex-pdfa/blob/master/glyphtounicode-cmr.tex "/angbracketleftBig": "\u28E8", "/angbracketleftBigg": "\u27E8", "/angbracketleftbig": "\u27E8", "/angbracketleftbigg": "\u27E8", "/angbracketrightBig": "\u27E9", "/angbracketrightBigg": "\u27E9", "/angbracketrightbig": "\u27E9", "/angbracketrightbigg": "\u27E9", "/arrowbt": "\u2193", "/arrowdblbt": "\u21D3", "/arrowdbltp": "\u21D1", "/arrowhookleft": "\u21AA", "/arrowhookright": "\u21A9", "/arrowtp": "\u2191", # diff : "/arrowvertex": "\u23D0", "/arrowvertexdbl": "\uED12", "/backslashBig": "\u005C", "/backslashBigg": "\u005C", "/backslashbig": "\u005C", "/backslashbigg": "\u005C", # diff : "/braceex": "\u23AA", "/bracehtipdownleft": "\uED17", "/bracehtipdownright": "\uED18", "/bracehtipupleft": "\uED19", "/bracehtipupright": "\uED1A", "/braceleftBig": "\u007B", "/braceleftBigg": "\u007B", "/braceleftbig": "\u007B", "/braceleftbigg": "\u007B", # diff : "/braceleftbt": "\u23A9", # diff : "/braceleftmid": "\u23A8", # diff : "/bracelefttp": "\u23A7", "/bracerightBig": "\u007D", "/bracerightBigg": "\u007D", "/bracerightbig": "\u007D", "/bracerightbigg": "\u007D", # diff : "/bracerightbt": "\u23AD", # diff : "/bracerightmid": "\u23AC", # diff : "/bracerighttp": "\u23AB", "/bracketleftBig": "\u005B", "/bracketleftBigg": "\u005B", "/bracketleftbig": "\u005B", "/bracketleftbigg": "\u005B", # diff : "/bracketleftbt": "\u23A3", # diff : "/bracketleftex": "\u23A2", # diff : "/bracketlefttp": "\u23A1", "/bracketrightBig": "\u005D", "/bracketrightBigg": "\u005D", "/bracketrightbig": "\u005D", "/bracketrightbigg": "\u005D", # diff : "/bracketrightbt": "\u23A6", # diff : "/bracketrightex": "\u23A5", # diff : "/bracketrighttp": "\u23A4", "/ceilingleftBig": "\u2308", "/ceilingleftBigg": "\u2308", "/ceilingleftbig": "\u2308", "/ceilingleftbigg": "\u2308", "/ceilingrightBig": "\u2309", "/ceilingrightBigg": "\u2309", "/ceilingrightbig": "\u2309", "/ceilingrightbigg": "\u2309", "/circledotdisplay": "\u2A00", "/circledottext": "\u2A00", "/circlemultiplydisplay": "\u2A02", "/circlemultiplytext": "\u2A02", "/circleplusdisplay": "\u2A01", "/circleplustext": "\u2A01", "/contintegraldisplay": "\u222E", "/contintegraltext": "\u222E", "/coproductdisplay": "\u2210", "/coproducttext": "\u2210", "/floorleftBig": "\u230A", "/floorleftBigg": "\u230A", "/floorleftbig": "\u230A", "/floorleftbigg": "\u230A", "/floorrightBig": "\u230B", "/floorrightBigg": "\u230B", "/floorrightbig": "\u230B", "/floorrightbigg": "\u230B", "/hatwide": "\u02C6", "/hatwider": "\u02C6", "/hatwidest": "\u02C6", "/integraldisplay": "\u222B", "/integraltext": "\u222B", "/intersectiondisplay": "\u22C2", "/intersectiontext": "\u22C2", "/logicalanddisplay": "\u22C0", "/logicalandtext": "\u22C0", "/logicalordisplay": "\u22C1", "/logicalortext": "\u22C1", "/mapsto": "\u21A6", "/parenleftBig": "\u0028", "/parenleftBigg": "\u0028", "/parenleftbig": "\u0028", "/parenleftbigg": "\u0028", # diff : "/parenleftbt": "\u239D", # diff : "/parenleftex": "\u239C", # diff : "/parenlefttp": "\u239B", "/parenrightBig": "\u0029", "/parenrightBigg": "\u0029", "/parenrightbig": "\u0029", "/parenrightbigg": "\u0029", # diff : "/parenrightbt": "\u23A0", # diff : "/parenrightex": "\u239F", # diff : "/parenrighttp": "\u239E", "/productdisplay": "\u220F", "/producttext": "\u220F", "/radicalBig": "\u221A", "/radicalBigg": "\u221A", "/radicalbig": "\u221A", "/radicalbigg": "\u221A", "/radicalbt": "\u221A", "/radicaltp": "\uED6A", "/radicalvertex": "\uED6B", "/slashBig": "\u002F", "/slashBigg": "\u002F", "/slashbig": "\u002F", "/slashbigg": "\u002F", "/summationdisplay": "\u2211", "/summationtext": "\u2211", "/tie": "\u2040", "/tildewide": "\u02DC", "/tildewider": "\u02DC", "/tildewidest": "\u02DC", "/uniondisplay": "\u22C3", "/unionmultidisplay": "\u2A04", "/unionmultitext": "\u2A04", "/unionsqdisplay": "\u2A06", "/unionsqtext": "\u2A06", "/uniontext": "\u22C3", "/vextenddouble": "\uED79", "/vextendsingle": "\u23D0", "/a1": "\u25C1", "/a2": "\u22B4", "/a3": "\u25B7", "/a4": "\u22B5", "/a40": "\u02C2", "/a41": "\u02C3", "/a42": "\u2303", "/a43": "\u2304", "/a48": "\u2127", "/a49": "\u22C8", "/a50": "\u25A1", "/a51": "\u25C7", "/a58": "\u2053", "/a59": "\u219D", "/a60": "\u228F", "/a61": "\u2290", "/d0": "\u2199", "/d1": "\u2199", "/d2": "\u2199", "/d3": "\u2199", "/d4": "\u2199", "/d5": "\u2199", "/d6": "\u2199", "/d7": "\u2193", "/d8": "\u2193", "/d9": "\u2193", "/d10": "\u2193", "/d11": "\u2193", "/d12": "\u2193", "/d13": "\u2193", "/d14": "\u2193", "/d15": "\u2193", "/d16": "\u2193", "/d17": "\u2193", "/d18": "\u2193", "/d19": "\u2193", "/d20": "\u2193", "/d21": "\u2193", "/d22": "\u2193", "/d23": "\u2193", "/d24": "\u2198", "/d25": "\u2198", "/d26": "\u2198", "/d27": "\u2198", "/d28": "\u2198", "/d29": "\u2198", "/d30": "\u2198", "/d31": "\u2198", "/d32": "\u2198", "/d33": "\u2198", "/d34": "\u2198", "/d35": "\u2198", "/d36": "\u2198", "/d37": "\u2198", "/d38": "\u2198", "/d39": "\u2192", "/d40": "\u2192", "/d41": "\u2192", "/d42": "\u2192", "/d43": "\u2192", "/d44": "\u2192", "/d45": "\u2192", "/d46": "\u2192", "/d47": "\u2192", "/d48": "\u2192", "/d49": "\u2192", "/d50": "\u2192", "/d51": "\u2192", "/d52": "\u2192", "/d53": "\u2192", "/d54": "\u2192", "/d55": "\u2192", "/d56": "\u2197", "/d57": "\u2197", "/d58": "\u2197", "/d59": "\u2197", "/d60": "\u2197", "/d61": "\u2197", "/d62": "\u2197", "/d63": "\u2197", "/d64": "\u2197", "/d65": "\u2197", "/d66": "\u2197", "/d67": "\u2197", "/d68": "\u2197", "/d69": "\u2197", "/d70": "\u2197", "/d71": "\u2191", "/d72": "\u2191", "/d73": "\u2191", "/d74": "\u2191", "/d75": "\u2191", "/d76": "\u2191", "/d77": "\u2191", "/d78": "\u2191", "/d79": "\u2191", "/d80": "\u2191", "/d81": "\u2191", "/d82": "\u2191", "/d83": "\u2191", "/d84": "\u2191", "/d85": "\u2191", "/d86": "\u2191", "/d87": "\u2191", "/d88": "\u2196", "/d89": "\u2196", "/d90": "\u2196", "/d91": "\u2196", "/d92": "\u2196", "/d93": "\u2196", "/d94": "\u2196", "/d95": "\u2196", "/d96": "\u2196", "/d97": "\u2196", "/d98": "\u2196", "/d99": "\u2196", "/d100": "\u2196", "/d101": "\u2196", "/d102": "\u2196", "/d103": "\u2190", "/d104": "\u2190", "/d105": "\u2190", "/d106": "\u2190", "/d107": "\u2190", "/d108": "\u2190", "/d109": "\u2190", "/d110": "\u2190", "/d111": "\u2190", "/d112": "\u2190", "/d113": "\u2190", "/d114": "\u2190", "/d115": "\u2190", "/d116": "\u2190", "/d117": "\u2190", "/d118": "\u2190", "/d119": "\u2190", "/d120": "\u2199", "/d121": "\u2199", "/d122": "\u2199", "/d123": "\u2199", "/d124": "\u2199", "/d125": "\u2199", "/d126": "\u2199", "/d127": "\u2199", # manually added from # https://github.com/kohler/lcdf-typetools/blob/master/texglyphlist.txt "/Ifractur": "\u2111", "/FFsmall": "\uF766", "/FFIsmall": "\uF766", "/FFLsmall": "\uF766", "/FIsmall": "\uF766", "/FLsmall": "\uF766", # diff : "/Germandbls": "\u0053", "/Germandblssmall": "\uF773", "/Ng": "\u014A", "/Rfractur": "\u211C", "/SS": "\u0053", "/SSsmall": "\uF773", "/altselector": "\uD802", "/angbracketleft": "\u27E8", "/angbracketright": "\u27E9", "/arrowbothv": "\u2195", "/arrowdblbothv": "\u21D5", "/arrowleftbothalf": "\u21BD", "/arrowlefttophalf": "\u21BC", "/arrownortheast": "\u2197", "/arrownorthwest": "\u2196", "/arrowrightbothalf": "\u21C1", "/arrowrighttophalf": "\u21C0", "/arrowsoutheast": "\u2198", "/arrowsouthwest": "\u2199", "/ascendercompwordmark": "\uD80A", "/asteriskcentered": "\u2217", "/bardbl": "\u2225", "/capitalcompwordmark": "\uD809", "/circlecopyrt": "\u20DD", "/circledivide": "\u2298", "/circleminus": "\u2296", "/coproduct": "\u2A3F", "/ct": "\u0063", "/cwm": "\u200C", "/dblbracketleft": "\u27E6", "/dblbracketright": "\u27E7", # diff : "/diamond": "\u2662", "/diamondmath": "\u22C4", # diff : "/dotlessj": "\u0237", "/emptyslot": "\uD801", "/epsilon1": "\u03F5", "/epsiloninv": "\u03F6", "/equivasymptotic": "\u224D", "/flat": "\u266D", "/follows": "\u227B", "/followsequal": "\u2AB0", "/followsorcurly": "\u227D", "/greatermuch": "\u226B", # diff : "/heart": "\u2661", "/interrobangdown": "\u2E18", "/intersectionsq": "\u2293", "/latticetop": "\u22A4", "/lessmuch": "\u226A", "/longdbls": "\u017F", "/longsh": "\u017F", "/longsi": "\u017F", "/longsl": "\u017F", "/longst": "\uFB05", "/lscript": "\u2113", "/natural": "\u266E", "/negationslash": "\u0338", "/ng": "\u014B", "/owner": "\u220B", "/pertenthousand": "\u2031", # diff : "/phi": "\u03D5", # diff : "/phi1": "\u03C6", "/pi1": "\u03D6", "/precedesequal": "\u2AAF", "/precedesorcurly": "\u227C", "/prime": "\u2032", "/rho1": "\u03F1", "/ringfitted": "\uD80D", "/sharp": "\u266F", "/similarequal": "\u2243", "/slurabove": "\u2322", "/slurbelow": "\u2323", "/st": "\uFB06", "/subsetsqequal": "\u2291", "/supersetsqequal": "\u2292", "/triangle": "\u25B3", "/triangleinv": "\u25BD", "/triangleleft": "\u25C1", # diff : "/triangleright": "\u25B7", "/turnstileleft": "\u22A2", "/turnstileright": "\u22A3", "/twelveudash": "\uD80C", "/unionmulti": "\u228E", "/unionsq": "\u2294", "/vector": "\u20D7", "/visualspace": "\u2423", "/Dbar": "\u0110", "/compwordmark": "\u200C", "/dbar": "\u0111", "/rangedash": "\u2013", "/hyphenchar": "\u002D", "/punctdash": "\u2014", "/visiblespace": "\u2423", "/Yen": "\u00A5", "/anticlockwise": "\u27F2", "/arrowparrleftright": "\u21C6", "/arrowparrrightleft": "\u21C4", "/arrowtailleft": "\u21A2", "/arrowtailright": "\u21A3", "/arrowtripleleft": "\u21DA", "/arrowtripleright": "\u21DB", "/check": "\u2713", "/circleR": "\u00AE", "/circleS": "\u24C8", "/circleasterisk": "\u229B", "/circleequal": "\u229C", "/circlering": "\u229A", "/clockwise": "\u27F3", "/curlyleft": "\u21AB", "/curlyright": "\u21AC", "/dblarrowdwn": "\u21CA", "/dblarrowheadleft": "\u219E", "/dblarrowheadright": "\u21A0", # diff : "/dblarrowup": "\u21C8", "/defines": "\u225C", "/diamondsolid": "\u2666", "/difference": "\u224F", "/downfall": "\u22CE", "/equaldotleftright": "\u2252", "/equaldotrightleft": "\u2253", "/equalorfollows": "\u22DF", # diff : "/equalorgreater": "\u2A96", # diff : "/equalorless": "\u2A95", "/equalsdots": "\u2251", "/followsorequal": "\u227F", "/forcesbar": "\u22AA", # diff : "/fork": "\u22D4", "/geomequivalent": "\u224E", "/greaterdbleqlless": "\u2A8C", "/greaterdblequal": "\u2267", "/greaterlessequal": "\u22DB", "/greaterorapproxeql": "\u2A86", "/greaterorequalslant": "\u2A7E", "/greaterorsimilar": "\u2273", "/harpoondownleft": "\u21C3", "/harpoondownright": "\u21C2", "/harpoonleftright": "\u21CC", "/harpoonrightleft": "\u21CB", "/harpoonupleft": "\u21BF", "/harpoonupright": "\u21BE", "/intercal": "\u22BA", "/lessdbleqlgreater": "\u2A8B", "/lessdblequal": "\u2266", "/lessequalgreater": "\u22DA", "/lessorapproxeql": "\u2A85", "/lessorequalslant": "\u2A7D", "/lessorsimilar": "\u2272", "/maltesecross": "\u2720", "/multiopenleft": "\u22CB", "/multiopenright": "\u22CC", "/orunderscore": "\u22BB", "/perpcorrespond": "\u2A5E", # diff : "/precedesorequal": "\u227E", "/primereverse": "\u2035", "/revasymptequal": "\u22CD", "/revsimilar": "\u223D", "/rightanglene": "\u231D", "/rightanglenw": "\u231C", "/rightanglese": "\u231F", "/rightanglesw": "\u231E", "/satisfies": "\u22A8", "/shiftleft": "\u21B0", "/shiftright": "\u21B1", "/square": "\u25A1", "/squaredot": "\u22A1", "/squareminus": "\u229F", "/squaremultiply": "\u22A0", "/squareplus": "\u229E", "/squaresolid": "\u25A0", "/squiggleleftright": "\u21AD", "/squiggleright": "\u21DD", "/subsetdblequal": "\u2AC5", "/supersetdbl": "\u22D1", "/supersetdblequal": "\u2AC6", "/triangledownsld": "\u25BC", "/triangleleftequal": "\u22B4", "/triangleleftsld": "\u25C0", "/trianglerightequal": "\u22B5", "/trianglerightsld": "\u25B6", "/trianglesolid": "\u25B2", "/uprise": "\u22CF", # diff : "/Digamma": "\u1D7C", "/Finv": "\u2132", "/Gmir": "\u2141", "/Omegainv": "\u2127", "/approxorequal": "\u224A", "/archleftdown": "\u21B6", "/archrightdown": "\u21B7", "/beth": "\u2136", "/daleth": "\u2138", "/dividemultiply": "\u22C7", "/downslope": "\u29F9", "/equalorsimilar": "\u2242", "/follownotdbleqv": "\u2ABA", "/follownotslnteql": "\u2AB6", "/followornoteqvlnt": "\u22E9", "/greaternotdblequal": "\u2A8A", "/greaternotequal": "\u2A88", "/greaterornotdbleql": "\u2269", "/greaterornotequal": "\u2269", "/integerdivide": "\u2216", "/lessnotdblequal": "\u2A89", "/lessnotequal": "\u2A87", "/lessornotdbleql": "\u2268", "/lessornotequal": "\u2268", "/multicloseleft": "\u22C9", "/multicloseright": "\u22CA", "/notapproxequal": "\u2247", "/notarrowboth": "\u21AE", "/notarrowleft": "\u219A", "/notarrowright": "\u219B", "/notbar": "\u2224", "/notdblarrowboth": "\u21CE", "/notdblarrowleft": "\u21CD", "/notdblarrowright": "\u21CF", "/notfollows": "\u2281", "/notfollowsoreql": "\u2AB0", "/notforces": "\u22AE", "/notforcesextra": "\u22AF", "/notgreaterdblequal": "\u2267", "/notgreaterequal": "\u2271", "/notgreaterorslnteql": "\u2A7E", "/notlessdblequal": "\u2266", "/notlessequal": "\u2270", "/notlessorslnteql": "\u2A7D", "/notprecedesoreql": "\u2AAF", "/notsatisfies": "\u22AD", "/notsimilar": "\u2241", "/notsubseteql": "\u2288", "/notsubsetordbleql": "\u2AC5", "/notsubsetoreql": "\u228A", "/notsuperseteql": "\u2289", "/notsupersetordbleql": "\u2AC6", "/notsupersetoreql": "\u228B", "/nottriangeqlleft": "\u22EC", "/nottriangeqlright": "\u22ED", "/nottriangleleft": "\u22EA", "/nottriangleright": "\u22EB", "/notturnstile": "\u22AC", "/planckover2pi": "\u210F", "/planckover2pi1": "\u210F", "/precedenotdbleqv": "\u2AB9", "/precedenotslnteql": "\u2AB5", "/precedeornoteqvlnt": "\u22E8", "/subsetnoteql": "\u228A", "/subsetornotdbleql": "\u2ACB", "/supersetnoteql": "\u228B", "/supersetornotdbleql": "\u2ACC", "/upslope": "\u29F8", } def _complete() -> None: for i in range(256): adobe_glyphs[f"/a{i}"] = chr(i) adobe_glyphs["/.notdef"] = "□" _complete() ================================================ FILE: pypdf/_codecs/core_font_metrics.py ================================================ # This file is based upon the 14 core AFM files provided by Adobe/Macromedia at # https://download.macromedia.com/pub/developer/opentype/tech-notes/Core14_AFMs.zip # The original copyright follows: # # ----------------------------------------------------------------------------------------------- # Core 14 AFM Files - ReadMe # # This file and the 14 PostScript(R) AFM files it accompanies may be used, copied, and # distributed for any purpose and without charge, with or without modification, provided that all # copyright notices are retained; that the AFM files are not distributed without this file; that # all modifications to this file or any of the AFM files are prominently noted in the modified # file(s); and that this paragraph is not modified. Adobe Systems has no responsibility or # obligation to support the use of the AFM files. # ----------------------------------------------------------------------------------------------- from pypdf._font import CoreFontMetrics, FontDescriptor CORE_FONT_METRICS: dict[str, CoreFontMetrics] = { # Generated from Courier.afm # Copyright (c) 1989, 1990, 1991, 1992, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. "Courier": CoreFontMetrics( font_descriptor=FontDescriptor( name="Courier", family="Courier", weight="Medium", ascent=629, descent=-157, cap_height=562, x_height=426, italic_angle=0, flags=33, bbox=(-23.0, -250.0, 715.0, 805.0), ), character_widths={ " ": 600, "default": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "\u2019": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "\u2018": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\u2044": 600, "\xa5": 600, "\u0192": 600, "\xa7": 600, "\xa4": 600, "'": 600, "\u201c": 600, "\xab": 600, "\u2039": 600, "\u203a": 600, "\ufb01": 600, "\ufb02": 600, "\u2013": 600, "\u2020": 600, "\u2021": 600, "\xb7": 600, "\xb6": 600, "\u2022": 600, "\u201a": 600, "\u201e": 600, "\u201d": 600, "\xbb": 600, "\u2026": 600, "\u2030": 600, "\xbf": 600, "`": 600, "\xb4": 600, "\u02c6": 600, "\u02dc": 600, "\xaf": 600, "\u02d8": 600, "\u02d9": 600, "\xa8": 600, "\u02da": 600, "\xb8": 600, "\u02dd": 600, "\u02db": 600, "\u02c7": 600, "\u2014": 600, "\xc6": 600, "\xaa": 600, "\u0141": 600, "\xd8": 600, "\u0152": 600, "\xba": 600, "\xe6": 600, "\u0131": 600, "\u0142": 600, "\xf8": 600, "\u0153": 600, "\xdf": 600, "\xcf": 600, "\xe9": 600, "\u0103": 600, "\u0171": 600, "\u011b": 600, "\u0178": 600, "\xf7": 600, "\xdd": 600, "\xc2": 600, "\xe1": 600, "\xdb": 600, "\xfd": 600, "\u0219": 600, "\xea": 600, "\u016e": 600, "\xdc": 600, "\u0105": 600, "\xda": 600, "\u0173": 600, "\xcb": 600, "\u0110": 600, "\uf6c3": 600, "\xa9": 600, "\u0112": 600, "\u010d": 600, "\xe5": 600, "\u0145": 600, "\u013a": 600, "\xe0": 600, "\u0162": 600, "\u0106": 600, "\xe3": 600, "\u0116": 600, "\u0161": 600, "\u015f": 600, "\xed": 600, "\u25ca": 600, "\u0158": 600, "\u0122": 600, "\xfb": 600, "\xe2": 600, "\u0100": 600, "\u0159": 600, "\xe7": 600, "\u017b": 600, "\xde": 600, "\u014c": 600, "\u0154": 600, "\u015a": 600, "\u010f": 600, "\u016a": 600, "\u016f": 600, "\xb3": 600, "\xd2": 600, "\xc0": 600, "\u0102": 600, "\xd7": 600, "\xfa": 600, "\u0164": 600, "\u2202": 600, "\xff": 600, "\u0143": 600, "\xee": 600, "\xca": 600, "\xe4": 600, "\xeb": 600, "\u0107": 600, "\u0144": 600, "\u016b": 600, "\u0147": 600, "\xcd": 600, "\xb1": 600, "\xa6": 600, "\xae": 600, "\u011e": 600, "\u0130": 600, "\u2211": 600, "\xc8": 600, "\u0155": 600, "\u014d": 600, "\u0179": 600, "\u017d": 600, "\u2265": 600, "\xd0": 600, "\xc7": 600, "\u013c": 600, "\u0165": 600, "\u0119": 600, "\u0172": 600, "\xc1": 600, "\xc4": 600, "\xe8": 600, "\u017a": 600, "\u012f": 600, "\xd3": 600, "\xf3": 600, "\u0101": 600, "\u015b": 600, "\xef": 600, "\xd4": 600, "\xd9": 600, "\u2206": 600, "\xfe": 600, "\xb2": 600, "\xd6": 600, "\xb5": 600, "\xec": 600, "\u0151": 600, "\u0118": 600, "\u0111": 600, "\xbe": 600, "\u015e": 600, "\u013e": 600, "\u0136": 600, "\u0139": 600, "\u2122": 600, "\u0117": 600, "\xcc": 600, "\u012a": 600, "\u013d": 600, "\xbd": 600, "\u2264": 600, "\xf4": 600, "\xf1": 600, "\u0170": 600, "\xc9": 600, "\u0113": 600, "\u011f": 600, "\xbc": 600, "\u0160": 600, "\u0218": 600, "\u0150": 600, "\xb0": 600, "\xf2": 600, "\u010c": 600, "\xf9": 600, "\u221a": 600, "\u010e": 600, "\u0157": 600, "\xd1": 600, "\xf5": 600, "\u0156": 600, "\u013b": 600, "\xc3": 600, "\u0104": 600, "\xc5": 600, "\xd5": 600, "\u017c": 600, "\u011a": 600, "\u012e": 600, "\u0137": 600, "\u2212": 600, "\xce": 600, "\u0148": 600, "\u0163": 600, "\xac": 600, "\xf6": 600, "\xfc": 600, "\u2260": 600, "\u0123": 600, "\xf0": 600, "\u017e": 600, "\u0146": 600, "\xb9": 600, "\u012b": 600, "\u20ac": 600, }, ), # Generated from Courier-Bold.afm # Copyright (c) 1989, 1990, 1991, 1993, 1997 Adobe Systems Incorporated. All Rights Reserved. "Courier-Bold": CoreFontMetrics( font_descriptor=FontDescriptor( name="Courier-Bold", family="Courier", weight="Bold", ascent=629, descent=-157, cap_height=562, x_height=439, italic_angle=0, flags=33, bbox=(-113.0, -250.0, 749.0, 801.0), ), character_widths={ " ": 600, "default": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "\u2019": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "\u2018": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\u2044": 600, "\xa5": 600, "\u0192": 600, "\xa7": 600, "\xa4": 600, "'": 600, "\u201c": 600, "\xab": 600, "\u2039": 600, "\u203a": 600, "\ufb01": 600, "\ufb02": 600, "\u2013": 600, "\u2020": 600, "\u2021": 600, "\xb7": 600, "\xb6": 600, "\u2022": 600, "\u201a": 600, "\u201e": 600, "\u201d": 600, "\xbb": 600, "\u2026": 600, "\u2030": 600, "\xbf": 600, "`": 600, "\xb4": 600, "\u02c6": 600, "\u02dc": 600, "\xaf": 600, "\u02d8": 600, "\u02d9": 600, "\xa8": 600, "\u02da": 600, "\xb8": 600, "\u02dd": 600, "\u02db": 600, "\u02c7": 600, "\u2014": 600, "\xc6": 600, "\xaa": 600, "\u0141": 600, "\xd8": 600, "\u0152": 600, "\xba": 600, "\xe6": 600, "\u0131": 600, "\u0142": 600, "\xf8": 600, "\u0153": 600, "\xdf": 600, "\xcf": 600, "\xe9": 600, "\u0103": 600, "\u0171": 600, "\u011b": 600, "\u0178": 600, "\xf7": 600, "\xdd": 600, "\xc2": 600, "\xe1": 600, "\xdb": 600, "\xfd": 600, "\u0219": 600, "\xea": 600, "\u016e": 600, "\xdc": 600, "\u0105": 600, "\xda": 600, "\u0173": 600, "\xcb": 600, "\u0110": 600, "\uf6c3": 600, "\xa9": 600, "\u0112": 600, "\u010d": 600, "\xe5": 600, "\u0145": 600, "\u013a": 600, "\xe0": 600, "\u0162": 600, "\u0106": 600, "\xe3": 600, "\u0116": 600, "\u0161": 600, "\u015f": 600, "\xed": 600, "\u25ca": 600, "\u0158": 600, "\u0122": 600, "\xfb": 600, "\xe2": 600, "\u0100": 600, "\u0159": 600, "\xe7": 600, "\u017b": 600, "\xde": 600, "\u014c": 600, "\u0154": 600, "\u015a": 600, "\u010f": 600, "\u016a": 600, "\u016f": 600, "\xb3": 600, "\xd2": 600, "\xc0": 600, "\u0102": 600, "\xd7": 600, "\xfa": 600, "\u0164": 600, "\u2202": 600, "\xff": 600, "\u0143": 600, "\xee": 600, "\xca": 600, "\xe4": 600, "\xeb": 600, "\u0107": 600, "\u0144": 600, "\u016b": 600, "\u0147": 600, "\xcd": 600, "\xb1": 600, "\xa6": 600, "\xae": 600, "\u011e": 600, "\u0130": 600, "\u2211": 600, "\xc8": 600, "\u0155": 600, "\u014d": 600, "\u0179": 600, "\u017d": 600, "\u2265": 600, "\xd0": 600, "\xc7": 600, "\u013c": 600, "\u0165": 600, "\u0119": 600, "\u0172": 600, "\xc1": 600, "\xc4": 600, "\xe8": 600, "\u017a": 600, "\u012f": 600, "\xd3": 600, "\xf3": 600, "\u0101": 600, "\u015b": 600, "\xef": 600, "\xd4": 600, "\xd9": 600, "\u2206": 600, "\xfe": 600, "\xb2": 600, "\xd6": 600, "\xb5": 600, "\xec": 600, "\u0151": 600, "\u0118": 600, "\u0111": 600, "\xbe": 600, "\u015e": 600, "\u013e": 600, "\u0136": 600, "\u0139": 600, "\u2122": 600, "\u0117": 600, "\xcc": 600, "\u012a": 600, "\u013d": 600, "\xbd": 600, "\u2264": 600, "\xf4": 600, "\xf1": 600, "\u0170": 600, "\xc9": 600, "\u0113": 600, "\u011f": 600, "\xbc": 600, "\u0160": 600, "\u0218": 600, "\u0150": 600, "\xb0": 600, "\xf2": 600, "\u010c": 600, "\xf9": 600, "\u221a": 600, "\u010e": 600, "\u0157": 600, "\xd1": 600, "\xf5": 600, "\u0156": 600, "\u013b": 600, "\xc3": 600, "\u0104": 600, "\xc5": 600, "\xd5": 600, "\u017c": 600, "\u011a": 600, "\u012e": 600, "\u0137": 600, "\u2212": 600, "\xce": 600, "\u0148": 600, "\u0163": 600, "\xac": 600, "\xf6": 600, "\xfc": 600, "\u2260": 600, "\u0123": 600, "\xf0": 600, "\u017e": 600, "\u0146": 600, "\xb9": 600, "\u012b": 600, "\u20ac": 600, }, ), # Generated from Courier-BoldOblique.afm # Copyright (c) 1989, 1990, 1991, 1993, 1997 Adobe Systems Incorporated. All Rights Reserved. "Courier-BoldOblique": CoreFontMetrics( font_descriptor=FontDescriptor( name="Courier-BoldOblique", family="Courier", weight="Bold", ascent=629, descent=-157, cap_height=562, x_height=439, italic_angle=-12, flags=97, bbox=(-57.0, -250.0, 869.0, 801.0), ), character_widths={ " ": 600, "default": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "\u2019": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "\u2018": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\u2044": 600, "\xa5": 600, "\u0192": 600, "\xa7": 600, "\xa4": 600, "'": 600, "\u201c": 600, "\xab": 600, "\u2039": 600, "\u203a": 600, "\ufb01": 600, "\ufb02": 600, "\u2013": 600, "\u2020": 600, "\u2021": 600, "\xb7": 600, "\xb6": 600, "\u2022": 600, "\u201a": 600, "\u201e": 600, "\u201d": 600, "\xbb": 600, "\u2026": 600, "\u2030": 600, "\xbf": 600, "`": 600, "\xb4": 600, "\u02c6": 600, "\u02dc": 600, "\xaf": 600, "\u02d8": 600, "\u02d9": 600, "\xa8": 600, "\u02da": 600, "\xb8": 600, "\u02dd": 600, "\u02db": 600, "\u02c7": 600, "\u2014": 600, "\xc6": 600, "\xaa": 600, "\u0141": 600, "\xd8": 600, "\u0152": 600, "\xba": 600, "\xe6": 600, "\u0131": 600, "\u0142": 600, "\xf8": 600, "\u0153": 600, "\xdf": 600, "\xcf": 600, "\xe9": 600, "\u0103": 600, "\u0171": 600, "\u011b": 600, "\u0178": 600, "\xf7": 600, "\xdd": 600, "\xc2": 600, "\xe1": 600, "\xdb": 600, "\xfd": 600, "\u0219": 600, "\xea": 600, "\u016e": 600, "\xdc": 600, "\u0105": 600, "\xda": 600, "\u0173": 600, "\xcb": 600, "\u0110": 600, "\uf6c3": 600, "\xa9": 600, "\u0112": 600, "\u010d": 600, "\xe5": 600, "\u0145": 600, "\u013a": 600, "\xe0": 600, "\u0162": 600, "\u0106": 600, "\xe3": 600, "\u0116": 600, "\u0161": 600, "\u015f": 600, "\xed": 600, "\u25ca": 600, "\u0158": 600, "\u0122": 600, "\xfb": 600, "\xe2": 600, "\u0100": 600, "\u0159": 600, "\xe7": 600, "\u017b": 600, "\xde": 600, "\u014c": 600, "\u0154": 600, "\u015a": 600, "\u010f": 600, "\u016a": 600, "\u016f": 600, "\xb3": 600, "\xd2": 600, "\xc0": 600, "\u0102": 600, "\xd7": 600, "\xfa": 600, "\u0164": 600, "\u2202": 600, "\xff": 600, "\u0143": 600, "\xee": 600, "\xca": 600, "\xe4": 600, "\xeb": 600, "\u0107": 600, "\u0144": 600, "\u016b": 600, "\u0147": 600, "\xcd": 600, "\xb1": 600, "\xa6": 600, "\xae": 600, "\u011e": 600, "\u0130": 600, "\u2211": 600, "\xc8": 600, "\u0155": 600, "\u014d": 600, "\u0179": 600, "\u017d": 600, "\u2265": 600, "\xd0": 600, "\xc7": 600, "\u013c": 600, "\u0165": 600, "\u0119": 600, "\u0172": 600, "\xc1": 600, "\xc4": 600, "\xe8": 600, "\u017a": 600, "\u012f": 600, "\xd3": 600, "\xf3": 600, "\u0101": 600, "\u015b": 600, "\xef": 600, "\xd4": 600, "\xd9": 600, "\u2206": 600, "\xfe": 600, "\xb2": 600, "\xd6": 600, "\xb5": 600, "\xec": 600, "\u0151": 600, "\u0118": 600, "\u0111": 600, "\xbe": 600, "\u015e": 600, "\u013e": 600, "\u0136": 600, "\u0139": 600, "\u2122": 600, "\u0117": 600, "\xcc": 600, "\u012a": 600, "\u013d": 600, "\xbd": 600, "\u2264": 600, "\xf4": 600, "\xf1": 600, "\u0170": 600, "\xc9": 600, "\u0113": 600, "\u011f": 600, "\xbc": 600, "\u0160": 600, "\u0218": 600, "\u0150": 600, "\xb0": 600, "\xf2": 600, "\u010c": 600, "\xf9": 600, "\u221a": 600, "\u010e": 600, "\u0157": 600, "\xd1": 600, "\xf5": 600, "\u0156": 600, "\u013b": 600, "\xc3": 600, "\u0104": 600, "\xc5": 600, "\xd5": 600, "\u017c": 600, "\u011a": 600, "\u012e": 600, "\u0137": 600, "\u2212": 600, "\xce": 600, "\u0148": 600, "\u0163": 600, "\xac": 600, "\xf6": 600, "\xfc": 600, "\u2260": 600, "\u0123": 600, "\xf0": 600, "\u017e": 600, "\u0146": 600, "\xb9": 600, "\u012b": 600, "\u20ac": 600, }, ), # Generated from Courier-Oblique.afm # Copyright (c) 1989, 1990, 1991, 1992, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. "Courier-Oblique": CoreFontMetrics( font_descriptor=FontDescriptor( name="Courier-Oblique", family="Courier", weight="Medium", ascent=629, descent=-157, cap_height=562, x_height=426, italic_angle=-12, flags=97, bbox=(-27.0, -250.0, 849.0, 805.0), ), character_widths={ " ": 600, "default": 600, "!": 600, '"': 600, "#": 600, "$": 600, "%": 600, "&": 600, "\u2019": 600, "(": 600, ")": 600, "*": 600, "+": 600, ",": 600, "-": 600, ".": 600, "/": 600, "0": 600, "1": 600, "2": 600, "3": 600, "4": 600, "5": 600, "6": 600, "7": 600, "8": 600, "9": 600, ":": 600, ";": 600, "<": 600, "=": 600, ">": 600, "?": 600, "@": 600, "A": 600, "B": 600, "C": 600, "D": 600, "E": 600, "F": 600, "G": 600, "H": 600, "I": 600, "J": 600, "K": 600, "L": 600, "M": 600, "N": 600, "O": 600, "P": 600, "Q": 600, "R": 600, "S": 600, "T": 600, "U": 600, "V": 600, "W": 600, "X": 600, "Y": 600, "Z": 600, "[": 600, "\\": 600, "]": 600, "^": 600, "_": 600, "\u2018": 600, "a": 600, "b": 600, "c": 600, "d": 600, "e": 600, "f": 600, "g": 600, "h": 600, "i": 600, "j": 600, "k": 600, "l": 600, "m": 600, "n": 600, "o": 600, "p": 600, "q": 600, "r": 600, "s": 600, "t": 600, "u": 600, "v": 600, "w": 600, "x": 600, "y": 600, "z": 600, "{": 600, "|": 600, "}": 600, "~": 600, "\xa1": 600, "\xa2": 600, "\xa3": 600, "\u2044": 600, "\xa5": 600, "\u0192": 600, "\xa7": 600, "\xa4": 600, "'": 600, "\u201c": 600, "\xab": 600, "\u2039": 600, "\u203a": 600, "\ufb01": 600, "\ufb02": 600, "\u2013": 600, "\u2020": 600, "\u2021": 600, "\xb7": 600, "\xb6": 600, "\u2022": 600, "\u201a": 600, "\u201e": 600, "\u201d": 600, "\xbb": 600, "\u2026": 600, "\u2030": 600, "\xbf": 600, "`": 600, "\xb4": 600, "\u02c6": 600, "\u02dc": 600, "\xaf": 600, "\u02d8": 600, "\u02d9": 600, "\xa8": 600, "\u02da": 600, "\xb8": 600, "\u02dd": 600, "\u02db": 600, "\u02c7": 600, "\u2014": 600, "\xc6": 600, "\xaa": 600, "\u0141": 600, "\xd8": 600, "\u0152": 600, "\xba": 600, "\xe6": 600, "\u0131": 600, "\u0142": 600, "\xf8": 600, "\u0153": 600, "\xdf": 600, "\xcf": 600, "\xe9": 600, "\u0103": 600, "\u0171": 600, "\u011b": 600, "\u0178": 600, "\xf7": 600, "\xdd": 600, "\xc2": 600, "\xe1": 600, "\xdb": 600, "\xfd": 600, "\u0219": 600, "\xea": 600, "\u016e": 600, "\xdc": 600, "\u0105": 600, "\xda": 600, "\u0173": 600, "\xcb": 600, "\u0110": 600, "\uf6c3": 600, "\xa9": 600, "\u0112": 600, "\u010d": 600, "\xe5": 600, "\u0145": 600, "\u013a": 600, "\xe0": 600, "\u0162": 600, "\u0106": 600, "\xe3": 600, "\u0116": 600, "\u0161": 600, "\u015f": 600, "\xed": 600, "\u25ca": 600, "\u0158": 600, "\u0122": 600, "\xfb": 600, "\xe2": 600, "\u0100": 600, "\u0159": 600, "\xe7": 600, "\u017b": 600, "\xde": 600, "\u014c": 600, "\u0154": 600, "\u015a": 600, "\u010f": 600, "\u016a": 600, "\u016f": 600, "\xb3": 600, "\xd2": 600, "\xc0": 600, "\u0102": 600, "\xd7": 600, "\xfa": 600, "\u0164": 600, "\u2202": 600, "\xff": 600, "\u0143": 600, "\xee": 600, "\xca": 600, "\xe4": 600, "\xeb": 600, "\u0107": 600, "\u0144": 600, "\u016b": 600, "\u0147": 600, "\xcd": 600, "\xb1": 600, "\xa6": 600, "\xae": 600, "\u011e": 600, "\u0130": 600, "\u2211": 600, "\xc8": 600, "\u0155": 600, "\u014d": 600, "\u0179": 600, "\u017d": 600, "\u2265": 600, "\xd0": 600, "\xc7": 600, "\u013c": 600, "\u0165": 600, "\u0119": 600, "\u0172": 600, "\xc1": 600, "\xc4": 600, "\xe8": 600, "\u017a": 600, "\u012f": 600, "\xd3": 600, "\xf3": 600, "\u0101": 600, "\u015b": 600, "\xef": 600, "\xd4": 600, "\xd9": 600, "\u2206": 600, "\xfe": 600, "\xb2": 600, "\xd6": 600, "\xb5": 600, "\xec": 600, "\u0151": 600, "\u0118": 600, "\u0111": 600, "\xbe": 600, "\u015e": 600, "\u013e": 600, "\u0136": 600, "\u0139": 600, "\u2122": 600, "\u0117": 600, "\xcc": 600, "\u012a": 600, "\u013d": 600, "\xbd": 600, "\u2264": 600, "\xf4": 600, "\xf1": 600, "\u0170": 600, "\xc9": 600, "\u0113": 600, "\u011f": 600, "\xbc": 600, "\u0160": 600, "\u0218": 600, "\u0150": 600, "\xb0": 600, "\xf2": 600, "\u010c": 600, "\xf9": 600, "\u221a": 600, "\u010e": 600, "\u0157": 600, "\xd1": 600, "\xf5": 600, "\u0156": 600, "\u013b": 600, "\xc3": 600, "\u0104": 600, "\xc5": 600, "\xd5": 600, "\u017c": 600, "\u011a": 600, "\u012e": 600, "\u0137": 600, "\u2212": 600, "\xce": 600, "\u0148": 600, "\u0163": 600, "\xac": 600, "\xf6": 600, "\xfc": 600, "\u2260": 600, "\u0123": 600, "\xf0": 600, "\u017e": 600, "\u0146": 600, "\xb9": 600, "\u012b": 600, "\u20ac": 600, }, ), # Generated from Helvetica.afm # Copyright (c) 1985, 1987, 1989, 1990, 1997 Adobe Systems Incorporated. All Rights Reserved. # Helvetica is a trademark of Linotype-Hell AG and/or its subsidiaries. "Helvetica": CoreFontMetrics( font_descriptor=FontDescriptor( name="Helvetica", family="Helvetica", weight="Medium", ascent=718, descent=-207, cap_height=718, x_height=523, italic_angle=0, flags=32, bbox=(-166.0, -225.0, 1000.0, 931.0), ), character_widths={ " ": 278, "default": 556, "!": 278, '"': 355, "#": 556, "$": 556, "%": 889, "&": 667, "\u2019": 222, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 278, ";": 278, "<": 584, "=": 584, ">": 584, "?": 556, "@": 1015, "A": 667, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 500, "K": 667, "L": 556, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 278, "\\": 278, "]": 278, "^": 469, "_": 556, "\u2018": 222, "a": 556, "b": 556, "c": 500, "d": 556, "e": 556, "f": 278, "g": 556, "h": 556, "i": 222, "j": 222, "k": 500, "l": 222, "m": 833, "n": 556, "o": 556, "p": 556, "q": 556, "r": 333, "s": 500, "t": 278, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 500, "{": 334, "|": 260, "}": 334, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\u2044": 167, "\xa5": 556, "\u0192": 556, "\xa7": 556, "\xa4": 556, "'": 191, "\u201c": 333, "\xab": 556, "\u2039": 333, "\u203a": 333, "\ufb01": 500, "\ufb02": 500, "\u2013": 556, "\u2020": 556, "\u2021": 556, "\xb7": 278, "\xb6": 537, "\u2022": 350, "\u201a": 222, "\u201e": 333, "\u201d": 333, "\xbb": 556, "\u2026": 1000, "\u2030": 1000, "\xbf": 611, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 1000, "\xaa": 370, "\u0141": 556, "\xd8": 778, "\u0152": 1000, "\xba": 365, "\xe6": 889, "\u0131": 278, "\u0142": 222, "\xf8": 611, "\u0153": 944, "\xdf": 611, "\xcf": 278, "\xe9": 556, "\u0103": 556, "\u0171": 556, "\u011b": 556, "\u0178": 667, "\xf7": 584, "\xdd": 667, "\xc2": 667, "\xe1": 556, "\xdb": 722, "\xfd": 500, "\u0219": 500, "\xea": 556, "\u016e": 722, "\xdc": 722, "\u0105": 556, "\xda": 722, "\u0173": 556, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 737, "\u0112": 667, "\u010d": 500, "\xe5": 556, "\u0145": 722, "\u013a": 222, "\xe0": 556, "\u0162": 611, "\u0106": 722, "\xe3": 556, "\u0116": 667, "\u0161": 500, "\u015f": 500, "\xed": 278, "\u25ca": 471, "\u0158": 722, "\u0122": 778, "\xfb": 556, "\xe2": 556, "\u0100": 667, "\u0159": 333, "\xe7": 500, "\u017b": 611, "\xde": 667, "\u014c": 778, "\u0154": 722, "\u015a": 667, "\u010f": 643, "\u016a": 722, "\u016f": 556, "\xb3": 333, "\xd2": 778, "\xc0": 667, "\u0102": 667, "\xd7": 584, "\xfa": 556, "\u0164": 611, "\u2202": 476, "\xff": 500, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 556, "\xeb": 556, "\u0107": 500, "\u0144": 556, "\u016b": 556, "\u0147": 722, "\xcd": 278, "\xb1": 584, "\xa6": 260, "\xae": 737, "\u011e": 778, "\u0130": 278, "\u2211": 600, "\xc8": 667, "\u0155": 333, "\u014d": 556, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 722, "\u013c": 222, "\u0165": 317, "\u0119": 556, "\u0172": 722, "\xc1": 667, "\xc4": 667, "\xe8": 556, "\u017a": 500, "\u012f": 222, "\xd3": 778, "\xf3": 556, "\u0101": 556, "\u015b": 500, "\xef": 278, "\xd4": 778, "\xd9": 722, "\u2206": 612, "\xfe": 556, "\xb2": 333, "\xd6": 778, "\xb5": 556, "\xec": 278, "\u0151": 556, "\u0118": 667, "\u0111": 556, "\xbe": 834, "\u015e": 667, "\u013e": 299, "\u0136": 667, "\u0139": 556, "\u2122": 1000, "\u0117": 556, "\xcc": 278, "\u012a": 278, "\u013d": 556, "\xbd": 834, "\u2264": 549, "\xf4": 556, "\xf1": 556, "\u0170": 722, "\xc9": 667, "\u0113": 556, "\u011f": 556, "\xbc": 834, "\u0160": 667, "\u0218": 667, "\u0150": 778, "\xb0": 400, "\xf2": 556, "\u010c": 722, "\xf9": 556, "\u221a": 453, "\u010e": 722, "\u0157": 333, "\xd1": 722, "\xf5": 556, "\u0156": 722, "\u013b": 556, "\xc3": 667, "\u0104": 667, "\xc5": 667, "\xd5": 778, "\u017c": 500, "\u011a": 667, "\u012e": 278, "\u0137": 500, "\u2212": 584, "\xce": 278, "\u0148": 556, "\u0163": 278, "\xac": 584, "\xf6": 556, "\xfc": 556, "\u2260": 549, "\u0123": 556, "\xf0": 556, "\u017e": 500, "\u0146": 556, "\xb9": 333, "\u012b": 278, "\u20ac": 556, }, ), # Generated from Helvetica-Bold.afm # Copyright (c) 1985, 1987, 1989, 1990, 1997 Adobe Systems Incorporated. All Rights Reserved. # Helvetica is a trademark of Linotype-Hell AG and/or its subsidiaries. "Helvetica-Bold": CoreFontMetrics( font_descriptor=FontDescriptor( name="Helvetica-Bold", family="Helvetica", weight="Bold", ascent=718, descent=-207, cap_height=718, x_height=532, italic_angle=0, flags=32, bbox=(-170.0, -228.0, 1003.0, 962.0), ), character_widths={ " ": 278, "default": 556, "!": 333, '"': 474, "#": 556, "$": 556, "%": 889, "&": 722, "\u2019": 278, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 333, ";": 333, "<": 584, "=": 584, ">": 584, "?": 611, "@": 975, "A": 722, "B": 722, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 556, "K": 722, "L": 611, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 584, "_": 556, "\u2018": 278, "a": 556, "b": 611, "c": 556, "d": 611, "e": 556, "f": 333, "g": 611, "h": 611, "i": 278, "j": 278, "k": 556, "l": 278, "m": 889, "n": 611, "o": 611, "p": 611, "q": 611, "r": 389, "s": 556, "t": 333, "u": 611, "v": 556, "w": 778, "x": 556, "y": 556, "z": 500, "{": 389, "|": 280, "}": 389, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\u2044": 167, "\xa5": 556, "\u0192": 556, "\xa7": 556, "\xa4": 556, "'": 238, "\u201c": 500, "\xab": 556, "\u2039": 333, "\u203a": 333, "\ufb01": 611, "\ufb02": 611, "\u2013": 556, "\u2020": 556, "\u2021": 556, "\xb7": 278, "\xb6": 556, "\u2022": 350, "\u201a": 278, "\u201e": 500, "\u201d": 500, "\xbb": 556, "\u2026": 1000, "\u2030": 1000, "\xbf": 611, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 1000, "\xaa": 370, "\u0141": 611, "\xd8": 778, "\u0152": 1000, "\xba": 365, "\xe6": 889, "\u0131": 278, "\u0142": 278, "\xf8": 611, "\u0153": 944, "\xdf": 611, "\xcf": 278, "\xe9": 556, "\u0103": 556, "\u0171": 611, "\u011b": 556, "\u0178": 667, "\xf7": 584, "\xdd": 667, "\xc2": 722, "\xe1": 556, "\xdb": 722, "\xfd": 556, "\u0219": 556, "\xea": 556, "\u016e": 722, "\xdc": 722, "\u0105": 556, "\xda": 722, "\u0173": 611, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 737, "\u0112": 667, "\u010d": 556, "\xe5": 556, "\u0145": 722, "\u013a": 278, "\xe0": 556, "\u0162": 611, "\u0106": 722, "\xe3": 556, "\u0116": 667, "\u0161": 556, "\u015f": 556, "\xed": 278, "\u25ca": 494, "\u0158": 722, "\u0122": 778, "\xfb": 611, "\xe2": 556, "\u0100": 722, "\u0159": 389, "\xe7": 556, "\u017b": 611, "\xde": 667, "\u014c": 778, "\u0154": 722, "\u015a": 667, "\u010f": 743, "\u016a": 722, "\u016f": 611, "\xb3": 333, "\xd2": 778, "\xc0": 722, "\u0102": 722, "\xd7": 584, "\xfa": 611, "\u0164": 611, "\u2202": 494, "\xff": 556, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 556, "\xeb": 556, "\u0107": 556, "\u0144": 611, "\u016b": 611, "\u0147": 722, "\xcd": 278, "\xb1": 584, "\xa6": 280, "\xae": 737, "\u011e": 778, "\u0130": 278, "\u2211": 600, "\xc8": 667, "\u0155": 389, "\u014d": 611, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 722, "\u013c": 278, "\u0165": 389, "\u0119": 556, "\u0172": 722, "\xc1": 722, "\xc4": 722, "\xe8": 556, "\u017a": 500, "\u012f": 278, "\xd3": 778, "\xf3": 611, "\u0101": 556, "\u015b": 556, "\xef": 278, "\xd4": 778, "\xd9": 722, "\u2206": 612, "\xfe": 611, "\xb2": 333, "\xd6": 778, "\xb5": 611, "\xec": 278, "\u0151": 611, "\u0118": 667, "\u0111": 611, "\xbe": 834, "\u015e": 667, "\u013e": 400, "\u0136": 722, "\u0139": 611, "\u2122": 1000, "\u0117": 556, "\xcc": 278, "\u012a": 278, "\u013d": 611, "\xbd": 834, "\u2264": 549, "\xf4": 611, "\xf1": 611, "\u0170": 722, "\xc9": 667, "\u0113": 556, "\u011f": 611, "\xbc": 834, "\u0160": 667, "\u0218": 667, "\u0150": 778, "\xb0": 400, "\xf2": 611, "\u010c": 722, "\xf9": 611, "\u221a": 549, "\u010e": 722, "\u0157": 389, "\xd1": 722, "\xf5": 611, "\u0156": 722, "\u013b": 611, "\xc3": 722, "\u0104": 722, "\xc5": 722, "\xd5": 778, "\u017c": 500, "\u011a": 667, "\u012e": 278, "\u0137": 556, "\u2212": 584, "\xce": 278, "\u0148": 611, "\u0163": 333, "\xac": 584, "\xf6": 611, "\xfc": 611, "\u2260": 549, "\u0123": 611, "\xf0": 611, "\u017e": 500, "\u0146": 611, "\xb9": 333, "\u012b": 278, "\u20ac": 556, }, ), # Generated from Helvetica-BoldOblique.afm # Copyright (c) 1985, 1987, 1989, 1990, 1997 Adobe Systems Incorporated. All Rights Reserved. # Helvetica is a trademark of Linotype-Hell AG and/or its subsidiaries. "Helvetica-BoldOblique": CoreFontMetrics( font_descriptor=FontDescriptor( name="Helvetica-BoldOblique", family="Helvetica", weight="Bold", ascent=718, descent=-207, cap_height=718, x_height=532, italic_angle=-12, flags=96, bbox=(-174.0, -228.0, 1114.0, 962.0), ), character_widths={ " ": 278, "default": 556, "!": 333, '"': 474, "#": 556, "$": 556, "%": 889, "&": 722, "\u2019": 278, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 333, ";": 333, "<": 584, "=": 584, ">": 584, "?": 611, "@": 975, "A": 722, "B": 722, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 556, "K": 722, "L": 611, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 584, "_": 556, "\u2018": 278, "a": 556, "b": 611, "c": 556, "d": 611, "e": 556, "f": 333, "g": 611, "h": 611, "i": 278, "j": 278, "k": 556, "l": 278, "m": 889, "n": 611, "o": 611, "p": 611, "q": 611, "r": 389, "s": 556, "t": 333, "u": 611, "v": 556, "w": 778, "x": 556, "y": 556, "z": 500, "{": 389, "|": 280, "}": 389, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\u2044": 167, "\xa5": 556, "\u0192": 556, "\xa7": 556, "\xa4": 556, "'": 238, "\u201c": 500, "\xab": 556, "\u2039": 333, "\u203a": 333, "\ufb01": 611, "\ufb02": 611, "\u2013": 556, "\u2020": 556, "\u2021": 556, "\xb7": 278, "\xb6": 556, "\u2022": 350, "\u201a": 278, "\u201e": 500, "\u201d": 500, "\xbb": 556, "\u2026": 1000, "\u2030": 1000, "\xbf": 611, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 1000, "\xaa": 370, "\u0141": 611, "\xd8": 778, "\u0152": 1000, "\xba": 365, "\xe6": 889, "\u0131": 278, "\u0142": 278, "\xf8": 611, "\u0153": 944, "\xdf": 611, "\xcf": 278, "\xe9": 556, "\u0103": 556, "\u0171": 611, "\u011b": 556, "\u0178": 667, "\xf7": 584, "\xdd": 667, "\xc2": 722, "\xe1": 556, "\xdb": 722, "\xfd": 556, "\u0219": 556, "\xea": 556, "\u016e": 722, "\xdc": 722, "\u0105": 556, "\xda": 722, "\u0173": 611, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 737, "\u0112": 667, "\u010d": 556, "\xe5": 556, "\u0145": 722, "\u013a": 278, "\xe0": 556, "\u0162": 611, "\u0106": 722, "\xe3": 556, "\u0116": 667, "\u0161": 556, "\u015f": 556, "\xed": 278, "\u25ca": 494, "\u0158": 722, "\u0122": 778, "\xfb": 611, "\xe2": 556, "\u0100": 722, "\u0159": 389, "\xe7": 556, "\u017b": 611, "\xde": 667, "\u014c": 778, "\u0154": 722, "\u015a": 667, "\u010f": 743, "\u016a": 722, "\u016f": 611, "\xb3": 333, "\xd2": 778, "\xc0": 722, "\u0102": 722, "\xd7": 584, "\xfa": 611, "\u0164": 611, "\u2202": 494, "\xff": 556, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 556, "\xeb": 556, "\u0107": 556, "\u0144": 611, "\u016b": 611, "\u0147": 722, "\xcd": 278, "\xb1": 584, "\xa6": 280, "\xae": 737, "\u011e": 778, "\u0130": 278, "\u2211": 600, "\xc8": 667, "\u0155": 389, "\u014d": 611, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 722, "\u013c": 278, "\u0165": 389, "\u0119": 556, "\u0172": 722, "\xc1": 722, "\xc4": 722, "\xe8": 556, "\u017a": 500, "\u012f": 278, "\xd3": 778, "\xf3": 611, "\u0101": 556, "\u015b": 556, "\xef": 278, "\xd4": 778, "\xd9": 722, "\u2206": 612, "\xfe": 611, "\xb2": 333, "\xd6": 778, "\xb5": 611, "\xec": 278, "\u0151": 611, "\u0118": 667, "\u0111": 611, "\xbe": 834, "\u015e": 667, "\u013e": 400, "\u0136": 722, "\u0139": 611, "\u2122": 1000, "\u0117": 556, "\xcc": 278, "\u012a": 278, "\u013d": 611, "\xbd": 834, "\u2264": 549, "\xf4": 611, "\xf1": 611, "\u0170": 722, "\xc9": 667, "\u0113": 556, "\u011f": 611, "\xbc": 834, "\u0160": 667, "\u0218": 667, "\u0150": 778, "\xb0": 400, "\xf2": 611, "\u010c": 722, "\xf9": 611, "\u221a": 549, "\u010e": 722, "\u0157": 389, "\xd1": 722, "\xf5": 611, "\u0156": 722, "\u013b": 611, "\xc3": 722, "\u0104": 722, "\xc5": 722, "\xd5": 778, "\u017c": 500, "\u011a": 667, "\u012e": 278, "\u0137": 556, "\u2212": 584, "\xce": 278, "\u0148": 611, "\u0163": 333, "\xac": 584, "\xf6": 611, "\xfc": 611, "\u2260": 549, "\u0123": 611, "\xf0": 611, "\u017e": 500, "\u0146": 611, "\xb9": 333, "\u012b": 278, "\u20ac": 556, }, ), # Generated from Helvetica-Oblique.afm # Copyright (c) 1985, 1987, 1989, 1990, 1997 Adobe Systems Incorporated. All Rights Reserved. # Helvetica is a trademark of Linotype-Hell AG and/or its subsidiaries. "Helvetica-Oblique": CoreFontMetrics( font_descriptor=FontDescriptor( name="Helvetica-Oblique", family="Helvetica", weight="Medium", ascent=718, descent=-207, cap_height=718, x_height=523, italic_angle=-12, flags=96, bbox=(-170.0, -225.0, 1116.0, 931.0), ), character_widths={ " ": 278, "default": 556, "!": 278, '"': 355, "#": 556, "$": 556, "%": 889, "&": 667, "\u2019": 222, "(": 333, ")": 333, "*": 389, "+": 584, ",": 278, "-": 333, ".": 278, "/": 278, "0": 556, "1": 556, "2": 556, "3": 556, "4": 556, "5": 556, "6": 556, "7": 556, "8": 556, "9": 556, ":": 278, ";": 278, "<": 584, "=": 584, ">": 584, "?": 556, "@": 1015, "A": 667, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 722, "I": 278, "J": 500, "K": 667, "L": 556, "M": 833, "N": 722, "O": 778, "P": 667, "Q": 778, "R": 722, "S": 667, "T": 611, "U": 722, "V": 667, "W": 944, "X": 667, "Y": 667, "Z": 611, "[": 278, "\\": 278, "]": 278, "^": 469, "_": 556, "\u2018": 222, "a": 556, "b": 556, "c": 500, "d": 556, "e": 556, "f": 278, "g": 556, "h": 556, "i": 222, "j": 222, "k": 500, "l": 222, "m": 833, "n": 556, "o": 556, "p": 556, "q": 556, "r": 333, "s": 500, "t": 278, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 500, "{": 334, "|": 260, "}": 334, "~": 584, "\xa1": 333, "\xa2": 556, "\xa3": 556, "\u2044": 167, "\xa5": 556, "\u0192": 556, "\xa7": 556, "\xa4": 556, "'": 191, "\u201c": 333, "\xab": 556, "\u2039": 333, "\u203a": 333, "\ufb01": 500, "\ufb02": 500, "\u2013": 556, "\u2020": 556, "\u2021": 556, "\xb7": 278, "\xb6": 537, "\u2022": 350, "\u201a": 222, "\u201e": 333, "\u201d": 333, "\xbb": 556, "\u2026": 1000, "\u2030": 1000, "\xbf": 611, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 1000, "\xaa": 370, "\u0141": 556, "\xd8": 778, "\u0152": 1000, "\xba": 365, "\xe6": 889, "\u0131": 278, "\u0142": 222, "\xf8": 611, "\u0153": 944, "\xdf": 611, "\xcf": 278, "\xe9": 556, "\u0103": 556, "\u0171": 556, "\u011b": 556, "\u0178": 667, "\xf7": 584, "\xdd": 667, "\xc2": 667, "\xe1": 556, "\xdb": 722, "\xfd": 500, "\u0219": 500, "\xea": 556, "\u016e": 722, "\xdc": 722, "\u0105": 556, "\xda": 722, "\u0173": 556, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 737, "\u0112": 667, "\u010d": 500, "\xe5": 556, "\u0145": 722, "\u013a": 222, "\xe0": 556, "\u0162": 611, "\u0106": 722, "\xe3": 556, "\u0116": 667, "\u0161": 500, "\u015f": 500, "\xed": 278, "\u25ca": 471, "\u0158": 722, "\u0122": 778, "\xfb": 556, "\xe2": 556, "\u0100": 667, "\u0159": 333, "\xe7": 500, "\u017b": 611, "\xde": 667, "\u014c": 778, "\u0154": 722, "\u015a": 667, "\u010f": 643, "\u016a": 722, "\u016f": 556, "\xb3": 333, "\xd2": 778, "\xc0": 667, "\u0102": 667, "\xd7": 584, "\xfa": 556, "\u0164": 611, "\u2202": 476, "\xff": 500, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 556, "\xeb": 556, "\u0107": 500, "\u0144": 556, "\u016b": 556, "\u0147": 722, "\xcd": 278, "\xb1": 584, "\xa6": 260, "\xae": 737, "\u011e": 778, "\u0130": 278, "\u2211": 600, "\xc8": 667, "\u0155": 333, "\u014d": 556, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 722, "\u013c": 222, "\u0165": 317, "\u0119": 556, "\u0172": 722, "\xc1": 667, "\xc4": 667, "\xe8": 556, "\u017a": 500, "\u012f": 222, "\xd3": 778, "\xf3": 556, "\u0101": 556, "\u015b": 500, "\xef": 278, "\xd4": 778, "\xd9": 722, "\u2206": 612, "\xfe": 556, "\xb2": 333, "\xd6": 778, "\xb5": 556, "\xec": 278, "\u0151": 556, "\u0118": 667, "\u0111": 556, "\xbe": 834, "\u015e": 667, "\u013e": 299, "\u0136": 667, "\u0139": 556, "\u2122": 1000, "\u0117": 556, "\xcc": 278, "\u012a": 278, "\u013d": 556, "\xbd": 834, "\u2264": 549, "\xf4": 556, "\xf1": 556, "\u0170": 722, "\xc9": 667, "\u0113": 556, "\u011f": 556, "\xbc": 834, "\u0160": 667, "\u0218": 667, "\u0150": 778, "\xb0": 400, "\xf2": 556, "\u010c": 722, "\xf9": 556, "\u221a": 453, "\u010e": 722, "\u0157": 333, "\xd1": 722, "\xf5": 556, "\u0156": 722, "\u013b": 556, "\xc3": 667, "\u0104": 667, "\xc5": 667, "\xd5": 778, "\u017c": 500, "\u011a": 667, "\u012e": 278, "\u0137": 500, "\u2212": 584, "\xce": 278, "\u0148": 556, "\u0163": 278, "\xac": 584, "\xf6": 556, "\xfc": 556, "\u2260": 549, "\u0123": 556, "\xf0": 556, "\u017e": 500, "\u0146": 556, "\xb9": 333, "\u012b": 278, "\u20ac": 556, }, ), # Generated from Symbol.afm # Copyright (c) 1985, 1987, 1989, 1990, 1997 Adobe Systems Incorporated. All rights reserved. "Symbol": CoreFontMetrics( font_descriptor=FontDescriptor( name="Symbol", family="Symbol", weight="Medium", ascent=0.0, descent=0.0, cap_height=0.0, x_height=0.0, italic_angle=0, flags=4, bbox=(-180.0, -293.0, 1090.0, 1010.0), ), character_widths={ " ": 250, "default": 500, "!": 333, "\u2200": 713, "#": 500, "\u2203": 549, "%": 833, "&": 778, "\u220b": 439, "(": 333, ")": 333, "\u2217": 500, "+": 549, ",": 250, "\u2212": 549, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 278, ";": 278, "<": 549, "=": 549, ">": 549, "?": 444, "\u2245": 549, "\u0391": 722, "\u0392": 667, "\u03a7": 722, "\u2206": 612, "\u0395": 611, "\u03a6": 763, "\u0393": 603, "\u0397": 722, "\u0399": 333, "\u03d1": 631, "\u039a": 722, "\u039b": 686, "\u039c": 889, "\u039d": 722, "\u039f": 722, "\u03a0": 768, "\u0398": 741, "\u03a1": 556, "\u03a3": 592, "\u03a4": 611, "\u03a5": 690, "\u03c2": 439, "\u2126": 768, "\u039e": 645, "\u03a8": 795, "\u0396": 611, "[": 333, "\u2234": 863, "]": 333, "\u22a5": 658, "_": 500, "\uf8e5": 500, "\u03b1": 631, "\u03b2": 549, "\u03c7": 549, "\u03b4": 494, "\u03b5": 439, "\u03c6": 521, "\u03b3": 411, "\u03b7": 603, "\u03b9": 329, "\u03d5": 603, "\u03ba": 549, "\u03bb": 549, "\xb5": 576, "\u03bd": 521, "\u03bf": 549, "\u03c0": 549, "\u03b8": 521, "\u03c1": 549, "\u03c3": 603, "\u03c4": 439, "\u03c5": 576, "\u03d6": 713, "\u03c9": 686, "\u03be": 493, "\u03c8": 686, "\u03b6": 494, "{": 480, "|": 200, "}": 480, "\u223c": 549, "\u20ac": 750, "\u03d2": 620, "\u2032": 247, "\u2264": 549, "\u2044": 167, "\u221e": 713, "\u0192": 500, "\u2663": 753, "\u2666": 753, "\u2665": 753, "\u2660": 753, "\u2194": 1042, "\u2190": 987, "\u2191": 603, "\u2192": 987, "\u2193": 603, "\xb0": 400, "\xb1": 549, "\u2033": 411, "\u2265": 549, "\xd7": 549, "\u221d": 713, "\u2202": 494, "\u2022": 460, "\xf7": 549, "\u2260": 549, "\u2261": 549, "\u2248": 549, "\u2026": 1000, "\uf8e6": 603, "\uf8e7": 1000, "\u21b5": 658, "\u2135": 823, "\u2111": 686, "\u211c": 795, "\u2118": 987, "\u2297": 768, "\u2295": 768, "\u2205": 823, "\u2229": 768, "\u222a": 768, "\u2283": 713, "\u2287": 713, "\u2284": 713, "\u2282": 713, "\u2286": 713, "\u2208": 713, "\u2209": 713, "\u2220": 768, "\u2207": 713, "\uf6da": 790, "\uf6d9": 790, "\uf6db": 890, "\u220f": 823, "\u221a": 549, "\u22c5": 250, "\xac": 713, "\u2227": 603, "\u2228": 603, "\u21d4": 1042, "\u21d0": 987, "\u21d1": 603, "\u21d2": 987, "\u21d3": 603, "\u25ca": 494, "\u2329": 329, "\uf8e8": 790, "\uf8e9": 790, "\uf8ea": 786, "\u2211": 713, "\uf8eb": 384, "\uf8ec": 384, "\uf8ed": 384, "\uf8ee": 384, "\uf8ef": 384, "\uf8f0": 384, "\uf8f1": 494, "\uf8f2": 494, "\uf8f3": 494, "\uf8f4": 494, "\u232a": 329, "\u222b": 274, "\u2320": 686, "\uf8f5": 686, "\u2321": 686, "\uf8f6": 384, "\uf8f7": 384, "\uf8f8": 384, "\uf8f9": 384, "\uf8fa": 384, "\uf8fb": 384, "\uf8fc": 494, "\uf8fd": 494, "\uf8fe": 494, "\uf8ff": 790, }, ), # Generated from Times-Bold.afm # Copyright (c) 1985, 1987, 1989, 1990, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. Times is a trademark of Linotype-Hell AG and/or its subsidiaries. "Times-Bold": CoreFontMetrics( font_descriptor=FontDescriptor( name="Times-Bold", family="Times", weight="Bold", ascent=683, descent=-217, cap_height=676, x_height=461, italic_angle=0, flags=34, bbox=(-168.0, -218.0, 1000.0, 935.0), ), character_widths={ " ": 250, "default": 500, "!": 333, '"': 555, "#": 500, "$": 500, "%": 1000, "&": 833, "\u2019": 333, "(": 333, ")": 333, "*": 500, "+": 570, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 570, "=": 570, ">": 570, "?": 500, "@": 930, "A": 722, "B": 667, "C": 722, "D": 722, "E": 667, "F": 611, "G": 778, "H": 778, "I": 389, "J": 500, "K": 778, "L": 667, "M": 944, "N": 722, "O": 778, "P": 611, "Q": 778, "R": 722, "S": 556, "T": 667, "U": 722, "V": 722, "W": 1000, "X": 722, "Y": 722, "Z": 667, "[": 333, "\\": 278, "]": 333, "^": 581, "_": 500, "\u2018": 333, "a": 500, "b": 556, "c": 444, "d": 556, "e": 444, "f": 333, "g": 500, "h": 556, "i": 278, "j": 333, "k": 556, "l": 278, "m": 833, "n": 556, "o": 500, "p": 556, "q": 556, "r": 444, "s": 389, "t": 333, "u": 556, "v": 500, "w": 722, "x": 500, "y": 500, "z": 444, "{": 394, "|": 220, "}": 394, "~": 520, "\xa1": 333, "\xa2": 500, "\xa3": 500, "\u2044": 167, "\xa5": 500, "\u0192": 500, "\xa7": 500, "\xa4": 500, "'": 278, "\u201c": 500, "\xab": 500, "\u2039": 333, "\u203a": 333, "\ufb01": 556, "\ufb02": 556, "\u2013": 500, "\u2020": 500, "\u2021": 500, "\xb7": 250, "\xb6": 540, "\u2022": 350, "\u201a": 333, "\u201e": 500, "\u201d": 500, "\xbb": 500, "\u2026": 1000, "\u2030": 1000, "\xbf": 500, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 1000, "\xaa": 300, "\u0141": 667, "\xd8": 778, "\u0152": 1000, "\xba": 330, "\xe6": 722, "\u0131": 278, "\u0142": 278, "\xf8": 500, "\u0153": 722, "\xdf": 556, "\xcf": 389, "\xe9": 444, "\u0103": 500, "\u0171": 556, "\u011b": 444, "\u0178": 722, "\xf7": 570, "\xdd": 722, "\xc2": 722, "\xe1": 500, "\xdb": 722, "\xfd": 500, "\u0219": 389, "\xea": 444, "\u016e": 722, "\xdc": 722, "\u0105": 500, "\xda": 722, "\u0173": 556, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 747, "\u0112": 667, "\u010d": 444, "\xe5": 500, "\u0145": 722, "\u013a": 278, "\xe0": 500, "\u0162": 667, "\u0106": 722, "\xe3": 500, "\u0116": 667, "\u0161": 389, "\u015f": 389, "\xed": 278, "\u25ca": 494, "\u0158": 722, "\u0122": 778, "\xfb": 556, "\xe2": 500, "\u0100": 722, "\u0159": 444, "\xe7": 444, "\u017b": 667, "\xde": 611, "\u014c": 778, "\u0154": 722, "\u015a": 556, "\u010f": 672, "\u016a": 722, "\u016f": 556, "\xb3": 300, "\xd2": 778, "\xc0": 722, "\u0102": 722, "\xd7": 570, "\xfa": 556, "\u0164": 667, "\u2202": 494, "\xff": 500, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 500, "\xeb": 444, "\u0107": 444, "\u0144": 556, "\u016b": 556, "\u0147": 722, "\xcd": 389, "\xb1": 570, "\xa6": 220, "\xae": 747, "\u011e": 778, "\u0130": 389, "\u2211": 600, "\xc8": 667, "\u0155": 444, "\u014d": 500, "\u0179": 667, "\u017d": 667, "\u2265": 549, "\xd0": 722, "\xc7": 722, "\u013c": 278, "\u0165": 416, "\u0119": 444, "\u0172": 722, "\xc1": 722, "\xc4": 722, "\xe8": 444, "\u017a": 444, "\u012f": 278, "\xd3": 778, "\xf3": 500, "\u0101": 500, "\u015b": 389, "\xef": 278, "\xd4": 778, "\xd9": 722, "\u2206": 612, "\xfe": 556, "\xb2": 300, "\xd6": 778, "\xb5": 556, "\xec": 278, "\u0151": 500, "\u0118": 667, "\u0111": 556, "\xbe": 750, "\u015e": 556, "\u013e": 394, "\u0136": 778, "\u0139": 667, "\u2122": 1000, "\u0117": 444, "\xcc": 389, "\u012a": 389, "\u013d": 667, "\xbd": 750, "\u2264": 549, "\xf4": 500, "\xf1": 556, "\u0170": 722, "\xc9": 667, "\u0113": 444, "\u011f": 500, "\xbc": 750, "\u0160": 556, "\u0218": 556, "\u0150": 778, "\xb0": 400, "\xf2": 500, "\u010c": 722, "\xf9": 556, "\u221a": 549, "\u010e": 722, "\u0157": 444, "\xd1": 722, "\xf5": 500, "\u0156": 722, "\u013b": 667, "\xc3": 722, "\u0104": 722, "\xc5": 722, "\xd5": 778, "\u017c": 444, "\u011a": 667, "\u012e": 389, "\u0137": 556, "\u2212": 570, "\xce": 389, "\u0148": 556, "\u0163": 333, "\xac": 570, "\xf6": 500, "\xfc": 556, "\u2260": 549, "\u0123": 500, "\xf0": 500, "\u017e": 444, "\u0146": 556, "\xb9": 300, "\u012b": 278, "\u20ac": 500, }, ), # Generated from Times-BoldItalic.afm # Copyright (c) 1985, 1987, 1989, 1990, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. Times is a trademark of Linotype-Hell AG and/or its subsidiaries. "Times-BoldItalic": CoreFontMetrics( font_descriptor=FontDescriptor( name="Times-BoldItalic", family="Times", weight="Bold", ascent=683, descent=-217, cap_height=669, x_height=462, italic_angle=-15, flags=98, bbox=(-200.0, -218.0, 996.0, 921.0), ), character_widths={ " ": 250, "default": 500, "!": 389, '"': 555, "#": 500, "$": 500, "%": 833, "&": 778, "\u2019": 333, "(": 333, ")": 333, "*": 500, "+": 570, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 570, "=": 570, ">": 570, "?": 500, "@": 832, "A": 667, "B": 667, "C": 667, "D": 722, "E": 667, "F": 667, "G": 722, "H": 778, "I": 389, "J": 500, "K": 667, "L": 611, "M": 889, "N": 722, "O": 722, "P": 611, "Q": 722, "R": 667, "S": 556, "T": 611, "U": 722, "V": 667, "W": 889, "X": 667, "Y": 611, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 570, "_": 500, "\u2018": 333, "a": 500, "b": 500, "c": 444, "d": 500, "e": 444, "f": 333, "g": 500, "h": 556, "i": 278, "j": 278, "k": 500, "l": 278, "m": 778, "n": 556, "o": 500, "p": 500, "q": 500, "r": 389, "s": 389, "t": 278, "u": 556, "v": 444, "w": 667, "x": 500, "y": 444, "z": 389, "{": 348, "|": 220, "}": 348, "~": 570, "\xa1": 389, "\xa2": 500, "\xa3": 500, "\u2044": 167, "\xa5": 500, "\u0192": 500, "\xa7": 500, "\xa4": 500, "'": 278, "\u201c": 500, "\xab": 500, "\u2039": 333, "\u203a": 333, "\ufb01": 556, "\ufb02": 556, "\u2013": 500, "\u2020": 500, "\u2021": 500, "\xb7": 250, "\xb6": 500, "\u2022": 350, "\u201a": 333, "\u201e": 500, "\u201d": 500, "\xbb": 500, "\u2026": 1000, "\u2030": 1000, "\xbf": 500, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 944, "\xaa": 266, "\u0141": 611, "\xd8": 722, "\u0152": 944, "\xba": 300, "\xe6": 722, "\u0131": 278, "\u0142": 278, "\xf8": 500, "\u0153": 722, "\xdf": 500, "\xcf": 389, "\xe9": 444, "\u0103": 500, "\u0171": 556, "\u011b": 444, "\u0178": 611, "\xf7": 570, "\xdd": 611, "\xc2": 667, "\xe1": 500, "\xdb": 722, "\xfd": 444, "\u0219": 389, "\xea": 444, "\u016e": 722, "\xdc": 722, "\u0105": 500, "\xda": 722, "\u0173": 556, "\xcb": 667, "\u0110": 722, "\uf6c3": 250, "\xa9": 747, "\u0112": 667, "\u010d": 444, "\xe5": 500, "\u0145": 722, "\u013a": 278, "\xe0": 500, "\u0162": 611, "\u0106": 667, "\xe3": 500, "\u0116": 667, "\u0161": 389, "\u015f": 389, "\xed": 278, "\u25ca": 494, "\u0158": 667, "\u0122": 722, "\xfb": 556, "\xe2": 500, "\u0100": 667, "\u0159": 389, "\xe7": 444, "\u017b": 611, "\xde": 611, "\u014c": 722, "\u0154": 667, "\u015a": 556, "\u010f": 608, "\u016a": 722, "\u016f": 556, "\xb3": 300, "\xd2": 722, "\xc0": 667, "\u0102": 667, "\xd7": 570, "\xfa": 556, "\u0164": 611, "\u2202": 494, "\xff": 444, "\u0143": 722, "\xee": 278, "\xca": 667, "\xe4": 500, "\xeb": 444, "\u0107": 444, "\u0144": 556, "\u016b": 556, "\u0147": 722, "\xcd": 389, "\xb1": 570, "\xa6": 220, "\xae": 747, "\u011e": 722, "\u0130": 389, "\u2211": 600, "\xc8": 667, "\u0155": 389, "\u014d": 500, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 667, "\u013c": 278, "\u0165": 366, "\u0119": 444, "\u0172": 722, "\xc1": 667, "\xc4": 667, "\xe8": 444, "\u017a": 389, "\u012f": 278, "\xd3": 722, "\xf3": 500, "\u0101": 500, "\u015b": 389, "\xef": 278, "\xd4": 722, "\xd9": 722, "\u2206": 612, "\xfe": 500, "\xb2": 300, "\xd6": 722, "\xb5": 576, "\xec": 278, "\u0151": 500, "\u0118": 667, "\u0111": 500, "\xbe": 750, "\u015e": 556, "\u013e": 382, "\u0136": 667, "\u0139": 611, "\u2122": 1000, "\u0117": 444, "\xcc": 389, "\u012a": 389, "\u013d": 611, "\xbd": 750, "\u2264": 549, "\xf4": 500, "\xf1": 556, "\u0170": 722, "\xc9": 667, "\u0113": 444, "\u011f": 500, "\xbc": 750, "\u0160": 556, "\u0218": 556, "\u0150": 722, "\xb0": 400, "\xf2": 500, "\u010c": 667, "\xf9": 556, "\u221a": 549, "\u010e": 722, "\u0157": 389, "\xd1": 722, "\xf5": 500, "\u0156": 667, "\u013b": 611, "\xc3": 667, "\u0104": 667, "\xc5": 667, "\xd5": 722, "\u017c": 389, "\u011a": 667, "\u012e": 389, "\u0137": 500, "\u2212": 606, "\xce": 389, "\u0148": 556, "\u0163": 278, "\xac": 606, "\xf6": 500, "\xfc": 556, "\u2260": 549, "\u0123": 500, "\xf0": 500, "\u017e": 389, "\u0146": 556, "\xb9": 300, "\u012b": 278, "\u20ac": 500, }, ), # Generated from Times-Italic.afm # Copyright (c) 1985, 1987, 1989, 1990, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. Times is a trademark of Linotype-Hell AG and/or its subsidiaries. "Times-Italic": CoreFontMetrics( font_descriptor=FontDescriptor( name="Times-Italic", family="Times", weight="Medium", ascent=683, descent=-217, cap_height=653, x_height=441, italic_angle=-15.5, flags=98, bbox=(-169.0, -217.0, 1010.0, 883.0), ), character_widths={ " ": 250, "default": 500, "!": 333, '"': 420, "#": 500, "$": 500, "%": 833, "&": 778, "\u2019": 333, "(": 333, ")": 333, "*": 500, "+": 675, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 333, ";": 333, "<": 675, "=": 675, ">": 675, "?": 500, "@": 920, "A": 611, "B": 611, "C": 667, "D": 722, "E": 611, "F": 611, "G": 722, "H": 722, "I": 333, "J": 444, "K": 667, "L": 556, "M": 833, "N": 667, "O": 722, "P": 611, "Q": 722, "R": 611, "S": 500, "T": 556, "U": 722, "V": 611, "W": 833, "X": 611, "Y": 556, "Z": 556, "[": 389, "\\": 278, "]": 389, "^": 422, "_": 500, "\u2018": 333, "a": 500, "b": 500, "c": 444, "d": 500, "e": 444, "f": 278, "g": 500, "h": 500, "i": 278, "j": 278, "k": 444, "l": 278, "m": 722, "n": 500, "o": 500, "p": 500, "q": 500, "r": 389, "s": 389, "t": 278, "u": 500, "v": 444, "w": 667, "x": 444, "y": 444, "z": 389, "{": 400, "|": 275, "}": 400, "~": 541, "\xa1": 389, "\xa2": 500, "\xa3": 500, "\u2044": 167, "\xa5": 500, "\u0192": 500, "\xa7": 500, "\xa4": 500, "'": 214, "\u201c": 556, "\xab": 500, "\u2039": 333, "\u203a": 333, "\ufb01": 500, "\ufb02": 500, "\u2013": 500, "\u2020": 500, "\u2021": 500, "\xb7": 250, "\xb6": 523, "\u2022": 350, "\u201a": 333, "\u201e": 556, "\u201d": 556, "\xbb": 500, "\u2026": 889, "\u2030": 1000, "\xbf": 500, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 889, "\xc6": 889, "\xaa": 276, "\u0141": 556, "\xd8": 722, "\u0152": 944, "\xba": 310, "\xe6": 667, "\u0131": 278, "\u0142": 278, "\xf8": 500, "\u0153": 667, "\xdf": 500, "\xcf": 333, "\xe9": 444, "\u0103": 500, "\u0171": 500, "\u011b": 444, "\u0178": 556, "\xf7": 675, "\xdd": 556, "\xc2": 611, "\xe1": 500, "\xdb": 722, "\xfd": 444, "\u0219": 389, "\xea": 444, "\u016e": 722, "\xdc": 722, "\u0105": 500, "\xda": 722, "\u0173": 500, "\xcb": 611, "\u0110": 722, "\uf6c3": 250, "\xa9": 760, "\u0112": 611, "\u010d": 444, "\xe5": 500, "\u0145": 667, "\u013a": 278, "\xe0": 500, "\u0162": 556, "\u0106": 667, "\xe3": 500, "\u0116": 611, "\u0161": 389, "\u015f": 389, "\xed": 278, "\u25ca": 471, "\u0158": 611, "\u0122": 722, "\xfb": 500, "\xe2": 500, "\u0100": 611, "\u0159": 389, "\xe7": 444, "\u017b": 556, "\xde": 611, "\u014c": 722, "\u0154": 611, "\u015a": 500, "\u010f": 544, "\u016a": 722, "\u016f": 500, "\xb3": 300, "\xd2": 722, "\xc0": 611, "\u0102": 611, "\xd7": 675, "\xfa": 500, "\u0164": 556, "\u2202": 476, "\xff": 444, "\u0143": 667, "\xee": 278, "\xca": 611, "\xe4": 500, "\xeb": 444, "\u0107": 444, "\u0144": 500, "\u016b": 500, "\u0147": 667, "\xcd": 333, "\xb1": 675, "\xa6": 275, "\xae": 760, "\u011e": 722, "\u0130": 333, "\u2211": 600, "\xc8": 611, "\u0155": 389, "\u014d": 500, "\u0179": 556, "\u017d": 556, "\u2265": 549, "\xd0": 722, "\xc7": 667, "\u013c": 278, "\u0165": 300, "\u0119": 444, "\u0172": 722, "\xc1": 611, "\xc4": 611, "\xe8": 444, "\u017a": 389, "\u012f": 278, "\xd3": 722, "\xf3": 500, "\u0101": 500, "\u015b": 389, "\xef": 278, "\xd4": 722, "\xd9": 722, "\u2206": 612, "\xfe": 500, "\xb2": 300, "\xd6": 722, "\xb5": 500, "\xec": 278, "\u0151": 500, "\u0118": 611, "\u0111": 500, "\xbe": 750, "\u015e": 500, "\u013e": 300, "\u0136": 667, "\u0139": 556, "\u2122": 980, "\u0117": 444, "\xcc": 333, "\u012a": 333, "\u013d": 611, "\xbd": 750, "\u2264": 549, "\xf4": 500, "\xf1": 500, "\u0170": 722, "\xc9": 611, "\u0113": 444, "\u011f": 500, "\xbc": 750, "\u0160": 500, "\u0218": 500, "\u0150": 722, "\xb0": 400, "\xf2": 500, "\u010c": 667, "\xf9": 500, "\u221a": 453, "\u010e": 722, "\u0157": 389, "\xd1": 667, "\xf5": 500, "\u0156": 611, "\u013b": 556, "\xc3": 611, "\u0104": 611, "\xc5": 611, "\xd5": 722, "\u017c": 389, "\u011a": 611, "\u012e": 333, "\u0137": 444, "\u2212": 675, "\xce": 333, "\u0148": 500, "\u0163": 278, "\xac": 675, "\xf6": 500, "\xfc": 500, "\u2260": 549, "\u0123": 500, "\xf0": 500, "\u017e": 389, "\u0146": 500, "\xb9": 300, "\u012b": 278, "\u20ac": 500, }, ), # Generated from Times-Roman.afm # Copyright (c) 1985, 1987, 1989, 1990, 1993, 1997 Adobe Systems Incorporated. All Rights # Reserved. Times is a trademark of Linotype-Hell AG and/or its subsidiaries. "Times-Roman": CoreFontMetrics( font_descriptor=FontDescriptor( name="Times-Roman", family="Times", weight="Roman", ascent=683, descent=-217, cap_height=662, x_height=450, italic_angle=0, flags=34, bbox=(-168.0, -218.0, 1000.0, 898.0), ), character_widths={ " ": 250, "default": 500, "!": 333, '"': 408, "#": 500, "$": 500, "%": 833, "&": 778, "\u2019": 333, "(": 333, ")": 333, "*": 500, "+": 564, ",": 250, "-": 333, ".": 250, "/": 278, "0": 500, "1": 500, "2": 500, "3": 500, "4": 500, "5": 500, "6": 500, "7": 500, "8": 500, "9": 500, ":": 278, ";": 278, "<": 564, "=": 564, ">": 564, "?": 444, "@": 921, "A": 722, "B": 667, "C": 667, "D": 722, "E": 611, "F": 556, "G": 722, "H": 722, "I": 333, "J": 389, "K": 722, "L": 611, "M": 889, "N": 722, "O": 722, "P": 556, "Q": 722, "R": 667, "S": 556, "T": 611, "U": 722, "V": 722, "W": 944, "X": 722, "Y": 722, "Z": 611, "[": 333, "\\": 278, "]": 333, "^": 469, "_": 500, "\u2018": 333, "a": 444, "b": 500, "c": 444, "d": 500, "e": 444, "f": 333, "g": 500, "h": 500, "i": 278, "j": 278, "k": 500, "l": 278, "m": 778, "n": 500, "o": 500, "p": 500, "q": 500, "r": 333, "s": 389, "t": 278, "u": 500, "v": 500, "w": 722, "x": 500, "y": 500, "z": 444, "{": 480, "|": 200, "}": 480, "~": 541, "\xa1": 333, "\xa2": 500, "\xa3": 500, "\u2044": 167, "\xa5": 500, "\u0192": 500, "\xa7": 500, "\xa4": 500, "'": 180, "\u201c": 444, "\xab": 500, "\u2039": 333, "\u203a": 333, "\ufb01": 556, "\ufb02": 556, "\u2013": 500, "\u2020": 500, "\u2021": 500, "\xb7": 250, "\xb6": 453, "\u2022": 350, "\u201a": 333, "\u201e": 444, "\u201d": 444, "\xbb": 500, "\u2026": 1000, "\u2030": 1000, "\xbf": 444, "`": 333, "\xb4": 333, "\u02c6": 333, "\u02dc": 333, "\xaf": 333, "\u02d8": 333, "\u02d9": 333, "\xa8": 333, "\u02da": 333, "\xb8": 333, "\u02dd": 333, "\u02db": 333, "\u02c7": 333, "\u2014": 1000, "\xc6": 889, "\xaa": 276, "\u0141": 611, "\xd8": 722, "\u0152": 889, "\xba": 310, "\xe6": 667, "\u0131": 278, "\u0142": 278, "\xf8": 500, "\u0153": 722, "\xdf": 500, "\xcf": 333, "\xe9": 444, "\u0103": 444, "\u0171": 500, "\u011b": 444, "\u0178": 722, "\xf7": 564, "\xdd": 722, "\xc2": 722, "\xe1": 444, "\xdb": 722, "\xfd": 500, "\u0219": 389, "\xea": 444, "\u016e": 722, "\xdc": 722, "\u0105": 444, "\xda": 722, "\u0173": 500, "\xcb": 611, "\u0110": 722, "\uf6c3": 250, "\xa9": 760, "\u0112": 611, "\u010d": 444, "\xe5": 444, "\u0145": 722, "\u013a": 278, "\xe0": 444, "\u0162": 611, "\u0106": 667, "\xe3": 444, "\u0116": 611, "\u0161": 389, "\u015f": 389, "\xed": 278, "\u25ca": 471, "\u0158": 667, "\u0122": 722, "\xfb": 500, "\xe2": 444, "\u0100": 722, "\u0159": 333, "\xe7": 444, "\u017b": 611, "\xde": 556, "\u014c": 722, "\u0154": 667, "\u015a": 556, "\u010f": 588, "\u016a": 722, "\u016f": 500, "\xb3": 300, "\xd2": 722, "\xc0": 722, "\u0102": 722, "\xd7": 564, "\xfa": 500, "\u0164": 611, "\u2202": 476, "\xff": 500, "\u0143": 722, "\xee": 278, "\xca": 611, "\xe4": 444, "\xeb": 444, "\u0107": 444, "\u0144": 500, "\u016b": 500, "\u0147": 722, "\xcd": 333, "\xb1": 564, "\xa6": 200, "\xae": 760, "\u011e": 722, "\u0130": 333, "\u2211": 600, "\xc8": 611, "\u0155": 333, "\u014d": 500, "\u0179": 611, "\u017d": 611, "\u2265": 549, "\xd0": 722, "\xc7": 667, "\u013c": 278, "\u0165": 326, "\u0119": 444, "\u0172": 722, "\xc1": 722, "\xc4": 722, "\xe8": 444, "\u017a": 444, "\u012f": 278, "\xd3": 722, "\xf3": 500, "\u0101": 444, "\u015b": 389, "\xef": 278, "\xd4": 722, "\xd9": 722, "\u2206": 612, "\xfe": 500, "\xb2": 300, "\xd6": 722, "\xb5": 500, "\xec": 278, "\u0151": 500, "\u0118": 611, "\u0111": 500, "\xbe": 750, "\u015e": 556, "\u013e": 344, "\u0136": 722, "\u0139": 611, "\u2122": 980, "\u0117": 444, "\xcc": 333, "\u012a": 333, "\u013d": 611, "\xbd": 750, "\u2264": 549, "\xf4": 500, "\xf1": 500, "\u0170": 722, "\xc9": 611, "\u0113": 444, "\u011f": 500, "\xbc": 750, "\u0160": 556, "\u0218": 556, "\u0150": 722, "\xb0": 400, "\xf2": 500, "\u010c": 667, "\xf9": 500, "\u221a": 453, "\u010e": 722, "\u0157": 333, "\xd1": 722, "\xf5": 500, "\u0156": 667, "\u013b": 611, "\xc3": 722, "\u0104": 722, "\xc5": 722, "\xd5": 722, "\u017c": 444, "\u011a": 611, "\u012e": 333, "\u0137": 500, "\u2212": 564, "\xce": 333, "\u0148": 500, "\u0163": 278, "\xac": 564, "\xf6": 500, "\xfc": 500, "\u2260": 549, "\u0123": 500, "\xf0": 500, "\u017e": 444, "\u0146": 500, "\xb9": 300, "\u012b": 278, "\u20ac": 500, }, ), # Generated from ZapfDingbats.afm # Copyright (c) 1985, 1987, 1988, 1989, 1997 Adobe Systems Incorporated. All Rights Reserved. # ITC Zapf Dingbats is a registered trademark of International Typeface Corporation. "ZapfDingbats": CoreFontMetrics( font_descriptor=FontDescriptor( name="ZapfDingbats", family="ZapfDingbats", weight="Medium", ascent=0.0, descent=0.0, cap_height=0.0, x_height=0.0, italic_angle=0, flags=4, bbox=(-1.0, -143.0, 981.0, 820.0), ), character_widths={ " ": 790, "default": 1580, "\x01": 974, "\x02": 961, "\xca": 974, "\x03": 980, "\x04": 719, "\x05": 789, "w": 790, "v": 791, "u": 690, "\x0b": 960, "\x0c": 939, "\r": 549, "\x0e": 855, "\x0f": 911, "\x10": 933, "i": 911, "\x11": 945, "\x12": 974, "\x13": 755, "\x14": 846, "\x15": 762, "\x16": 761, "\x17": 571, "\x18": 677, "\x19": 763, "\x1a": 760, "\x1b": 759, "\x1c": 754, "\x06": 494, "\x07": 552, "\x08": 537, "\t": 577, "\n": 692, "\x1d": 786, "\x1e": 788, "\x1f": 788, "!": 793, '"': 794, "#": 816, "$": 823, "%": 789, "&": 841, "'": 823, "(": 833, ")": 816, "*": 831, "+": 923, ",": 744, "-": 723, ".": 749, "/": 790, "0": 792, "1": 695, "2": 776, "3": 768, "4": 792, "5": 759, "6": 707, "7": 708, "8": 682, "9": 701, ":": 826, ";": 815, "<": 789, "=": 789, ">": 707, "?": 687, "@": 696, "A": 689, "B": 786, "C": 787, "D": 713, "E": 791, "F": 785, "G": 791, "H": 873, "I": 761, "J": 762, "\xcb": 762, "K": 759, "\xcc": 759, "L": 892, "M": 892, "N": 788, "O": 784, "Q": 438, "R": 138, "S": 277, "T": 415, "a": 392, "b": 392, "c": 668, "d": 668, "Y": 390, "Z": 390, "]": 317, "^": 317, "[": 276, "\\": 276, "\xcd": 509, "U": 509, "\xce": 410, "V": 410, "W": 234, "X": 234, "_": 334, "`": 334, "e": 732, "f": 544, "g": 544, "h": 910, "j": 667, "k": 760, "l": 760, "p": 776, "o": 595, "n": 694, "m": 626, "x": 788, "y": 788, "z": 788, "{": 788, "|": 788, "}": 788, "~": 788, "\x7f": 788, "\x80": 788, "\x81": 788, "\x82": 788, "\x83": 788, "\x84": 788, "\x85": 788, "\x86": 788, "\x87": 788, "\x88": 788, "\x89": 788, "\x8a": 788, "\x8b": 788, "\x8c": 788, "\x8d": 788, "\x8e": 788, "\x8f": 788, "\x90": 788, "\x91": 788, "\x92": 788, "\x93": 788, "\x94": 788, "\x95": 788, "\x96": 788, "\x97": 788, "\x98": 788, "\x99": 788, "\x9a": 788, "\x9b": 788, "\x9c": 788, "\x9d": 788, "\x9e": 788, "\x9f": 788, "\xa0": 894, "\xa1": 838, "\xa3": 1016, "\xa4": 458, "\xc4": 748, "\xa5": 924, "\xc0": 748, "\xa6": 918, "\xa7": 927, "\xa8": 928, "\xa9": 928, "\xaa": 834, "\xab": 873, "\xac": 828, "\xad": 924, "\xa2": 924, "\xae": 917, "\xaf": 930, "\xb0": 931, "\xb1": 463, "\xb2": 883, "\xb3": 836, "\xc1": 836, "\xb4": 867, "\xc7": 867, "\xb5": 696, "\xc8": 696, "\xb6": 874, "\xc9": 874, "\xb7": 760, "\xb8": 946, "\xc5": 771, "\xb9": 865, "\xc2": 771, "\xc6": 888, "\xba": 967, "\xc3": 888, "\xbb": 831, "\xbc": 873, "\xbd": 927, "\xbe": 970, "\xbf": 918, }, ), } # Add aliases per table H.3 on pp. 1109-1110 of the PDF 1.7 reference CORE_FONT_METRICS["Arial"] = CORE_FONT_METRICS["Helvetica"] CORE_FONT_METRICS["Arial,Italic"] = CORE_FONT_METRICS["Helvetica-Oblique"] CORE_FONT_METRICS["Arial,Bold"] = CORE_FONT_METRICS["Helvetica-Bold"] CORE_FONT_METRICS["Arial,BoldItalic"] = CORE_FONT_METRICS["Helvetica-BoldOblique"] CORE_FONT_METRICS["CourierNew"] = CORE_FONT_METRICS["Courier"] CORE_FONT_METRICS["CourierNew,Italic"] = CORE_FONT_METRICS["Courier-Oblique"] CORE_FONT_METRICS["CourierNew,Bold"] = CORE_FONT_METRICS["Courier-Bold"] CORE_FONT_METRICS["CourierNew,BoldItalic"] = CORE_FONT_METRICS["Courier-BoldOblique"] CORE_FONT_METRICS["TimesNewRoman"] = CORE_FONT_METRICS["Times-Roman"] CORE_FONT_METRICS["TimesNewRoman,Italic"] = CORE_FONT_METRICS["Times-Italic"] CORE_FONT_METRICS["TimesNewRoman,Bold"] = CORE_FONT_METRICS["Times-Bold"] CORE_FONT_METRICS["TimesNewRoman,BoldItalic"] = CORE_FONT_METRICS["Times-BoldItalic"] ================================================ FILE: pypdf/_codecs/pdfdoc.py ================================================ # PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7 # C.1 Predefined encodings sorted by character name of another PDF reference # Some indices have '\u0000' although they should have something else: # 22: should be '\u0017' _pdfdoc_encoding = [ "\u0000", "\u0001", "\u0002", "\u0003", "\u0004", "\u0005", "\u0006", "\u0007", # 0 - 7 "\u0008", "\u0009", "\u000a", "\u000b", "\u000c", "\u000d", "\u000e", "\u000f", # 8 - 15 "\u0010", "\u0011", "\u0012", "\u0013", "\u0014", "\u0015", "\u0000", "\u0017", # 16 - 23 "\u02d8", "\u02c7", "\u02c6", "\u02d9", "\u02dd", "\u02db", "\u02da", "\u02dc", # 24 - 31 "\u0020", "\u0021", "\u0022", "\u0023", "\u0024", "\u0025", "\u0026", "\u0027", # 32 - 39 "\u0028", "\u0029", "\u002a", "\u002b", "\u002c", "\u002d", "\u002e", "\u002f", # 40 - 47 "\u0030", "\u0031", "\u0032", "\u0033", "\u0034", "\u0035", "\u0036", "\u0037", # 48 - 55 "\u0038", "\u0039", "\u003a", "\u003b", "\u003c", "\u003d", "\u003e", "\u003f", # 56 - 63 "\u0040", "\u0041", "\u0042", "\u0043", "\u0044", "\u0045", "\u0046", "\u0047", # 64 - 71 "\u0048", "\u0049", "\u004a", "\u004b", "\u004c", "\u004d", "\u004e", "\u004f", # 72 - 79 "\u0050", "\u0051", "\u0052", "\u0053", "\u0054", "\u0055", "\u0056", "\u0057", # 80 - 87 "\u0058", "\u0059", "\u005a", "\u005b", "\u005c", "\u005d", "\u005e", "\u005f", # 88 - 95 "\u0060", "\u0061", "\u0062", "\u0063", "\u0064", "\u0065", "\u0066", "\u0067", # 96 - 103 "\u0068", "\u0069", "\u006a", "\u006b", "\u006c", "\u006d", "\u006e", "\u006f", # 104 - 111 "\u0070", "\u0071", "\u0072", "\u0073", "\u0074", "\u0075", "\u0076", "\u0077", # 112 - 119 "\u0078", "\u0079", "\u007a", "\u007b", "\u007c", "\u007d", "\u007e", "\u0000", # 120 - 127 "\u2022", "\u2020", "\u2021", "\u2026", "\u2014", "\u2013", "\u0192", "\u2044", # 128 - 135 "\u2039", "\u203a", "\u2212", "\u2030", "\u201e", "\u201c", "\u201d", "\u2018", # 136 - 143 "\u2019", "\u201a", "\u2122", "\ufb01", "\ufb02", "\u0141", "\u0152", "\u0160", # 144 - 151 "\u0178", "\u017d", "\u0131", "\u0142", "\u0153", "\u0161", "\u017e", "\u0000", # 152 - 159 "\u20ac", "\u00a1", "\u00a2", "\u00a3", "\u00a4", "\u00a5", "\u00a6", "\u00a7", # 160 - 167 "\u00a8", "\u00a9", "\u00aa", "\u00ab", "\u00ac", "\u0000", "\u00ae", "\u00af", # 168 - 175 "\u00b0", "\u00b1", "\u00b2", "\u00b3", "\u00b4", "\u00b5", "\u00b6", "\u00b7", # 176 - 183 "\u00b8", "\u00b9", "\u00ba", "\u00bb", "\u00bc", "\u00bd", "\u00be", "\u00bf", # 184 - 191 "\u00c0", "\u00c1", "\u00c2", "\u00c3", "\u00c4", "\u00c5", "\u00c6", "\u00c7", # 192 - 199 "\u00c8", "\u00c9", "\u00ca", "\u00cb", "\u00cc", "\u00cd", "\u00ce", "\u00cf", # 200 - 207 "\u00d0", "\u00d1", "\u00d2", "\u00d3", "\u00d4", "\u00d5", "\u00d6", "\u00d7", # 208 - 215 "\u00d8", "\u00d9", "\u00da", "\u00db", "\u00dc", "\u00dd", "\u00de", "\u00df", # 216 - 223 "\u00e0", "\u00e1", "\u00e2", "\u00e3", "\u00e4", "\u00e5", "\u00e6", "\u00e7", # 224 - 231 "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ec", "\u00ed", "\u00ee", "\u00ef", # 232 - 239 "\u00f0", "\u00f1", "\u00f2", "\u00f3", "\u00f4", "\u00f5", "\u00f6", "\u00f7", # 240 - 247 "\u00f8", "\u00f9", "\u00fa", "\u00fb", "\u00fc", "\u00fd", "\u00fe", "\u00ff", # 248 - 255 ] assert len(_pdfdoc_encoding) == 256 ================================================ FILE: pypdf/_codecs/std.py ================================================ _std_encoding = [ "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\t", "\n", "\x0b", "\x0c", "\r", "\x0e", "\x0f", "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a", "\x1b", "\x1c", "\x1d", "\x1e", "\x1f", " ", "!", '"', "#", "$", "%", "&", "’", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\", "]", "^", "_", "‘", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\x7f", "\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8a", "\x8b", "\x8c", "\x8d", "\x8e", "\x8f", "\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9a", "\x9b", "\x9c", "\x9d", "\x9e", "\x9f", "\xa0", "¡", "¢", "£", "⁄", "¥", "ƒ", "§", "¤", "'", "“", "«", "‹", "›", "fi", "fl", "°", "–", "†", "‡", "·", "µ", "¶", "•", "‚", "„", "”", "»", "…", "‰", "¾", "¿", "À", "`", "´", "ˆ", "˜", "¯", "˘", "˙", "¨", "É", "˚", "¸", "Ì", "˝", "˛", "ˇ", "—", "Ñ", "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "Þ", "ß", "à", "Æ", "â", "ª", "ä", "å", "æ", "ç", "Ł", "Ø", "Œ", "º", "ì", "í", "î", "ï", "ð", "æ", "ò", "ó", "ô", "ı", "ö", "÷", "ł", "ø", "œ", "ß", "ü", "ý", "þ", "ÿ", ] ================================================ FILE: pypdf/_codecs/symbol.py ================================================ # manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt _symbol_encoding = [ "\u0000", "\u0001", "\u0002", "\u0003", "\u0004", "\u0005", "\u0006", "\u0007", "\u0008", "\u0009", "\u000A", "\u000B", "\u000C", "\u000D", "\u000E", "\u000F", "\u0010", "\u0011", "\u0012", "\u0013", "\u0014", "\u0015", "\u0016", "\u0017", "\u0018", "\u0019", "\u001A", "\u001B", "\u001C", "\u001D", "\u001E", "\u001F", "\u0020", "\u0021", "\u2200", "\u0023", "\u2203", "\u0025", "\u0026", "\u220B", "\u0028", "\u0029", "\u2217", "\u002B", "\u002C", "\u2212", "\u002E", "\u002F", "\u0030", "\u0031", "\u0032", "\u0033", "\u0034", "\u0035", "\u0036", "\u0037", "\u0038", "\u0039", "\u003A", "\u003B", "\u003C", "\u003D", "\u003E", "\u003F", "\u2245", "\u0391", "\u0392", "\u03A7", "\u0394", "\u0395", "\u03A6", "\u0393", "\u0397", "\u0399", "\u03D1", "\u039A", "\u039B", "\u039C", "\u039D", "\u039F", "\u03A0", "\u0398", "\u03A1", "\u03A3", "\u03A4", "\u03A5", "\u03C2", "\u03A9", "\u039E", "\u03A8", "\u0396", "\u005B", "\u2234", "\u005D", "\u22A5", "\u005F", "\uF8E5", "\u03B1", "\u03B2", "\u03C7", "\u03B4", "\u03B5", "\u03C6", "\u03B3", "\u03B7", "\u03B9", "\u03D5", "\u03BA", "\u03BB", "\u00B5", "\u03BD", "\u03BF", "\u03C0", "\u03B8", "\u03C1", "\u03C3", "\u03C4", "\u03C5", "\u03D6", "\u03C9", "\u03BE", "\u03C8", "\u03B6", "\u007B", "\u007C", "\u007D", "\u223C", "\u007F", "\u0080", "\u0081", "\u0082", "\u0083", "\u0084", "\u0085", "\u0086", "\u0087", "\u0088", "\u0089", "\u008A", "\u008B", "\u008C", "\u008D", "\u008E", "\u008F", "\u0090", "\u0091", "\u0092", "\u0093", "\u0094", "\u0095", "\u0096", "\u0097", "\u0098", "\u0099", "\u009A", "\u009B", "\u009C", "\u009D", "\u009E", "\u009F", "\u20AC", "\u03D2", "\u2032", "\u2264", "\u2044", "\u221E", "\u0192", "\u2663", "\u2666", "\u2665", "\u2660", "\u2194", "\u2190", "\u2191", "\u2192", "\u2193", "\u00B0", "\u00B1", "\u2033", "\u2265", "\u00D7", "\u221D", "\u2202", "\u2022", "\u00F7", "\u2260", "\u2261", "\u2248", "\u2026", "\uF8E6", "\uF8E7", "\u21B5", "\u2135", "\u2111", "\u211C", "\u2118", "\u2297", "\u2295", "\u2205", "\u2229", "\u222A", "\u2283", "\u2287", "\u2284", "\u2282", "\u2286", "\u2208", "\u2209", "\u2220", "\u2207", "\uF6DA", "\uF6D9", "\uF6DB", "\u220F", "\u221A", "\u22C5", "\u00AC", "\u2227", "\u2228", "\u21D4", "\u21D0", "\u21D1", "\u21D2", "\u21D3", "\u25CA", "\u2329", "\uF8E8", "\uF8E9", "\uF8EA", "\u2211", "\uF8EB", "\uF8EC", "\uF8ED", "\uF8EE", "\uF8EF", "\uF8F0", "\uF8F1", "\uF8F2", "\uF8F3", "\uF8F4", "\u00F0", "\u232A", "\u222B", "\u2320", "\uF8F5", "\u2321", "\uF8F6", "\uF8F7", "\uF8F8", "\uF8F9", "\uF8FA", "\uF8FB", "\uF8FC", "\uF8FD", "\uF8FE", "\u00FF", ] assert len(_symbol_encoding) == 256 ================================================ FILE: pypdf/_codecs/zapfding.py ================================================ # manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt _zapfding_encoding = [ "\u0000", "\u0001", "\u0002", "\u0003", "\u0004", "\u0005", "\u0006", "\u0007", "\u0008", "\u0009", "\u000A", "\u000B", "\u000C", "\u000D", "\u000E", "\u000F", "\u0010", "\u0011", "\u0012", "\u0013", "\u0014", "\u0015", "\u0016", "\u0017", "\u0018", "\u0019", "\u001A", "\u001B", "\u001C", "\u001D", "\u001E", "\u001F", "\u0020", "\u2701", "\u2702", "\u2703", "\u2704", "\u260E", "\u2706", "\u2707", "\u2708", "\u2709", "\u261B", "\u261E", "\u270C", "\u270D", "\u270E", "\u270F", "\u2710", "\u2711", "\u2712", "\u2713", "\u2714", "\u2715", "\u2716", "\u2717", "\u2718", "\u2719", "\u271A", "\u271B", "\u271C", "\u271D", "\u271E", "\u271F", "\u2720", "\u2721", "\u2722", "\u2723", "\u2724", "\u2725", "\u2726", "\u2727", "\u2605", "\u2729", "\u272A", "\u272B", "\u272C", "\u272D", "\u272E", "\u272F", "\u2730", "\u2731", "\u2732", "\u2733", "\u2734", "\u2735", "\u2736", "\u2737", "\u2738", "\u2739", "\u273A", "\u273B", "\u273C", "\u273D", "\u273E", "\u273F", "\u2740", "\u2741", "\u2742", "\u2743", "\u2744", "\u2745", "\u2746", "\u2747", "\u2748", "\u2749", "\u274A", "\u274B", "\u25CF", "\u274D", "\u25A0", "\u274F", "\u2750", "\u2751", "\u2752", "\u25B2", "\u25BC", "\u25C6", "\u2756", "\u25D7", "\u2758", "\u2759", "\u275A", "\u275B", "\u275C", "\u275D", "\u275E", "\u007F", "\uF8D7", "\uF8D8", "\uF8D9", "\uF8DA", "\uF8DB", "\uF8DC", "\uF8DD", "\uF8DE", "\uF8DF", "\uF8E0", "\uF8E1", "\uF8E2", "\uF8E3", "\uF8E4", "\u008E", "\u008F", "\u0090", "\u0091", "\u0092", "\u0093", "\u0094", "\u0095", "\u0096", "\u0097", "\u0098", "\u0099", "\u009A", "\u009B", "\u009C", "\u009D", "\u009E", "\u009F", "\u00A0", "\u2761", "\u2762", "\u2763", "\u2764", "\u2765", "\u2766", "\u2767", "\u2663", "\u2666", "\u2665", "\u2660", "\u2460", "\u2461", "\u2462", "\u2463", "\u2464", "\u2465", "\u2466", "\u2467", "\u2468", "\u2469", "\u2776", "\u2777", "\u2778", "\u2779", "\u277A", "\u277B", "\u277C", "\u277D", "\u277E", "\u277F", "\u2780", "\u2781", "\u2782", "\u2783", "\u2784", "\u2785", "\u2786", "\u2787", "\u2788", "\u2789", "\u278A", "\u278B", "\u278C", "\u278D", "\u278E", "\u278F", "\u2790", "\u2791", "\u2792", "\u2793", "\u2794", "\u2192", "\u2194", "\u2195", "\u2798", "\u2799", "\u279A", "\u279B", "\u279C", "\u279D", "\u279E", "\u279F", "\u27A0", "\u27A1", "\u27A2", "\u27A3", "\u27A4", "\u27A5", "\u27A6", "\u27A7", "\u27A8", "\u27A9", "\u27AA", "\u27AB", "\u27AC", "\u27AD", "\u27AE", "\u27AF", "\u00F0", "\u27B1", "\u27B2", "\u27B3", "\u27B4", "\u27B5", "\u27B6", "\u27B7", "\u27B8", "\u27B9", "\u27BA", "\u27BB", "\u27BC", "\u27BD", "\u27BE", "\u00FF", ] assert len(_zapfding_encoding) == 256 ================================================ FILE: pypdf/_crypt_providers/__init__.py ================================================ # Copyright (c) 2023, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from pypdf._crypt_providers._base import CryptBase, CryptIdentity try: from pypdf._crypt_providers._cryptography import ( CryptAES, CryptRC4, aes_cbc_decrypt, aes_cbc_encrypt, aes_ecb_decrypt, aes_ecb_encrypt, crypt_provider, rc4_decrypt, rc4_encrypt, ) from pypdf._utils import Version if Version(crypt_provider[1]) <= Version("3.0"): # This is due to the backend parameter being required back then: # https://cryptography.io/en/latest/changelog/#v3-1 raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover except ImportError: try: from pypdf._crypt_providers._pycryptodome import ( # type: ignore CryptAES, CryptRC4, aes_cbc_decrypt, aes_cbc_encrypt, aes_ecb_decrypt, aes_ecb_encrypt, crypt_provider, rc4_decrypt, rc4_encrypt, ) except ImportError: from pypdf._crypt_providers._fallback import ( # type: ignore CryptAES, CryptRC4, aes_cbc_decrypt, aes_cbc_encrypt, aes_ecb_decrypt, aes_ecb_encrypt, crypt_provider, rc4_decrypt, rc4_encrypt, ) __all__ = [ "CryptAES", "CryptBase", "CryptIdentity", "CryptRC4", "aes_cbc_decrypt", "aes_cbc_encrypt", "aes_ecb_decrypt", "aes_ecb_encrypt", "crypt_provider", "rc4_decrypt", "rc4_encrypt", ] ================================================ FILE: pypdf/_crypt_providers/_base.py ================================================ # Copyright (c) 2023, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. class CryptBase: def encrypt(self, data: bytes) -> bytes: # pragma: no cover return data def decrypt(self, data: bytes) -> bytes: # pragma: no cover return data class CryptIdentity(CryptBase): pass ================================================ FILE: pypdf/_crypt_providers/_cryptography.py ================================================ # Copyright (c) 2023, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import secrets from cryptography import __version__ from cryptography.hazmat.primitives import padding from cryptography.hazmat.primitives.ciphers.algorithms import AES try: # 43.0.0 - https://cryptography.io/en/latest/changelog/#v43-0-0 from cryptography.hazmat.decrepit.ciphers.algorithms import ARC4 except ImportError: from cryptography.hazmat.primitives.ciphers.algorithms import ARC4 from cryptography.hazmat.primitives.ciphers.base import Cipher from cryptography.hazmat.primitives.ciphers.modes import CBC, ECB from pypdf._crypt_providers._base import CryptBase crypt_provider = ("cryptography", __version__) class CryptRC4(CryptBase): def __init__(self, key: bytes) -> None: self.cipher = Cipher(ARC4(key), mode=None) def encrypt(self, data: bytes) -> bytes: encryptor = self.cipher.encryptor() return encryptor.update(data) + encryptor.finalize() def decrypt(self, data: bytes) -> bytes: decryptor = self.cipher.decryptor() return decryptor.update(data) + decryptor.finalize() class CryptAES(CryptBase): def __init__(self, key: bytes) -> None: self.alg = AES(key) def encrypt(self, data: bytes) -> bytes: iv = secrets.token_bytes(16) pad = padding.PKCS7(128).padder() data = pad.update(data) + pad.finalize() cipher = Cipher(self.alg, CBC(iv)) encryptor = cipher.encryptor() return iv + encryptor.update(data) + encryptor.finalize() def decrypt(self, data: bytes) -> bytes: iv = data[:16] data = data[16:] # for empty encrypted data if not data: return data # just for robustness, it does not happen under normal circumstances if len(data) % 16 != 0: pad = padding.PKCS7(128).padder() data = pad.update(data) + pad.finalize() cipher = Cipher(self.alg, CBC(iv)) decryptor = cipher.decryptor() d = decryptor.update(data) + decryptor.finalize() return d[: -d[-1]] def rc4_encrypt(key: bytes, data: bytes) -> bytes: encryptor = Cipher(ARC4(key), mode=None).encryptor() return encryptor.update(data) + encryptor.finalize() def rc4_decrypt(key: bytes, data: bytes) -> bytes: decryptor = Cipher(ARC4(key), mode=None).decryptor() return decryptor.update(data) + decryptor.finalize() def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes: encryptor = Cipher(AES(key), mode=ECB()).encryptor() return encryptor.update(data) + encryptor.finalize() def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes: decryptor = Cipher(AES(key), mode=ECB()).decryptor() return decryptor.update(data) + decryptor.finalize() def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes: encryptor = Cipher(AES(key), mode=CBC(iv)).encryptor() return encryptor.update(data) + encryptor.finalize() def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes: decryptor = Cipher(AES(key), mode=CBC(iv)).decryptor() return decryptor.update(data) + decryptor.finalize() ================================================ FILE: pypdf/_crypt_providers/_fallback.py ================================================ # Copyright (c) 2023, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from pypdf._crypt_providers._base import CryptBase from pypdf.errors import DependencyError _DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm" crypt_provider = ("local_crypt_fallback", "0.0.0") class CryptRC4(CryptBase): def __init__(self, key: bytes) -> None: self.s = bytearray(range(256)) j = 0 for i in range(256): j = (j + self.s[i] + key[i % len(key)]) % 256 self.s[i], self.s[j] = self.s[j], self.s[i] def encrypt(self, data: bytes) -> bytes: s = bytearray(self.s) out = [0 for _ in range(len(data))] i, j = 0, 0 for k in range(len(data)): i = (i + 1) % 256 j = (j + s[i]) % 256 s[i], s[j] = s[j], s[i] x = s[(s[i] + s[j]) % 256] out[k] = data[k] ^ x return bytes(out) def decrypt(self, data: bytes) -> bytes: return self.encrypt(data) class CryptAES(CryptBase): def __init__(self, key: bytes) -> None: pass def encrypt(self, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) def decrypt(self, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) def rc4_encrypt(key: bytes, data: bytes) -> bytes: return CryptRC4(key).encrypt(data) def rc4_decrypt(key: bytes, data: bytes) -> bytes: return CryptRC4(key).decrypt(data) def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes: raise DependencyError(_DEPENDENCY_ERROR_STR) ================================================ FILE: pypdf/_crypt_providers/_pycryptodome.py ================================================ # Copyright (c) 2023, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import secrets from Crypto import __version__ from Crypto.Cipher import AES, ARC4 from Crypto.Util.Padding import pad from pypdf._crypt_providers._base import CryptBase crypt_provider = ("pycryptodome", __version__) class CryptRC4(CryptBase): def __init__(self, key: bytes) -> None: self.key = key def encrypt(self, data: bytes) -> bytes: return ARC4.ARC4Cipher(self.key).encrypt(data) def decrypt(self, data: bytes) -> bytes: return ARC4.ARC4Cipher(self.key).decrypt(data) class CryptAES(CryptBase): def __init__(self, key: bytes) -> None: self.key = key def encrypt(self, data: bytes) -> bytes: iv = secrets.token_bytes(16) data = pad(data, 16) aes = AES.new(self.key, AES.MODE_CBC, iv) return iv + aes.encrypt(data) def decrypt(self, data: bytes) -> bytes: iv = data[:16] data = data[16:] # for empty encrypted data if not data: return data # just for robustness, it does not happen under normal circumstances if len(data) % 16 != 0: data = pad(data, 16) aes = AES.new(self.key, AES.MODE_CBC, iv) d = aes.decrypt(data) return d[: -d[-1]] def rc4_encrypt(key: bytes, data: bytes) -> bytes: return ARC4.ARC4Cipher(key).encrypt(data) def rc4_decrypt(key: bytes, data: bytes) -> bytes: return ARC4.ARC4Cipher(key).decrypt(data) def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes: return AES.new(key, AES.MODE_ECB).encrypt(data) def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes: return AES.new(key, AES.MODE_ECB).decrypt(data) def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes: return AES.new(key, AES.MODE_CBC, iv).encrypt(data) def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes: return AES.new(key, AES.MODE_CBC, iv).decrypt(data) ================================================ FILE: pypdf/_doc_common.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # Copyright (c) 2024, Pubpub-ZZ # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import struct from abc import abstractmethod from collections.abc import Generator, Iterable, Iterator, Mapping from datetime import datetime from typing import ( Any, Optional, Union, cast, ) from ._encryption import Encryption from ._page import PageObject, _VirtualList from ._page_labels import index2label as page_index2page_label from ._utils import ( deprecation_with_replacement, logger_warning, parse_iso8824_date, ) from .constants import CatalogAttributes as CA from .constants import CatalogDictionary as CD from .constants import ( CheckboxRadioButtonAttributes, GoToActionArguments, PagesAttributes, UserAccessPermissions, ) from .constants import Core as CO from .constants import DocumentInformationAttributes as DI from .constants import FieldDictionaryAttributes as FA from .constants import PageAttributes as PG from .errors import PdfReadError, PyPdfError from .filters import _decompress_with_limit from .generic import ( ArrayObject, BooleanObject, ByteStringObject, Destination, DictionaryObject, EncodedStreamObject, Field, Fit, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, TextStringObject, TreeObject, ViewerPreferences, create_string_object, is_null_or_none, ) from .generic._files import EmbeddedFile from .types import OutlineType, PagemodeType from .xmp import XmpInformation def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]: if size > 8: raise PdfReadError("Invalid size in convert_to_int") d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d d = d[-8:] return struct.unpack(">q", d)[0] class DocumentInformation(DictionaryObject): """ A class representing the basic document metadata provided in a PDF File. This class is accessible through :py:class:`PdfReader.metadata`. All text properties of the document metadata have *two* properties, e.g. author and author_raw. The non-raw property will always return a ``TextStringObject``, making it ideal for a case where the metadata is being displayed. The raw property can sometimes return a ``ByteStringObject``, if pypdf was unable to decode the string's text encoding; this requires additional safety in the caller and therefore is not as commonly accessed. """ def __init__(self) -> None: DictionaryObject.__init__(self) def _get_text(self, key: str) -> Optional[str]: retval = self.get(key, None) if isinstance(retval, TextStringObject): return retval if isinstance(retval, ByteStringObject): return str(retval) return None @property def title(self) -> Optional[str]: """ Read-only property accessing the document's title. Returns a ``TextStringObject`` or ``None`` if the title is not specified. """ return ( self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore if self.get(DI.TITLE) else None ) @property def title_raw(self) -> Optional[str]: """The "raw" version of title; can return a ``ByteStringObject``.""" return self.get(DI.TITLE) @property def author(self) -> Optional[str]: """ Read-only property accessing the document's author. Returns a ``TextStringObject`` or ``None`` if the author is not specified. """ return self._get_text(DI.AUTHOR) @property def author_raw(self) -> Optional[str]: """The "raw" version of author; can return a ``ByteStringObject``.""" return self.get(DI.AUTHOR) @property def subject(self) -> Optional[str]: """ Read-only property accessing the document's subject. Returns a ``TextStringObject`` or ``None`` if the subject is not specified. """ return self._get_text(DI.SUBJECT) @property def subject_raw(self) -> Optional[str]: """The "raw" version of subject; can return a ``ByteStringObject``.""" return self.get(DI.SUBJECT) @property def creator(self) -> Optional[str]: """ Read-only property accessing the document's creator. If the document was converted to PDF from another format, this is the name of the application (e.g. OpenOffice) that created the original document from which it was converted. Returns a ``TextStringObject`` or ``None`` if the creator is not specified. """ return self._get_text(DI.CREATOR) @property def creator_raw(self) -> Optional[str]: """The "raw" version of creator; can return a ``ByteStringObject``.""" return self.get(DI.CREATOR) @property def producer(self) -> Optional[str]: """ Read-only property accessing the document's producer. If the document was converted to PDF from another format, this is the name of the application (for example, macOS Quartz) that converted it to PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not specified. """ return self._get_text(DI.PRODUCER) @property def producer_raw(self) -> Optional[str]: """The "raw" version of producer; can return a ``ByteStringObject``.""" return self.get(DI.PRODUCER) @property def creation_date(self) -> Optional[datetime]: """Read-only property accessing the document's creation date.""" return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) @property def creation_date_raw(self) -> Optional[str]: """ The "raw" version of creation date; can return a ``ByteStringObject``. Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix is the offset from UTC. """ return self.get(DI.CREATION_DATE) @property def modification_date(self) -> Optional[datetime]: """ Read-only property accessing the document's modification date. The date and time the document was most recently modified. """ return parse_iso8824_date(self._get_text(DI.MOD_DATE)) @property def modification_date_raw(self) -> Optional[str]: """ The "raw" version of modification date; can return a ``ByteStringObject``. Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix is the offset from UTC. """ return self.get(DI.MOD_DATE) @property def keywords(self) -> Optional[str]: """ Read-only property accessing the document's keywords. Returns a ``TextStringObject`` or ``None`` if keywords are not specified. """ return self._get_text(DI.KEYWORDS) @property def keywords_raw(self) -> Optional[str]: """The "raw" version of keywords; can return a ``ByteStringObject``.""" return self.get(DI.KEYWORDS) class PdfDocCommon: """ Common functions from PdfWriter and PdfReader objects. This root class is strongly abstracted. """ strict: bool = False # default flattened_pages: Optional[list[PageObject]] = None _encryption: Optional[Encryption] = None _readonly: bool = False @property @abstractmethod def root_object(self) -> DictionaryObject: ... # pragma: no cover @property @abstractmethod def pdf_header(self) -> str: ... # pragma: no cover @abstractmethod def get_object( self, indirect_reference: Union[int, IndirectObject] ) -> Optional[PdfObject]: ... # pragma: no cover @abstractmethod def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: ... # pragma: no cover @property @abstractmethod def _info(self) -> Optional[DictionaryObject]: ... # pragma: no cover @property def metadata(self) -> Optional[DocumentInformation]: """ Retrieve the PDF file's document information dictionary, if it exists. Note that some PDF files use metadata streams instead of document information dictionaries, and these metadata streams will not be accessed by this function. """ retval = DocumentInformation() if self._info is None: return None retval.update(self._info) return retval @property def xmp_metadata(self) -> Optional[XmpInformation]: ... # pragma: no cover @property def viewer_preferences(self) -> Optional[ViewerPreferences]: """Returns the existing ViewerPreferences as an overloaded dictionary.""" o = self.root_object.get(CD.VIEWER_PREFERENCES, None) if o is None: return None o = o.get_object() if not isinstance(o, ViewerPreferences): o = ViewerPreferences(o) if hasattr(o, "indirect_reference") and o.indirect_reference is not None: self._replace_object(o.indirect_reference, o) else: self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o return o def get_num_pages(self) -> int: """ Calculate the number of pages in this PDF file. Returns: The number of pages of the parsed PDF file. Raises: PdfReadError: If restrictions prevent this action. """ # Flattened pages will not work on an encrypted PDF; # the PDF file's page count is used in this case. Otherwise, # the original method (flattened page count) is used. if self.is_encrypted: return self.root_object["/Pages"]["/Count"] # type: ignore if self.flattened_pages is None: self._flatten(self._readonly) assert self.flattened_pages is not None return len(self.flattened_pages) def get_page(self, page_number: int) -> PageObject: """ Retrieve a page by number from this PDF file. Most of the time ``.pages[page_number]`` is preferred. Args: page_number: The page number to retrieve (pages begin at zero) Returns: A :class:`PageObject` instance. """ if self.flattened_pages is None: self._flatten(self._readonly) assert self.flattened_pages is not None, "hint for mypy" return self.flattened_pages[page_number] def _get_page_in_node( self, page_number: int, ) -> tuple[DictionaryObject, int]: """ Retrieve the node and position within the /Kids containing the page. If page_number is greater than the number of pages, it returns the top node, -1. """ top = cast(DictionaryObject, self.root_object["/Pages"]) def recursive_call( node: DictionaryObject, mi: int ) -> tuple[Optional[PdfObject], int]: ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types if node["/Type"] == "/Page": if page_number == mi: return node, -1 return None, mi + 1 if (page_number - mi) >= ma: # not in nodes below if node == top: return top, -1 return None, mi + ma for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): kid = cast(DictionaryObject, kid.get_object()) n, i = recursive_call(kid, mi) if n is not None: # page has just been found ... if i < 0: # ... just below! return node, idx # ... at lower levels return n, i mi = i raise PyPdfError("Unexpectedly cannot find the node.") node, idx = recursive_call(top, 0) assert isinstance(node, DictionaryObject), "mypy" return node, idx @property def named_destinations(self) -> dict[str, Destination]: """A read-only dictionary which maps names to destinations.""" return self._get_named_destinations() def get_named_dest_root(self) -> ArrayObject: named_dest = ArrayObject() if CA.NAMES in self.root_object and isinstance( self.root_object[CA.NAMES], DictionaryObject ): names = cast(DictionaryObject, self.root_object[CA.NAMES]) if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): # §3.6.3 Name Dictionary (PDF spec 1.7) dests = cast(DictionaryObject, names[CA.DESTS]) dests_ref = dests.indirect_reference if CA.NAMES in dests: # §7.9.6, entries in a name tree node dictionary named_dest = cast(ArrayObject, dests[CA.NAMES]) else: named_dest = ArrayObject() dests[NameObject(CA.NAMES)] = named_dest elif hasattr(self, "_add_object"): dests = DictionaryObject() dests_ref = self._add_object(dests) names[NameObject(CA.DESTS)] = dests_ref dests[NameObject(CA.NAMES)] = named_dest elif hasattr(self, "_add_object"): names = DictionaryObject() names_ref = self._add_object(names) self.root_object[NameObject(CA.NAMES)] = names_ref dests = DictionaryObject() dests_ref = self._add_object(dests) names[NameObject(CA.DESTS)] = dests_ref dests[NameObject(CA.NAMES)] = named_dest return named_dest ## common def _get_named_destinations( self, tree: Union[TreeObject, None] = None, retval: Optional[dict[str, Destination]] = None, ) -> dict[str, Destination]: """ Retrieve the named destinations present in the document. Args: tree: The current tree. retval: The previously retrieved destinations for nested calls. Returns: A dictionary which maps names to destinations. """ if retval is None: retval = {} catalog = self.root_object # get the name tree if CA.DESTS in catalog: tree = cast(TreeObject, catalog[CA.DESTS]) elif CA.NAMES in catalog: names = cast(DictionaryObject, catalog[CA.NAMES]) if CA.DESTS in names: tree = cast(TreeObject, names[CA.DESTS]) if is_null_or_none(tree): return retval assert tree is not None, "mypy" if PagesAttributes.KIDS in tree: # recurse down the tree for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]): self._get_named_destinations(kid.get_object(), retval) # §7.9.6, entries in a name tree node dictionary elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) names = cast(DictionaryObject, tree[CA.NAMES]) i = 0 while i < len(names): key = names[i].get_object() i += 1 if not isinstance(key, (bytes, str)): continue try: value = names[i].get_object() except IndexError: break i += 1 if isinstance(value, DictionaryObject): if "/D" in value: value = value["/D"] else: continue dest = self._build_destination(key, value) if dest is not None: retval[cast(str, dest["/Title"])] = dest # Remain backwards-compatible. retval[str(key)] = dest else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) for k__, v__ in tree.items(): val = v__.get_object() if isinstance(val, DictionaryObject): if "/D" in val: val = val["/D"].get_object() else: continue dest = self._build_destination(k__, val) if dest is not None: retval[k__] = dest return retval # A select group of relevant field attributes. For the complete list, # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification. def get_fields( self, tree: Optional[TreeObject] = None, retval: Optional[dict[Any, Any]] = None, fileobj: Optional[Any] = None, stack: Optional[list[PdfObject]] = None, ) -> Optional[dict[str, Any]]: """ Extract field data if this PDF contains interactive form fields. The *tree*, *retval*, *stack* parameters are for recursive use. Args: tree: Current object to parse. retval: In-progress list of fields. fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. stack: List of already parsed objects. Returns: A dictionary where each key is a field name, and each value is a :class:`Field` object. By default, the mapping name is used for keys. ``None`` if form data could not be located. """ field_attributes = FA.attributes_dict() field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) if retval is None: retval = {} catalog = self.root_object stack = [] # get the AcroForm tree if CD.ACRO_FORM in catalog: tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) else: return None if tree is None: return retval assert stack is not None if "/Fields" in tree: fields = cast(ArrayObject, tree["/Fields"]) for f in fields: field = f.get_object() self._build_field(field, retval, fileobj, field_attributes, stack) elif any(attr in tree for attr in field_attributes): # Tree is a field self._build_field(tree, retval, fileobj, field_attributes, stack) return retval def _get_qualified_field_name(self, parent: DictionaryObject) -> str: if "/TM" in parent: return cast(str, parent["/TM"]) if "/Parent" in parent: return ( self._get_qualified_field_name( cast(DictionaryObject, parent["/Parent"]) ) + "." + cast(str, parent.get("/T", "")) ) return cast(str, parent.get("/T", "")) def _build_field( self, field: Union[TreeObject, DictionaryObject], retval: dict[Any, Any], fileobj: Any, field_attributes: Any, stack: list[PdfObject], ) -> None: if all(attr not in field for attr in ("/T", "/TM")): return key = self._get_qualified_field_name(field) if fileobj: self._write_field(fileobj, field, field_attributes) fileobj.write("\n") retval[key] = Field(field) obj = retval[key].indirect_reference.get_object() # to get the full object if obj.get(FA.FT, "") == "/Ch" and obj.get(NameObject(FA.Opt)): retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: # Checkbox retval[key][NameObject("/_States_")] = ArrayObject( list(obj["/AP"]["/N"].keys()) ) if "/Off" not in retval[key]["/_States_"]: retval[key][NameObject("/_States_")].append(NameObject("/Off")) elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: states: list[str] = [] retval[key][NameObject("/_States_")] = ArrayObject(states) for k in obj.get(FA.Kids, {}): k = k.get_object() for s in list(k["/AP"]["/N"].keys()): if s not in states: states.append(s) retval[key][NameObject("/_States_")] = ArrayObject(states) if ( obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 and "/Off" in retval[key]["/_States_"] ): del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] # at last for order self._check_kids(field, retval, fileobj, stack) def _check_kids( self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any, stack: list[PdfObject], ) -> None: if tree in stack: logger_warning( f"{self._get_qualified_field_name(tree)} already parsed", __name__ ) return stack.append(tree) if PagesAttributes.KIDS in tree: # recurse down the tree for kid in tree[PagesAttributes.KIDS]: # type: ignore kid = kid.get_object() self.get_fields(kid, retval, fileobj, stack) def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: field_attributes_tuple = FA.attributes() field_attributes_tuple = ( field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() ) for attr in field_attributes_tuple: if attr in ( FA.Kids, FA.AA, ): continue attr_name = field_attributes[attr] try: if attr == FA.FT: # Make the field type value clearer types = { "/Btn": "Button", "/Tx": "Text", "/Ch": "Choice", "/Sig": "Signature", } if field[attr] in types: fileobj.write(f"{attr_name}: {types[field[attr]]}\n") elif attr == FA.Parent: # Let's just write the name of the parent try: name = field[attr][FA.TM] except KeyError: name = field[attr][FA.T] fileobj.write(f"{attr_name}: {name}\n") else: fileobj.write(f"{attr_name}: {field[attr]}\n") except KeyError: # Field attribute is N/A or unknown, so don't write anything pass def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]: """ Retrieve form fields from the document with textual data. Args: full_qualified_name: to get full name Returns: A dictionary. The key is the name of the form field, the value is the content of the field. If the document contains multiple form fields with the same name, the second and following will get the suffix .2, .3, ... """ def indexed_key(k: str, fields: dict[Any, Any]) -> str: if k not in fields: return k return ( k + "." + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2) ) # Retrieve document form fields formfields = self.get_fields() if formfields is None: return {} ff = {} for field, value in formfields.items(): if value.get("/FT") == "/Tx": if full_qualified_name: ff[field] = value.get("/V") else: ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") return ff def get_pages_showing_field( self, field: Union[Field, PdfObject, IndirectObject] ) -> list[PageObject]: """ Provides list of pages where the field is called. Args: field: Field Object, PdfObject or IndirectObject referencing a Field Returns: List of pages: - Empty list: The field has no widgets attached (either hidden field or ancestor field). - Single page list: Page where the widget is present (most common). - Multi-page list: Field with multiple kids widgets (example: radio buttons, field repeated on multiple pages). """ def _get_inherited(obj: DictionaryObject, key: str) -> Any: if key in obj: return obj[key] if "/Parent" in obj: return _get_inherited( cast(DictionaryObject, obj["/Parent"].get_object()), key ) return None try: # to cope with all types field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore except Exception as exc: raise ValueError("Field type is invalid") from exc if is_null_or_none(_get_inherited(field, "/FT")): raise ValueError("Field is not valid") ret = [] if field.get("/Subtype", "") == "/Widget": if "/P" in field: ret = [field["/P"].get_object()] else: ret = [ p for p in self.pages if field.indirect_reference in p.get("/Annots", "") ] else: kids = field.get("/Kids", ()) for k in kids: k = k.get_object() if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): # Kid that is just a widget, not a field: if "/P" in k: ret += [k["/P"].get_object()] else: ret += [ p for p in self.pages if k.indirect_reference in p.get("/Annots", "") ] return [ x if isinstance(x, PageObject) else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore for x in ret ] @property def open_destination( self, ) -> Union[None, Destination, TextStringObject, ByteStringObject]: """ Property to access the opening destination (``/OpenAction`` entry in the PDF catalog). It returns ``None`` if the entry does not exist or is not set. Raises: Exception: If a destination is invalid. """ if "/OpenAction" not in self.root_object: return None oa: Any = self.root_object["/OpenAction"] if isinstance(oa, bytes): # pragma: no cover oa = oa.decode() if isinstance(oa, str): return create_string_object(oa) if isinstance(oa, ArrayObject): try: page, typ, *array = oa fit = Fit(typ, tuple(array)) return Destination("OpenAction", page, fit) except Exception as exc: raise Exception(f"Invalid Destination {oa}: {exc}") else: return None @open_destination.setter def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: raise NotImplementedError("No setter for open_destination") @property def outline(self) -> OutlineType: """ Read-only property for the outline present in the document (i.e., a collection of 'outline items' which are also known as 'bookmarks'). """ return self._get_outline() def _get_outline( self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None, visited: Optional[set[int]] = None, ) -> OutlineType: if outline is None: outline = [] catalog = self.root_object # get the outline dictionary and named destinations if CO.OUTLINES in catalog: lines = cast(DictionaryObject, catalog[CO.OUTLINES]) if isinstance(lines, NullObject): return outline # §12.3.3 Document outline, entries in the outline dictionary if not is_null_or_none(lines) and "/First" in lines: node = cast(DictionaryObject, lines["/First"]) self._named_destinations = self._get_named_destinations() if node is None: return outline # see if there are any more outline items if visited is None: visited = set() while True: node_id = id(node) if node_id in visited: logger_warning(f"Detected cycle in outline structure for {node}", __name__) break visited.add(node_id) outline_obj = self._build_outline_item(node) if outline_obj: outline.append(outline_obj) # check for sub-outline if "/First" in node: sub_outline: list[Any] = [] # Pass a copy to allow multiple outer entries to reference the same inner one. inner_visited = visited.copy() self._get_outline( node=cast(DictionaryObject, node["/First"]), outline=sub_outline, visited=inner_visited, ) if sub_outline: outline.append(sub_outline) if "/Next" not in node: break node = cast(DictionaryObject, node["/Next"]) return outline @property def threads(self) -> Optional[ArrayObject]: """ Read-only property for the list of threads. See §12.4.3 from the PDF 1.7 or 2.0 specification. It is an array of dictionaries with "/F" (the first bead in the thread) and "/I" (a thread information dictionary containing information about the thread, such as its title, author, and creation date) properties or None if there are no articles. Since PDF 2.0 it can also contain an indirect reference to a metadata stream containing information about the thread, such as its title, author, and creation date. """ catalog = self.root_object if CO.THREADS in catalog: return cast("ArrayObject", catalog[CO.THREADS]) return None @abstractmethod def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> Optional[int]: ... # pragma: no cover def get_page_number(self, page: PageObject) -> Optional[int]: """ Retrieve page number of a given PageObject. Args: page: The page to get page number. Should be an instance of :class:`PageObject` Returns: The page number or None if page is not found """ return self._get_page_number_by_indirect(page.indirect_reference) def get_destination_page_number(self, destination: Destination) -> Optional[int]: """ Retrieve page number of a given Destination object. Args: destination: The destination to get page number. Returns: The page number or None if page is not found """ return self._get_page_number_by_indirect(destination.page) def _build_destination( self, title: Union[str, bytes], array: Optional[ list[ Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] ] ], ) -> Destination: page, typ = None, None # handle outline items with missing or invalid destination if ( isinstance(array, (NullObject, str)) or (isinstance(array, ArrayObject) and len(array) == 0) or array is None ): page = NullObject() return Destination(title, page, Fit.fit()) page, typ, *array = array # type: ignore try: return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore except PdfReadError: logger_warning(f"Unknown destination: {title!r} {array}", __name__) if self.strict: raise # create a link to first Page tmp = self.pages[0].indirect_reference indirect_reference = NullObject() if tmp is None else tmp return Destination(title, indirect_reference, Fit.fit()) def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: dest, title, outline_item = None, None, None # title required for valid outline # §12.3.3, entries in an outline item dictionary try: title = cast("str", node["/Title"]) except KeyError: if self.strict: raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") title = "" if "/A" in node: # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported) action = cast(DictionaryObject, node["/A"]) action_type = cast(NameObject, action[GoToActionArguments.S]) if action_type == "/GoTo": if GoToActionArguments.D in action: dest = action[GoToActionArguments.D] elif self.strict: raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}") elif "/Dest" in node: # Destination, PDF 1.7 and PDF 2.0 §12.3.2 dest = node["/Dest"] # if array was referenced in another object, will be a dict w/ key "/D" if isinstance(dest, DictionaryObject) and "/D" in dest: dest = dest["/D"] if isinstance(dest, ArrayObject): outline_item = self._build_destination(title, dest) elif isinstance(dest, str): # named destination, addresses NameObject Issue #193 # TODO: Keep named destination instead of replacing it? try: outline_item = self._build_destination( title, self._named_destinations[dest].dest_array ) except KeyError: # named destination not found in Name Dict outline_item = self._build_destination(title, None) elif dest is None: # outline item not required to have destination or action # PDFv1.7 Table 153 outline_item = self._build_destination(title, dest) else: if self.strict: raise PdfReadError(f"Unexpected destination {dest!r}") logger_warning( f"Removed unexpected destination {dest!r} from destination", __name__, ) outline_item = self._build_destination(title, None) # if outline item created, add color, format, and child count if present if outline_item: if "/C" in node: # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore if "/F" in node: # specifies style characteristics bold and/or italic # with 1=italic, 2=bold, 3=both outline_item[NameObject("/F")] = node["/F"] if "/Count" in node: # absolute value = num. visible children # with positive = open/unfolded, negative = closed/folded outline_item[NameObject("/Count")] = node["/Count"] # if count is 0 we will consider it as open (to have available is_open) outline_item[NameObject("/%is_open%")] = BooleanObject( node.get("/Count", 0) >= 0 ) outline_item.node = node try: outline_item.indirect_reference = node.indirect_reference except AttributeError: pass return outline_item @property def pages(self) -> list[PageObject]: """ Property that emulates a list of :class:`PageObject`. This property allows to get a page or a range of pages. Note: For PdfWriter only: Provides the capability to remove a page/range of page from the list (using the del operator). Remember: Only the page entry is removed, as the objects beneath can be used elsewhere. A solution to completely remove them - if they are not used anywhere - is to write to a buffer/temporary file and then load it into a new PdfWriter. """ return _VirtualList(self.get_num_pages, self.get_page) # type: ignore @property def page_labels(self) -> list[str]: """ A list of labels for the pages in this document. This property is read-only. The labels are in the order that the pages appear in the document. """ return [page_index2page_label(self, i) for i in range(len(self.pages))] @property def page_layout(self) -> Optional[str]: """ Get the page layout currently being used. .. list-table:: Valid ``layout`` values :widths: 50 200 * - /NoLayout - Layout explicitly not specified * - /SinglePage - Show one page at a time * - /OneColumn - Show one column at a time * - /TwoColumnLeft - Show pages in two columns, odd-numbered pages on the left * - /TwoColumnRight - Show pages in two columns, odd-numbered pages on the right * - /TwoPageLeft - Show two pages at a time, odd-numbered pages on the left * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ try: return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) except KeyError: return None @property def page_mode(self) -> Optional[PagemodeType]: """ Get the page mode currently being used. .. list-table:: Valid ``mode`` values :widths: 50 200 * - /UseNone - Do not show outline or thumbnails panels * - /UseOutlines - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen - Fullscreen view * - /UseOC - Show Optional Content Group (OCG) panel * - /UseAttachments - Show attachments panel """ try: return self.root_object["/PageMode"] # type: ignore except KeyError: return None def _flatten( self, list_only: bool = False, pages: Union[None, DictionaryObject, PageObject] = None, inherit: Optional[dict[str, Any]] = None, indirect_reference: Optional[IndirectObject] = None, ) -> None: """ Process the document pages to ease searching. Attributes of a page may inherit from ancestor nodes in the page tree. Flattening means moving any inheritance data into descendant nodes, effectively removing the inheritance dependency. Note: It is distinct from another use of "flattening" applied to PDFs. Flattening a PDF also means combining all the contents into one single layer and making the file less editable. Args: list_only: Will only list the pages within _flatten_pages. pages: inherit: indirect_reference: Used recursively to flatten the /Pages object. """ inheritable_page_attributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), NameObject(PG.CROPBOX), NameObject(PG.ROTATE), ) if inherit is None: inherit = {} if pages is None: # Fix issue 327: set flattened_pages attribute only for # decrypted file catalog = self.root_object pages = catalog.get("/Pages").get_object() # type: ignore if not isinstance(pages, DictionaryObject): raise PdfReadError("Invalid object in /Pages") self.flattened_pages = [] if PagesAttributes.TYPE in pages: t = cast(str, pages[PagesAttributes.TYPE]) # if the page tree node has no /Type, consider as a page if /Kids is also missing elif PagesAttributes.KIDS not in pages: t = "/Page" else: t = "/Pages" if t == "/Pages": for attr in inheritable_page_attributes: if attr in pages: inherit[attr] = pages[attr] pages_reference = getattr(pages, "indirect_reference", object()) for page in cast(ArrayObject, pages[PagesAttributes.KIDS]): if getattr(page, "indirect_reference", object()) == pages_reference: raise PdfReadError("Detected cyclic page references.") addt = {} if isinstance(page, IndirectObject): addt["indirect_reference"] = page obj = page.get_object() if obj: # damaged file may have invalid child in /Pages try: self._flatten(list_only, obj, inherit, **addt) except RecursionError: raise PdfReadError( "Maximum recursion depth reached during page flattening." ) elif t == "/Page": for attr_in, value in inherit.items(): # if the page has its own value, it does not inherit the # parent's value if attr_in not in pages: pages[attr_in] = value page_obj = PageObject(self, indirect_reference) if not list_only: page_obj.update(pages) # TODO: Could flattened_pages be None at this point? self.flattened_pages.append(page_obj) # type: ignore def remove_page( self, page: Union[int, PageObject, IndirectObject], clean: bool = False, ) -> None: """ Remove page from pages list. Args: page: * :class:`int`: Page number to be removed. * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times only the first one will be removed. * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. clean: replace PageObject with NullObject to prevent annotations or destinations to reference a detached page. """ if self.flattened_pages is None: self._flatten(self._readonly) assert self.flattened_pages is not None if isinstance(page, IndirectObject): p = page.get_object() if not isinstance(p, PageObject): logger_warning("IndirectObject is not referencing a page", __name__) return page = p if not isinstance(page, int): try: page = self.flattened_pages.index(page) except ValueError: logger_warning("Cannot find page in pages", __name__) return if not (0 <= page < len(self.flattened_pages)): logger_warning("Page number is out of range", __name__) return ind = self.pages[page].indirect_reference del self.pages[page] if clean and ind is not None: self._replace_object(ind, NullObject()) def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: """ Used to ease development. This is equivalent to generic.IndirectObject(num,gen,self).get_object() Args: num: The object number of the indirect object. gen: The generation number of the indirect object. Returns: A PdfObject """ return IndirectObject(num, gen, self).get_object() def decode_permissions( self, permissions_code: int ) -> dict[str, bool]: # pragma: no cover """Take the permissions as an integer, return the allowed access.""" deprecation_with_replacement( old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0", ) permissions_mapping = { "print": UserAccessPermissions.PRINT, "modify": UserAccessPermissions.MODIFY, "copy": UserAccessPermissions.EXTRACT, "annotations": UserAccessPermissions.ADD_OR_MODIFY, "forms": UserAccessPermissions.FILL_FORM_FIELDS, # Do not fix typo, as part of official, but deprecated API. "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS, "assemble": UserAccessPermissions.ASSEMBLE_DOC, "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION, } return { key: permissions_code & flag != 0 for key, flag in permissions_mapping.items() } @property def user_access_permissions(self) -> Optional[UserAccessPermissions]: """ Get the user access permissions for encrypted documents. Returns None if not encrypted. .. warning:: For AES-256 encrypted documents (R=5/R=6), the returned permissions are derived from the ``/P`` field, which is only trustworthy if the ``/Perms`` integrity check passed. Check :attr:`are_permissions_valid` to verify. """ if self._encryption is None: return None return UserAccessPermissions(self._encryption.P) @property def are_permissions_valid(self) -> Optional[bool]: """ Whether the ``/Perms`` integrity check passed for this document. For AES-256 encrypted documents (R=5/R=6), the ``/Perms`` field is an encrypted copy of the permissions that can be verified independently. Returns ``False`` if this check fails (the ``/P`` permissions may have been tampered with). Returns ``None`` if the document is not encrypted or has not yet been decrypted via :meth:`decrypt()`. Returns ``True`` for non-AES-256 encryption (no ``/Perms`` to check). """ if self._encryption is None: return None if not self._encryption.is_decrypted(): return None return self._encryption._are_permissions_valid @property @abstractmethod def is_encrypted(self) -> bool: """ Read-only boolean property showing whether this PDF file is encrypted. Note that this property, if true, will remain true even after the :meth:`decrypt()` method is called. """ ... # pragma: no cover @property def xfa(self) -> Optional[dict[str, Any]]: retval: dict[str, Any] = {} catalog = self.root_object if "/AcroForm" not in catalog or not catalog["/AcroForm"]: return None tree = cast(TreeObject, catalog["/AcroForm"]) if "/XFA" in tree: fields = cast(ArrayObject, tree["/XFA"]) i = iter(fields) for f in i: tag = f f = next(i) if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: es = _decompress_with_limit(field._data) retval[tag] = es return retval @property def attachments(self) -> Mapping[str, list[bytes]]: """Mapping of attachment filenames to their content.""" return LazyDict( { name: (self._get_attachment_list, name) for name in self._list_attachments() } ) @property def attachment_list(self) -> Generator[EmbeddedFile, None, None]: """Iterable of attachment objects.""" yield from EmbeddedFile._load(self.root_object) def _list_attachments(self) -> list[str]: """ Retrieves the list of filenames of file attachments. Returns: list of filenames """ names = [] for entry in self.attachment_list: names.append(entry.name) if (name := entry.alternative_name) != entry.name and name: names.append(name) return names def _get_attachment_list(self, name: str) -> list[bytes]: out = self._get_attachments(name)[name] if isinstance(out, list): return out return [out] def _get_attachments( self, filename: Optional[str] = None ) -> dict[str, Union[bytes, list[bytes]]]: """ Retrieves all or selected file attachments of the PDF as a dictionary of file names and the file data as a bytestring. Args: filename: If filename is None, then a dictionary of all attachments will be returned, where the key is the filename and the value is the content. Otherwise, a dictionary with just a single key - the filename - and its content will be returned. Returns: dictionary of filename -> Union[bytestring or List[ByteString]] If the filename exists multiple times a list of the different versions will be provided. """ attachments: dict[str, Union[bytes, list[bytes]]] = {} for entry in self.attachment_list: names = set() alternative_name = entry.alternative_name if filename is not None: if filename in {entry.name, alternative_name}: name = entry.name if filename == entry.name else alternative_name names.add(name) else: continue else: names = {entry.name, alternative_name} for name in names: if name is None: continue if name in attachments: if not isinstance(attachments[name], list): attachments[name] = [attachments[name]] # type:ignore attachments[name].append(entry.content) # type:ignore else: attachments[name] = entry.content return attachments @abstractmethod def _repr_mimebundle_( self, include: Union[None, Iterable[str]] = None, exclude: Union[None, Iterable[str]] = None, ) -> dict[str, Any]: """ Integration into Jupyter Notebooks. This method returns a dictionary that maps a mime-type to its representation. .. seealso:: https://ipython.readthedocs.io/en/stable/config/integrating.html """ ... # pragma: no cover class LazyDict(Mapping[Any, Any]): def __init__(self, *args: Any, **kwargs: Any) -> None: self._raw_dict = dict(*args, **kwargs) def __getitem__(self, key: str) -> Any: func, arg = self._raw_dict.__getitem__(key) return func(arg) def __iter__(self) -> Iterator[Any]: return iter(self._raw_dict) def __len__(self) -> int: return len(self._raw_dict) def __str__(self) -> str: return f"LazyDict(keys={list(self.keys())})" ================================================ FILE: pypdf/_encryption.py ================================================ # Copyright (c) 2022, exiledkingcc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import hashlib import secrets import struct from enum import Enum, IntEnum from typing import Any, Optional, Union, cast from pypdf._crypt_providers import ( CryptAES, CryptBase, CryptIdentity, CryptRC4, aes_cbc_decrypt, aes_cbc_encrypt, aes_ecb_decrypt, aes_ecb_encrypt, rc4_decrypt, rc4_encrypt, ) from ._utils import logger_warning from .generic import ( ArrayObject, ByteStringObject, DictionaryObject, NameObject, NumberObject, PdfObject, StreamObject, TextStringObject, create_string_object, ) class CryptFilter: def __init__( self, stm_crypt: CryptBase, str_crypt: CryptBase, ef_crypt: CryptBase, ) -> None: self.stm_crypt = stm_crypt self.str_crypt = str_crypt self.ef_crypt = ef_crypt def encrypt_object(self, obj: PdfObject) -> PdfObject: if isinstance(obj, ByteStringObject): data = self.str_crypt.encrypt(obj.original_bytes) obj = ByteStringObject(data) elif isinstance(obj, TextStringObject): data = self.str_crypt.encrypt(obj.get_encoded_bytes()) obj = ByteStringObject(data) elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) obj2.set_data(self.stm_crypt.encrypt(obj._data)) for key, value in obj.items(): # Dont forget the Stream dict. obj2[key] = self.encrypt_object(value) obj = obj2 elif isinstance(obj, DictionaryObject): obj2 = DictionaryObject() # type: ignore for key, value in obj.items(): obj2[key] = self.encrypt_object(value) obj = obj2 elif isinstance(obj, ArrayObject): obj = ArrayObject(self.encrypt_object(x) for x in obj) return obj def decrypt_object(self, obj: PdfObject) -> PdfObject: if isinstance(obj, (ByteStringObject, TextStringObject)): data = self.str_crypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): obj._data = self.stm_crypt.decrypt(obj._data) for key, value in obj.items(): # Dont forget the Stream dict. obj[key] = self.decrypt_object(value) elif isinstance(obj, DictionaryObject): for key, value in obj.items(): obj[key] = self.decrypt_object(value) elif isinstance(obj, ArrayObject): for i in range(len(obj)): obj[i] = self.decrypt_object(obj[i]) return obj _PADDING = ( b"\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56\xff\xfa\x01\x08" b"\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c\xa9\xfe\x64\x53\x69\x7a" ) def _padding(data: bytes) -> bytes: return (data + _PADDING)[:32] class AlgV4: @staticmethod def compute_key( password: bytes, rev: int, key_size: int, o_entry: bytes, P: int, id1_entry: bytes, metadata_encrypted: bool, ) -> bytes: """ Algorithm 2: Computing an encryption key. a) Pad or truncate the password string to exactly 32 bytes. If the password string is more than 32 bytes long, use only its first 32 bytes; if it is less than 32 bytes long, pad it by appending the required number of additional bytes from the beginning of the following padding string: < 28 BF 4E 5E 4E 75 8A 41 64 00 4E 56 FF FA 01 08 2E 2E 00 B6 D0 68 3E 80 2F 0C A9 FE 64 53 69 7A > That is, if the password string is n bytes long, append the first 32 - n bytes of the padding string to the end of the password string. If the password string is empty (zero-length), meaning there is no user password, substitute the entire padding string in its place. b) Initialize the MD5 hash function and pass the result of step (a) as input to this function. c) Pass the value of the encryption dictionary’s O entry to the MD5 hash function. ("Algorithm 3: Computing the encryption dictionary’s O (owner password) value" shows how the O value is computed.) d) Convert the integer value of the P entry to a 32-bit unsigned binary number and pass these bytes to the MD5 hash function, low-order byte first. e) Pass the first element of the file’s file identifier array (the value of the ID entry in the document’s trailer dictionary; see Table 15) to the MD5 hash function. f) (Security handlers of revision 4 or greater) If document metadata is not being encrypted, pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. g) Finish the hash. h) (Security handlers of revision 3 or greater) Do the following 50 times: Take the output from the previous MD5 hash and pass the first n bytes of the output as input into a new MD5 hash, where n is the number of bytes of the encryption key as defined by the value of the encryption dictionary’s Length entry. i) Set the encryption key to the first n bytes of the output from the final MD5 hash, where n shall always be 5 for security handlers of revision 2 but, for security handlers of revision 3 or greater, shall depend on the value of the encryption dictionary’s Length entry. Args: password: The encryption secret as a bytes-string rev: The encryption revision (see PDF standard) key_size: The size of the key in bytes o_entry: The owner entry P: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: The u_hash digest of length key_size """ a = _padding(password) u_hash = hashlib.md5(a) u_hash.update(o_entry) u_hash.update(struct.pack("= 4 and not metadata_encrypted: u_hash.update(b"\xff\xff\xff\xff") u_hash_digest = u_hash.digest() length = key_size // 8 if rev >= 3: for _ in range(50): u_hash_digest = hashlib.md5(u_hash_digest[:length]).digest() return u_hash_digest[:length] @staticmethod def compute_O_value_key(owner_password: bytes, rev: int, key_size: int) -> bytes: """ Algorithm 3: Computing the encryption dictionary’s O (owner password) value. a) Pad or truncate the owner password string as described in step (a) of "Algorithm 2: Computing an encryption key". If there is no owner password, use the user password instead. b) Initialize the MD5 hash function and pass the result of step (a) as input to this function. c) (Security handlers of revision 3 or greater) Do the following 50 times: Take the output from the previous MD5 hash and pass it as input into a new MD5 hash. d) Create an RC4 encryption key using the first n bytes of the output from the final MD5 hash, where n shall always be 5 for security handlers of revision 2 but, for security handlers of revision 3 or greater, shall depend on the value of the encryption dictionary’s Length entry. e) Pad or truncate the user password string as described in step (a) of "Algorithm 2: Computing an encryption key". f) Encrypt the result of step (e), using an RC4 encryption function with the encryption key obtained in step (d). g) (Security handlers of revision 3 or greater) Do the following 19 times: Take the output from the previous invocation of the RC4 function and pass it as input to a new invocation of the function; use an encryption key generated by taking each byte of the encryption key obtained in step (d) and performing an XOR (exclusive or) operation between that byte and the single-byte value of the iteration counter (from 1 to 19). h) Store the output from the final invocation of the RC4 function as the value of the O entry in the encryption dictionary. Args: owner_password: rev: The encryption revision (see PDF standard) key_size: The size of the key in bytes Returns: The RC4 key """ a = _padding(owner_password) o_hash_digest = hashlib.md5(a).digest() if rev >= 3: for _ in range(50): o_hash_digest = hashlib.md5(o_hash_digest).digest() return o_hash_digest[: key_size // 8] @staticmethod def compute_O_value(rc4_key: bytes, user_password: bytes, rev: int) -> bytes: """ See :func:`compute_O_value_key`. Args: rc4_key: user_password: rev: The encryption revision (see PDF standard) Returns: The RC4 encrypted """ a = _padding(user_password) rc4_enc = rc4_encrypt(rc4_key, a) if rev >= 3: for i in range(1, 20): key = bytes(x ^ i for x in rc4_key) rc4_enc = rc4_encrypt(key, rc4_enc) return rc4_enc @staticmethod def compute_U_value(key: bytes, rev: int, id1_entry: bytes) -> bytes: """ Algorithm 4: Computing the encryption dictionary’s U (user password) value. (Security handlers of revision 2) a) Create an encryption key based on the user password string, as described in "Algorithm 2: Computing an encryption key". b) Encrypt the 32-byte padding string shown in step (a) of "Algorithm 2: Computing an encryption key", using an RC4 encryption function with the encryption key from the preceding step. c) Store the result of step (b) as the value of the U entry in the encryption dictionary. Args: key: rev: The encryption revision (see PDF standard) id1_entry: Returns: The value """ if rev <= 2: return rc4_encrypt(key, _PADDING) """ Algorithm 5: Computing the encryption dictionary’s U (user password) value. (Security handlers of revision 3 or greater) a) Create an encryption key based on the user password string, as described in "Algorithm 2: Computing an encryption key". b) Initialize the MD5 hash function and pass the 32-byte padding string shown in step (a) of "Algorithm 2: Computing an encryption key" as input to this function. c) Pass the first element of the file’s file identifier array (the value of the ID entry in the document’s trailer dictionary; see Table 15) to the hash function and finish the hash. d) Encrypt the 16-byte result of the hash, using an RC4 encryption function with the encryption key from step (a). e) Do the following 19 times: Take the output from the previous invocation of the RC4 function and pass it as input to a new invocation of the function; use an encryption key generated by taking each byte of the original encryption key obtained in step (a) and performing an XOR (exclusive or) operation between that byte and the single-byte value of the iteration counter (from 1 to 19). f) Append 16 bytes of arbitrary padding to the output from the final invocation of the RC4 function and store the 32-byte result as the value of the U entry in the encryption dictionary. """ u_hash = hashlib.md5(_PADDING) u_hash.update(id1_entry) rc4_enc = rc4_encrypt(key, u_hash.digest()) for i in range(1, 20): rc4_key = bytes(x ^ i for x in key) rc4_enc = rc4_encrypt(rc4_key, rc4_enc) return _padding(rc4_enc) @staticmethod def verify_user_password( user_password: bytes, rev: int, key_size: int, o_entry: bytes, u_entry: bytes, P: int, id1_entry: bytes, metadata_encrypted: bool, ) -> bytes: """ Algorithm 6: Authenticating the user password. a) Perform all but the last step of "Algorithm 4: Computing the encryption dictionary’s U (user password) value (Security handlers of revision 2)" or "Algorithm 5: Computing the encryption dictionary’s U (user password) value (Security handlers of revision 3 or greater)" using the supplied password string. b) If the result of step (a) is equal to the value of the encryption dictionary’s U entry (comparing on the first 16 bytes in the case of security handlers of revision 3 or greater), the password supplied is the correct user password. The key obtained in step (a) (that is, in the first step of "Algorithm 4: Computing the encryption dictionary’s U (user password) value (Security handlers of revision 2)" or "Algorithm 5: Computing the encryption dictionary’s U (user password) value (Security handlers of revision 3 or greater)") shall be used to decrypt the document. Args: user_password: The user password as a bytes stream rev: The encryption revision (see PDF standard) key_size: The size of the key in bytes o_entry: The owner entry u_entry: The user entry P: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: The key """ key = AlgV4.compute_key( user_password, rev, key_size, o_entry, P, id1_entry, metadata_encrypted ) u_value = AlgV4.compute_U_value(key, rev, id1_entry) if rev >= 3: u_value = u_value[:16] u_entry = u_entry[:16] if u_value != u_entry: key = b"" return key @staticmethod def verify_owner_password( owner_password: bytes, rev: int, key_size: int, o_entry: bytes, u_entry: bytes, P: int, id1_entry: bytes, metadata_encrypted: bool, ) -> bytes: """ Algorithm 7: Authenticating the owner password. a) Compute an encryption key from the supplied password string, as described in steps (a) to (d) of "Algorithm 3: Computing the encryption dictionary’s O (owner password) value". b) (Security handlers of revision 2 only) Decrypt the value of the encryption dictionary’s O entry, using an RC4 encryption function with the encryption key computed in step (a). (Security handlers of revision 3 or greater) Do the following 20 times: Decrypt the value of the encryption dictionary’s O entry (first iteration) or the output from the previous iteration (all subsequent iterations), using an RC4 encryption function with a different encryption key at each iteration. The key shall be generated by taking the original key (obtained in step (a)) and performing an XOR (exclusive or) operation between each byte of the key and the single-byte value of the iteration counter (from 19 to 0). c) The result of step (b) purports to be the user password. Authenticate this user password using "Algorithm 6: Authenticating the user password". If it is correct, the password supplied is the correct owner password. Args: owner_password: rev: The encryption revision (see PDF standard) key_size: The size of the key in bytes o_entry: The owner entry u_entry: The user entry P: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: bytes """ rc4_key = AlgV4.compute_O_value_key(owner_password, rev, key_size) if rev <= 2: user_password = rc4_decrypt(rc4_key, o_entry) else: user_password = o_entry for i in range(19, -1, -1): key = bytes(x ^ i for x in rc4_key) user_password = rc4_decrypt(key, user_password) return AlgV4.verify_user_password( user_password, rev, key_size, o_entry, u_entry, P, id1_entry, metadata_encrypted, ) class AlgV5: @staticmethod def verify_owner_password( R: int, password: bytes, o_value: bytes, oe_value: bytes, u_value: bytes ) -> bytes: """ Algorithm 3.2a Computing an encryption key. To understand the algorithm below, it is necessary to treat the O and U strings in the Encrypt dictionary as made up of three sections. The first 32 bytes are a hash value (explained below). The next 8 bytes are called the Validation Salt. The final 8 bytes are called the Key Salt. 1. The password string is generated from Unicode input by processing the input string with the SASLprep (IETF RFC 4013) profile of stringprep (IETF RFC 3454), and then converting to a UTF-8 representation. 2. Truncate the UTF-8 representation to 127 bytes if it is longer than 127 bytes. 3. Test the password against the owner key by computing the SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of owner Validation Salt, concatenated with the 48-byte U string. If the 32-byte result matches the first 32 bytes of the O string, this is the owner password. Compute an intermediate owner key by computing the SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of owner Key Salt, concatenated with the 48-byte U string. The 32-byte result is the key used to decrypt the 32-byte OE string using AES-256 in CBC mode with no padding and an initialization vector of zero. The 32-byte result is the file encryption key. 4. Test the password against the user key by computing the SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of user Validation Salt. If the 32 byte result matches the first 32 bytes of the U string, this is the user password. Compute an intermediate user key by computing the SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of user Key Salt. The 32-byte result is the key used to decrypt the 32-byte UE string using AES-256 in CBC mode with no padding and an initialization vector of zero. The 32-byte result is the file encryption key. 5. Decrypt the 16-byte Perms string using AES-256 in ECB mode with an initialization vector of zero and the file encryption key as the key. Verify that bytes 9-11 of the result are the characters ‘a’, ‘d’, ‘b’. Bytes 0-3 of the decrypted Perms entry, treated as a little-endian integer, are the user permissions. They should match the value in the P key. Args: R: A number specifying which revision of the standard security handler shall be used to interpret this dictionary password: The owner password o_value: A 32-byte string, based on both the owner and user passwords, that shall be used in computing the encryption key and in determining whether a valid owner password was entered oe_value: u_value: A 32-byte string, based on the user password, that shall be used in determining whether to prompt the user for a password and, if so, whether a valid user or owner password was entered. Returns: The key """ password = password[:127] if ( AlgV5.calculate_hash(R, password, o_value[32:40], u_value[:48]) != o_value[:32] ): return b"" iv = bytes(0 for _ in range(16)) tmp_key = AlgV5.calculate_hash(R, password, o_value[40:48], u_value[:48]) return aes_cbc_decrypt(tmp_key, iv, oe_value) @staticmethod def verify_user_password( R: int, password: bytes, u_value: bytes, ue_value: bytes ) -> bytes: """ See :func:`verify_owner_password`. Args: R: A number specifying which revision of the standard security handler shall be used to interpret this dictionary password: The user password u_value: A 32-byte string, based on the user password, that shall be used in determining whether to prompt the user for a password and, if so, whether a valid user or owner password was entered. ue_value: Returns: bytes """ password = password[:127] if AlgV5.calculate_hash(R, password, u_value[32:40], b"") != u_value[:32]: return b"" iv = bytes(0 for _ in range(16)) tmp_key = AlgV5.calculate_hash(R, password, u_value[40:48], b"") return aes_cbc_decrypt(tmp_key, iv, ue_value) @staticmethod def calculate_hash(R: int, password: bytes, salt: bytes, udata: bytes) -> bytes: # https://github.com/qpdf/qpdf/blob/main/libqpdf/QPDF_encryption.cc k = hashlib.sha256(password + salt + udata).digest() if R < 6: return k count = 0 while True: count += 1 k1 = password + k + udata e = aes_cbc_encrypt(k[:16], k[16:32], k1 * 64) hash_fn = ( hashlib.sha256, hashlib.sha384, hashlib.sha512, )[sum(e[:16]) % 3] k = hash_fn(e).digest() if count >= 64 and e[-1] <= count - 32: break return k[:32] @staticmethod def verify_perms( key: bytes, perms: bytes, p: int, metadata_encrypted: bool ) -> bool: """ See :func:`verify_owner_password` and :func:`compute_perms_value`. Args: key: The owner password perms: p: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. metadata_encrypted: Returns: A boolean """ b8 = b"T" if metadata_encrypted else b"F" p1 = struct.pack(" dict[Any, Any]: user_password = user_password[:127] owner_password = owner_password[:127] u_value, ue_value = AlgV5.compute_U_value(R, user_password, key) o_value, oe_value = AlgV5.compute_O_value(R, owner_password, key, u_value) perms = AlgV5.compute_Perms_value(key, p, metadata_encrypted) return { "/U": u_value, "/UE": ue_value, "/O": o_value, "/OE": oe_value, "/Perms": perms, } @staticmethod def compute_U_value(R: int, password: bytes, key: bytes) -> tuple[bytes, bytes]: """ Algorithm 3.8 Computing the encryption dictionary’s U (user password) and UE (user encryption key) values. 1. Generate 16 random bytes of data using a strong random number generator. The first 8 bytes are the User Validation Salt. The second 8 bytes are the User Key Salt. Compute the 32-byte SHA-256 hash of the password concatenated with the User Validation Salt. The 48-byte string consisting of the 32-byte hash followed by the User Validation Salt followed by the User Key Salt is stored as the U key. 2. Compute the 32-byte SHA-256 hash of the password concatenated with the User Key Salt. Using this hash as the key, encrypt the file encryption key using AES-256 in CBC mode with no padding and an initialization vector of zero. The resulting 32-byte string is stored as the UE key. Args: R: password: key: Returns: A tuple (u-value, ue value) """ random_bytes = secrets.token_bytes(16) val_salt = random_bytes[:8] key_salt = random_bytes[8:] u_value = AlgV5.calculate_hash(R, password, val_salt, b"") + val_salt + key_salt tmp_key = AlgV5.calculate_hash(R, password, key_salt, b"") iv = bytes(0 for _ in range(16)) ue_value = aes_cbc_encrypt(tmp_key, iv, key) return u_value, ue_value @staticmethod def compute_O_value( R: int, password: bytes, key: bytes, u_value: bytes ) -> tuple[bytes, bytes]: """ Algorithm 3.9 Computing the encryption dictionary’s O (owner password) and OE (owner encryption key) values. 1. Generate 16 random bytes of data using a strong random number generator. The first 8 bytes are the Owner Validation Salt. The second 8 bytes are the Owner Key Salt. Compute the 32-byte SHA-256 hash of the password concatenated with the Owner Validation Salt and then concatenated with the 48-byte U string as generated in Algorithm 3.8. The 48-byte string consisting of the 32-byte hash followed by the Owner Validation Salt followed by the Owner Key Salt is stored as the O key. 2. Compute the 32-byte SHA-256 hash of the password concatenated with the Owner Key Salt and then concatenated with the 48-byte U string as generated in Algorithm 3.8. Using this hash as the key, encrypt the file encryption key using AES-256 in CBC mode with no padding and an initialization vector of zero. The resulting 32-byte string is stored as the OE key. Args: R: password: key: u_value: A 32-byte string, based on the user password, that shall be used in determining whether to prompt the user for a password and, if so, whether a valid user or owner password was entered. Returns: A tuple (O value, OE value) """ random_bytes = secrets.token_bytes(16) val_salt = random_bytes[:8] key_salt = random_bytes[8:] o_value = ( AlgV5.calculate_hash(R, password, val_salt, u_value) + val_salt + key_salt ) tmp_key = AlgV5.calculate_hash(R, password, key_salt, u_value[:48]) iv = bytes(0 for _ in range(16)) oe_value = aes_cbc_encrypt(tmp_key, iv, key) return o_value, oe_value @staticmethod def compute_Perms_value(key: bytes, p: int, metadata_encrypted: bool) -> bytes: """ Algorithm 3.10 Computing the encryption dictionary’s Perms (permissions) value. 1. Extend the permissions (contents of the P integer) to 64 bits by setting the upper 32 bits to all 1’s. (This allows for future extension without changing the format.) 2. Record the 8 bytes of permission in the bytes 0-7 of the block, low order byte first. 3. Set byte 8 to the ASCII value ' T ' or ' F ' according to the EncryptMetadata Boolean. 4. Set bytes 9-11 to the ASCII characters ' a ', ' d ', ' b '. 5. Set bytes 12-15 to 4 bytes of random data, which will be ignored. 6. Encrypt the 16-byte block using AES-256 in ECB mode with an initialization vector of zero, using the file encryption key as the key. The result (16 bytes) is stored as the Perms string, and checked for validity when the file is opened. Args: key: p: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: The perms value """ b8 = b"T" if metadata_encrypted else b"F" rr = secrets.token_bytes(4) data = struct.pack(" None: # §7.6.2, entries common to all encryption dictionaries # use same name as keys of encryption dictionaries entries self.V = V self.R = R self.Length = Length # key_size self.P = (P + 0x100000000) % 0x100000000 # maybe P < 0 self.EncryptMetadata = EncryptMetadata self.id1_entry = first_id_entry self.StmF = StmF self.StrF = StrF self.EFF = EFF self.values: EncryptionValues = values or EncryptionValues() self._password_type = PasswordType.NOT_DECRYPTED self._key: Optional[bytes] = None self._are_permissions_valid: bool = True def is_decrypted(self) -> bool: return self._password_type != PasswordType.NOT_DECRYPTED def encrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObject: # skip calculate key if not self._is_encryption_object(obj): return obj cf = self._make_crypt_filter(idnum, generation) return cf.encrypt_object(obj) def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObject: # skip calculate key if not self._is_encryption_object(obj): return obj cf = self._make_crypt_filter(idnum, generation) return cf.decrypt_object(obj) @staticmethod def _is_encryption_object(obj: PdfObject) -> bool: return isinstance( obj, ( ByteStringObject, TextStringObject, StreamObject, ArrayObject, DictionaryObject, ), ) def _make_crypt_filter(self, idnum: int, generation: int) -> CryptFilter: """ Algorithm 1: Encryption of data using the RC4 or AES algorithms. a) Obtain the object number and generation number from the object identifier of the string or stream to be encrypted (see 7.3.10, "Indirect Objects"). If the string is a direct object, use the identifier of the indirect object containing it. b) For all strings and streams without crypt filter specifier; treating the object number and generation number as binary integers, extend the original n-byte encryption key to n + 5 bytes by appending the low-order 3 bytes of the object number and the low-order 2 bytes of the generation number in that order, low-order byte first. (n is 5 unless the value of V in the encryption dictionary is greater than 1, in which case n is the value of Length divided by 8.) If using the AES algorithm, extend the encryption key an additional 4 bytes by adding the value “sAlT”, which corresponds to the hexadecimal values 0x73, 0x41, 0x6C, 0x54. (This addition is done for backward compatibility and is not intended to provide additional security.) c) Initialize the MD5 hash function and pass the result of step (b) as input to this function. d) Use the first (n + 5) bytes, up to a maximum of 16, of the output from the MD5 hash as the key for the RC4 or AES symmetric key algorithms, along with the string or stream data to be encrypted. If using the AES algorithm, the Cipher Block Chaining (CBC) mode, which requires an initialization vector, is used. The block size parameter is set to 16 bytes, and the initialization vector is a 16-byte random number that is stored as the first 16 bytes of the encrypted stream or string. Algorithm 3.1a Encryption of data using the AES algorithm 1. Use the 32-byte file encryption key for the AES-256 symmetric key algorithm, along with the string or stream data to be encrypted. Use the AES algorithm in Cipher Block Chaining (CBC) mode, which requires an initialization vector. The block size parameter is set to 16 bytes, and the initialization vector is a 16-byte random number that is stored as the first 16 bytes of the encrypted stream or string. The output is the encrypted data to be stored in the PDF file. """ pack1 = struct.pack(" CryptBase: if method == "/AESV2": return CryptAES(aes128_key) if method == "/AESV3": return CryptAES(aes256_key) if method == "/Identity": return CryptIdentity() return CryptRC4(rc4_key) @staticmethod def _encode_password(password: Union[bytes, str]) -> bytes: if isinstance(password, str): try: pwd = password.encode("latin-1") except Exception: pwd = password.encode("utf-8") else: pwd = password return pwd def verify(self, password: Union[bytes, str]) -> PasswordType: pwd = self._encode_password(password) key, rc = self.verify_v4(pwd) if self.V <= 4 else self.verify_v5(pwd) if rc != PasswordType.NOT_DECRYPTED: self._password_type = rc self._key = key return rc def verify_v4(self, password: bytes) -> tuple[bytes, PasswordType]: # verify owner password first key = AlgV4.verify_owner_password( password, self.R, self.Length, self.values.O, self.values.U, self.P, self.id1_entry, self.EncryptMetadata, ) if key: return key, PasswordType.OWNER_PASSWORD key = AlgV4.verify_user_password( password, self.R, self.Length, self.values.O, self.values.U, self.P, self.id1_entry, self.EncryptMetadata, ) if key: return key, PasswordType.USER_PASSWORD return b"", PasswordType.NOT_DECRYPTED def verify_v5(self, password: bytes) -> tuple[bytes, PasswordType]: # TODO: use SASLprep process # verify owner password first key = AlgV5.verify_owner_password( self.R, password, self.values.O, self.values.OE, self.values.U ) rc = PasswordType.OWNER_PASSWORD if not key: key = AlgV5.verify_user_password( self.R, password, self.values.U, self.values.UE ) rc = PasswordType.USER_PASSWORD if not key: return b"", PasswordType.NOT_DECRYPTED # verify Perms self._are_permissions_valid = AlgV5.verify_perms(key, self.values.Perms, self.P, self.EncryptMetadata) if not self._are_permissions_valid: logger_warning("ignore '/Perms' verify failed", __name__) return key, rc def write_entry( self, user_password: str, owner_password: Optional[str] ) -> DictionaryObject: user_pwd = self._encode_password(user_password) owner_pwd = self._encode_password(owner_password) if owner_password else None if owner_pwd is None: owner_pwd = user_pwd if self.V <= 4: self.compute_values_v4(user_pwd, owner_pwd) else: self._key = secrets.token_bytes(self.Length // 8) values = AlgV5.generate_values( self.R, user_pwd, owner_pwd, self._key, self.P, self.EncryptMetadata ) self.values.O = values["/O"] self.values.U = values["/U"] self.values.OE = values["/OE"] self.values.UE = values["/UE"] self.values.Perms = values["/Perms"] dict_obj = DictionaryObject() dict_obj[NameObject("/V")] = NumberObject(self.V) dict_obj[NameObject("/R")] = NumberObject(self.R) dict_obj[NameObject("/Length")] = NumberObject(self.Length) dict_obj[NameObject("/P")] = NumberObject(self.P) dict_obj[NameObject("/Filter")] = NameObject("/Standard") # ignore /EncryptMetadata dict_obj[NameObject("/O")] = ByteStringObject(self.values.O) dict_obj[NameObject("/U")] = ByteStringObject(self.values.U) if self.V >= 4: # TODO: allow different method std_cf = DictionaryObject() std_cf[NameObject("/AuthEvent")] = NameObject("/DocOpen") std_cf[NameObject("/CFM")] = NameObject(self.StmF) std_cf[NameObject("/Length")] = NumberObject(self.Length // 8) cf = DictionaryObject() cf[NameObject("/StdCF")] = std_cf dict_obj[NameObject("/CF")] = cf dict_obj[NameObject("/StmF")] = NameObject("/StdCF") dict_obj[NameObject("/StrF")] = NameObject("/StdCF") # ignore EFF # dict_obj[NameObject("/EFF")] = NameObject("/StdCF") if self.V >= 5: dict_obj[NameObject("/OE")] = ByteStringObject(self.values.OE) dict_obj[NameObject("/UE")] = ByteStringObject(self.values.UE) dict_obj[NameObject("/Perms")] = ByteStringObject(self.values.Perms) return dict_obj def compute_values_v4(self, user_password: bytes, owner_password: bytes) -> None: rc4_key = AlgV4.compute_O_value_key(owner_password, self.R, self.Length) o_value = AlgV4.compute_O_value(rc4_key, user_password, self.R) key = AlgV4.compute_key( user_password, self.R, self.Length, o_value, self.P, self.id1_entry, self.EncryptMetadata, ) u_value = AlgV4.compute_U_value(key, self.R, self.id1_entry) self._key = key self.values.O = o_value self.values.U = u_value @staticmethod def read(encryption_entry: DictionaryObject, first_id_entry: bytes) -> "Encryption": if encryption_entry.get("/Filter") != "/Standard": raise NotImplementedError( "only Standard PDF encryption handler is available" ) if "/SubFilter" in encryption_entry: raise NotImplementedError("/SubFilter NOT supported") stm_filter = "/V2" str_filter = "/V2" ef_filter = "/V2" alg_ver = encryption_entry.get("/V", 0) if alg_ver not in (1, 2, 3, 4, 5): raise NotImplementedError(f"Encryption V={alg_ver} NOT supported") if alg_ver >= 4: filters = encryption_entry["/CF"] stm_filter = encryption_entry.get("/StmF", "/Identity") str_filter = encryption_entry.get("/StrF", "/Identity") ef_filter = encryption_entry.get("/EFF", stm_filter) if stm_filter != "/Identity": stm_filter = filters[stm_filter]["/CFM"] # type: ignore if str_filter != "/Identity": str_filter = filters[str_filter]["/CFM"] # type: ignore if ef_filter != "/Identity": ef_filter = filters[ef_filter]["/CFM"] # type: ignore allowed_methods = ("/Identity", "/V2", "/AESV2", "/AESV3") if stm_filter not in allowed_methods: raise NotImplementedError(f"StmF Method {stm_filter} NOT supported!") if str_filter not in allowed_methods: raise NotImplementedError(f"StrF Method {str_filter} NOT supported!") if ef_filter not in allowed_methods: raise NotImplementedError(f"EFF Method {ef_filter} NOT supported!") alg_rev = cast(int, encryption_entry["/R"]) perm_flags = cast(int, encryption_entry["/P"]) key_bits = encryption_entry.get("/Length", 40) if alg_ver == 4 and stm_filter == "/AESV2": cf_dict = cast(DictionaryObject, filters[encryption_entry["/StmF"]]) # type: ignore[index] # CF /Length is in bytes (default 16 for AES-128), convert to bits key_bits = cast(int, cf_dict.get("/Length", 16)) * 8 encrypt_metadata = encryption_entry.get("/EncryptMetadata") encrypt_metadata = ( encrypt_metadata.value if encrypt_metadata is not None else True ) values = EncryptionValues() values.O = cast(ByteStringObject, encryption_entry["/O"]).original_bytes values.U = cast(ByteStringObject, encryption_entry["/U"]).original_bytes values.OE = encryption_entry.get("/OE", ByteStringObject()).original_bytes values.UE = encryption_entry.get("/UE", ByteStringObject()).original_bytes values.Perms = encryption_entry.get("/Perms", ByteStringObject()).original_bytes return Encryption( V=alg_ver, R=alg_rev, Length=key_bits, P=perm_flags, EncryptMetadata=encrypt_metadata, first_id_entry=first_id_entry, values=values, StrF=str_filter, StmF=stm_filter, EFF=ef_filter, entry=encryption_entry, # Dummy entry for the moment; will get removed ) @staticmethod def make( alg: EncryptAlgorithm, permissions: int, first_id_entry: bytes ) -> "Encryption": alg_ver, alg_rev, key_bits = alg stm_filter, str_filter, ef_filter = "/V2", "/V2", "/V2" if alg == EncryptAlgorithm.AES_128: stm_filter, str_filter, ef_filter = "/AESV2", "/AESV2", "/AESV2" elif alg in (EncryptAlgorithm.AES_256_R5, EncryptAlgorithm.AES_256): stm_filter, str_filter, ef_filter = "/AESV3", "/AESV3", "/AESV3" return Encryption( V=alg_ver, R=alg_rev, Length=key_bits, P=permissions, EncryptMetadata=True, first_id_entry=first_id_entry, values=None, StrF=str_filter, StmF=stm_filter, EFF=ef_filter, entry=DictionaryObject(), # Dummy entry for the moment; will get removed ) ================================================ FILE: pypdf/_font.py ================================================ from collections.abc import Sequence from dataclasses import dataclass, field from typing import Any, Union, cast from pypdf.generic import ArrayObject, DictionaryObject, NameObject from ._cmap import get_encoding from ._codecs.adobe_glyphs import adobe_glyphs from ._utils import logger_warning from .constants import FontFlags @dataclass(frozen=True) class FontDescriptor: """ Represents the FontDescriptor dictionary as defined in the PDF specification. This contains both descriptive and metric information. The defaults are derived from the mean values of the 14 core fonts, rounded to 100. """ name: str = "Unknown" family: str = "Unknown" weight: str = "Unknown" ascent: float = 700.0 descent: float = -200.0 cap_height: float = 600.0 x_height: float = 500.0 italic_angle: float = 0.0 # Non-italic flags: int = 32 # Non-serif, non-symbolic, not fixed width bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) @dataclass(frozen=True) class CoreFontMetrics: font_descriptor: FontDescriptor character_widths: dict[str, int] @dataclass class Font: """ A font object for use during text extraction and for producing text appearance streams. Attributes: name: Font name, derived from font["/BaseFont"] character_map: The font's character map encoding: Font encoding sub_type: The font type, such as Type1, TrueType, or Type3. font_descriptor: Font metrics, including a mapping of characters to widths character_widths: A mapping of characters to widths space_width: The width of a space, or an approximation interpretable: Default True. If False, the font glyphs cannot be translated to characters, e.g. Type3 fonts that do not define a '/ToUnicode' mapping. """ name: str encoding: Union[str, dict[int, str]] character_map: dict[Any, Any] = field(default_factory=dict) sub_type: str = "Unknown" font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) space_width: Union[float, int] = 250 interpretable: bool = True @staticmethod def _collect_tt_t1_character_widths( pdf_font_dict: DictionaryObject, char_map: dict[Any, Any], encoding: Union[str, dict[int, str]], current_widths: dict[str, int] ) -> None: """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) first_char = pdf_font_dict.get("/FirstChar", 0) if not isinstance(encoding, str): # This means that encoding is a dict current_widths.update({ encoding.get(idx + first_char, chr(idx + first_char)): width for idx, width in enumerate(widths_array) }) return # We map the character code directly to the character # using the string encoding for idx, width in enumerate(widths_array): # Often "idx == 0" will denote the .notdef character, but we add it anyway char_code = idx + first_char # This is a raw code # Get the "raw" character or byte representation raw_char = bytes([char_code]).decode(encoding, "surrogatepass") # Translate raw_char to the REAL Unicode character using the char_map unicode_char = char_map.get(raw_char) if unicode_char: current_widths[unicode_char] = int(width) else: current_widths[raw_char] = int(width) @staticmethod def _collect_cid_character_widths( d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] ) -> None: """Parses the /W array from a DescendantFont dictionary and updates character widths.""" ord_map = { ord(_target): _surrogate for _target, _surrogate in char_map.items() if isinstance(_target, str) } # /W width definitions have two valid formats which can be mixed and matched: # (1) A character start index followed by a list of widths, e.g. # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. # (2) A character start index, a character stop index, and a width, e.g. # `45 65 500` applies width 500 to characters 45-65. skip_count = 0 _w = d_font.get("/W", []) for idx, w_entry in enumerate(_w): w_entry = w_entry.get_object() if skip_count: skip_count -= 1 continue if not isinstance(w_entry, (int, float)): # We should never get here due to skip_count above. But # sometimes we do. logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) continue # check for format (1): `int [int int int int ...]` w_next_entry = _w[idx + 1].get_object() if isinstance(w_next_entry, Sequence): start_idx, width_list = w_entry, w_next_entry current_widths.update( { ord_map[_cidx]: _width for _cidx, _width in zip( range( cast(int, start_idx), cast(int, start_idx) + len(width_list), 1, ), width_list, ) if _cidx in ord_map } ) skip_count = 1 # check for format (2): `int int int` elif isinstance(w_next_entry, (int, float)) and isinstance( _w[idx + 2].get_object(), (int, float) ): start_idx, stop_idx, const_width = ( w_entry, w_next_entry, _w[idx + 2].get_object(), ) current_widths.update( { ord_map[_cidx]: const_width for _cidx in range( cast(int, start_idx), cast(int, stop_idx + 1), 1 ) if _cidx in ord_map } ) skip_count = 2 else: # This handles the case of out of bounds (reaching the end of the width definitions # while expecting more elements). logger_warning( f"Invalid font width definition. Last element: {w_entry}.", __name__ ) @staticmethod def _add_default_width(current_widths: dict[str, int], flags: int) -> None: if not current_widths: current_widths["default"] = 500 return if " " in current_widths and current_widths[" "] != 0: # Setting default to once or twice the space width, depending on fixed pitch if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: current_widths["default"] = current_widths[" "] return current_widths["default"] = int(2 * current_widths[" "]) return # Use the average width of existing glyph widths valid_widths = [w for w in current_widths.values() if w > 0] current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 @staticmethod def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]: font_descriptor_kwargs: dict[Any, Any] = {} for source_key, target_key in [ ("/FontName", "name"), ("/FontFamily", "family"), ("/FontWeight", "weight"), ("/Ascent", "ascent"), ("/Descent", "descent"), ("/CapHeight", "cap_height"), ("/XHeight", "x_height"), ("/ItalicAngle", "italic_angle"), ("/Flags", "flags"), ("/FontBBox", "bbox") ]: if source_key in font_descriptor_obj: font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key] # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes if "bbox" in font_descriptor_kwargs: bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"])) assert len(bbox_tuple) == 4, bbox_tuple font_descriptor_kwargs["bbox"] = bbox_tuple return font_descriptor_kwargs @classmethod def from_font_resource( cls, pdf_font_dict: DictionaryObject, ) -> "Font": from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415 # Can collect base_font, name and encoding directly from font resource name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") encoding, character_map = get_encoding(pdf_font_dict) font_descriptor = None character_widths: dict[str, int] = {} interpretable = True # Deal with fonts by type; Type1, TrueType and certain Type3 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be # reliably converted into character codes unless all named chars # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the # PDF 1.7 standard. if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: interpretable = all( cname in adobe_glyphs for cname in pdf_font_dict.get("/CharProcs") or [] ) if interpretable: # Save some overhead if font is not interpretable if "/Widths" in pdf_font_dict: cls._collect_tt_t1_character_widths( pdf_font_dict, character_map, encoding, character_widths ) elif name in CORE_FONT_METRICS: font_descriptor = CORE_FONT_METRICS[name].font_descriptor character_widths = CORE_FONT_METRICS[name].character_widths if "/FontDescriptor" in pdf_font_dict: font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() if "/MissingWidth" in font_descriptor_obj: character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object()) font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) elif "/FontBBox" in pdf_font_dict: # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"]))) assert len(bbox_tuple) == 4, bbox_tuple font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple) else: # Composite font or CID font - CID fonts have a /W array mapping character codes # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, # because all other fonts have already been dealt with. d_font: DictionaryObject for d_font_idx, d_font in enumerate( cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) ): d_font = cast(DictionaryObject, d_font.get_object()) cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font cls._collect_cid_character_widths( d_font, character_map, character_widths ) if "/DW" in d_font: character_widths["default"] = cast(int, d_font["/DW"].get_object()) font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object() font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) if not font_descriptor: font_descriptor = FontDescriptor(name=name) if character_widths.get("default", 0) == 0: cls._add_default_width(character_widths, font_descriptor.flags) space_width = character_widths.get(" ", 0) if space_width == 0: if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: space_width = character_widths["default"] else: space_width = character_widths["default"] // 2 return cls( name=name, sub_type=sub_type, encoding=encoding, font_descriptor=font_descriptor, character_map=character_map, character_widths=character_widths, space_width=space_width, interpretable=interpretable ) def as_font_resource(self) -> DictionaryObject: # For now, this returns a font resource that only works with the 14 Adobe Core fonts. return ( DictionaryObject({ NameObject("/Subtype"): NameObject("/Type1"), NameObject("/Name"): NameObject(f"/{self.name}"), NameObject("/Type"): NameObject("/Font"), NameObject("/BaseFont"): NameObject(f"/{self.name}"), NameObject("/Encoding"): NameObject("/WinAnsiEncoding") }) ) def text_width(self, text: str = "") -> float: """Sum of character widths specified in PDF font for the supplied text.""" return sum( [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 ) ================================================ FILE: pypdf/_page.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import math from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy from dataclasses import asdict, dataclass from decimal import Decimal from io import BytesIO from pathlib import Path from typing import ( Any, Callable, Literal, Optional, Union, cast, overload, ) from ._font import Font from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( _layout_mode, ) from ._text_extraction._text_extractor import TextExtraction from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, _human_readable_bytes, deprecate, logger_warning, matrix_multiply, ) from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING from .constants import AnnotationDictionaryAttributes as ADA from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Resources as RES from .errors import PageSizeNotDefinedError, PdfReadError from .generic import ( ArrayObject, ContentStream, DictionaryObject, EncodedStreamObject, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, RectangleObject, StreamObject, is_null_or_none, ) try: from PIL.Image import Image pil_not_imported = False except ImportError: Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 pil_not_imported = True # error will be raised only when using images MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name) if isinstance(retval, RectangleObject): return retval if is_null_or_none(retval): for d in defaults: retval = self.get(d) if retval is not None: break if isinstance(retval, IndirectObject): retval = self.pdf.get_object(retval) if isinstance(retval, ArrayObject) and (length := len(retval)) > 4: logger_warning(f"Expected four values, got {length}: {retval}", __name__) retval = RectangleObject(tuple(retval[:4])) else: retval = RectangleObject(retval) # type: ignore _set_rectangle(self, name, retval) return retval def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: self[NameObject(name)] = value def _delete_rectangle(self: Any, name: str) -> None: del self[name] def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: return property( lambda self: _get_rectangle(self, name, fallback), lambda self, value: _set_rectangle(self, name, value), lambda self: _delete_rectangle(self, name), ) class Transformation: """ Represent a 2D transformation. The transformation between two coordinate systems is represented by a 3-by-3 transformation matrix with the following form:: a b 0 c d 0 e f 1 Because a transformation matrix has only six elements that can be changed, it is usually specified in PDF as the six-element array [ a b c d e f ]. Coordinate transformations are expressed as matrix multiplications:: a b 0 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 e f 1 Example: >>> from pypdf import PdfWriter, Transformation >>> page = PdfWriter().add_blank_page(800, 600) >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) >>> page.add_transformation(op) """ def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: self.ctm = ctm @property def matrix(self) -> TransformationMatrixType: """ Return the transformation matrix as a tuple of tuples in the form: ((a, b, 0), (c, d, 0), (e, f, 1)) """ return ( (self.ctm[0], self.ctm[1], 0), (self.ctm[2], self.ctm[3], 0), (self.ctm[4], self.ctm[5], 1), ) @staticmethod def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: """ Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). Args: matrix: The transformation matrix as a tuple of tuples. Returns: A tuple representing the transformation matrix as (a, b, c, d, e, f) """ return ( matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1], matrix[2][0], matrix[2][1], ) def _to_cm(self) -> str: # Returns the cm operation string for the given transformation matrix return ( f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" ) def transform(self, m: "Transformation") -> "Transformation": """ Apply one transformation to another. Args: m: a Transformation to apply. Returns: A new ``Transformation`` instance Example: >>> from pypdf import PdfWriter, Transformation >>> height, width = 40, 50 >>> page = PdfWriter().add_blank_page(800, 600) >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror >>> page.add_transformation(op) """ ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) return Transformation(ctm) def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": """ Translate the contents of a page. Args: tx: The translation along the x-axis. ty: The translation along the y-axis. Returns: A new ``Transformation`` instance """ m = self.ctm return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) def scale( self, sx: Optional[float] = None, sy: Optional[float] = None ) -> "Transformation": """ Scale the contents of a page towards the origin of the coordinate system. Typically, that is the lower-left corner of the page. That can be changed by translating the contents / the page boxes. Args: sx: The scale factor along the x-axis. sy: The scale factor along the y-axis. Returns: A new Transformation instance with the scaled matrix. """ if sx is None and sy is None: raise ValueError("Either sx or sy must be specified") if sx is None: sx = sy if sy is None: sy = sx assert sx is not None assert sy is not None op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) ctm = Transformation.compress(matrix_multiply(self.matrix, op)) return Transformation(ctm) def rotate(self, rotation: float) -> "Transformation": """ Rotate the contents of a page. Args: rotation: The angle of rotation in degrees. Returns: A new ``Transformation`` instance with the rotated matrix. """ rotation = math.radians(rotation) op: TransformationMatrixType = ( (math.cos(rotation), math.sin(rotation), 0), (-math.sin(rotation), math.cos(rotation), 0), (0, 0, 1), ) ctm = Transformation.compress(matrix_multiply(self.matrix, op)) return Transformation(ctm) def __repr__(self) -> str: return f"Transformation(ctm={self.ctm})" @overload def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: ... @overload def apply_on( self, pt: tuple[float, float], as_object: bool = False ) -> tuple[float, float]: ... def apply_on( self, pt: Union[tuple[float, float], list[float]], as_object: bool = False, ) -> Union[tuple[float, float], list[float]]: """ Apply the transformation matrix on the given point. Args: pt: A tuple or list representing the point in the form (x, y). as_object: If True, return items as FloatObject, otherwise as plain floats. Returns: A tuple or list representing the transformed point in the form (x', y') """ typ = FloatObject if as_object else float pt1 = ( typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), ) return list(pt1) if isinstance(pt, list) else pt1 @dataclass class ImageFile: """ Image within the PDF file. *This object is not designed to be built.* This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. """ name: str = "" """ Filename as identified within the PDF file. """ data: bytes = b"" """ Data as bytes. """ image: Optional[Image] = None """ Data as PIL image. """ indirect_reference: Optional[IndirectObject] = None """ Reference to the object storing the stream. """ def replace(self, new_image: Image, **kwargs: Any) -> None: """ Replace the image with a new PIL image. Args: new_image (PIL.Image.Image): The new PIL image to replace the existing image. **kwargs: Additional keyword arguments to pass to `Image.save()`. Raises: TypeError: If the image is inline or in a PdfReader. TypeError: If the image does not belong to a PdfWriter. TypeError: If `new_image` is not a PIL Image. Note: This method replaces the existing image with a new image. It is not allowed for inline images or images within a PdfReader. The `kwargs` parameter allows passing additional parameters to `Image.save()`, such as quality. """ if pil_not_imported: raise ImportError( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ) from ._reader import PdfReader # noqa: PLC0415 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 if self.indirect_reference is None: raise TypeError("Cannot update an inline image.") if not hasattr(self.indirect_reference.pdf, "_id_translated"): raise TypeError("Cannot update an image not belonging to a PdfWriter.") if not isinstance(new_image, Image): raise TypeError("new_image shall be a PIL Image") b = BytesIO() new_image.save(b, "PDF", **kwargs) reader = PdfReader(b) page_image = reader.pages[0].images[0] assert page_image.indirect_reference is not None self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( page_image.indirect_reference.get_object() ) cast( PdfObject, self.indirect_reference.get_object() ).indirect_reference = self.indirect_reference # change the object attributes extension, byte_stream, img = _xobj_to_image( cast(DictionaryObject, self.indirect_reference.get_object()), pillow_parameters=kwargs, ) assert extension is not None self.name = self.name[: self.name.rfind(".")] + extension self.data = byte_stream self.image = img def __str__(self) -> str: return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" class VirtualListImages(Sequence[ImageFile]): """ Provides access to images referenced within a page. Only one copy will be returned if the usage is used on the same page multiple times. See :func:`PageObject.images` for more details. """ def __init__( self, ids_function: Callable[[], list[Union[str, list[str]]]], get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], ) -> None: self.ids_function = ids_function self.get_function = get_function self.current = -1 def __len__(self) -> int: return len(self.ids_function()) def keys(self) -> list[Union[str, list[str]]]: return self.ids_function() def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: return [(x, self[x]) for x in self.ids_function()] @overload def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: ... @overload def __getitem__(self, index: slice) -> Sequence[ImageFile]: ... def __getitem__( self, index: Union[int, slice, str, list[str], tuple[str]] ) -> Union[ImageFile, Sequence[ImageFile]]: lst = self.ids_function() if isinstance(index, slice): indices = range(*index.indices(len(self))) lst = [lst[x] for x in indices] cls = type(self) return cls((lambda: lst), self.get_function) if isinstance(index, (str, list, tuple)): return self.get_function(index) if not isinstance(index, int): raise TypeError("Invalid sequence indices type") len_self = len(lst) if index < 0: # support negative indexes index += len_self if not (0 <= index < len_self): raise IndexError("Sequence index out of range") return self.get_function(lst[index]) def __iter__(self) -> Iterator[ImageFile]: for i in range(len(self)): yield self[i] def __str__(self) -> str: p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] return f"[{', '.join(p)}]" class PageObject(DictionaryObject): """ PageObject represents a single page within a PDF file. Typically these objects will be created by accessing the :attr:`pages` property of the :class:`PdfReader` class, but it is also possible to create an empty page with the :meth:`create_blank_page()` static method. Args: pdf: PDF file the page belongs to. indirect_reference: Stores the original indirect reference to this object in its source PDF """ original_page: "PageObject" # very local use in writer when appending def __init__( self, pdf: Optional[PdfCommonDocProtocol] = None, indirect_reference: Optional[IndirectObject] = None, ) -> None: DictionaryObject.__init__(self) self.pdf = pdf self.inline_images: Optional[dict[str, ImageFile]] = None self.indirect_reference = indirect_reference if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} def hash_bin(self) -> int: """ Used to detect modified object. Note: this function is overloaded to return the same results as a DictionaryObject. Returns: Hash considering type and value. """ return hash( (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) ) def hash_value_data(self) -> bytes: data = super().hash_value_data() data += f"{id(self)}".encode() return data @property def user_unit(self) -> float: """ A read-only positive number giving the size of user space units. It is in multiples of 1/72 inch. Hence a value of 1 means a user space unit is 1/72 inch, and a value of 3 means that a user space unit is 3/72 inch. """ return self.get(PG.USER_UNIT, 1) @staticmethod def create_blank_page( pdf: Optional[PdfCommonDocProtocol] = None, width: Union[float, Decimal, None] = None, height: Union[float, Decimal, None] = None, ) -> "PageObject": """ Return a new blank page. If ``width`` or ``height`` is ``None``, try to get the page size from the last page of *pdf*. Args: pdf: PDF file the page is within. width: The width of the new page expressed in default user space units. height: The height of the new page expressed in default user space units. Returns: The new blank page Raises: PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains no page """ page = PageObject(pdf) # Creates a new page (cf PDF Reference §7.7.3.3) page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) page.__setitem__(NameObject(PG.PARENT), NullObject()) page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) if width is None or height is None: if pdf is not None and len(pdf.pages) > 0: lastpage = pdf.pages[len(pdf.pages) - 1] width = lastpage.mediabox.width height = lastpage.mediabox.height else: raise PageSizeNotDefinedError page.__setitem__( NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore ) return page def _get_ids_image( self, obj: Optional[DictionaryObject] = None, ancest: Optional[list[str]] = None, call_stack: Optional[list[Any]] = None, ) -> list[Union[str, list[str]]]: if call_stack is None: call_stack = [] _i = getattr(obj, "indirect_reference", None) if _i in call_stack: return [] call_stack.append(_i) if self.inline_images is None: self.inline_images = self._get_inline_images() if obj is None: obj = self if ancest is None: ancest = [] lst: list[Union[str, list[str]]] = [] if ( PG.RESOURCES not in obj or is_null_or_none(resources := obj[PG.RESOURCES]) or RES.XOBJECT not in cast(DictionaryObject, resources) ): return [] if self.inline_images is None else list(self.inline_images.keys()) x_object = resources[RES.XOBJECT].get_object() # type: ignore for o in x_object: if not isinstance(x_object[o], StreamObject): continue if x_object[o][IA.SUBTYPE] == "/Image": lst.append(o if len(ancest) == 0 else [*ancest, o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) assert self.inline_images is not None lst.extend(list(self.inline_images.keys())) return lst def _get_image( self, id: Union[str, list[str], tuple[str]], obj: Optional[DictionaryObject] = None, ) -> ImageFile: if obj is None: obj = cast(DictionaryObject, self) if isinstance(id, tuple): id = list(id) if isinstance(id, list) and len(id) == 1: id = id[0] xobjs: Optional[DictionaryObject] = None try: xobjs = cast( DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] ) except KeyError as exc: if not (id[0] == "~" and id[-1] == "~"): raise KeyError( f"Cannot access image object {id} without XObject resources" ) from exc if isinstance(id, str): if id[0] == "~" and id[-1] == "~": if self.inline_images is None: self.inline_images = self._get_inline_images() if self.inline_images is None: raise KeyError("No inline image can be found") return self.inline_images[id] assert xobjs is not None from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] return ImageFile( name=f"{id[1:]}{extension}", data=byte_stream, image=imgd[2], indirect_reference=xobjs[id].indirect_reference, ) # in a subobject assert xobjs is not None ids = id[1:] return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) @property def images(self) -> VirtualListImages: """ Read-only property emulating a list of images on a page. Get a list of all images on the page. The key can be: - A string (for the top object) - A tuple (for images within XObject forms) - An integer Examples: * `reader.pages[0].images[0]` # return first image * `reader.pages[0].images['/I0']` # return image '/I0' * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form * `for img in reader.pages[0].images:` # loops through all objects images.keys() and images.items() can be used. The ImageFile has the following properties: * `.name` : name of the object * `.data` : bytes of the object * `.image` : PIL Image Object * `.indirect_reference` : object reference and the following methods: `.replace(new_image: PIL.Image.Image, **kwargs)` : replace the image in the pdf with the new image applying the saving parameters indicated (such as quality) Example usage: reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) Inline images are extracted and named ~0~, ~1~, ..., with the indirect_reference set to None. """ return VirtualListImages(self._get_ids_image, self._get_image) def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: """Translate values used in inline image""" try: v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) except (TypeError, KeyError): if isinstance(v, NameObject): # It is a custom name, thus we have to look in resources. # The only applicable case is for ColorSpace. try: res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] v = cast(DictionaryObject, res)[v] except KeyError: # for res and v raise PdfReadError(f"Cannot find resource entry {v} for {k}") return v def _get_inline_images(self) -> dict[str, ImageFile]: """Load inline images. Entries will be identified as `~1~`.""" content = self.get_contents() if is_null_or_none(content): return {} imgs_data = [] assert content is not None, "mypy" for param, ope in content.operations: if ope == b"INLINE IMAGE": imgs_data.append( {"settings": param["settings"], "__streamdata__": param["data"]} ) elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover raise PdfReadError( f"{ope!r} operator met whereas not expected, " "please share use case with pypdf dev team" ) files = {} for num, ii in enumerate(imgs_data): init = { "__streamdata__": ii["__streamdata__"], "/Length": len(ii["__streamdata__"]), } for k, v in ii["settings"].items(): if k in {"/Length", "/L"}: # no length is expected continue if isinstance(v, list): v = ArrayObject( [self._translate_value_inline_image(k, x) for x in v] ) else: v = self._translate_value_inline_image(k, v) k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) if k not in init: init[k] = v ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 extension, byte_stream, img = _xobj_to_image(ii["object"]) files[f"~{num}~"] = ImageFile( name=f"~{num}~{extension}", data=byte_stream, image=img, indirect_reference=None, ) return files @property def rotation(self) -> int: """ The visual rotation of the page. This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are valid values. This property does not affect ``/Contents``. """ rotate_obj = self.get(PG.ROTATE, 0) return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() @rotation.setter def rotation(self, r: float) -> None: self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) def transfer_rotation_to_content(self) -> None: """ Apply the rotation of the page to the content and the media/crop/... boxes. It is recommended to apply this function before page merging. """ r = -self.rotation # rotation to apply is in the otherway self.rotation = 0 mb = RectangleObject(self.mediabox) trsf = ( Transformation() .translate( -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) ) .rotate(r) ) pt1 = trsf.apply_on(mb.lower_left) pt2 = trsf.apply_on(mb.upper_right) trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) self.add_transformation(trsf, False) for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: if b in self: rr = RectangleObject(self[b]) # type: ignore pt1 = trsf.apply_on(rr.lower_left) pt2 = trsf.apply_on(rr.upper_right) self[NameObject(b)] = RectangleObject( ( min(pt1[0], pt2[0]), min(pt1[1], pt2[1]), max(pt1[0], pt2[0]), max(pt1[1], pt2[1]), ) ) def rotate(self, angle: int) -> "PageObject": """ Rotate a page clockwise by increments of 90 degrees. Args: angle: Angle to rotate the page. Must be an increment of 90 deg. Returns: The rotated PageObject """ if angle % 90 != 0: raise ValueError("Rotation angle must be a multiple of 90") self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) return self def _merge_resources( self, res1: DictionaryObject, res2: DictionaryObject, resource: Any, new_res1: bool = True, ) -> tuple[dict[str, Any], dict[str, Any]]: try: assert isinstance(self.indirect_reference, IndirectObject) pdf = self.indirect_reference.pdf is_pdf_writer = hasattr( pdf, "_add_object" ) # expect isinstance(pdf, PdfWriter) except (AssertionError, AttributeError): pdf = None is_pdf_writer = False def compute_unique_key(base_key: str) -> tuple[str, bool]: """ Find a key that either doesn't already exist or has the same value (indicated by the bool) Args: base_key: An index is added to this to get the computed key Returns: A tuple (computed key, bool) where the boolean indicates if there is a resource of the given computed_key with the same value. """ value = page2res.raw_get(base_key) # TODO: a possible improvement for writer, the indirect_reference # cannot be found because translated # try the current key first (e.g. "foo"), but otherwise iterate # through "foo-0", "foo-1", etc. new_res can contain only finitely # many keys, thus this'll eventually end, even if it's been crafted # to be maximally annoying. computed_key = base_key idx = 0 while computed_key in new_res: if new_res.raw_get(computed_key) == value: # there's already a resource of this name, with the exact # same value return computed_key, True computed_key = f"{base_key}-{idx}" idx += 1 return computed_key, False if new_res1: new_res = DictionaryObject() new_res.update(res1.get(resource, DictionaryObject()).get_object()) else: new_res = cast(DictionaryObject, res1[resource]) page2res = cast( DictionaryObject, res2.get(resource, DictionaryObject()).get_object() ) rename_res = {} for key in page2res: unique_key, same_value = compute_unique_key(key) newname = NameObject(unique_key) if key != unique_key: # we have to use a different name for this rename_res[key] = newname if not same_value: if is_pdf_writer: new_res[newname] = page2res.raw_get(key).clone(pdf) try: new_res[newname] = new_res[newname].indirect_reference except AttributeError: pass else: new_res[newname] = page2res.raw_get(key) lst = sorted(new_res.items()) new_res.clear() for el in lst: new_res[el[0]] = el[1] return new_res, rename_res @staticmethod def _content_stream_rename( stream: ContentStream, rename: dict[Any, Any], pdf: Optional[PdfCommonDocProtocol], ) -> ContentStream: if not rename: return stream stream = ContentStream(stream, pdf) for operands, _operator in stream.operations: if isinstance(operands, list): for i, op in enumerate(operands): if isinstance(op, NameObject): operands[i] = rename.get(op, op) elif isinstance(operands, dict): for i, op in operands.items(): if isinstance(op, NameObject): operands[i] = rename.get(op, op) else: raise KeyError(f"Type of operands is {type(operands)}") return stream @staticmethod def _add_transformation_matrix( contents: Any, pdf: Optional[PdfCommonDocProtocol], ctm: CompressedTransformationMatrix, ) -> ContentStream: """Add transformation matrix at the beginning of the given contents stream.""" contents = ContentStream(contents, pdf) contents.operations.insert( 0, [ [FloatObject(x) for x in ctm], b"cm", ], ) return contents def _get_contents_as_bytes(self) -> Optional[bytes]: """ Return the page contents as bytes. Returns: The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. """ if PG.CONTENTS in self: obj = self[PG.CONTENTS].get_object() if isinstance(obj, list): return b"".join(x.get_object().get_data() for x in obj) return cast(EncodedStreamObject, obj).get_data() return None def get_contents(self) -> Optional[ContentStream]: """ Access the page contents. Returns: The ``/Contents`` object, or ``None`` if it does not exist. ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. """ if PG.CONTENTS in self: try: pdf = cast(IndirectObject, self.indirect_reference).pdf except AttributeError: pdf = None obj = self[PG.CONTENTS] if is_null_or_none(obj): return None resolved_object = obj.get_object() return ContentStream(resolved_object, pdf) return None def replace_contents( self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] ) -> None: """ Replace the page contents with the new content and nullify old objects Args: content: new content; if None delete the content field. """ if not hasattr(self, "indirect_reference") or self.indirect_reference is None: # the page is not attached : the content is directly attached. self[NameObject(PG.CONTENTS)] = content return from pypdf._writer import PdfWriter # noqa: PLC0415 if not isinstance(self.indirect_reference.pdf, PdfWriter): deprecate( "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated " "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use " "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable." ) writer = self.indirect_reference.pdf if isinstance(self.get(PG.CONTENTS, None), ArrayObject): content_array = cast(ArrayObject, self[PG.CONTENTS]) for reference in content_array: try: writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject()) except ValueError: # Occurs when called on PdfReader. pass if isinstance(content, ArrayObject): content = ArrayObject(writer._add_object(obj) for obj in content) if is_null_or_none(content): if PG.CONTENTS not in self: return assert self[PG.CONTENTS].indirect_reference is not None writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject()) del self[PG.CONTENTS] elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): try: self[NameObject(PG.CONTENTS)] = writer._add_object(content) except AttributeError: # applies at least for page not in writer # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content else: assert content is not None, "mypy" content.indirect_reference = self[ PG.CONTENTS ].indirect_reference # TODO: in the future may require generation management try: writer._replace_object(indirect_reference=content.indirect_reference, obj=content) except AttributeError: # applies at least for page not in writer # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content # forces recalculation of inline_images self.inline_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True ) -> None: """ Merge the content streams of two pages into one. Resource references (e.g. fonts) are maintained from both pages. The mediabox, cropbox, etc of this page are not altered. The parameter page's content stream will be added to the end of this page's content stream, meaning that it will be drawn after, or "on top" of this page. Args: page2: The page to be merged into this one. Should be an instance of :class:`PageObject`. over: set the page2 content over page1 if True (default) else under expand: If True, the current page dimensions will be expanded to accommodate the dimensions of the page to be merged. """ self._merge_page(page2, over=over, expand=expand) def _merge_page( self, page2: "PageObject", page2transformation: Optional[Callable[[Any], ContentStream]] = None, ctm: Optional[CompressedTransformationMatrix] = None, over: bool = True, expand: bool = False, ) -> None: # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. try: assert isinstance(self.indirect_reference, IndirectObject) if hasattr( self.indirect_reference.pdf, "_add_object" ): # to detect PdfWriter return self._merge_page_writer( page2, page2transformation, ctm, over, expand ) except (AssertionError, AttributeError): pass new_resources = DictionaryObject() rename = {} original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object()) page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object()) new_annots = ArrayObject() for page in (self, page2): if PG.ANNOTS in page: annots = page[PG.ANNOTS] if isinstance(annots, ArrayObject): new_annots.extend(annots) for res in ( RES.EXT_G_STATE, RES.FONT, RES.XOBJECT, RES.COLOR_SPACE, RES.PATTERN, RES.SHADING, RES.PROPERTIES, ): new, newrename = self._merge_resources( original_resources, page2resources, res ) if new: new_resources[NameObject(res)] = new rename.update(newrename) # Combine /ProcSet sets, making sure there's a consistent order new_resources[NameObject(RES.PROC_SET)] = ArrayObject( sorted( set( original_resources.get(RES.PROC_SET, ArrayObject()).get_object() ).union( set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) ) ) ) new_content_array = ArrayObject() original_content = self.get_contents() if original_content is not None: original_content.isolate_graphics_state() new_content_array.append(original_content) page2content = page2.get_contents() if page2content is not None: rect = getattr(page2, MERGE_CROP_BOX) page2content.operations.insert( 0, ( map( FloatObject, [ rect.left, rect.bottom, rect.width, rect.height, ], ), b"re", ), ) page2content.operations.insert(1, ([], b"W")) page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( page2content, rename, self.pdf ) page2content.isolate_graphics_state() if over: new_content_array.append(page2content) else: new_content_array.insert(0, page2content) # if expanding the page to fit a new page, calculate the new media box size if expand: self._expand_mediabox(page2, ctm) self.replace_contents(ContentStream(new_content_array, self.pdf)) self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots return None def _merge_page_writer( self, page2: "PageObject", page2transformation: Optional[Callable[[Any], ContentStream]] = None, ctm: Optional[CompressedTransformationMatrix] = None, over: bool = True, expand: bool = False, ) -> None: # First we work on merging the resource dictionaries. This allows us # to find which symbols in the content streams we might need to # rename. assert isinstance(self.indirect_reference, IndirectObject) pdf = self.indirect_reference.pdf rename = {} if PG.RESOURCES not in self: self[NameObject(PG.RESOURCES)] = DictionaryObject() original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) if PG.RESOURCES not in page2: page2resources = DictionaryObject() else: page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) for res in ( RES.EXT_G_STATE, RES.FONT, RES.XOBJECT, RES.COLOR_SPACE, RES.PATTERN, RES.SHADING, RES.PROPERTIES, ): if res in page2resources: if res not in original_resources: original_resources[NameObject(res)] = DictionaryObject() _, newrename = self._merge_resources( original_resources, page2resources, res, False ) rename.update(newrename) # Combine /ProcSet sets. if RES.PROC_SET in page2resources: if RES.PROC_SET not in original_resources: original_resources[NameObject(RES.PROC_SET)] = ArrayObject() arr = cast(ArrayObject, original_resources[RES.PROC_SET]) for x in cast(ArrayObject, page2resources[RES.PROC_SET]): if x not in arr: arr.append(x) arr.sort() if PG.ANNOTS in page2: if PG.ANNOTS not in self: self[NameObject(PG.ANNOTS)] = ArrayObject() annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) if ctm is None: trsf = Transformation() else: trsf = Transformation(ctm) # Ensure we are working on a copy of the list. Otherwise, if both pages # are the same object, we might run into an infinite loop. for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])): a = a.get_object() aa = a.clone( pdf, ignore_fields=("/P", "/StructParent", "/Parent"), force_duplicate=True, ) r = cast(ArrayObject, a["/Rect"]) pt1 = trsf.apply_on((r[0], r[1]), True) pt2 = trsf.apply_on((r[2], r[3]), True) aa[NameObject("/Rect")] = ArrayObject( ( min(pt1[0], pt2[0]), min(pt1[1], pt2[1]), max(pt1[0], pt2[0]), max(pt1[1], pt2[1]), ) ) if "/QuadPoints" in a: q = cast(ArrayObject, a["/QuadPoints"]) aa[NameObject("/QuadPoints")] = ArrayObject( trsf.apply_on((q[0], q[1]), True) + trsf.apply_on((q[2], q[3]), True) + trsf.apply_on((q[4], q[5]), True) + trsf.apply_on((q[6], q[7]), True) ) try: aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference except KeyError: pass try: aa[NameObject("/P")] = self.indirect_reference annots.append(aa.indirect_reference) except AttributeError: pass new_content_array = ArrayObject() original_content = self.get_contents() if original_content is not None: original_content.isolate_graphics_state() new_content_array.append(original_content) page2content = page2.get_contents() if page2content is not None: rect = getattr(page2, MERGE_CROP_BOX) page2content.operations.insert( 0, ( map( FloatObject, [ rect.left, rect.bottom, rect.width, rect.height, ], ), b"re", ), ) page2content.operations.insert(1, ([], b"W")) page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( page2content, rename, self.pdf ) page2content.isolate_graphics_state() if over: new_content_array.append(page2content) else: new_content_array.insert(0, page2content) # if expanding the page to fit a new page, calculate the new media box size if expand: self._expand_mediabox(page2, ctm) self.replace_contents(new_content_array) def _expand_mediabox( self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] ) -> None: corners1 = ( self.mediabox.left.as_numeric(), self.mediabox.bottom.as_numeric(), self.mediabox.right.as_numeric(), self.mediabox.top.as_numeric(), ) corners2 = ( page2.mediabox.left.as_numeric(), page2.mediabox.bottom.as_numeric(), page2.mediabox.left.as_numeric(), page2.mediabox.top.as_numeric(), page2.mediabox.right.as_numeric(), page2.mediabox.top.as_numeric(), page2.mediabox.right.as_numeric(), page2.mediabox.bottom.as_numeric(), ) if ctm is not None: ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] new_x = tuple( ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] for i in range(0, 8, 2) ) new_y = tuple( ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] for i in range(0, 8, 2) ) else: new_x = corners2[0:8:2] new_y = corners2[1:8:2] lowerleft = (min(new_x), min(new_y)) upperright = (max(new_x), max(new_y)) lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) upperright = ( max(corners1[2], upperright[0]), max(corners1[3], upperright[1]), ) self.mediabox.lower_left = lowerleft self.mediabox.upper_right = upperright def merge_transformed_page( self, page2: "PageObject", ctm: Union[CompressedTransformationMatrix, Transformation], over: bool = True, expand: bool = False, ) -> None: """ Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation matrix is applied to the merged stream. Args: page2: The page to be merged into this one. ctm: a 6-element tuple containing the operands of the transformation matrix over: set the page2 content over page1 if True (default) else under expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ if isinstance(ctm, Transformation): ctm = ctm.ctm self._merge_page( page2, lambda page2_content: PageObject._add_transformation_matrix( page2_content, page2.pdf, ctm ), ctm, over, expand, ) def merge_scaled_page( self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False ) -> None: """ Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged is scaled by applying a transformation matrix. Args: page2: The page to be merged into this one. scale: The scaling factor over: set the page2 content over page1 if True (default) else under expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ op = Transformation().scale(scale, scale) self.merge_transformed_page(page2, op, over, expand) def merge_rotated_page( self, page2: "PageObject", rotation: float, over: bool = True, expand: bool = False, ) -> None: """ Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged is rotated by applying a transformation matrix. Args: page2: The page to be merged into this one. rotation: The angle of the rotation, in degrees over: set the page2 content over page1 if True (default) else under expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ op = Transformation().rotate(rotation) self.merge_transformed_page(page2, op, over, expand) def merge_translated_page( self, page2: "PageObject", tx: float, ty: float, over: bool = True, expand: bool = False, ) -> None: """ Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged is translated by applying a transformation matrix. Args: page2: the page to be merged into this one. tx: The translation on X axis ty: The translation on Y axis over: set the page2 content over page1 if True (default) else under expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ op = Transformation().translate(tx, ty) self.merge_transformed_page(page2, op, over, expand) def add_transformation( self, ctm: Union[Transformation, CompressedTransformationMatrix], expand: bool = False, ) -> None: """ Apply a transformation matrix to the page. Args: ctm: A 6-element tuple containing the operands of the transformation matrix. Alternatively, a :py:class:`Transformation` object can be passed. See :doc:`/user/cropping-and-transforming`. """ if isinstance(ctm, Transformation): ctm = ctm.ctm content = self.get_contents() if content is not None: content = PageObject._add_transformation_matrix(content, self.pdf, ctm) content.isolate_graphics_state() self.replace_contents(content) # if expanding the page to fit a new page, calculate the new media box size if expand: corners = [ self.mediabox.left.as_numeric(), self.mediabox.bottom.as_numeric(), self.mediabox.left.as_numeric(), self.mediabox.top.as_numeric(), self.mediabox.right.as_numeric(), self.mediabox.top.as_numeric(), self.mediabox.right.as_numeric(), self.mediabox.bottom.as_numeric(), ] ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] new_x = [ ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] for i in range(0, 8, 2) ] new_y = [ ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] for i in range(0, 8, 2) ] self.mediabox.lower_left = (min(new_x), min(new_y)) self.mediabox.upper_right = (max(new_x), max(new_y)) def scale(self, sx: float, sy: float) -> None: """ Scale a page by the given factors by applying a transformation matrix to its content and updating the page size. This updates the various page boundaries (bleedbox, trimbox, etc.) and the contents of the page. Args: sx: The scaling factor on horizontal axis. sy: The scaling factor on vertical axis. """ self.add_transformation((sx, 0, 0, sy, 0, 0)) self.bleedbox = self.bleedbox.scale(sx, sy) self.trimbox = self.trimbox.scale(sx, sy) self.artbox = self.artbox.scale(sx, sy) self.cropbox = self.cropbox.scale(sx, sy) self.mediabox = self.mediabox.scale(sx, sy) if PG.ANNOTS in self: annotations = self[PG.ANNOTS] if isinstance(annotations, ArrayObject): for annotation in annotations: annotation_obj = annotation.get_object() if ADA.Rect in annotation_obj: rectangle = annotation_obj[ADA.Rect] if isinstance(rectangle, ArrayObject): rectangle[0] = FloatObject(float(rectangle[0]) * sx) rectangle[1] = FloatObject(float(rectangle[1]) * sy) rectangle[2] = FloatObject(float(rectangle[2]) * sx) rectangle[3] = FloatObject(float(rectangle[3]) * sy) if PG.VP in self: viewport = self[PG.VP] if isinstance(viewport, ArrayObject): bbox = viewport[0]["/BBox"] else: bbox = viewport["/BBox"] # type: ignore scaled_bbox = RectangleObject( ( float(bbox[0]) * sx, float(bbox[1]) * sy, float(bbox[2]) * sx, float(bbox[3]) * sy, ) ) if isinstance(viewport, ArrayObject): self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore NameObject("/BBox") ] = scaled_bbox else: self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore def scale_by(self, factor: float) -> None: """ Scale a page by the given factor by applying a transformation matrix to its content and updating the page size. Args: factor: The scaling factor (for both X and Y axis). """ self.scale(factor, factor) def scale_to(self, width: float, height: float) -> None: """ Scale a page to the specified dimensions by applying a transformation matrix to its content and updating the page size. Args: width: The new width. height: The new height. """ sx = width / float(self.mediabox.width) sy = height / float(self.mediabox.height) self.scale(sx, sy) def compress_content_streams(self, level: int = -1) -> None: """ Compress the size of this page by joining all content streams and applying a FlateDecode filter. However, it is possible that this function will perform no action if content stream compression becomes "automatic". """ content = self.get_contents() if content is not None: content_obj = content.flate_encode(level) try: content.indirect_reference.pdf._objects[ # type: ignore content.indirect_reference.idnum - 1 # type: ignore ] = content_obj except AttributeError: if self.indirect_reference is not None and hasattr( self.indirect_reference.pdf, "_add_object" ): self.replace_contents(content_obj) else: raise ValueError("Page must be part of a PdfWriter") @property def page_number(self) -> Optional[int]: """ Read-only property which returns the page number within the PDF file. Returns: Page number; None if the page is not attached to a PDF. """ if self.indirect_reference is None: return None try: lst = self.indirect_reference.pdf.pages return lst.index(self) except ValueError: return None def _debug_for_extract(self) -> str: # pragma: no cover out = "" for ope, op in ContentStream( self["/Contents"].get_object(), self.pdf, "bytes" ).operations: if op == b"TJ": s = [x for x in ope[0] if isinstance(x, str)] else: s = [] out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" out += "\n=============================\n" try: for fo in self[PG.RESOURCES]["/Font"]: # type:ignore out += fo + "\n" out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore try: enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore "/Encoding" ].__repr__() out += enc_repr + "\n" except Exception: pass try: out += ( self[PG.RESOURCES]["/Font"][fo][ # type:ignore "/ToUnicode" ] .get_data() .decode() + "\n" ) except Exception: pass except KeyError: out += "No Font\n" return out def _extract_text( self, obj: Any, pdf: Any, orientations: tuple[int, ...] = (0, 90, 180, 270), space_width: float = 200.0, content_key: Optional[str] = PG.CONTENTS, visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ See extract_text for most arguments. Args: content_key: indicate the default key where to extract data None = the object; this allows reusing the function on an XObject default = "/Content" """ extractor = TextExtraction() font_resources: dict[str, DictionaryObject] = {} fonts: dict[str, Font] = {} try: objr = obj while NameObject(PG.RESOURCES) not in objr: # /Resources can be inherited so we look to parents objr = objr["/Parent"].get_object() # If no parents then no /Resources will be available, # so an exception will be raised resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) except Exception: # No resources means no text is possible (no font); we consider the # file as not damaged, no need to check for TJ or Tj return "" if ( not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"])) ): for font_resource in font_resources_dict: try: font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object()) font_resources[font_resource] = font_resource_object fonts[font_resource] = Font.from_font_resource(font_resource_object) # Override space width, if applicable if fonts[font_resource].character_widths.get(" ", 0) == 0: fonts[font_resource].space_width = space_width except (AttributeError, TypeError): pass try: content = ( obj[content_key].get_object() if isinstance(content_key, str) else obj ) if not isinstance(content, ContentStream): content = ContentStream(content, pdf, "bytes") except (AttributeError, KeyError): # no content can be extracted (certainly empty page) return "" # We check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. # Initialize the extractor with the necessary parameters extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts) for operands, operator in content.operations: if visitor_operand_before is not None: visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) # Multiple operators are handled here if operator == b"'": extractor.process_operation(b"T*", []) extractor.process_operation(b"Tj", operands) elif operator == b'"': extractor.process_operation(b"Tw", [operands[0]]) extractor.process_operation(b"Tc", [operands[1]]) extractor.process_operation(b"T*", []) extractor.process_operation(b"Tj", operands[2:]) elif operator == b"TJ": # The space width may be smaller than the font width, so the width should be 95%. _confirm_space_width = extractor._space_width * 0.95 if operands: for op in operands[0]: if isinstance(op, (str, bytes)): extractor.process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( abs(float(op)) >= _confirm_space_width and extractor.text and extractor.text[-1] != " " ): extractor.process_operation(b"Tj", [" "]) elif operator == b"TD": extractor.process_operation(b"TL", [-operands[1]]) extractor.process_operation(b"Td", operands) elif operator == b"Do": extractor.output += extractor.text if visitor_text is not None: visitor_text( extractor.text, extractor.memo_cm, extractor.memo_tm, extractor.font_resource, extractor.font_size, ) try: if extractor.output[-1] != "\n": extractor.output += "\n" if visitor_text is not None: visitor_text( "\n", extractor.memo_cm, extractor.memo_tm, extractor.font_resource, extractor.font_size, ) except IndexError: pass try: xobj = resources_dict["/XObject"] if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore text = self.extract_xform_text( xobj[operands[0]], # type: ignore orientations, space_width, visitor_operand_before, visitor_operand_after, visitor_text, ) extractor.output += text if visitor_text is not None: visitor_text( text, extractor.memo_cm, extractor.memo_tm, extractor.font_resource, extractor.font_size, ) except Exception as exception: logger_warning( f"Impossible to decode XFormObject {operands[0]}: {exception}", __name__, ) finally: extractor.text = "" extractor.memo_cm = extractor.cm_matrix.copy() extractor.memo_tm = extractor.tm_matrix.copy() else: extractor.process_operation(operator, operands) if visitor_operand_after is not None: visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) extractor.output += extractor.text # just in case if extractor.text != "" and visitor_text is not None: visitor_text( extractor.text, extractor.memo_cm, extractor.memo_tm, extractor.font_resource, extractor.font_size, ) return extractor.output def _layout_mode_fonts(self) -> dict[str, Font]: """ Get fonts formatted for "layout" mode text extraction. Returns: Dict[str, Font]: dictionary of Font instances keyed by font name """ # Font retrieval logic adapted from pypdf.PageObject._extract_text() objr: Any = self fonts: dict[str, Font] = {} while objr is not None: try: resources_dict: Any = objr[PG.RESOURCES] except KeyError: resources_dict = {} if "/Font" in resources_dict and self.pdf is not None: for font_name in resources_dict["/Font"]: fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name]) try: objr = objr["/Parent"].get_object() except KeyError: objr = None return fonts def _layout_mode_text( self, space_vertically: bool = True, scale_weight: float = 1.25, strip_rotated: bool = True, debug_path: Optional[Path] = None, font_height_weight: float = 1, ) -> str: """ Get text preserving fidelity to source PDF text layout. Args: space_vertically: include blank lines inferred from y distance + font height. Defaults to True. scale_weight: multiplier for string length when calculating weighted average character width. Defaults to 1.25. strip_rotated: Removes text that is rotated w.r.t. to the page from layout mode output. Defaults to True. debug_path (Path | None): if supplied, must target a directory. creates the following files with debug information for layout mode functions if supplied: - fonts.json: output of self._layout_mode_fonts - tjs.json: individual text render ops with corresponding transform matrices - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) Defaults to None. font_height_weight: multiplier for font height when calculating blank lines. Defaults to 1. Returns: str: multiline string containing page text in a fixed width format that closely adheres to the rendered layout in the source pdf. """ fonts = self._layout_mode_fonts() if debug_path: # pragma: no cover import json # noqa: PLC0415 debug_path.joinpath("fonts.json").write_text( json.dumps(fonts, indent=2, default=asdict), "utf-8" ) ops = iter( ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations ) bt_groups = _layout_mode.text_show_operations( ops, fonts, strip_rotated, debug_path ) if not bt_groups: return "" ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) def extract_text( self, *args: Any, orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), space_width: float = 200.0, visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, extraction_mode: Literal["plain", "layout"] = "plain", **kwargs: Any, ) -> str: """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. Arabic and Hebrew are extracted in the correct order. If required a custom RTL range of characters can be defined; see function set_custom_rtl. Additionally you can provide visitor methods to get informed on all operations and all text objects. For example in some PDF files this can be useful to parse tables. Args: orientations: list of orientations extract_text will look for default = (0, 90, 180, 270) note: currently only 0 (up),90 (turned left), 180 (upside down), 270 (turned right) Silently ignored in "layout" mode. space_width: force default space width if not extracted from font (default: 200) Silently ignored in "layout" mode. visitor_operand_before: function to be called before processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. Ignored with a warning in "layout" mode. visitor_operand_after: function to be called after processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. Ignored with a warning in "layout" mode. visitor_text: function to be called when extracting some text at some position. It has five arguments: text, current transformation matrix, text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". Ignored with a warning in "layout" mode. extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, "layout" for experimental layout mode functionality. NOTE: orientations, space_width, and visitor_* parameters are NOT respected in "layout" mode. kwargs: layout_mode_space_vertically (bool): include blank lines inferred from y distance + font height. Defaults to True. layout_mode_scale_weight (float): multiplier for string length when calculating weighted average character width. Defaults to 1.25. layout_mode_strip_rotated (bool): layout mode does not support rotated text. Set to False to include rotated text anyway. If rotated text is discovered, layout will be degraded and a warning will result. Defaults to True. layout_mode_debug_path (Path | None): if supplied, must target a directory. creates the following files with debug information for layout mode functions if supplied: - fonts.json: output of self._layout_mode_fonts - tjs.json: individual text render ops with corresponding transform matrices - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) layout_mode_font_height_weight (float): multiplier for font height when calculating blank lines. Defaults to 1. Returns: The extracted text """ if extraction_mode not in ["plain", "layout"]: raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") if extraction_mode == "layout": for visitor in ( "visitor_operand_before", "visitor_operand_after", "visitor_text", ): if locals()[visitor]: logger_warning( f"Argument {visitor} is ignored in layout mode", __name__, ) return self._layout_mode_text( space_vertically=kwargs.get("layout_mode_space_vertically", True), scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), strip_rotated=kwargs.get("layout_mode_strip_rotated", True), debug_path=kwargs.get("layout_mode_debug_path"), font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) ) if len(args) >= 1: if isinstance(args[0], str): if len(args) >= 3: if isinstance(args[2], (tuple, int)): orientations = args[2] else: raise TypeError(f"Invalid positional parameter {args[2]}") if len(args) >= 4: if isinstance(args[3], (float, int)): space_width = args[3] else: raise TypeError(f"Invalid positional parameter {args[3]}") elif isinstance(args[0], (tuple, int)): orientations = args[0] if len(args) >= 2: if isinstance(args[1], (float, int)): space_width = args[1] else: raise TypeError(f"Invalid positional parameter {args[1]}") else: raise TypeError(f"Invalid positional parameter {args[0]}") if isinstance(orientations, int): orientations = (orientations,) return self._extract_text( self, self.pdf, orientations, space_width, PG.CONTENTS, visitor_operand_before, visitor_operand_after, visitor_text, ) def extract_xform_text( self, xform: EncodedStreamObject, orientations: tuple[int, ...] = (0, 90, 270, 360), space_width: float = 200.0, visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ Extract text from an XObject. Args: xform: orientations: space_width: force default space width (if not extracted from font (default 200) visitor_operand_before: visitor_operand_after: visitor_text: Returns: The extracted text """ return self._extract_text( xform, self.pdf, orientations, space_width, None, visitor_operand_before, visitor_operand_after, visitor_text, ) def _get_fonts(self) -> tuple[set[str], set[str]]: """ Get the names of embedded fonts and unembedded fonts. Returns: A tuple (set of embedded fonts, set of unembedded fonts) """ obj = self.get_object() assert isinstance(obj, DictionaryObject) fonts: set[str] = set() embedded: set[str] = set() fonts, embedded = _get_fonts_walk(obj, fonts, embedded) unembedded = fonts - embedded return embedded, unembedded mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) """A :class:`RectangleObject`, expressed in default user space units, defining the boundaries of the physical medium on which the page is intended to be displayed or printed.""" cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) """ A :class:`RectangleObject`, expressed in default user space units, defining the visible region of default user space. When the page is displayed or printed, its contents are to be clipped (cropped) to this rectangle and then imposed on the output medium in some implementation-defined manner. Default value: same as :attr:`mediabox`. """ bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) """A :class:`RectangleObject`, expressed in default user space units, defining the region to which the contents of the page should be clipped when output in a production environment.""" trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) """A :class:`RectangleObject`, expressed in default user space units, defining the intended dimensions of the finished page after trimming.""" artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) """A :class:`RectangleObject`, expressed in default user space units, defining the extent of the page's meaningful content as intended by the page's creator.""" @property def annotations(self) -> Optional[ArrayObject]: if "/Annots" not in self: return None return cast(ArrayObject, self["/Annots"]) @annotations.setter def annotations(self, value: Optional[ArrayObject]) -> None: """ Set the annotations array of the page. Typically you do not want to set this value, but append to it. If you append to it, remember to add the object first to the writer and only add the indirect object. """ if value is None: if "/Annots" not in self: return del self[NameObject("/Annots")] else: self[NameObject("/Annots")] = value class _VirtualList(Sequence[PageObject]): def __init__( self, length_function: Callable[[], int], get_function: Callable[[int], PageObject], ) -> None: self.length_function = length_function self.get_function = get_function self.current = -1 def __len__(self) -> int: return self.length_function() @overload def __getitem__(self, index: int) -> PageObject: ... @overload def __getitem__(self, index: slice) -> Sequence[PageObject]: ... def __getitem__( self, index: Union[int, slice] ) -> Union[PageObject, Sequence[PageObject]]: if isinstance(index, slice): indices = range(*index.indices(len(self))) cls = type(self) return cls(indices.__len__, lambda idx: self[indices[idx]]) if not isinstance(index, int): raise TypeError("Sequence indices must be integers") len_self = len(self) if index < 0: # support negative indexes index += len_self if not (0 <= index < len_self): raise IndexError("Sequence index out of range") return self.get_function(index) def __delitem__(self, index: Union[int, slice]) -> None: if isinstance(index, slice): r = list(range(*index.indices(len(self)))) # pages have to be deleted from last to first r.sort() r.reverse() for p in r: del self[p] # recursive call return if not isinstance(index, int): raise TypeError("Index must be integers") len_self = len(self) if index < 0: # support negative indexes index += len_self if not (0 <= index < len_self): raise IndexError("Index out of range") ind = self[index].indirect_reference assert ind is not None parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( "/Parent", None ) first = True while parent is not None: parent = cast(DictionaryObject, parent.get_object()) try: i = cast(ArrayObject, parent["/Kids"]).index(ind) del cast(ArrayObject, parent["/Kids"])[i] first = False try: assert ind is not None del ind.pdf.flattened_pages[index] # case of page in a Reader except Exception: # pragma: no cover pass if "/Count" in parent: parent[NameObject("/Count")] = NumberObject( cast(int, parent["/Count"]) - 1 ) if len(cast(ArrayObject, parent["/Kids"])) == 0: # No more objects in this part of this subtree ind = parent.indirect_reference parent = parent.get("/Parent", None) except ValueError: # from index if first: raise PdfReadError(f"Page not found in page tree: {ind}") break def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] def __str__(self) -> str: p = [f"PageObject({i})" for i in range(self.length_function())] return f"[{', '.join(p)}]" def _get_fonts_walk( obj: DictionaryObject, fnt: set[str], emb: set[str], ) -> tuple[set[str], set[str]]: """ Get the set of all fonts and all embedded fonts. Args: obj: Page resources dictionary fnt: font emb: embedded fonts Returns: A tuple (fnt, emb) If there is a key called 'BaseFont', that is a font that is used in the document. If there is a key called 'FontName' and another key in the same dictionary object that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is embedded. We create and add to two sets, fnt = fonts used and emb = fonts embedded. """ fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") def process_font(f: DictionaryObject) -> None: nonlocal fnt, emb f = cast(DictionaryObject, f.get_object()) # to be sure if "/BaseFont" in f: fnt.add(cast(str, f["/BaseFont"])) if ( ("/CharProcs" in f) or ( "/FontDescriptor" in f and any( x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys ) ) or ( "/DescendantFonts" in f and "/FontDescriptor" in cast( DictionaryObject, cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), ) and any( x in cast( DictionaryObject, cast( DictionaryObject, cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), )["/FontDescriptor"], ) for x in fontkeys ) ) ): # the list comprehension ensures there is FontFile try: emb.add(cast(str, f["/BaseFont"])) except KeyError: emb.add("(" + cast(str, f["/Subtype"]) + ")") if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): process_font(f) if "/Resources" in obj: if "/Font" in cast(DictionaryObject, obj["/Resources"]): for f in cast( DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] ).values(): process_font(f) if "/XObject" in cast(DictionaryObject, obj["/Resources"]): for x in cast( DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] ).values(): _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) if "/Annots" in obj: for a in cast(ArrayObject, obj["/Annots"]): _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) if "/AP" in obj: if ( cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( "/Type" ) == "/XObject" ): _get_fonts_walk( cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), fnt, emb, ) else: for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) return fnt, emb # return the sets for each page ================================================ FILE: pypdf/_page_labels.py ================================================ """ Page labels are shown by PDF viewers as "the page number". A page has a numeric index, starting at 0. Additionally, the page has a label. In the most simple case: label = index + 1 However, the title page and the table of contents might have Roman numerals as page labels. This makes things more complicated. Example 1 --------- >>> reader.root_object["/PageLabels"]["/Nums"] [0, IndirectObject(18, 0, 139929798197504), 8, IndirectObject(19, 0, 139929798197504)] >>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1]) {'/S': '/r'} >>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3]) {'/S': '/D'} Example 2 --------- The following is a document with pages labeled i, ii, iii, iv, 1, 2, 3, A-8, A-9, ... 1 0 obj << /Type /Catalog /PageLabels << /Nums [ 0 << /S /r >> 4 << /S /D >> 7 << /S /D /P ( A- ) /St 8 >> % A number tree containing % three page label dictionaries ] >> ... >> endobj §12.4.2 PDF Specification 1.7 and 2.0 ===================================== Entries in a page label dictionary ---------------------------------- The /S key: D Decimal Arabic numerals R Uppercase Roman numerals r Lowercase Roman numerals A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) """ from collections.abc import Callable, Iterator from typing import Optional, cast from ._protocols import PdfCommonDocProtocol from ._utils import logger_warning from .generic import ( ArrayObject, DictionaryObject, NullObject, NumberObject, is_null_or_none, ) def number2uppercase_roman_numeral(num: int) -> str: roman = [ (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I"), ] def roman_num(num: int) -> Iterator[str]: for decimal, roman_repr in roman: x, _ = divmod(num, decimal) yield roman_repr * x num -= decimal * x if num <= 0: break return "".join(list(roman_num(num))) def number2lowercase_roman_numeral(number: int) -> str: return number2uppercase_roman_numeral(number).lower() def number2uppercase_letter(number: int) -> str: if number <= 0: raise ValueError("Expecting a positive number") alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)] rep = "" while number > 0: remainder = number % 26 if remainder == 0: remainder = 26 rep = alphabet[remainder - 1] + rep # update number -= remainder number = number // 26 return rep def number2lowercase_letter(number: int) -> str: return number2uppercase_letter(number).lower() def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str: # [Nums] shall be an array of the form # [ key_1 value_1 key_2 value_2 ... key_n value_n ] # where each key_i is an integer and the corresponding # value_i shall be the object associated with that key. # The keys shall be sorted in numerical order, # analogously to the arrangement of keys in a name tree # as described in 7.9.6, "Name Trees." nums = cast(ArrayObject, dictionary_object["/Nums"]) i = 0 value = None start_index = 0 while i < len(nums): start_index = nums[i] value = nums[i + 1].get_object() if i + 2 == len(nums): break if nums[i + 2] > index: break i += 2 m: dict[Optional[str], Callable[[int], str]] = { None: lambda _: "", "/D": str, "/R": number2uppercase_roman_numeral, "/r": number2lowercase_roman_numeral, "/A": number2uppercase_letter, "/a": number2lowercase_letter, } # if /Nums array is not following the specification or if /Nums is empty if not isinstance(value, dict): return str(index + 1) # Fallback start = value.get("/St", 1) prefix = value.get("/P", "") mapping_function = m[value.get("/S")] return prefix + mapping_function(index - start_index + start) def index2label(reader: PdfCommonDocProtocol, index: int) -> str: """ See 7.9.7 "Number Trees". Args: reader: The PdfReader index: The index of the page Returns: The label of the page, e.g. "iv" or "4". """ root = cast(DictionaryObject, reader.root_object) if "/PageLabels" not in root: return str(index + 1) # Fallback number_tree = cast(DictionaryObject, root["/PageLabels"].get_object()) if "/Nums" in number_tree: return get_label_from_nums(number_tree, index) if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject): # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]} # Limit maximum depth. level = 0 while level < 100: kids = cast(list[DictionaryObject], number_tree["/Kids"]) for kid in kids: # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} limits = cast(list[int], kid["/Limits"]) if limits[0] <= index <= limits[1]: if not is_null_or_none(kid.get("/Kids", None)): # Recursive definition. level += 1 if level == 100: # pragma: no cover raise NotImplementedError( "Too deep nesting is not supported." ) number_tree = kid # Exit the inner `for` loop and continue at the next level with the # next iteration of the `while` loop. break return get_label_from_nums(kid, index) else: # When there are no kids, make sure to exit the `while` loop directly # and continue with the fallback. break logger_warning(f"Could not reliably determine page label for {index}.", __name__) return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree def nums_insert( key: NumberObject, value: DictionaryObject, nums: ArrayObject, ) -> None: """ Insert a key, value pair in a Nums array. See 7.9.7 "Number Trees". Args: key: number key of the entry value: value of the entry nums: Nums array to modify """ if len(nums) % 2 != 0: raise ValueError("A nums like array must have an even number of elements") i = len(nums) while i != 0 and key <= nums[i - 2]: i = i - 2 if i < len(nums) and key == nums[i]: nums[i + 1] = value else: nums.insert(i, key) nums.insert(i + 1, value) def nums_clear_range( key: NumberObject, page_index_to: int, nums: ArrayObject, ) -> None: """ Remove all entries in a number tree in a range after an entry. See 7.9.7 "Number Trees". Args: key: number key of the entry before the range page_index_to: The page index of the upper limit of the range nums: Nums array to modify """ if len(nums) % 2 != 0: raise ValueError("A nums like array must have an even number of elements") if page_index_to < key: raise ValueError("page_index_to must be greater or equal than key") i = nums.index(key) + 2 while i < len(nums) and nums[i] <= page_index_to: nums.pop(i) nums.pop(i) def nums_next( key: NumberObject, nums: ArrayObject, ) -> tuple[Optional[NumberObject], Optional[DictionaryObject]]: """ Return the (key, value) pair of the entry after the given one. See 7.9.7 "Number Trees". Args: key: number key of the entry nums: Nums array """ if len(nums) % 2 != 0: raise ValueError("A nums like array must have an even number of elements") i = nums.index(key) + 2 if i < len(nums): return (nums[i], nums[i + 1]) return (None, None) ================================================ FILE: pypdf/_protocols.py ================================================ """Helpers for working with PDF types.""" from abc import abstractmethod from pathlib import Path from typing import IO, Any, Optional, Protocol, Union from ._utils import StrByteType, StreamType class PdfObjectProtocol(Protocol): indirect_reference: Any def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Union[tuple[str, ...], list[str], None] = (), ) -> Any: ... # pragma: no cover def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any: ... # pragma: no cover def get_object(self) -> Optional["PdfObjectProtocol"]: ... # pragma: no cover def hash_value(self) -> bytes: ... # pragma: no cover def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: ... # pragma: no cover class XmpInformationProtocol(PdfObjectProtocol): pass class PdfCommonDocProtocol(Protocol): @property def pdf_header(self) -> str: ... # pragma: no cover @property def pages(self) -> list[Any]: ... # pragma: no cover @property def root_object(self) -> PdfObjectProtocol: ... # pragma: no cover def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: ... # pragma: no cover @property def strict(self) -> bool: ... # pragma: no cover class PdfReaderProtocol(PdfCommonDocProtocol, Protocol): @property @abstractmethod def xref(self) -> dict[int, dict[int, Any]]: ... # pragma: no cover @property @abstractmethod def trailer(self) -> dict[str, Any]: ... # pragma: no cover class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: list[Any] _id_translated: dict[int, dict[int, int]] incremental: bool _reader: Any # PdfReader @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]: ... # pragma: no cover @abstractmethod def _add_object(self, obj: Any) -> Any: ... # pragma: no cover ================================================ FILE: pypdf/_reader.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import os import re import sys from collections.abc import Iterable from io import BytesIO, UnsupportedOperation from pathlib import Path from types import TracebackType from typing import ( TYPE_CHECKING, Any, Callable, Optional, Union, cast, ) if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self from ._doc_common import PdfDocCommon, convert_to_int from ._encryption import Encryption, PasswordType from ._utils import ( WHITESPACES_AS_BYTES, StrByteType, StreamType, logger_warning, read_non_whitespace, read_previous_line, read_until_whitespace, skip_over_comment, skip_over_whitespace, ) from .constants import TrailerKeys as TK from .errors import ( EmptyFileError, FileNotDecryptedError, LimitReachedError, PdfReadError, PdfStreamError, WrongPasswordError, ) from .generic import ( ArrayObject, ContentStream, DecodedStreamObject, DictionaryObject, EncodedStreamObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, StreamObject, TextStringObject, is_null_or_none, read_object, ) from .xmp import XmpInformation if TYPE_CHECKING: from ._page import PageObject class PdfReader(PdfDocCommon): """ Initialize a PdfReader object. This operation can take some time, as the PDF stream's cross-reference tables are read into memory. Args: stream: A File object or an object that supports the standard read and seek methods similar to a File object. Could also be a string representing a path to a PDF file. strict: Determines whether user should be warned of all problems and also causes some correctable problems to be fatal. Defaults to ``False``. password: Decrypt PDF file at initialization. If the password is None, the file will not be decrypted. Defaults to ``None``. root_object_recovery_limit: The maximum number of objects to query for recovering the Root object in non-strict mode. To disable this security measure, pass ``None``. """ def __init__( self, stream: Union[StrByteType, Path], strict: bool = False, password: Union[None, str, bytes] = None, *, root_object_recovery_limit: Optional[int] = 10_000, ) -> None: self.strict = strict self.flattened_pages: Optional[list[PageObject]] = None #: Storage of parsed PDF objects. self.resolved_objects: dict[tuple[Any, Any], Optional[PdfObject]] = {} self._startxref: int = 0 self.xref_index = 0 self.xref: dict[int, dict[Any, Any]] = {} self.xref_free_entry: dict[int, dict[Any, Any]] = {} self.xref_objStm: dict[int, tuple[Any, Any]] = {} self.trailer = DictionaryObject() # Security parameters. self._root_object_recovery_limit = ( root_object_recovery_limit if isinstance(root_object_recovery_limit, int) else sys.maxsize ) # Map page indirect_reference number to page number self._page_id2num: Optional[dict[Any, Any]] = None self._validated_root: Optional[DictionaryObject] = None self._initialize_stream(stream) self._known_objects: set[tuple[int, int]] = set() self._override_encryption = False self._encryption: Optional[Encryption] = None if self.is_encrypted: self._handle_encryption(password) elif password is not None: raise PdfReadError("Not an encrypted file") def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: if hasattr(stream, "mode") and "b" not in stream.mode: logger_warning( "PdfReader stream/file object is not in binary mode. " "It may not be read correctly.", __name__, ) self._stream_opened = False if isinstance(stream, (str, Path)): with open(stream, "rb") as fh: stream = BytesIO(fh.read()) self._stream_opened = True self.read(stream) self.stream = stream def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: self._override_encryption = True # Some documents may not have a /ID, use two empty # byte strings instead. Solves # https://github.com/py-pdf/pypdf/issues/608 id_entry = self.trailer.get(TK.ID) id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) self._encryption = Encryption.read(encrypt_entry, id1_entry) # try empty password if no password provided pwd = password if password is not None else b"" if ( self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED and password is not None ): # raise if password provided raise WrongPasswordError("Wrong password") self._override_encryption = False def __enter__(self) -> Self: return self def __exit__( self, exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: self.close() def close(self) -> None: """Close the stream if opened in __init__ and clear memory.""" if self._stream_opened: self.stream.close() self.flattened_pages = [] self.resolved_objects = {} self.trailer = DictionaryObject() self.xref = {} self.xref_free_entry = {} self.xref_objStm = {} @property def root_object(self) -> DictionaryObject: """Provide access to "/Root". Standardized with PdfWriter.""" if self._validated_root: return self._validated_root root = self.trailer.get(TK.ROOT) if is_null_or_none(root): logger_warning('Cannot find "/Root" key in trailer', __name__) elif ( cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") == "/Catalog" ): self._validated_root = cast( DictionaryObject, cast(PdfObject, root).get_object() ) else: logger_warning("Invalid Root object in trailer", __name__) if self._validated_root is None: logger_warning('Searching object with "/Catalog" key', __name__) number_of_objects = cast(int, self.trailer.get("/Size", 0)) for i in range(number_of_objects): if i >= self._root_object_recovery_limit: raise LimitReachedError("Maximum Root object recovery limit reached.") try: obj = self.get_object(i + 1) except Exception: # to be sure to capture all errors obj = None if isinstance(obj, DictionaryObject) and obj.get("/Type") == "/Catalog": self._validated_root = obj logger_warning(f"Root found at {obj.indirect_reference!r}", __name__) break if self._validated_root is None: if not is_null_or_none(root) and "/Pages" in cast(DictionaryObject, cast(PdfObject, root).get_object()): logger_warning( f"Possible root found at {cast(PdfObject, root).indirect_reference!r}, but missing /Catalog key", __name__ ) self._validated_root = cast( DictionaryObject, cast(PdfObject, root).get_object() ) else: raise PdfReadError("Cannot find Root object in pdf") return self._validated_root @property def _info(self) -> Optional[DictionaryObject]: """ Provide access to "/Info". Standardized with PdfWriter. Returns: /Info Dictionary; None if the entry does not exist """ info = self.trailer.get(TK.INFO, None) if is_null_or_none(info): return None assert info is not None, "mypy" info = info.get_object() if not isinstance(info, DictionaryObject): raise PdfReadError( "Trailer not found or does not point to a document information dictionary" ) return info @property def _ID(self) -> Optional[ArrayObject]: """ Provide access to "/ID". Standardized with PdfWriter. Returns: /ID array; None if the entry does not exist """ id = self.trailer.get(TK.ID, None) if is_null_or_none(id): return None assert id is not None, "mypy" return cast(ArrayObject, id.get_object()) @property def pdf_header(self) -> str: """ The first 8 bytes of the file. This is typically something like ``'%PDF-1.6'`` and can be used to detect if the file is actually a PDF file and which version it is. """ # TODO: Make this return a bytes object for consistency # but that needs a deprecation loc = self.stream.tell() self.stream.seek(0, 0) pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") self.stream.seek(loc, 0) # return to where it was return pdf_file_version @property def xmp_metadata(self) -> Optional[XmpInformation]: """XMP (Extensible Metadata Platform) data.""" try: self._override_encryption = True return cast(XmpInformation, self.root_object.xmp_metadata) finally: self._override_encryption = False def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> Optional[int]: """ Retrieve the page number from an indirect reference. Args: indirect_reference: The indirect reference to locate. Returns: Page number or None. """ if self._page_id2num is None: self._page_id2num = { x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore } if is_null_or_none(indirect_reference): return None assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" if isinstance(indirect_reference, int): idnum = indirect_reference else: idnum = indirect_reference.idnum assert self._page_id2num is not None, "hint for mypy" return self._page_id2num.get(idnum, None) def _get_object_from_stream( self, indirect_reference: IndirectObject ) -> Union[int, PdfObject, str]: # indirect reference to object in object stream # read the entire object stream into memory stmnum, _idx = self.xref_objStm[indirect_reference.idnum] obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore # This is an xref to a stream, so its type better be a stream assert cast(str, obj_stm["/Type"]) == "/ObjStm" # Parse ALL objects in this stream in one pass and cache them. # This avoids O(N²) behavior when many objects from the same stream # are resolved individually (each call would re-parse the header). stream_data = BytesIO(obj_stm.get_data()) n = int(obj_stm["/N"]) # type: ignore[call-overload] first_offset = int(obj_stm["/First"]) # type: ignore[call-overload] # Phase 1: Read the index (objnum, offset) pairs from the header. obj_index: list[tuple[int, int]] = [] for _i in range(n): read_non_whitespace(stream_data) stream_data.seek(-1, 1) objnum = NumberObject.read_from_stream(stream_data) read_non_whitespace(stream_data) stream_data.seek(-1, 1) offset = NumberObject.read_from_stream(stream_data) read_non_whitespace(stream_data) stream_data.seek(-1, 1) obj_index.append((int(objnum), int(offset))) # Phase 2: Parse each object and cache it. target_obj: Union[int, PdfObject, str] = NullObject() found = False for i, (obj_num, obj_offset) in enumerate(obj_index): # Skip objects already in the cache. cached = self.cache_get_indirect_object(0, obj_num) if cached is not None: if obj_num == indirect_reference.idnum: target_obj = cached found = True continue stream_data.seek(first_offset + obj_offset, 0) # To cope with case where the 'pointer' is on a white space read_non_whitespace(stream_data) stream_data.seek(-1, 1) try: obj = read_object(stream_data, self) except PdfStreamError as exc: # Stream object cannot be read. Normally, a critical error, but # Adobe Reader doesn't complain, so continue (in strict mode?) logger_warning( f"Invalid stream (index {i}) within object " f"{obj_num} 0: {exc}", __name__, ) if self.strict: # pragma: no cover raise PdfReadError( f"Cannot read object stream: {exc}" ) # pragma: no cover obj = NullObject() # pragma: no cover # Only cache if this object is still registered in xref_objStm. # Incremental updates may override objects originally in the stream; # caching those stale versions would shadow the newer xref entry. if obj_num in self.xref_objStm: self.cache_indirect_object(0, obj_num, obj) # type: ignore[arg-type] if obj_num == indirect_reference.idnum: target_obj = obj found = True if not found and self.strict: # pragma: no cover raise PdfReadError( "This is a fatal error in strict mode." ) # pragma: no cover return target_obj def get_object( self, indirect_reference: Union[int, IndirectObject] ) -> Optional[PdfObject]: if isinstance(indirect_reference, int): indirect_reference = IndirectObject(indirect_reference, 0, self) retval = self.cache_get_indirect_object( indirect_reference.generation, indirect_reference.idnum ) if retval is not None: return retval if ( indirect_reference.generation == 0 and indirect_reference.idnum in self.xref_objStm ): retval = self._get_object_from_stream(indirect_reference) # type: ignore elif ( indirect_reference.generation in self.xref and indirect_reference.idnum in self.xref[indirect_reference.generation] ): if self.xref_free_entry.get(indirect_reference.generation, {}).get( indirect_reference.idnum, False ): return NullObject() start = self.xref[indirect_reference.generation][indirect_reference.idnum] self.stream.seek(start, 0) try: idnum, generation = self.read_object_header(self.stream) if ( idnum != indirect_reference.idnum or generation != indirect_reference.generation ): raise PdfReadError("Not matching, we parse the file for it") except Exception: if hasattr(self.stream, "getbuffer"): buf = bytes(self.stream.getbuffer()) else: p = self.stream.tell() self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), buf, ) if m is not None: logger_warning( f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", __name__, ) self.xref[indirect_reference.generation][ indirect_reference.idnum ] = (m.start(0) + 1) self.stream.seek(m.start(0) + 1) idnum, generation = self.read_object_header(self.stream) else: idnum = -1 generation = -1 # exception will be raised below if idnum != indirect_reference.idnum and self.xref_index: # xref table probably had bad indexes due to not being zero-indexed if self.strict: raise PdfReadError( f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " f"does not match actual ({idnum} {generation}); " "xref table not zero-indexed." ) # xref table is corrected in non-strict mode elif idnum != indirect_reference.idnum and self.strict: # some other problem raise PdfReadError( f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " f"does not match actual ({idnum} {generation})." ) if self.strict: assert generation == indirect_reference.generation current_object = (indirect_reference.idnum, indirect_reference.generation) if current_object in self._known_objects: raise PdfReadError(f"Detected loop with self reference for {indirect_reference!r}.") self._known_objects.add(current_object) retval = read_object(self.stream, self) # type: ignore self._known_objects.remove(current_object) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self._encryption is not None: # if we don't have the encryption key: if not self._encryption.is_decrypted(): raise FileNotDecryptedError("File has not been decrypted") # otherwise, decrypt here... retval = cast(PdfObject, retval) retval = self._encryption.decrypt_object( retval, indirect_reference.idnum, indirect_reference.generation ) else: if hasattr(self.stream, "getbuffer"): buf = bytes(self.stream.getbuffer()) else: p = self.stream.tell() self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), buf, ) if m is not None: logger_warning( f"Object {indirect_reference.idnum} {indirect_reference.generation} found", __name__, ) if indirect_reference.generation not in self.xref: self.xref[indirect_reference.generation] = {} self.xref[indirect_reference.generation][indirect_reference.idnum] = ( m.start(0) + 1 ) self.stream.seek(m.end(0) + 1) skip_over_whitespace(self.stream) self.stream.seek(-1, 1) retval = read_object(self.stream, self) # type: ignore # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self._encryption is not None: # if we don't have the encryption key: if not self._encryption.is_decrypted(): raise FileNotDecryptedError("File has not been decrypted") # otherwise, decrypt here... retval = cast(PdfObject, retval) retval = self._encryption.decrypt_object( retval, indirect_reference.idnum, indirect_reference.generation ) else: logger_warning( f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", __name__, ) if self.strict: raise PdfReadError("Could not find object.") # For ObjStm objects, _get_object_from_stream already cached # the result during batch parsing; skip the redundant cache write # to avoid "Overwriting cache" warnings. For non-ObjStm objects # (including encrypted ones that need decrypted values cached), # always write. if not ( indirect_reference.generation == 0 and indirect_reference.idnum in self.xref_objStm ): self.cache_indirect_object( indirect_reference.generation, indirect_reference.idnum, retval ) return retval def read_object_header(self, stream: StreamType) -> tuple[int, int]: # Should never be necessary to read out whitespace, since the # cross-reference table should put us in the right spot to read the # object header. In reality some files have stupid cross-reference # tables that are off by whitespace bytes. skip_over_comment(stream) extra = skip_over_whitespace(stream) stream.seek(-1, 1) idnum = read_until_whitespace(stream) extra |= skip_over_whitespace(stream) stream.seek(-1, 1) generation = read_until_whitespace(stream) extra |= skip_over_whitespace(stream) stream.seek(-1, 1) # although it's not used, it might still be necessary to read _obj = stream.read(3) read_non_whitespace(stream) stream.seek(-1, 1) if extra and self.strict: logger_warning( f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore __name__, ) return int(idnum), int(generation) def cache_get_indirect_object( self, generation: int, idnum: int ) -> Optional[PdfObject]: try: return self.resolved_objects.get((generation, idnum)) except RecursionError: raise PdfReadError("Maximum recursion depth reached.") def cache_indirect_object( self, generation: int, idnum: int, obj: Optional[PdfObject] ) -> Optional[PdfObject]: if (generation, idnum) in self.resolved_objects: msg = f"Overwriting cache for {generation} {idnum}" if self.strict: raise PdfReadError(msg) logger_warning(msg, __name__) self.resolved_objects[(generation, idnum)] = obj if obj is not None: obj.indirect_reference = IndirectObject(idnum, generation, self) return obj def _replace_object(self, indirect_reference: IndirectObject, obj: PdfObject) -> PdfObject: # function reserved for future development if indirect_reference.pdf != self: raise ValueError("Cannot update PdfReader with external object") if (indirect_reference.generation, indirect_reference.idnum) not in self.resolved_objects: raise ValueError("Cannot find referenced object") self.resolved_objects[(indirect_reference.generation, indirect_reference.idnum)] = obj obj.indirect_reference = indirect_reference return obj def read(self, stream: StreamType) -> None: """ Read and process the PDF stream, extracting necessary data. Args: stream: The PDF file stream. """ self._basic_validation(stream) self._find_eof_marker(stream) startxref = self._find_startxref_pos(stream) self._startxref = startxref # check and eventually correct the startxref only if not strict xref_issue_nr = self._get_xref_issues(stream, startxref) if xref_issue_nr != 0: if self.strict and xref_issue_nr: raise PdfReadError("Broken xref table") logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) # read all cross-reference tables and their trailers self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) # if not zero-indexed, verify that the table is correct; change it if necessary if self.xref_index and not self.strict: loc = stream.tell() for gen, xref_entry in self.xref.items(): if gen == 65535: continue xref_k = sorted( xref_entry.keys() ) # ensure ascending to prevent damage for id in xref_k: stream.seek(xref_entry[id], 0) try: pid, _pgen = self.read_object_header(stream) except ValueError: self._rebuild_xref_table(stream) break if pid == id - self.xref_index: # fixing index item per item is required for revised PDF. self.xref[gen][pid] = self.xref[gen][id] del self.xref[gen][id] # if not, then either it's just plain wrong, or the # non-zero-index is actually correct stream.seek(loc, 0) # return to where it was # remove wrong objects (not pointing to correct structures) - cf #2326 if not self.strict: loc = stream.tell() for gen, xref_entry in self.xref.items(): if gen == 65535: continue ids = list(xref_entry.keys()) for id in ids: stream.seek(xref_entry[id], 0) try: self.read_object_header(stream) except ValueError: logger_warning( f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", __name__, ) del xref_entry[id] # we can delete the id, we are parsing ids stream.seek(loc, 0) # return to where it was def _basic_validation(self, stream: StreamType) -> None: """Ensure the stream is valid and not empty.""" stream.seek(0, os.SEEK_SET) try: header_byte = stream.read(5) except UnicodeDecodeError: raise UnsupportedOperation("cannot read header") if header_byte == b"": raise EmptyFileError("Cannot read an empty file") if header_byte != b"%PDF-": if self.strict: raise PdfReadError( f"PDF starts with '{header_byte.decode('utf8')}', " "but '%PDF-' expected" ) logger_warning(f"invalid pdf header: {header_byte}", __name__) stream.seek(0, os.SEEK_END) def _find_eof_marker(self, stream: StreamType) -> None: """ Jump to the %%EOF marker. According to the specs, the %%EOF marker should be at the very end of the file. Hence for standard-compliant PDF documents this function will read only the last part (DEFAULT_BUFFER_SIZE). """ HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' line = b"" first = True while not line.startswith(b"%%EOF"): if line != b"" and first: if any( line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%") ): # Consider the file as truncated while # having enough confidence to carry on. logger_warning("EOF marker seems truncated", __name__) break first = False if b"startxref" in line: logger_warning( "CAUTION: startxref found while searching for %%EOF. " "The file might be truncated and some data might not be read.", __name__, ) if stream.tell() < HEADER_SIZE: if self.strict: raise PdfReadError("EOF marker not found") logger_warning("EOF marker not found", __name__) line = read_previous_line(stream) def _find_startxref_pos(self, stream: StreamType) -> int: """ Find startxref entry - the location of the xref table. Args: stream: Returns: The bytes offset """ line = read_previous_line(stream) try: startxref = int(line) except ValueError: # 'startxref' may be on the same line as the location if not line.startswith(b"startxref"): raise PdfReadError("startxref not found") startxref = int(line[9:].strip()) logger_warning("startxref on same line as offset", __name__) else: line = read_previous_line(stream) if not line.startswith(b"startxref"): raise PdfReadError("startxref not found") return startxref def _read_standard_xref_table(self, stream: StreamType) -> None: # standard cross-reference table ref = stream.read(3) if ref != b"ref": raise PdfReadError("xref table read error") read_non_whitespace(stream) stream.seek(-1, 1) first_time = True # check if the first time looking at the xref table while True: num = cast(int, read_object(stream, self)) if first_time and num != 0: self.xref_index = num if self.strict: logger_warning( "Xref table not zero-indexed. ID numbers for objects will be corrected.", __name__, ) # if table not zero indexed, could be due to error from when PDF was created # which will lead to mismatched indices later on, only warned and corrected if self.strict==True first_time = False read_non_whitespace(stream) stream.seek(-1, 1) size = cast(int, read_object(stream, self)) if not isinstance(size, int): logger_warning( "Invalid/Truncated xref table. Rebuilding it.", __name__, ) self._rebuild_xref_table(stream) stream.read() return read_non_whitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) if not line: raise PdfReadError("Unexpected empty line in Xref table.") # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b"\x0D\x0A": stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceding # space. Detect that case, and seek the stream # back one character (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b"0123456789t": stream.seek(-1, 1) try: offset_b, generation_b = line[:16].split(b" ") entry_type_b = line[17:18] offset, generation = int(offset_b), int(generation_b) except Exception: if hasattr(stream, "getbuffer"): buf = bytes(stream.getbuffer()) else: p = stream.tell() stream.seek(0, 0) buf = stream.read(-1) stream.seek(p) f = re.search(rf"{num}\s+(\d+)\s+obj".encode(), buf) if f is None: logger_warning( f"entry {num} in Xref table invalid; object not found", __name__, ) generation = 65535 offset = -1 entry_type_b = b"f" else: logger_warning( f"entry {num} in Xref table invalid but object found", __name__, ) generation = int(f.group(1)) offset = f.start() if generation not in self.xref: self.xref[generation] = {} self.xref_free_entry[generation] = {} if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: if entry_type_b == b"n": self.xref[generation][num] = offset try: self.xref_free_entry[generation][num] = entry_type_b == b"f" except Exception: pass try: self.xref_free_entry[65535][num] = entry_type_b == b"f" except Exception: pass cnt += 1 num += 1 read_non_whitespace(stream) stream.seek(-1, 1) trailer_tag = stream.read(7) if trailer_tag != b"trailer": # more xrefs! stream.seek(-7, 1) else: break def _read_xref_tables_and_trailers( self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int ) -> None: """Read the cross-reference tables and trailers in the PDF stream.""" self.xref = {} self.xref_free_entry = {} self.xref_objStm = {} self.trailer = DictionaryObject() visited_xref_offsets: set[int] = set() while startxref is not None: # Detect circular /Prev references in the xref chain if startxref in visited_xref_offsets: logger_warning( f"Circular xref chain detected at offset {startxref}, stopping", __name__, ) break visited_xref_offsets.add(startxref) # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x in b"\r\n": x = stream.read(1) if x == b"x": startxref = self._read_xref(stream) elif xref_issue_nr: try: self._rebuild_xref_table(stream) break except Exception: xref_issue_nr = 0 elif x.isdigit(): try: xrefstream = self._read_pdf15_xref_stream(stream) except Exception as e: if TK.ROOT in self.trailer: logger_warning( f"Previous trailer cannot be read: {e.args}", __name__ ) break raise PdfReadError(f"Trailer cannot be read: {e!s}") self._process_xref_stream(xrefstream) if "/Prev" in xrefstream: startxref = cast(int, xrefstream["/Prev"]) else: break else: startxref = self._read_xref_other_error(stream, startxref) def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: """Process and handle the xref stream.""" trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE for key in trailer_keys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/XRefStm" in xrefstream: p = self.stream.tell() self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) self._read_pdf15_xref_stream(self.stream) self.stream.seek(p, 0) def _read_xref(self, stream: StreamType) -> Optional[int]: self._read_standard_xref_table(stream) if stream.read(1) == b"": return None stream.seek(-1, 1) read_non_whitespace(stream) stream.seek(-1, 1) new_trailer = cast(dict[str, Any], read_object(stream, self)) for key, value in new_trailer.items(): if key not in self.trailer: self.trailer[key] = value if "/XRefStm" in new_trailer: p = stream.tell() stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) try: self._read_pdf15_xref_stream(stream) except Exception: logger_warning( f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", __name__, ) stream.seek(p, 0) if "/Prev" in new_trailer: return new_trailer["/Prev"] return None def _read_xref_other_error( self, stream: StreamType, startxref: int ) -> Optional[int]: # some PDFs have /Prev=0 in the trailer, instead of no /Prev if startxref == 0: if self.strict: raise PdfReadError( "/Prev=0 in the trailer (try opening with strict=False)" ) logger_warning( "/Prev=0 in the trailer - assuming there is no previous xref table", __name__, ) return None # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b"xref") if xref_loc != -1: startxref -= 10 - xref_loc return startxref # No explicit xref table, try finding a cross-reference stream. stream.seek(startxref, 0) for look in range(25): # value extended to cope with more linearized files if stream.read(1).isdigit(): # This is not a standard PDF, consider adding a warning startxref += look return startxref # no xref table found at specified location if "/Root" in self.trailer and not self.strict: # if Root has been already found, just raise warning logger_warning("Invalid parent xref., rebuild xref", __name__) try: self._rebuild_xref_table(stream) return None except Exception: raise PdfReadError("Cannot rebuild xref") raise PdfReadError("Could not find xref table at specified location") def _read_pdf15_xref_stream( self, stream: StreamType ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: """Read the cross-reference stream for PDF 1.5+.""" stream.seek(-1, 1) idnum, generation = self.read_object_header(stream) xrefstream = cast(ContentStream, read_object(stream, self)) if cast(str, xrefstream["/Type"]) != "/XRef": raise PdfReadError(f"Unexpected type {xrefstream['/Type']!r}") self.cache_indirect_object(generation, idnum, xrefstream) # Index pairs specify the subsections in the dictionary. # If none, create one subsection that spans everything. if "/Size" not in xrefstream: # According to table 17 of the PDF 2.0 specification, this key is required. raise PdfReadError(f"Size missing from XRef stream {xrefstream!r}!") idx_pairs = xrefstream.get("/Index", [0, xrefstream["/Size"]]) entry_sizes = cast(dict[Any, Any], xrefstream.get("/W")) assert len(entry_sizes) >= 3 if self.strict and len(entry_sizes) > 3: raise PdfReadError(f"Too many entry sizes: {entry_sizes}") stream_data = BytesIO(xrefstream.get_data()) def get_entry(i: int) -> Union[int, tuple[int, ...]]: # Reads the correct number of bytes for each entry. See the # discussion of the W parameter in PDF spec table 17. if entry_sizes[i] > 0: d = stream_data.read(entry_sizes[i]) return convert_to_int(d, entry_sizes[i]) # PDF Spec Table 17: A value of zero for an element in the # W array indicates...the default value shall be used if i == 0: return 1 # First value defaults to 1 return 0 def used_before(num: int, generation: Union[int, tuple[int, ...]]) -> bool: # We move backwards through the xrefs, don't replace any. return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore # Iterate through each subsection self._read_xref_subsections(idx_pairs, get_entry, used_before) return xrefstream @staticmethod def _get_xref_issues(stream: StreamType, startxref: int) -> int: """ Return an int which indicates an issue. 0 means there is no issue. Args: stream: startxref: Returns: 0 means no issue, other values represent specific issues. """ if startxref == 0: return 4 stream.seek(startxref - 1, 0) # -1 to check character before line = stream.read(1) if line == b"j": line = stream.read(1) if line not in b"\r\n \t": return 1 line = stream.read(4) if line != b"xref": # not a xref so check if it is an XREF object line = b"" while line in b"0123456789 \t": line = stream.read(1) if line == b"": return 2 line += stream.read(2) # 1 char already read, +2 to check "obj" if line.lower() != b"obj": return 3 return 0 @classmethod def _find_pdf_objects(cls, data: bytes) -> Iterable[tuple[int, int, int]]: index = 0 ord_0 = ord("0") ord_9 = ord("9") while True: index = data.find(b" obj", index) if index == -1: return index_before_space = index - 1 # Skip whitespace backwards while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: index_before_space -= 1 # Read generation number generation_end = index_before_space + 1 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: index_before_space -= 1 generation_start = index_before_space + 1 # Skip whitespace while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: index_before_space -= 1 # Read object number object_end = index_before_space + 1 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: index_before_space -= 1 object_start = index_before_space + 1 # Validate if object_start < object_end and generation_start < generation_end: object_number = int(data[object_start:object_end]) generation_number = int(data[generation_start:generation_end]) yield object_number, generation_number, object_start index += 4 # len(b" obj") @classmethod def _find_pdf_trailers(cls, data: bytes) -> Iterable[int]: index = 0 data_length = len(data) while True: index = data.find(b"trailer", index) if index == -1: return index_after_trailer = index + 7 # len(b"trailer") # Skip whitespace while index_after_trailer < data_length and data[index_after_trailer] in WHITESPACES_AS_BYTES: index_after_trailer += 1 # Must be dictionary start if index_after_trailer + 1 < data_length and data[index_after_trailer:index_after_trailer+2] == b"<<": yield index_after_trailer # offset of '<<' index += 7 # len(b"trailer") def _rebuild_xref_table(self, stream: StreamType) -> None: self.xref = {} stream.seek(0, 0) stream_data = stream.read(-1) for object_number, generation_number, object_start in self._find_pdf_objects(stream_data): if generation_number not in self.xref: self.xref[generation_number] = {} self.xref[generation_number][object_number] = object_start logger_warning("parsing for Object Streams", __name__) for generation_number in self.xref: for object_number in self.xref[generation_number]: # get_object in manual stream.seek(self.xref[generation_number][object_number], 0) try: _ = self.read_object_header(stream) obj = cast(StreamObject, read_object(stream, self)) if obj.get("/Type", "") != "/ObjStm": continue object_stream = BytesIO(obj.get_data()) actual_count = 0 while True: current = read_until_whitespace(object_stream) if not current.isdigit(): break inner_object_number = int(current) skip_over_whitespace(object_stream) object_stream.seek(-1, 1) current = read_until_whitespace(object_stream) if not current.isdigit(): # pragma: no cover break # pragma: no cover inner_generation_number = int(current) self.xref_objStm[inner_object_number] = (object_number, inner_generation_number) actual_count += 1 if actual_count != obj.get("/N"): # pragma: no cover logger_warning( # pragma: no cover f"found {actual_count} objects within Object({object_number},{generation_number})" f" whereas {obj.get('/N')} expected", __name__, ) except Exception: # could be multiple causes pass stream.seek(0, 0) for position in self._find_pdf_trailers(stream_data): stream.seek(position, 0) new_trailer = cast(dict[Any, Any], read_object(stream, self)) # Here, we are parsing the file from start to end, the new data have to erase the existing. for key, value in new_trailer.items(): self.trailer[key] = value def _read_xref_subsections( self, idx_pairs: list[int], get_entry: Callable[[int], Union[int, tuple[int, ...]]], used_before: Callable[[int, Union[int, tuple[int, ...]]], bool], ) -> None: """Read and process the subsections of the xref.""" for start, size in self._pairs(idx_pairs): # The subsections must increase for num in range(start, start + size): # The first entry is the type xref_type = get_entry(0) # The rest of the elements depend on the xref_type if xref_type == 0: # linked list of free objects next_free_object = get_entry(1) # noqa: F841 next_generation = get_entry(2) # noqa: F841 elif xref_type == 1: # objects that are in use but are not compressed byte_offset = get_entry(1) generation = get_entry(2) if generation not in self.xref: self.xref[generation] = {} # type: ignore if not used_before(num, generation): self.xref[generation][num] = byte_offset # type: ignore elif xref_type == 2: # compressed objects objstr_num = get_entry(1) obstr_idx = get_entry(2) generation = 0 # PDF spec table 18, generation is 0 if not used_before(num, generation): self.xref_objStm[num] = (objstr_num, obstr_idx) elif self.strict: raise PdfReadError(f"Unknown xref type: {xref_type}") def _pairs(self, array: list[int]) -> Iterable[tuple[int, int]]: """Iterate over pairs in the array.""" i = 0 while i + 1 < len(array): yield array[i], array[i + 1] i += 2 def decrypt(self, password: Union[str, bytes]) -> PasswordType: """ When using an encrypted / secured PDF file with the PDF Standard encryption handler, this function will allow the file to be decrypted. It checks the given password against the document's user password and owner password, and then stores the resulting decryption key if either password is correct. It does not matter which password was matched. Both passwords provide the correct decryption key that will allow the document to be used with this library. Args: password: The password to match. Returns: An indicator if the document was decrypted and whether it was the owner password or the user password. """ if not self._encryption: raise PdfReadError("Not encrypted file") # TODO: raise Exception for wrong password return self._encryption.verify(password) @property def is_encrypted(self) -> bool: """ Read-only boolean property showing whether this PDF file is encrypted. Note that this property, if true, will remain true even after the :meth:`decrypt()` method is called. """ return TK.ENCRYPT in self.trailer def add_form_topname(self, name: str) -> Optional[DictionaryObject]: """ Add a top level form that groups all form fields below it. Args: name: text string of the "/T" Attribute of the created object Returns: The created object. ``None`` means no object was created. """ catalog = self.root_object if "/AcroForm" not in catalog or not isinstance( catalog["/AcroForm"], DictionaryObject ): return None acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) if "/Fields" not in acroform: # TODO: No error but this may be extended for XFA Forms return None interim = DictionaryObject() interim[NameObject("/T")] = TextStringObject(name) interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] self.cache_indirect_object( 0, max(i for (g, i) in self.resolved_objects if g == 0) + 1, interim, ) arr = ArrayObject() arr.append(interim.indirect_reference) acroform[NameObject("/Fields")] = arr for o in cast(ArrayObject, interim["/Kids"]): obj = o.get_object() if "/Parent" in obj: logger_warning( f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", __name__, ) obj[NameObject("/Parent")] = interim.indirect_reference return interim def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: """ Rename top level form field that all form fields below it. Args: name: text string of the "/T" field of the created object Returns: The modified object. ``None`` means no object was modified. """ catalog = self.root_object if "/AcroForm" not in catalog or not isinstance( catalog["/AcroForm"], DictionaryObject ): return None acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) if "/Fields" not in acroform: return None interim = cast( DictionaryObject, cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), ) interim[NameObject("/T")] = TextStringObject(name) return interim def _repr_mimebundle_( self, include: Union[None, Iterable[str]] = None, exclude: Union[None, Iterable[str]] = None, ) -> dict[str, Any]: """ Integration into Jupyter Notebooks. This method returns a dictionary that maps a mime-type to its representation. .. seealso:: https://ipython.readthedocs.io/en/stable/config/integrating.html """ self.stream.seek(0) pdf_data = self.stream.read() data = { "application/pdf": pdf_data, } if include is not None: # Filter representations based on include list data = {k: v for k, v in data.items() if k in include} if exclude is not None: # Remove representations based on exclude list data = {k: v for k, v in data.items() if k not in exclude} return data ================================================ FILE: pypdf/_text_extraction/__init__.py ================================================ """ Code related to text extraction. Some parts are still in _page.py. In doubt, they will stay there. """ import math from typing import Any, Callable, Optional, Union from .._font import Font from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding CUSTOM_RTL_MIN: int = -1 CUSTOM_RTL_MAX: int = -1 CUSTOM_RTL_SPECIAL_CHARS: list[int] = [] LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 class OrientationNotFoundError(Exception): pass def set_custom_rtl( _min: Union[str, int, None] = None, _max: Union[str, int, None] = None, specials: Union[str, list[int], None] = None, ) -> tuple[int, int, list[int]]: """ Change the Right-To-Left and special characters custom parameters. Args: _min: The new minimum value for the range of custom characters that will be written right to left. If set to ``None``, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. _max: The new maximum value for the range of custom characters that will be written right to left. If set to ``None``, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. specials: The new list of special characters to be inserted in the current insertion order. If set to ``None``, the current value will not be changed. If set to a string, it will be converted to a list of ASCII codes. The default value is an empty list. Returns: A tuple containing the new values for ``CUSTOM_RTL_MIN``, ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. """ global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS if isinstance(_min, int): CUSTOM_RTL_MIN = _min elif isinstance(_min, str): CUSTOM_RTL_MIN = ord(_min) if isinstance(_max, int): CUSTOM_RTL_MAX = _max elif isinstance(_max, str): CUSTOM_RTL_MAX = ord(_max) if isinstance(specials, str): CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] elif isinstance(specials, list): CUSTOM_RTL_SPECIAL_CHARS = specials return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS def mult(m: list[float], n: list[float]) -> list[float]: return [ m[0] * n[0] + m[1] * n[2], m[0] * n[1] + m[1] * n[3], m[2] * n[0] + m[3] * n[2], m[2] * n[1] + m[3] * n[3], m[4] * n[0] + m[5] * n[2] + n[4], m[4] * n[1] + m[5] * n[3] + n[5], ] def orient(m: list[float]) -> int: if m[3] > 1e-6: return 0 if m[3] < -1e-6: return 180 if m[1] > 0: return 90 return 270 def crlf_space_check( text: str, cmtm_prev: tuple[list[float], list[float]], cmtm_matrix: tuple[list[float], list[float]], memo_cmtm: tuple[list[float], list[float]], font_resource: Optional[DictionaryObject], orientations: tuple[int, ...], output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], str_widths: float, spacewidth: float, str_height: float, ) -> tuple[str, str, list[float], list[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] cm_matrix = cmtm_matrix[0] tm_matrix = cmtm_matrix[1] memo_cm = memo_cmtm[0] memo_tm = memo_cmtm[1] m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # Table 108 of the 1.7 reference ("Text positioning operators") scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m if orientation not in orientations: raise OrientationNotFoundError if orientation in (0, 180): moved_height: float = delta_y moved_width: float = delta_x elif orientation in (90, 270): moved_height = delta_x moved_width = delta_y try: if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, font_resource, font_size, ) text = "" elif ( (moved_width >= (spacewidth + str_widths) * scale_prev_x) and (output + text)[-1] != " " ): text += " " except Exception: pass tm_prev = tm_matrix.copy() cm_prev = cm_matrix.copy() return text, output, cm_prev, tm_prev def get_text_operands( operands: list[Union[str, TextStringObject]], cm_matrix: list[float], tm_matrix: list[float], font: Font, orientations: tuple[int, ...] ) -> tuple[str, bool]: t: str = "" is_str_operands = False m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): t = operands[0] is_str_operands = True else: t = "" tt: bytes = ( encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) else operands[0] ) if isinstance(font.encoding, str): try: t = tt.decode(font.encoding, "surrogatepass") # apply str encoding except Exception: # the data does not match the expectation, # we use the alternative ; # text extraction may not be good t = tt.decode( "utf-16-be" if font.encoding == "charmap" else "charmap", "surrogatepass", ) # apply str encoding else: # apply dict encoding t = "".join( [font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt] ) return (t, is_str_operands) def get_display_str( text: str, cm_matrix: list[float], tm_matrix: list[float], font_resource: Optional[DictionaryObject], font: Font, text_operands: str, font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] ) -> tuple[str, bool, float]: # "\u0590 - \u08FF \uFB50 - \uFDFF" widths: float = 0.0 for x in [font.character_map.get(x, x) for x in text_operands]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: xx = ord(x) else: xx = 1 # fmt: off if ( # cases where the current inserting order is kept (xx <= 0x2F) # punctuations but... or 0x3A <= xx <= 0x40 # numbers (x30-39) or 0x2000 <= xx <= 0x206F # upper punctuations.. or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF or 0xFE70 <= xx <= 0xFEFF or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX ): if not rtl_dir: rtl_dir = True if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size) text = "" text = x + text else: # left-to-right if rtl_dir: rtl_dir = False if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size) text = "" text = text + x widths += font.space_width if x == " " else font.text_width(x) # fmt: on return text, rtl_dir, widths ================================================ FILE: pypdf/_text_extraction/_layout_mode/__init__.py ================================================ """Layout mode text extraction extension for pypdf""" from ..._font import Font from ._fixed_width_page import ( fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups, ) __all__ = [ "Font", "fixed_char_width", "fixed_width_page", "text_show_operations", "y_coordinate_groups", ] ================================================ FILE: pypdf/_text_extraction/_layout_mode/_fixed_width_page.py ================================================ """Extract PDF text preserving the layout of the source PDF""" from collections.abc import Iterator from itertools import groupby from math import ceil from pathlib import Path from typing import Any, Literal, Optional, TypedDict from ..._font import Font from ..._utils import logger_warning from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS from ._text_state_manager import TextStateManager from ._text_state_params import TextStateParams class BTGroup(TypedDict): """ Dict describing a line of text rendered within a BT/ET operator pair. If multiple text show operations render text on the same line, the text will be combined into a single BTGroup dict. Keys: tx: x coordinate of first character in BTGroup ty: y coordinate of first character in BTGroup font_size: nominal font size font_height: effective font height text: rendered text displaced_tx: x coordinate of last character in BTGroup flip_sort: -1 if page is upside down, else 1 """ tx: float ty: float font_size: float font_height: float text: str displaced_tx: float flip_sort: Literal[-1, 1] def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup: """ BTGroup constructed from a TextStateParams instance, rendered text, and displaced tx value. Args: tj_op (TextStateParams): TextStateParams instance rendered_text (str): rendered text dispaced_tx (float): x coordinate of last character in BTGroup """ return BTGroup( tx=tj_op.tx, ty=tj_op.ty, font_size=tj_op.font_size, font_height=tj_op.font_height, text=rendered_text, displaced_tx=dispaced_tx, flip_sort=-1 if tj_op.flip_vertical else 1, ) def recurs_to_target_op( ops: Iterator[tuple[list[Any], bytes]], text_state_mgr: TextStateManager, end_target: Literal[b"Q", b"ET"], fonts: dict[str, Font], strip_rotated: bool = True, ) -> tuple[list[BTGroup], list[TextStateParams]]: """ Recurse operators between BT/ET and/or q/Q operators managing the transform stack and capturing text positioning and rendering data. Args: ops: iterator of operators in content stream text_state_mgr: a TextStateManager instance end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op) fonts: font dictionary as returned by PageObject._layout_mode_fonts() Returns: tuple: list of BTGroup dicts + list of TextStateParams dataclass instances. """ # 1 entry per line of text rendered within each BT/ET operation. bt_groups: list[BTGroup] = [] # 1 entry per text show operator (Tj/TJ/'/") tj_ops: list[TextStateParams] = [] if end_target == b"Q": # add new q level. cm's added at this level will be popped at next b'Q' text_state_mgr.add_q() for operands, op in ops: # The loop is broken by the end target, or exits normally when there are no more ops. if op == end_target: if op == b"Q": text_state_mgr.remove_q() if op == b"ET": if not tj_ops: return bt_groups, tj_ops _text = "" bt_idx = 0 # idx of first tj in this bt group last_displaced_tx = tj_ops[bt_idx].displaced_tx last_ty = tj_ops[bt_idx].ty for _idx, _tj in enumerate( tj_ops ): # ... build text from new Tj operators if strip_rotated and _tj.rotated: continue if not _tj.font.interpretable: # generates warning continue # if the y position of the text is greater than the font height, assume # the text is on a new line and start a new group if abs(_tj.ty - last_ty) > _tj.font_height: if _text.strip(): bt_groups.append( bt_group(tj_ops[bt_idx], _text, last_displaced_tx) ) bt_idx = _idx _text = "" # if the x position of the text is less than the last x position by # more than 5 spaces widths, assume the text order should be flipped # and start a new group if ( last_displaced_tx - _tj.tx > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS ): if _text.strip(): bt_groups.append( bt_group(tj_ops[bt_idx], _text, last_displaced_tx) ) bt_idx = _idx last_displaced_tx = _tj.displaced_tx _text = "" # calculate excess x translation based on ending tx of previous Tj. # multiply by bool (_idx != bt_idx) to ensure spaces aren't double # applied to the first tj of a BTGroup in fixed_width_page(). excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx) # space_tx could be 0 if either Tz or font_size was 0 for this _tj. spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0 new_text = f'{" " * spaces}{_tj.txt}' last_ty = _tj.ty _text = f"{_text}{new_text}" last_displaced_tx = _tj.displaced_tx if _text: bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx)) text_state_mgr.reset_tm() break if op == b"q": bts, tjs = recurs_to_target_op( ops, text_state_mgr, b"Q", fonts, strip_rotated ) bt_groups.extend(bts) tj_ops.extend(tjs) elif op == b"cm": text_state_mgr.add_cm(*operands) elif op == b"BT": bts, tjs = recurs_to_target_op( ops, text_state_mgr, b"ET", fonts, strip_rotated ) bt_groups.extend(bts) tj_ops.extend(tjs) elif op == b"Tj": tj_ops.append(text_state_mgr.text_state_params(operands[0])) elif op == b"TJ": _tj = text_state_mgr.text_state_params() for tj_op in operands[0]: if isinstance(tj_op, bytes): _tj = text_state_mgr.text_state_params(tj_op) tj_ops.append(_tj) else: text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op)) elif op == b"'": text_state_mgr.reset_trm() text_state_mgr.add_tm([0, -text_state_mgr.TL]) tj_ops.append(text_state_mgr.text_state_params(operands[0])) elif op == b'"': text_state_mgr.reset_trm() text_state_mgr.set_state_param(b"Tw", operands[0]) text_state_mgr.set_state_param(b"Tc", operands[1]) text_state_mgr.add_tm([0, -text_state_mgr.TL]) tj_ops.append(text_state_mgr.text_state_params(operands[2])) elif op in (b"Td", b"Tm", b"TD", b"T*"): text_state_mgr.reset_trm() if op == b"Tm": text_state_mgr.reset_tm() elif op == b"TD": text_state_mgr.set_state_param(b"TL", -operands[1]) elif op == b"T*": operands = [0, -text_state_mgr.TL] text_state_mgr.add_tm(operands) elif op == b"Tf": text_state_mgr.set_font(fonts[operands[0]], operands[1]) else: # handle Tc, Tw, Tz, TL, and Ts operators text_state_mgr.set_state_param(op, operands) else: logger_warning( f"Unbalanced target operations, expected {end_target!r}.", __name__, ) return bt_groups, tj_ops def y_coordinate_groups( bt_groups: list[BTGroup], debug_path: Optional[Path] = None ) -> dict[int, list[BTGroup]]: """ Group text operations by rendered y coordinate, i.e. the line number. Args: bt_groups: list of dicts as returned by text_show_operations() debug_path (Path, optional): Path to a directory for saving debug output. Returns: Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator keyed by y coordinate """ ty_groups = { ty: sorted(grp, key=lambda x: x["tx"]) for ty, grp in groupby( bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"]) ) } # combine groups whose y coordinates differ by less than the effective font height # (accounts for mixed fonts and other minor oddities) last_ty = next(iter(ty_groups)) last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()} for ty in list(ty_groups)[1:]: fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty)) txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()} # prevent merge if both groups are rendering in the same x position. no_text_overlap = not (txs & last_txs) offset_less_than_font_height = abs(ty - last_ty) < fsz if no_text_overlap and offset_less_than_font_height: ty_groups[last_ty] = sorted( ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"] ) last_txs |= txs else: last_ty = ty last_txs = txs if debug_path: # pragma: no cover import json # noqa: PLC0415 debug_path.joinpath("bt_groups.json").write_text( json.dumps(ty_groups, indent=2, default=str), "utf-8" ) return ty_groups def text_show_operations( ops: Iterator[tuple[list[Any], bytes]], fonts: dict[str, Font], strip_rotated: bool = True, debug_path: Optional[Path] = None, ) -> list[BTGroup]: """ Extract text from BT/ET operator pairs. Args: ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream fonts (Dict[str, Font]): font dictionary strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True. debug_path (Path, optional): Path to a directory for saving debug output. Returns: List[BTGroup]: list of dicts of text rendered by each BT operator """ state_mgr = TextStateManager() # transformation stack manager bt_groups: list[BTGroup] = [] # BT operator dict tj_ops: list[TextStateParams] = [] # Tj/TJ operator data for operands, op in ops: if op in (b"BT", b"q"): bts, tjs = recurs_to_target_op( ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated ) bt_groups.extend(bts) tj_ops.extend(tjs) elif op == b"Tf": state_mgr.set_font(fonts[operands[0]], operands[1]) else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops state_mgr.set_state_param(op, operands) if any(tj.rotated for tj in tj_ops): if strip_rotated: logger_warning( "Rotated text discovered. Output will be incomplete.", __name__ ) else: logger_warning( "Rotated text discovered. Layout will be degraded.", __name__ ) if not all(tj.font.interpretable for tj in tj_ops): logger_warning( "PDF contains an uninterpretable font. Output will be incomplete.", __name__ ) # left align the data, i.e. decrement all tx values by min(tx) min_x = min((x["tx"] for x in bt_groups), default=0.0) bt_groups = [ dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc] for ogrp in sorted( bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True ) ] if debug_path: # pragma: no cover import json # noqa: PLC0415 debug_path.joinpath("bts.json").write_text( json.dumps(bt_groups, indent=2, default=str), "utf-8" ) debug_path.joinpath("tjs.json").write_text( json.dumps( tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) ), "utf-8", ) return bt_groups def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float: """ Calculate average character width weighted by the length of the rendered text in each sample for conversion to fixed-width layout. Args: bt_groups (List[BTGroup]): List of dicts of text rendered by each BT operator Returns: float: fixed character width """ char_widths = [] for _bt in bt_groups: _len = len(_bt["text"]) * scale_weight char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len)) return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths) def fixed_width_page( ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float ) -> str: """ Generate page text from text operations grouped by rendered y coordinate. Args: ty_groups: dict of text show ops as returned by y_coordinate_groups() char_width: fixed character width space_vertically: include blank lines inferred from y distance + font height. font_height_weight: multiplier for font height when calculating blank lines. Returns: str: page text in a fixed width format that closely adheres to the rendered layout in the source pdf. """ lines: list[str] = [] last_y_coord = 0 table = str.maketrans(dict.fromkeys(range(14, 32), " ")) for y_coord, line_data in ty_groups.items(): if space_vertically and lines: fh = line_data[0]["font_height"] blank_lines = 0 if fh == 0 else ( int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1 ) lines.extend([""] * blank_lines) line_parts = [] # It uses a list to construct the line, avoiding string concatenation. current_len = 0 # Track the size with int instead of len(str) overhead. last_disp = 0.0 for bt_op in line_data: tx = bt_op["tx"] offset = int(tx // char_width) needed_spaces = offset - current_len if needed_spaces > 0 and ceil(last_disp) < int(tx): padding = " " * needed_spaces line_parts.append(padding) current_len += needed_spaces raw_text = bt_op["text"] text = raw_text.translate(table) line_parts.append(text) current_len += len(text) last_disp = bt_op["displaced_tx"] full_line = "".join(line_parts).rstrip() if full_line.strip() or (space_vertically and lines): lines.append(full_line) last_y_coord = y_coord return "\n".join(lines) ================================================ FILE: pypdf/_text_extraction/_layout_mode/_text_state_manager.py ================================================ """manage the PDF transform stack during "layout" mode text extraction""" from collections import ChainMap, Counter from collections import ChainMap as ChainMapType from collections import Counter as CounterType from collections.abc import MutableMapping from typing import Any, Union from ..._font import Font from ...errors import PdfReadError from .. import mult from ._text_state_params import TextStateParams TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]] TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]] class TextStateManager: """ Tracks the current text state including cm/tm/trm transformation matrices. Attributes: transform_stack (ChainMap): ChainMap of cm/tm transformation matrices q_queue (Counter[int]): Counter of q operators q_depth (List[int]): list of q operator nesting levels Tc (float): character spacing Tw (float): word spacing Tz (int): horizontal scaling TL (float): leading Ts (float): text rise font (Font): font object font_size (int | float): font size """ def __init__(self) -> None: self.transform_stack: TextStateManagerChainMapType = ChainMap( self.new_transform() ) self.q_queue: CounterType[int] = Counter() self.q_depth = [0] self.Tc: float = 0.0 self.Tw: float = 0.0 self.Tz: float = 100.0 self.TL: float = 0.0 self.Ts: float = 0.0 self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = [] self.font: Union[Font, None] = None self.font_size: Union[int, float] = 0 def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None: """ Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators. Args: op: operator read from PDF stream as bytes. No action is taken for unsupported operators (see supported operators above). value (float | List[Any]): new parameter value. If a list, value[0] is used. """ if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]: return self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value) def set_font(self, font: Font, size: float) -> None: """ Set the current font and font_size. Args: font (Font): a layout mode Font size (float): font size """ self.font = font self.font_size = size def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams: """ Create a TextStateParams instance to display a text string. Type[bytes] values will be decoded implicitly. Args: value (str | bytes): text to associate with the captured state. Raises: PdfReadError: if font not set (no Tf operator in incoming pdf content stream) Returns: TextStateParams: current text state parameters """ if not isinstance(self.font, Font): raise PdfReadError( "font not set: is PDF missing a Tf operator?" ) # pragma: no cover if isinstance(value, bytes): try: if isinstance(self.font.encoding, str): txt = value.decode(self.font.encoding, "surrogatepass") else: txt = "".join( self.font.encoding[x] if x in self.font.encoding else bytes((x,)).decode() for x in value ) except (UnicodeEncodeError, UnicodeDecodeError): txt = value.decode("utf-8", "replace") txt = "".join( self.font.character_map.get(x, x) for x in txt ) else: txt = value return TextStateParams( txt, self.font, self.font_size, self.Tc, self.Tw, self.Tz, self.TL, self.Ts, self.effective_transform, ) @staticmethod def raw_transform( _a: float = 1.0, _b: float = 0.0, _c: float = 0.0, _d: float = 1.0, _e: float = 0.0, _f: float = 0.0, ) -> dict[int, float]: """Only a/b/c/d/e/f matrix params""" return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f)))) @staticmethod def new_transform( _a: float = 1.0, _b: float = 0.0, _c: float = 0.0, _d: float = 1.0, _e: float = 0.0, _f: float = 0.0, is_text: bool = False, is_render: bool = False, ) -> TextStateManagerDictType: """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys""" result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f) result.update({"is_text": is_text, "is_render": is_render}) return result def reset_tm(self) -> TextStateManagerChainMapType: """Clear all transforms from chainmap having is_text==True or is_render==True""" while ( self.transform_stack.maps[0]["is_text"] or self.transform_stack.maps[0]["is_render"] ): self.transform_stack = self.transform_stack.parents return self.transform_stack def reset_trm(self) -> TextStateManagerChainMapType: """Clear all transforms from chainmap having is_render==True""" while self.transform_stack.maps[0]["is_render"]: self.transform_stack = self.transform_stack.parents return self.transform_stack def remove_q(self) -> TextStateManagerChainMapType: """Rewind to stack prior state after closing a 'q' with internal 'cm' ops""" self.font, self.font_size = self.font_stack.pop(-1) self.transform_stack = self.reset_tm() self.transform_stack.maps = self.transform_stack.maps[ self.q_queue.pop(self.q_depth.pop(), 0) : ] return self.transform_stack def add_q(self) -> None: """Add another level to q_queue""" self.font_stack.append((self.font, self.font_size)) self.q_depth.append(len(self.q_depth)) def add_cm(self, *args: Any) -> TextStateManagerChainMapType: """Concatenate an additional transform matrix""" self.transform_stack = self.reset_tm() self.q_queue.update(self.q_depth[-1:]) self.transform_stack = self.transform_stack.new_child(self.new_transform(*args)) return self.transform_stack def _complete_matrix(self, operands: list[float]) -> list[float]: """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)""" if len(operands) == 2: # this is a Td operator or equivalent operands = [1.0, 0.0, 0.0, 1.0, *operands] return operands def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType: """Append a text transform matrix""" self.transform_stack = self.transform_stack.new_child( self.new_transform( # type: ignore[misc] *self._complete_matrix(operands), is_text=True # type: ignore[arg-type] ) ) return self.transform_stack def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType: """Append a text rendering transform matrix""" self.transform_stack = self.transform_stack.new_child( self.new_transform( # type: ignore[misc] *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type] ) ) return self.transform_stack @property def effective_transform(self) -> list[float]: """Current effective transform accounting for cm, tm, and trm transforms""" eff_transform = [*self.transform_stack.maps[0].values()] for transform in self.transform_stack.maps[1:]: eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5 return eff_transform ================================================ FILE: pypdf/_text_extraction/_layout_mode/_text_state_params.py ================================================ """A dataclass that captures the CTM and Text State for a tj operation""" import math from dataclasses import dataclass, field from typing import Any, Union from ..._font import Font from .. import mult, orient @dataclass class TextStateParams: """ Text state parameters and operator values for a single text value in a TJ or Tj PDF operation. Attributes: txt (str): the text to be rendered. font (Font): font object font_size (int | float): font size Tc (float): character spacing. Defaults to 0.0. Tw (float): word spacing. Defaults to 0.0. Tz (float): horizontal scaling. Defaults to 100.0. TL (float): leading, vertical displacement between text lines. Defaults to 0.0. Ts (float): text rise. Used for super/subscripts. Defaults to 0.0. transform (List[float]): effective transformation matrix. tx (float): x cood of rendered text, i.e. self.transform[4] ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts. displaced_tx (float): x coord immediately following rendered text space_tx (float): tx for a space character font_height (float): effective font height accounting for CTM flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.) rotated (bool): True if the text orientation is rotated with respect to the page. """ txt: str font: Font font_size: Union[int, float] Tc: float = 0.0 Tw: float = 0.0 Tz: float = 100.0 TL: float = 0.0 Ts: float = 0.0 transform: list[float] = field( default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] ) tx: float = field(default=0.0, init=False) ty: float = field(default=0.0, init=False) displaced_tx: float = field(default=0.0, init=False) space_tx: float = field(default=0.0, init=False) font_height: float = field(default=0.0, init=False) flip_vertical: bool = field(default=False, init=False) rotated: bool = field(default=False, init=False) def __post_init__(self) -> None: if orient(self.transform) in (90, 270): self.transform = mult( [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0], self.transform, ) self.rotated = True # self.transform[0] AND self.transform[3] < 0 indicates true rotation. # If only self.transform[3] < 0, the y coords are simply inverted. if orient(self.transform) == 180 and self.transform[0] < -1e-6: self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform) self.rotated = True self.displaced_tx = self.displaced_transform()[4] self.tx = self.transform[4] self.ty = self.render_transform()[5] self.space_tx = round(self.word_tx(" "), 3) if self.space_tx < 1e-6: # if the " " char is assigned 0 width (e.g. for fine tuned spacing # with TJ int operators a la crazyones.pdf), calculate space_tx as # a td_offset of -1 * font.space_width where font.space_width is # the space_width calculated in _font.py. self.space_tx = round(self.word_tx("", -self.font.space_width), 3) self.font_height = self.font_size * math.sqrt( self.transform[1] ** 2 + self.transform[3] ** 2 ) # flip_vertical handles PDFs generated by Microsoft Word's "publish" command. self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis def font_size_matrix(self) -> list[float]: """Font size matrix""" return [ self.font_size * (self.Tz / 100.0), 0.0, 0.0, self.font_size, 0.0, self.Ts, ] def displaced_transform(self) -> list[float]: """Effective transform matrix after text has been rendered.""" return mult(self.displacement_matrix(), self.transform) def render_transform(self) -> list[float]: """Effective transform matrix accounting for font size, Tz, and Ts.""" return mult(self.font_size_matrix(), self.transform) def displacement_matrix( self, word: Union[str, None] = None, td_offset: float = 0.0 ) -> list[float]: """ Text displacement matrix Args: word (str, optional): Defaults to None in which case self.txt displacement is returned. td_offset (float, optional): translation applied by TD operator. Defaults to 0.0. """ word = word if word is not None else self.txt return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0] def word_tx(self, word: str, td_offset: float = 0.0) -> float: """Horizontal text displacement for any word according this text state""" width: float = 0.0 for char in word: if char == " ": width += self.font.space_width else: width += self.font.text_width(char) return ( (self.font_size * ((width - td_offset) / 1000.0)) + self.Tc + word.count(" ") * self.Tw ) * (self.Tz / 100.0) @staticmethod def to_dict(inst: "TextStateParams") -> dict[str, Any]: """Dataclass to dict for json.dumps serialization""" return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"} ================================================ FILE: pypdf/_text_extraction/_text_extractor.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import math from typing import Any, Callable, Optional, Union from .._font import Font, FontDescriptor from ..generic import DictionaryObject, TextStringObject from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult class TextExtraction: """ A class to handle PDF text extraction operations. This class encapsulates all the state and operations needed for extracting text from PDF content streams, replacing the nested functions and nonlocal variables in the original implementation. """ def __init__(self) -> None: self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {} # Text extraction state variables self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.cm_stack: list[ tuple[ list[float], Optional[DictionaryObject], Font, float, float, float, float, ] ] = [] # Store the last modified matrices; can be an intermediate position self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # Store the position at the beginning of building the text self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.char_scale = 1.0 self.space_scale = 1.0 self._space_width: float = 500.0 # will be set correctly at first Tf self._actual_str_size: dict[str, float] = { "str_widths": 0.0, "str_height": 0.0, } # will be set to string length calculation result self.TL = 0.0 self.font_size = 12.0 # init just in case of # Text extraction variables self.text: str = "" self.output: str = "" self.rtl_dir: bool = False # right-to-left self.font_resource: Optional[DictionaryObject] = None self.font = Font( name = "NotInitialized", sub_type="Unknown", encoding="charmap", font_descriptor=FontDescriptor(), ) self.orientations: tuple[int, ...] = (0, 90, 180, 270) self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None self.font_resources: dict[str, DictionaryObject] = {} self.fonts: dict[str, Font] = {} self.operation_handlers = { b"BT": self._handle_bt, b"ET": self._handle_et, b"q": self._handle_save_graphics_state, b"Q": self._handle_restore_graphics_state, b"cm": self._handle_cm, b"Tz": self._handle_tz, b"Tw": self._handle_tw, b"TL": self._handle_tl, b"Tf": self._handle_tf, b"Td": self._handle_td, b"Tm": self._handle_tm, b"T*": self._handle_t_star, b"Tj": self._handle_tj_operation, } def initialize_extraction( self, orientations: tuple[int, ...] = (0, 90, 180, 270), visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, font_resources: Optional[dict[str, DictionaryObject]] = None, fonts: Optional[dict[str, Font]] = None ) -> None: """Initialize the extractor with extraction parameters.""" self.orientations = orientations self.visitor_text = visitor_text self.font_resources = font_resources or {} self.fonts = fonts or {} # Reset state self.text = "" self.output = "" self.rtl_dir = False def compute_str_widths(self, str_widths: float) -> float: return str_widths / 1000 def process_operation(self, operator: bytes, operands: list[Any]) -> None: if operator in self.operation_handlers: handler = self.operation_handlers[operator] str_widths = handler(operands) # Post-process operations that affect text positioning if operator in {b"Td", b"Tm", b"T*", b"Tj"}: self._post_process_text_operation(str_widths or 0.0) def _post_process_text_operation(self, str_widths: float) -> None: """Handle common post-processing for text positioning operations.""" try: self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( self.text, (self.cm_prev, self.tm_prev), (self.cm_matrix, self.tm_matrix), (self.memo_cm, self.memo_tm), self.font_resource, self.orientations, self.output, self.font_size, self.visitor_text, str_widths, self.compute_str_widths(self.font_size * self._space_width), self._actual_str_size["str_height"], ) if self.text == "": self.memo_cm = self.cm_matrix.copy() self.memo_tm = self.tm_matrix.copy() except OrientationNotFoundError: pass def _handle_tj( self, text: str, operands: list[Union[str, TextStringObject]], cm_matrix: list[float], tm_matrix: list[float], font_resource: Optional[DictionaryObject], font: Font, orientations: tuple[int, ...], font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], actual_str_size: dict[str, float], ) -> tuple[str, bool, dict[str, float]]: text_operands, is_str_operands = get_text_operands( operands, cm_matrix, tm_matrix, font, orientations ) if is_str_operands: text += text_operands font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands]) else: text, rtl_dir, font_widths = get_display_str( text, cm_matrix, tm_matrix, # text matrix font_resource, font, text_operands, font_size, rtl_dir, visitor_text, ) actual_str_size["str_widths"] += font_widths * font_size actual_str_size["str_height"] = font_size return text, rtl_dir, actual_str_size def _flush_text(self) -> None: """Flush accumulated text to output and call visitor if present.""" self.output += self.text if self.visitor_text is not None: self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) self.text = "" self.memo_cm = self.cm_matrix.copy() self.memo_tm = self.tm_matrix.copy() # Operation handlers def _handle_bt(self, operands: list[Any]) -> None: """Handle BT (Begin Text) operation - Table 5.4 page 405.""" self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self._flush_text() def _handle_et(self, operands: list[Any]) -> None: """Handle ET (End Text) operation - Table 5.4 page 405.""" self._flush_text() def _handle_save_graphics_state(self, operands: list[Any]) -> None: """Handle q (Save graphics state) operation - Table 4.7 page 219.""" self.cm_stack.append( ( self.cm_matrix, self.font_resource, self.font, self.font_size, self.char_scale, self.space_scale, self.TL, ) ) def _handle_restore_graphics_state(self, operands: list[Any]) -> None: """Handle Q (Restore graphics state) operation - Table 4.7 page 219.""" try: ( self.cm_matrix, self.font_resource, self.font, self.font_size, self.char_scale, self.space_scale, self.TL, ) = self.cm_stack.pop() except Exception: self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] def _handle_cm(self, operands: list[Any]) -> None: """Handle cm (Modify current matrix) operation - Table 4.7 page 219.""" self.output += self.text if self.visitor_text is not None: self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) self.text = "" try: self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) except Exception: self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.memo_cm = self.cm_matrix.copy() self.memo_tm = self.tm_matrix.copy() def _handle_tz(self, operands: list[Any]) -> None: """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398.""" self.char_scale = float(operands[0]) / 100 if operands else 1.0 def _handle_tw(self, operands: list[Any]) -> None: """Handle Tw (Set word spacing) operation - Table 5.2 page 398.""" self.space_scale = 1.0 + float(operands[0] if operands else 0.0) def _handle_tl(self, operands: list[Any]) -> None: """Handle TL (Set Text Leading) operation - Table 5.2 page 398.""" scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x def _handle_tf(self, operands: list[Any]) -> None: """Handle Tf (Set font size) operation - Table 5.2 page 398.""" if self.text != "": self.output += self.text # .translate(cmap) if self.visitor_text is not None: self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) self.text = "" self.memo_cm = self.cm_matrix.copy() self.memo_tm = self.tm_matrix.copy() try: self.font_resource = self.font_resources[operands[0]] self.font = self.fonts[operands[0]] except KeyError: # font not found self.font_resource = None font_descriptor = FontDescriptor() self.font = Font( "Unknown", space_width=250, encoding=dict.fromkeys(range(256), "�"), font_descriptor=font_descriptor, character_map={}, ) self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space... try: self.font_size = float(operands[1]) except Exception: pass # keep previous size def _handle_td(self, operands: list[Any]) -> float: """Handle Td (Move text position) operation - Table 5.5 page 406.""" # A special case is a translating only tm: # tm = [1, 0, 0, 1, e, f] # i.e. tm[4] += tx, tm[5] += ty. tx, ty = float(operands[0]), float(operands[1]) self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 return str_widths def _handle_tm(self, operands: list[Any]) -> float: """Handle Tm (Set text matrix) operation - Table 5.5 page 406.""" self.tm_matrix = [float(operand) for operand in operands[:6]] str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 return str_widths def _handle_t_star(self, operands: list[Any]) -> float: """Handle T* (Move to next line) operation - Table 5.5 page 406.""" self.tm_matrix[4] -= self.TL * self.tm_matrix[2] self.tm_matrix[5] -= self.TL * self.tm_matrix[3] str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 return str_widths def _handle_tj_operation(self, operands: list[Any]) -> float: """Handle Tj (Show text) operation - Table 5.5 page 406.""" self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( self.text, operands, self.cm_matrix, self.tm_matrix, self.font_resource, self.font, self.orientations, self.font_size, self.rtl_dir, self.visitor_text, self._actual_str_size, ) return 0.0 # str_widths will be handled in post-processing ================================================ FILE: pypdf/_utils.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """Utility functions for PDF library.""" __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import functools import logging import re import sys import warnings from dataclasses import dataclass from datetime import datetime, timezone from io import DEFAULT_BUFFER_SIZE from os import SEEK_CUR from re import Pattern from typing import ( IO, Any, Optional, Union, overload, ) if sys.version_info[:2] >= (3, 10): # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ from typing import TypeAlias else: from typing_extensions import TypeAlias if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self from .errors import ( STREAM_TRUNCATED_PREMATURELY, DeprecationError, PdfStreamError, ) TransformationMatrixType: TypeAlias = tuple[ tuple[float, float, float], tuple[float, float, float], tuple[float, float, float] ] CompressedTransformationMatrix: TypeAlias = tuple[ float, float, float, float, float, float ] StreamType = IO[Any] StrByteType = Union[str, StreamType] def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]: orgtext = text if not text: return None if text[0].isdigit(): text = "D:" + text if text.endswith(("Z", "z")): text += "0000" text = text.replace("z", "+").replace("Z", "+").replace("'", "") i = max(text.find("+"), text.find("-")) if i > 0 and i != len(text) - 5: text += "00" for f in ( "D:%Y", "D:%Y%m", "D:%Y%m%d", "D:%Y%m%d%H", "D:%Y%m%d%H%M", "D:%Y%m%d%H%M%S", "D:%Y%m%d%H%M%S%z", ): try: d = datetime.strptime(text, f) # noqa: DTZ007 except ValueError: continue else: if text.endswith("+0000"): d = d.replace(tzinfo=timezone.utc) return d raise ValueError(f"Can not convert date: {orgtext}") def format_iso8824_date(dt: datetime) -> str: """ Convert a datetime object to PDF date string format. Converts datetime to the PDF date format D:YYYYMMDDHHmmSSOHH'mm as specified in the PDF Reference. Args: dt: A datetime object to convert. Returns: A date string in PDF format. """ date_str = dt.strftime("D:%Y%m%d%H%M%S") if dt.tzinfo is not None: offset = dt.utcoffset() assert offset is not None total_seconds = int(offset.total_seconds()) hours, remainder = divmod(abs(total_seconds), 3600) minutes = remainder // 60 sign = "+" if total_seconds >= 0 else "-" date_str += f"{sign}{hours:02d}'{minutes:02d}'" return date_str def _get_max_pdf_version_header(header1: str, header2: str) -> str: versions = ( "%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF-2.0", ) pdf_header_indices = [] if header1 in versions: pdf_header_indices.append(versions.index(header1)) if header2 in versions: pdf_header_indices.append(versions.index(header2)) if len(pdf_header_indices) == 0: raise ValueError(f"Neither {header1!r} nor {header2!r} are proper headers") return versions[max(pdf_header_indices)] WHITESPACES = (b"\x00", b"\t", b"\n", b"\f", b"\r", b" ") WHITESPACES_AS_BYTES = b"".join(WHITESPACES) WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes: """ Read non-whitespace characters and return them. Stops upon encountering whitespace or when maxchars is reached. Args: stream: The data stream from which was read. maxchars: The maximum number of bytes returned; by default unlimited. Returns: The data which was read. """ txt = b"" while True: tok = stream.read(1) if tok.isspace() or not tok: break txt += tok if len(txt) == maxchars: break return txt def read_non_whitespace(stream: StreamType) -> bytes: """ Find and read the next non-whitespace character (ignores whitespace). Args: stream: The data stream from which was read. Returns: The data which was read. """ tok = stream.read(1) while tok in WHITESPACES: tok = stream.read(1) return tok def skip_over_whitespace(stream: StreamType) -> bool: """ Similar to read_non_whitespace, but return a boolean if at least one whitespace character was read. Args: stream: The data stream from which was read. Returns: True if one or more whitespace was skipped, otherwise return False. """ tok = stream.read(1) cnt = 0 while tok in WHITESPACES: cnt += 1 tok = stream.read(1) return cnt > 0 def check_if_whitespace_only(value: bytes) -> bool: """ Check if the given value consists of whitespace characters only. Args: value: The bytes to check. Returns: True if the value only has whitespace characters, otherwise return False. """ return all(b in WHITESPACES_AS_BYTES for b in value) def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) stream.seek(-1, 1) if tok == b"%": while tok not in (b"\n", b"\r"): tok = stream.read(1) if tok == b"": raise PdfStreamError("File ended unexpectedly.") def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: """ Read until the regular expression pattern matched (ignore the match). Treats EOF on the underlying stream as the end of the token to be matched. Args: regex: re.Pattern Returns: The read bytes. """ parts: list[bytes] = [] total_len = 0 tail = b"" chunk_size = 16 while True: tok = stream.read(chunk_size) if not tok: return b"".join(parts) # Search overlap of previous tail + new chunk to catch # multi-byte regex matches spanning chunk boundaries. buf = tail + tok m = regex.search(buf) if m is not None: overlap = len(tail) actual_start = total_len - overlap + m.start() stream.seek(actual_start - total_len - len(tok), 1) parts.append(tok) return b"".join(parts)[:actual_start] parts.append(tok) total_len += len(tok) # Fixed overlap: 16 bytes is sufficient for the short # delimiter patterns used in PDF parsing. tail = tok[-16:] if chunk_size < 8192: chunk_size <<= 1 return b"".join(parts) def read_block_backwards(stream: StreamType, to_read: int) -> bytes: """ Given a stream at position X, read a block of size to_read ending at position X. This changes the stream's position to the beginning of where the block was read. Args: stream: to_read: Returns: The data which was read. """ if stream.tell() < to_read: raise PdfStreamError("Could not read malformed PDF file") # Seek to the start of the block we want to read. stream.seek(-to_read, SEEK_CUR) read = stream.read(to_read) # Seek to the start of the block we read after reading it. stream.seek(-to_read, SEEK_CUR) return read def read_previous_line(stream: StreamType) -> bytes: """ Given a byte stream with current position X, return the previous line. All characters between the first CR/LF byte found before X (or, the start of the file, if no such byte is found) and position X After this call, the stream will be positioned one byte after the first non-CRLF character found beyond the first CR/LF byte before X, or, if no such byte is found, at the beginning of the stream. Args: stream: StreamType: Returns: The data which was read. """ line_content = [] found_crlf = False if stream.tell() == 0: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) while True: to_read = min(DEFAULT_BUFFER_SIZE, stream.tell()) if to_read == 0: break # Read the block. After this, our stream will be one # beyond the initial position. block = read_block_backwards(stream, to_read) idx = len(block) - 1 if not found_crlf: # We haven't found our first CR/LF yet. # Read off characters until we hit one. while idx >= 0 and block[idx] not in b"\r\n": idx -= 1 if idx >= 0: found_crlf = True if found_crlf: # We found our first CR/LF already (on this block or # a previous one). # Our combined line is the remainder of the block # plus any previously read blocks. line_content.append(block[idx + 1 :]) # Continue to read off any more CRLF characters. while idx >= 0 and block[idx] in b"\r\n": idx -= 1 else: # Didn't find CR/LF yet - add this block to our # previously read blocks and continue. line_content.append(block) if idx >= 0: # We found the next non-CRLF character. # Set the stream position correctly, then break stream.seek(idx + 1, SEEK_CUR) break # Join all the blocks in the line (which are in reverse order) return b"".join(line_content[::-1]) def matrix_multiply( a: TransformationMatrixType, b: TransformationMatrixType ) -> TransformationMatrixType: return tuple( # type: ignore[return-value] tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b)) for row in a ) def mark_location(stream: StreamType) -> None: """Create text file showing current location in context.""" # Mainly for debugging radius = 5000 stream.seek(-radius, 1) with open("pypdf_pdfLocation.txt", "wb") as output_fh: output_fh.write(stream.read(radius)) output_fh.write(b"HERE") output_fh.write(stream.read(radius)) stream.seek(-radius, 1) @overload def ord_(b: str) -> int: ... @overload def ord_(b: bytes) -> bytes: ... @overload def ord_(b: int) -> int: ... def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: if isinstance(b, str): return ord(b) return b def deprecate(msg: str, stacklevel: int = 3) -> None: warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) def deprecation(msg: str) -> None: raise DeprecationError(msg) def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Issue a warning that a feature will be removed, but has a replacement.""" deprecate( f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4, ) def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature was already removed, but has a replacement.""" deprecation( f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." ) def deprecate_no_replacement(name: str, removed_in: str) -> None: """Issue a warning that a feature will be removed without replacement.""" deprecate(f"{name} is deprecated and will be removed in pypdf {removed_in}.", 4) def deprecation_no_replacement(name: str, removed_in: str) -> None: """Raise an exception that a feature was already removed without replacement.""" deprecation(f"{name} is deprecated and was removed in pypdf {removed_in}.") def logger_error(message: str, *, source: str, **values: Any) -> None: """ Use this instead of logger.error directly. That allows people to overwrite it more easily. See the docs on when to use which: https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html """ logging.getLogger(source).error(message, values) def logger_warning(msg: str, src: str) -> None: """ Use this instead of logger.warning directly. That allows people to overwrite it more easily. ## Exception, warnings.warn, logger_warning - Exceptions should be used if the user should write code that deals with an error case, e.g. the PDF being completely broken. - warnings.warn should be used if the user needs to fix their code, e.g. DeprecationWarnings - logger_warning should be used if the user needs to know that an issue was handled by pypdf, e.g. a non-compliant PDF being read in a way that pypdf could apply a robustness fix to still read it. This applies mainly to strict=False mode. """ logging.getLogger(src).warning(msg) def rename_kwargs( func_name: str, kwargs: dict[str, Any], aliases: dict[str, str], fail: bool = False ) -> None: """ Helper function to deprecate arguments. Args: func_name: Name of the function to be deprecated kwargs: aliases: fail: """ for old_term, new_term in aliases.items(): if old_term in kwargs: if fail: raise DeprecationError( f"{old_term} is deprecated as an argument. Use {new_term} instead" ) if new_term in kwargs: raise TypeError( f"{func_name} received both {old_term} and {new_term} as " f"an argument. {old_term} is deprecated. " f"Use {new_term} instead." ) kwargs[new_term] = kwargs.pop(old_term) warnings.warn( message=( f"{old_term} is deprecated as an argument. Use {new_term} instead" ), category=DeprecationWarning, stacklevel=3, ) def _human_readable_bytes(bytes: int) -> str: if bytes < 10**3: return f"{bytes} Byte" if bytes < 10**6: return f"{bytes / 10**3:.1f} kB" if bytes < 10**9: return f"{bytes / 10**6:.1f} MB" return f"{bytes / 10**9:.1f} GB" # The following class has been copied from Django: # https://github.com/django/django/blob/adae619426b6f50046b3daaa744db52989c9d6db/django/utils/functional.py#L51-L65 # It received some modifications to comply with our own coding standards. # # Original license: # # --------------------------------------------------------------------------------- # Copyright (c) Django Software Foundation and individual contributors. # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # 3. Neither the name of Django nor the names of its contributors may be used # to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # --------------------------------------------------------------------------------- class classproperty: # noqa: N801 """ Decorator that converts a method with a single cls argument into a property that can be accessed directly from the class. """ def __init__(self, method=None) -> None: # type: ignore # noqa: ANN001 self.fget = method def __get__(self, instance, cls=None) -> Any: # type: ignore # noqa: ANN001 return self.fget(cls) def getter(self, method) -> Self: # type: ignore # noqa: ANN001 self.fget = method return self @dataclass class File: from .generic import IndirectObject # noqa: PLC0415 name: str = "" """ Filename as identified within the PDF file. """ data: bytes = b"" """ Data as bytes. """ indirect_reference: Optional[IndirectObject] = None """ Reference to the object storing the stream. """ def __str__(self) -> str: return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" @functools.total_ordering class Version: COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$") def __init__(self, version_str: str) -> None: self.version_str = version_str self.components = self._parse_version(version_str) def _parse_version(self, version_str: str) -> list[tuple[int, str]]: components = version_str.split(".") parsed_components = [] for component in components: match = Version.COMPONENT_PATTERN.match(component) if not match: parsed_components.append((0, component)) continue integer_prefix = match.group(1) suffix = match.group(2) if integer_prefix is None: integer_prefix = 0 parsed_components.append((int(integer_prefix), suffix)) return parsed_components def __eq__(self, other: object) -> bool: if not isinstance(other, Version): return False return self.components == other.components def __hash__(self) -> int: # Convert to tuple as lists cannot be hashed. return hash((self.__class__, tuple(self.components))) def __lt__(self, other: Any) -> bool: if not isinstance(other, Version): raise ValueError(f"Version cannot be compared against {type(other)}") for self_component, other_component in zip(self.components, other.components): self_value, self_suffix = self_component other_value, other_suffix = other_component if self_value < other_value: return True if self_value > other_value: return False if self_suffix < other_suffix: return True if self_suffix > other_suffix: return False return len(self.components) < len(other.components) ================================================ FILE: pypdf/_version.py ================================================ __version__ = "6.9.1" ================================================ FILE: pypdf/_writer.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import decimal import enum import hashlib import re import struct import sys import uuid from collections.abc import Iterable, Mapping from io import BytesIO, FileIO, IOBase from itertools import compress from pathlib import Path from re import Pattern from types import TracebackType from typing import ( IO, Any, Callable, Optional, Union, cast, ) if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject, Transformation from ._page_labels import nums_clear_range, nums_insert, nums_next from ._reader import PdfReader from ._utils import ( StrByteType, StreamType, _get_max_pdf_version_header, deprecation_no_replacement, logger_warning, ) from .constants import AnnotationDictionaryAttributes as AA from .constants import CatalogAttributes as CA from .constants import ( CatalogDictionary, GoToActionArguments, ImageType, InteractiveFormDictEntries, OutlineFontFlag, PageLabelStyle, PagesAttributes, TypFitArguments, UserAccessPermissions, ) from .constants import Core as CO from .constants import FieldDictionaryAttributes as FA from .constants import PageAttributes as PG from .constants import TrailerKeys as TK from .errors import PdfReadError, PyPdfError from .generic import ( PAGE_FIT, ArrayObject, BooleanObject, ByteStringObject, ContentStream, Destination, DictionaryObject, EmbeddedFile, Fit, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, RectangleObject, ReferenceLink, StreamObject, TextStringObject, TreeObject, ViewerPreferences, create_string_object, extract_links, hex_to_rgb, is_null_or_none, ) from .generic._appearance_stream import TextStreamAppearance from .pagerange import PageRange, PageRangeSpec from .types import ( AnnotationSubtype, BorderArrayType, LayoutType, OutlineItemType, OutlineType, PagemodeType, ) from .xmp import XmpInformation ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() class ObjectDeletionFlag(enum.IntFlag): NONE = 0 TEXT = enum.auto() LINKS = enum.auto() ATTACHMENTS = enum.auto() OBJECTS_3D = enum.auto() ALL_ANNOTATIONS = enum.auto() XOBJECT_IMAGES = enum.auto() INLINE_IMAGES = enum.auto() DRAWING_IMAGES = enum.auto() IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: hash = hashlib.md5(usedforsecurity=False) for block in iter(lambda: stream.read(blocksize), b""): hash.update(block) return hash.hexdigest() class PdfWriter(PdfDocCommon): """ Write a PDF file out, given pages produced by another class or through cloning a PDF file during initialization. Typically data is added from a :class:`PdfReader`. Args: clone_from: identical to fileobj (for compatibility) incremental: If true, loads the document and set the PdfWriter in incremental mode. When writing incrementally, the original document is written first and new/modified content is appended. To be used for signed document/forms to keep signature valid. full: If true, loads all the objects (always full if incremental = True). This parameter may allow loading large PDFs. strict: If true, pypdf will raise an exception if a PDF does not follow the specification. If false, pypdf will try to be forgiving and do something reasonable, but it will log a warning message. It is a best-effort approach. """ def __init__( self, fileobj: Union[None, PdfReader, StrByteType, Path] = "", clone_from: Union[None, PdfReader, StrByteType, Path] = None, incremental: bool = False, full: bool = False, strict: bool = False, ) -> None: self.strict = strict """ If true, pypdf will raise an exception if a PDF does not follow the specification. If false, pypdf will try to be forgiving and do something reasonable, but it will log a warning message. It is a best-effort approach. """ self.incremental = incremental or full """ Returns if the PdfWriter object has been started in incremental mode. """ self._objects: list[Optional[PdfObject]] = [] """ The indirect objects in the PDF. For the incremental case, it will be filled with None in clone_reader_document_root. """ self._original_hash: list[int] = [] """ List of hashes after import; used to identify changes. """ self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {} """ Maps hash values of indirect objects to the list of IndirectObjects. This is used for compression. """ self._id_translated: dict[int, dict[int, int]] = {} """List of already translated IDs. dict[id(pdf)][(idnum, generation)] """ self._info_obj: Optional[PdfObject] """The PDF files's document information dictionary, defined by Info in the PDF file's trailer dictionary.""" self._ID: Union[ArrayObject, None] = None """The PDF file identifier, defined by the ID in the PDF file's trailer dictionary.""" self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = [] "Tracks links in pages added to the writer for resolving later." self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {} "Tracks pages added to the writer and what page they turned into." if self.incremental: if isinstance(fileobj, (str, Path)): with open(fileobj, "rb") as f: fileobj = BytesIO(f.read(-1)) if isinstance(fileobj, BytesIO): fileobj = PdfReader(fileobj) if not isinstance(fileobj, PdfReader): raise PyPdfError("Invalid type for incremental mode") self._reader = fileobj # prev content is in _reader.stream self._header = fileobj.pdf_header.encode() self._readonly = True # TODO: to be analysed else: self._header = b"%PDF-1.3" self._info_obj = self._add_object( DictionaryObject( {NameObject("/Producer"): create_string_object("pypdf")} ) ) def _get_clone_from( fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO], ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]: if isinstance(fileobj, (str, Path, IO, BytesIO)) and ( fileobj == "" or clone_from is not None ): return clone_from cloning = True if isinstance(fileobj, (str, Path)) and ( not Path(str(fileobj)).exists() or Path(str(fileobj)).stat().st_size == 0 ): cloning = False if isinstance(fileobj, (IOBase, BytesIO)): t = fileobj.tell() if fileobj.seek(0, 2) == 0: cloning = False fileobj.seek(t, 0) if cloning: clone_from = fileobj return clone_from clone_from = _get_clone_from(fileobj, clone_from) # To prevent overwriting self.temp_fileobj = fileobj self.fileobj = "" self._with_as_usage = False self._cloned = False # The root of our page tree node pages = DictionaryObject( { NameObject(PagesAttributes.TYPE): NameObject("/Pages"), NameObject(PagesAttributes.COUNT): NumberObject(0), NameObject(PagesAttributes.KIDS): ArrayObject(), } ) self.flattened_pages = [] self._encryption: Optional[Encryption] = None self._encrypt_entry: Optional[DictionaryObject] = None if clone_from is not None: if not isinstance(clone_from, PdfReader): clone_from = PdfReader(clone_from) self.clone_document_from_reader(clone_from) self._cloned = True else: self._pages = self._add_object(pages) self._root_object = DictionaryObject( { NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG), NameObject(CO.PAGES): self._pages, } ) self._add_object(self._root_object) if full and not incremental: self.incremental = False if isinstance(self._ID, list): if isinstance(self._ID[0], TextStringObject): self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) if isinstance(self._ID[1], TextStringObject): self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) # for commonality @property def is_encrypted(self) -> bool: """ Read-only boolean property showing whether this PDF file is encrypted. Note that this property, if true, will remain true even after the :meth:`decrypt()` method is called. """ return False @property def root_object(self) -> DictionaryObject: """ Provide direct access to PDF Structure. Note: Recommended only for read access. """ return self._root_object @property def _info(self) -> Optional[DictionaryObject]: """ Provide access to "/Info". Standardized with PdfReader. Returns: /Info Dictionary; None if the entry does not exist """ return ( None if self._info_obj is None else cast(DictionaryObject, self._info_obj.get_object()) ) @_info.setter def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: if value is None: try: self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore except (KeyError, AttributeError): pass self._info_obj = None else: if self._info_obj is None: self._info_obj = self._add_object(DictionaryObject()) obj = cast(DictionaryObject, self._info_obj.get_object()) obj.clear() obj.update(cast(DictionaryObject, value.get_object())) @property def xmp_metadata(self) -> Optional[XmpInformation]: """XMP (Extensible Metadata Platform) data.""" return cast(XmpInformation, self.root_object.xmp_metadata) @xmp_metadata.setter def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None: """XMP (Extensible Metadata Platform) data.""" if value is None: if "/Metadata" in self.root_object: del self.root_object["/Metadata"] return metadata = self.root_object.get("/Metadata", None) if not isinstance(metadata, IndirectObject): if metadata is not None: del self.root_object["/Metadata"] metadata_stream = StreamObject() stream_reference = self._add_object(metadata_stream) self.root_object[NameObject("/Metadata")] = stream_reference else: metadata_stream = cast(StreamObject, metadata.get_object()) if isinstance(value, XmpInformation): bytes_data = value.stream.get_data() else: bytes_data = value metadata_stream.set_data(bytes_data) @property def with_as_usage(self) -> bool: deprecation_no_replacement("with_as_usage", "5.0") return self._with_as_usage @with_as_usage.setter def with_as_usage(self, value: bool) -> None: deprecation_no_replacement("with_as_usage", "5.0") self._with_as_usage = value def __enter__(self) -> Self: """Store how writer is initialized by 'with'.""" c: bool = self._cloned t = self.temp_fileobj self.__init__() # type: ignore self._cloned = c self._with_as_usage = True self.fileobj = t # type: ignore return self def __exit__( self, exc_type: Optional[type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType], ) -> None: """Write data to the fileobj.""" if self.fileobj and not self._cloned: self.write(self.fileobj) @property def pdf_header(self) -> str: """ Read/Write property of the PDF header that is written. This should be something like ``'%PDF-1.5'``. It is recommended to set the lowest version that supports all features which are used within the PDF file. Note: `pdf_header` returns a string but accepts bytes or str for writing """ return self._header.decode() @pdf_header.setter def pdf_header(self, new_header: Union[str, bytes]) -> None: if isinstance(new_header, str): new_header = new_header.encode() self._header = new_header def _add_object(self, obj: PdfObject) -> IndirectObject: if ( getattr(obj, "indirect_reference", None) is not None and obj.indirect_reference.pdf == self # type: ignore ): return obj.indirect_reference # type: ignore # check for /Contents in Pages (/Contents in annotations are strings) if isinstance(obj, DictionaryObject) and isinstance( obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) ): obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) self._objects.append(obj) obj.indirect_reference = IndirectObject(len(self._objects), 0, self) return obj.indirect_reference def get_object( self, indirect_reference: Union[int, IndirectObject], ) -> PdfObject: if isinstance(indirect_reference, int): obj = self._objects[indirect_reference - 1] elif indirect_reference.pdf != self: raise ValueError("PDF must be self") else: obj = self._objects[indirect_reference.idnum - 1] assert obj is not None, "mypy" return obj def _replace_object( self, indirect_reference: Union[int, IndirectObject], obj: PdfObject, ) -> PdfObject: if isinstance(indirect_reference, IndirectObject): if indirect_reference.pdf != self: raise ValueError("PDF must be self") indirect_reference = indirect_reference.idnum gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore if ( getattr(obj, "indirect_reference", None) is not None and obj.indirect_reference.pdf != self # type: ignore ): obj = obj.clone(self) self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) assert isinstance(obj, PdfObject), "mypy" return obj def _add_page( self, page: PageObject, index: int, excluded_keys: Iterable[str] = (), ) -> PageObject: if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE: raise ValueError("Invalid page object") assert self.flattened_pages is not None, "for mypy" page_org = page excluded_keys = list(excluded_keys) excluded_keys += [PagesAttributes.PARENT, "/StructParents"] # Acrobat does not accept two indirect references pointing on the same # page; therefore in order to add multiple copies of the same # page, we need to create a new dictionary for the page, however the # objects below (including content) are not duplicated: try: # delete an already existing page del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore page_org.indirect_reference.idnum # type: ignore ] except Exception: pass page = cast( "PageObject", page_org.clone(self, False, excluded_keys).get_object() ) if page_org.pdf is not None: other = page_org.pdf.pdf_header self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) node, idx = self._get_page_in_node(index) page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference if idx >= 0: cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference) self.flattened_pages.insert(index, page) else: cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference) self.flattened_pages.append(page) recurse = 0 while not is_null_or_none(node): node = cast(DictionaryObject, node.get_object()) node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1) node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix. recurse += 1 if recurse > 1000: raise PyPdfError("Too many recursive calls!") if page_org.pdf is not None: # the page may contain links to other pages, and those other # pages may or may not already be added. we store the # information we need, so that we can resolve the references # later. self._unresolved_links.extend(extract_links(page, page_org)) self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference return page def set_need_appearances_writer(self, state: bool = True) -> None: """ Sets the "NeedAppearances" flag in the PDF writer. The "NeedAppearances" flag indicates whether the appearance dictionary for form fields should be automatically generated by the PDF viewer or if the embedded appearance should be used. Args: state: The actual value of the NeedAppearances flag. Returns: None """ # See §12.7.2 and §7.7.2 for more information: # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf try: # get the AcroForm tree if CatalogDictionary.ACRO_FORM not in self._root_object: self._root_object[ NameObject(CatalogDictionary.ACRO_FORM) ] = self._add_object(DictionaryObject()) need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[ need_appearances ] = BooleanObject(state) except Exception as exc: # pragma: no cover logger_warning( f"set_need_appearances_writer({state}) catch : {exc}", __name__ ) def create_viewer_preferences(self) -> ViewerPreferences: o = ViewerPreferences() self._root_object[ NameObject(CatalogDictionary.VIEWER_PREFERENCES) ] = self._add_object(o) return o def add_page( self, page: PageObject, excluded_keys: Iterable[str] = (), ) -> PageObject: """ Add a page to this PDF file. Recommended for advanced usage including the adequate excluded_keys. The page is usually acquired from a :class:`PdfReader` instance. Args: page: The page to add to the document. Should be an instance of :class:`PageObject` excluded_keys: Returns: The added PageObject. """ assert self.flattened_pages is not None, "mypy" return self._add_page(page, len(self.flattened_pages), excluded_keys) def insert_page( self, page: PageObject, index: int = 0, excluded_keys: Iterable[str] = (), ) -> PageObject: """ Insert a page in this PDF file. The page is usually acquired from a :class:`PdfReader` instance. Args: page: The page to add to the document. index: Position at which the page will be inserted. excluded_keys: Returns: The added PageObject. """ assert self.flattened_pages is not None, "mypy" if index < 0: index += len(self.flattened_pages) if index < 0: raise ValueError("Invalid index value") if index >= len(self.flattened_pages): return self.add_page(page, excluded_keys) return self._add_page(page, index, excluded_keys) def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> Optional[int]: """ Generate _page_id2num. Args: indirect_reference: Returns: The page number or None """ # To provide same function as in PdfReader if is_null_or_none(indirect_reference): return None assert indirect_reference is not None, "mypy" if isinstance(indirect_reference, int): indirect_reference = IndirectObject(indirect_reference, 0, self) obj = indirect_reference.get_object() if isinstance(obj, PageObject): return obj.page_number return None def add_blank_page( self, width: Optional[float] = None, height: Optional[float] = None ) -> PageObject: """ Append a blank page to this PDF file and return it. If no page size is specified, use the size of the last page. Args: width: The width of the new page expressed in default user space units. height: The height of the new page expressed in default user space units. Returns: The newly appended page. Raises: PageSizeNotDefinedError: if width and height are not defined and previous page does not exist. """ page = PageObject.create_blank_page(self, width, height) return self.add_page(page) def insert_blank_page( self, width: Optional[Union[float, decimal.Decimal]] = None, height: Optional[Union[float, decimal.Decimal]] = None, index: int = 0, ) -> PageObject: """ Insert a blank page to this PDF file and return it. If no page size is specified for a dimension, use the size of the last page. Args: width: The width of the new page in default user space units. height: The height of the new page in default user space units. index: Position to add the page. Returns: The newly inserted page. Raises: PageSizeNotDefinedError: if width and height are not defined and previous page does not exist. IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()] """ num_pages = self.get_num_pages() if abs(index) <= num_pages: # Use the chosen index, but do not exceed the available pages fixed_index = min(index, num_pages - 1) mediabox = self.pages[fixed_index].mediabox if width is None or width <= 0: width = mediabox.width if height is None or height <= 0: height = mediabox.height else: raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]") page = PageObject.create_blank_page(self, width, height) self.insert_page(page, index) return page @property def open_destination( self, ) -> Union[None, Destination, TextStringObject, ByteStringObject]: return super().open_destination @open_destination.setter def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: if dest is None: try: del self._root_object["/OpenAction"] except KeyError: pass elif isinstance(dest, str): self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) elif isinstance(dest, Destination): self._root_object[NameObject("/OpenAction")] = dest.dest_array elif isinstance(dest, PageObject): self._root_object[NameObject("/OpenAction")] = Destination( "Opening", dest.indirect_reference if dest.indirect_reference is not None else NullObject(), PAGE_FIT, ).dest_array def add_js(self, javascript: str) -> None: """ Add JavaScript which will launch upon opening this PDF. Args: javascript: Your JavaScript. Example: This will launch the print window when the PDF is opened. >>> from pypdf import PdfWriter >>> output = PdfWriter() >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") """ # Names / JavaScript preferred to be able to add multiple scripts if "/Names" not in self._root_object: self._root_object[NameObject(CA.NAMES)] = DictionaryObject() names = cast(DictionaryObject, self._root_object[CA.NAMES]) if "/JavaScript" not in names: names[NameObject("/JavaScript")] = DictionaryObject( {NameObject("/Names"): ArrayObject()} ) js_list = cast( ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] ) # We need a name for parameterized JavaScript in the PDF file, # but it can be anything. js_list.append(create_string_object(str(uuid.uuid4()))) js = DictionaryObject( { NameObject(PagesAttributes.TYPE): NameObject("/Action"), NameObject("/S"): NameObject("/JavaScript"), NameObject("/JS"): TextStringObject(f"{javascript}"), } ) js_list.append(self._add_object(js)) def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile": """ Embed a file inside the PDF. Reference: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf Section 7.11.3 Args: filename: The filename to display. data: The data in the file. Returns: EmbeddedFile instance for the newly created embedded file. """ return EmbeddedFile._create_new(self, filename, data) def append_pages_from_reader( self, reader: PdfReader, after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: """ Copy pages from reader to writer. Includes an optional callback parameter which is invoked after pages are appended to the writer. ``append`` should be preferred. Args: reader: a PdfReader object from which to copy page annotations to this writer object. The writer's annots will then be updated. after_page_append: Callback function that is invoked after each page is appended to the writer. Signature includes a reference to the appended page (delegates to append_pages_from_reader). The single parameter of the callback is a reference to the page just appended to the document. """ reader_num_pages = len(reader.pages) # Copy pages from reader to writer for reader_page_number in range(reader_num_pages): reader_page = reader.pages[reader_page_number] writer_page = self.add_page(reader_page) # Trigger callback, pass writer page as parameter if callable(after_page_append): after_page_append(writer_page) def _merge_content_stream_to_page( self, page: PageObject, new_content_data: bytes, ) -> None: """ Combines existing content stream(s) with new content (as bytes). Args: page: The page to which the new content data will be added. new_content_data: A binary-encoded new content stream, for instance the commands to draw an XObject. """ # First resolve the existing page content. This always is an IndirectObject: # PDF Explained by John Whitington # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html if NameObject("/Contents") in page: existing_content_ref = page[NameObject("/Contents")] existing_content = existing_content_ref.get_object() if isinstance(existing_content, ArrayObject): # Create a new StreamObject for the new_content_data new_stream_obj = StreamObject() new_stream_obj.set_data(new_content_data) existing_content.append(self._add_object(new_stream_obj)) page[NameObject("/Contents")] = self._add_object(existing_content) if isinstance(existing_content, StreamObject): # Merge new content to existing StreamObject merged_data = existing_content.get_data() + b"\n" + new_content_data new_stream = StreamObject() new_stream.set_data(merged_data) page[NameObject("/Contents")] = self._add_object(new_stream) else: # If no existing content, then we have an empty page. # Create a new StreamObject in a new /Contents entry. new_stream = StreamObject() new_stream.set_data(new_content_data) page[NameObject("/Contents")] = self._add_object(new_stream) def _add_apstream_object( self, page: PageObject, appearance_stream_obj: StreamObject, object_name: str, x_offset: float, y_offset: float, ) -> None: """ Adds an appearance stream to the page content in the form of an XObject. Args: page: The page to which to add the appearance stream. appearance_stream_obj: The appearance stream. object_name: The name of the appearance stream. x_offset: The horizontal offset for the appearance stream. y_offset: The vertical offset for the appearance stream. """ # Prepare XObject resource dictionary on the page. This currently # only deals with font resources, but can easily be adapted to also # include other resources. pg_res = cast(DictionaryObject, page[PG.RESOURCES]) if "/Resources" in appearance_stream_obj: ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"]) ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject())) if "/Font" not in pg_res: font_dict_ref = self._add_object(DictionaryObject()) pg_res[NameObject("/Font")] = font_dict_ref pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object()) # Merge fonts from the appearance stream into the page's font resources for font_name, font_res in ap_stream_font_dict.items(): if font_name not in pg_font_res: font_res_ref = self._add_object(font_res) pg_font_res[font_name] = font_res_ref # Always add the resolved stream object to the writer to get a new IndirectObject. # This ensures we have a valid IndirectObject managed by *this* writer. xobject_ref = self._add_object(appearance_stream_obj) xobject_name = NameObject(f"/Fm_{object_name}")._sanitize() if "/XObject" not in pg_res: pg_res[NameObject("/XObject")] = DictionaryObject() pg_xo_res = cast(DictionaryObject, pg_res["/XObject"]) if xobject_name not in pg_xo_res: pg_xo_res[xobject_name] = xobject_ref else: logger_warning( f"XObject {xobject_name!r} already added to page resources. This might be an issue.", __name__ ) xobject_cm = Transformation().translate(x_offset, y_offset) xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() self._merge_content_stream_to_page(page, xobject_drawing_commands) FFBITS_NUL = FA.FfBits(0) def update_page_form_field_values( self, page: Union[PageObject, list[PageObject], None], fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]], flags: FA.FfBits = FFBITS_NUL, auto_regenerate: Optional[bool] = True, flatten: bool = False, ) -> None: """ Update the form field values for a given page from a fields dictionary. Copy field texts and values from fields to page. If the field links to a parent object, add the information to the parent. Args: page: `PageObject` - references **PDF writer's page** where the annotations and field data will be updated. `List[Pageobject]` - provides list of pages to be processed. `None` - all pages. fields: a Python dictionary of: * field names (/T) as keys and text values (/V) as value * field names (/T) as keys and list of text values (/V) for multiple choice list * field names (/T) as keys and tuple of: * text values (/V) * font id (e.g. /F1, the font id must exist) * font size (0 for autosize) flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`. auto_regenerate: Set/unset the need_appearances flag; the flag is unchanged if auto_regenerate is None. flatten: Whether or not to flatten the annotation. If True, this adds the annotation's appearance stream to the page contents. Note that this option does not remove the annotation itself. """ if CatalogDictionary.ACRO_FORM not in self._root_object: raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) if InteractiveFormDictEntries.Fields not in acro_form: raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") if isinstance(auto_regenerate, bool): self.set_need_appearances_writer(auto_regenerate) # Iterate through pages, update field values if page is None: page = list(self.pages) if isinstance(page, list): for p in page: if PG.ANNOTS in p: # just to prevent warnings self.update_page_form_field_values(p, fields, flags, None, flatten=flatten) return if PG.ANNOTS not in page: logger_warning("No fields to update on this page", __name__) return appearance_stream_obj: Optional[StreamObject] = None for annotation in page[PG.ANNOTS]: # type: ignore annotation = cast(DictionaryObject, annotation.get_object()) if annotation.get("/Subtype", "") != "/Widget": continue if "/FT" in annotation and "/T" in annotation: parent_annotation = annotation else: parent_annotation = annotation.get( PG.PARENT, DictionaryObject() ).get_object() for field, value in fields.items(): rectangle = cast(RectangleObject, annotation[AA.Rect]) if not ( self._get_qualified_field_name(parent_annotation) == field or parent_annotation.get("/T", None) == field ): continue if ( parent_annotation.get("/FT", None) == "/Ch" and "/I" in parent_annotation ): del parent_annotation["/I"] if flags: annotation[NameObject(FA.Ff)] = NumberObject(flags) # Set the field value if not (value is None and flatten): # Only change values if given by user and not flattening. if isinstance(value, list): lst = ArrayObject(TextStringObject(v) for v in value) parent_annotation[NameObject(FA.V)] = lst elif isinstance(value, tuple): annotation[NameObject(FA.V)] = TextStringObject( value[0], ) else: parent_annotation[NameObject(FA.V)] = TextStringObject(value) # Get or create the field's appearance stream object if parent_annotation.get(FA.FT) == "/Btn": # Checkbox button (no /FT found in Radio widgets); # We can find the associated appearance stream object # within the annotation. v = NameObject(value) ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) normal_ap = cast(DictionaryObject, ap["/N"]) if v not in normal_ap: v = NameObject("/Off") appearance_stream_obj = normal_ap.get(v) # Other cases will be updated through the for loop annotation[NameObject(AA.AS)] = v annotation[NameObject(FA.V)] = v elif ( parent_annotation.get(FA.FT) == "/Tx" or parent_annotation.get(FA.FT) == "/Ch" ): # Textbox; we need to generate the appearance stream object if isinstance(value, tuple): appearance_stream_obj = TextStreamAppearance.from_text_annotation( acro_form, parent_annotation, annotation, value[1], value[2] ) else: appearance_stream_obj = TextStreamAppearance.from_text_annotation( acro_form, parent_annotation, annotation ) # Add the appearance stream object if AA.AP not in annotation: annotation[NameObject(AA.AP)] = DictionaryObject( {NameObject("/N"): self._add_object(appearance_stream_obj)} ) elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])): cast(DictionaryObject, annotation[NameObject(AA.AP)])[ NameObject("/N") ] = self._add_object(appearance_stream_obj) else: # [/AP][/N] exists n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore self._objects[n - 1] = appearance_stream_obj appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self) elif ( annotation.get(FA.FT) == "/Sig" ): # deprecated # not implemented yet logger_warning("Signature forms not implemented yet", __name__) if flatten and appearance_stream_obj is not None: self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1]) def reattach_fields( self, page: Optional[PageObject] = None ) -> list[DictionaryObject]: """ Parse annotations within the page looking for orphan fields and reattach then into the Fields Structure. Args: page: page to analyze. If none is provided, all pages will be analyzed. Returns: list of reattached fields. """ lst = [] if page is None: for p in self.pages: lst += self.reattach_fields(p) return lst try: af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) except KeyError: af = DictionaryObject() self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af try: fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) except KeyError: fields = ArrayObject() af[NameObject(InteractiveFormDictEntries.Fields)] = fields if "/Annots" not in page: return lst annotations = cast(ArrayObject, page["/Annots"]) for idx, annotation in enumerate(annotations): is_indirect = isinstance(annotation, IndirectObject) annotation = cast(DictionaryObject, annotation.get_object()) if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation: if ( "indirect_reference" in annotation.__dict__ and annotation.indirect_reference in fields ): continue if not is_indirect: annotations[idx] = self._add_object(annotation) fields.append(annotation.indirect_reference) lst.append(annotation) return lst def clone_reader_document_root(self, reader: PdfReader) -> None: """ Copy the reader document root to the writer and all sub-elements, including pages, threads, outlines,... For partial insertion, ``append`` should be considered. Args: reader: PdfReader from which the document root should be copied. """ self._info_obj = None if self.incremental: self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1) for i in range(len(self._objects)): o = reader.get_object(i + 1) if o is not None: self._objects[i] = o.replicate(self) else: self._objects.clear() self._root_object = reader.root_object.clone(self) self._pages = self._root_object.raw_get("/Pages") if len(self._objects) > cast(int, reader.trailer["/Size"]): if self.strict: raise PdfReadError( f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}" ) logger_warning( f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}", __name__ ) # must be done here before rewriting if self.incremental: self._original_hash = [ (obj.hash_bin() if obj is not None else 0) for obj in self._objects ] try: self._flatten() except IndexError: raise PdfReadError("Got index error while flattening.") assert self.flattened_pages is not None for p in self.flattened_pages: self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) if not self.incremental: p[NameObject("/Parent")] = self._pages if not self.incremental: cast(DictionaryObject, self._pages.get_object())[ NameObject("/Kids") ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) def clone_document_from_reader( self, reader: PdfReader, after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: """ Create a copy (clone) of a document from a PDF file reader cloning section '/Root' and '/Info' and '/ID' of the pdf. Args: reader: PDF file reader instance from which the clone should be created. after_page_append: Callback function that is invoked after each page is appended to the writer. Signature includes a reference to the appended page (delegates to append_pages_from_reader). The single parameter of the callback is a reference to the page just appended to the document. """ self.clone_reader_document_root(reader) inf = reader._info if self.incremental: if inf is not None: self._info_obj = cast( IndirectObject, inf.clone(self).indirect_reference ) assert isinstance(self._info, DictionaryObject), "for mypy" self._original_hash[ self._info_obj.indirect_reference.idnum - 1 ] = self._info.hash_bin() elif inf is not None: self._info_obj = self._add_object( DictionaryObject(cast(DictionaryObject, inf.get_object())) ) # else: _info_obj = None done in clone_reader_document_root() try: self._ID = cast(ArrayObject, reader._ID).clone(self) except AttributeError: pass if callable(after_page_append): for page in cast( ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] ): after_page_append(page.get_object()) def _compute_document_identifier(self) -> ByteStringObject: stream = BytesIO() self._write_pdf_structure(stream) stream.seek(0) return ByteStringObject(_rolling_checksum(stream).encode("utf8")) def generate_file_identifiers(self) -> None: """ Generate an identifier for the PDF that will be written. The only point of this is ensuring uniqueness. Reproducibility is not required. When a file is first written, both identifiers shall be set to the same value. If both identifiers match when a file reference is resolved, it is very likely that the correct and unchanged file has been found. If only the first identifier matches, a different version of the correct file has been found. see §14.4 "File Identifiers". """ if self._ID: id1 = self._ID[0] id2 = self._compute_document_identifier() else: id1 = self._compute_document_identifier() id2 = id1 self._ID = ArrayObject((id1, id2)) def encrypt( self, user_password: str, owner_password: Optional[str] = None, use_128bit: bool = True, permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, *, algorithm: Optional[str] = None, ) -> None: """ Encrypt this PDF file with the PDF Standard encryption handler. Args: user_password: The password which allows for opening and reading the PDF file with the restrictions provided. owner_password: The password which allows for opening the PDF files without any restrictions. By default, the owner password is the same as the user password. use_128bit: flag as to whether to use 128bit encryption. When false, 40bit encryption will be used. By default, this flag is on. permissions_flag: permissions as described in Table 3.20 of the PDF 1.7 specification. A bit value of 1 means the permission is granted. Hence an integer value of -1 will set all flags. Bit position 3 is for printing, 4 is for modifying content, 5 and 6 control annotations, 9 for form fields, 10 for extraction of text and graphics. algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128", "AES-128", "AES-256-R5", "AES-256". If it is valid, `use_128bit` will be ignored. """ if owner_password is None: owner_password = user_password if algorithm is not None: try: alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_")) except AttributeError: raise ValueError(f"Algorithm '{algorithm}' NOT supported") else: alg = EncryptAlgorithm.RC4_128 if not use_128bit: alg = EncryptAlgorithm.RC4_40 self.generate_file_identifiers() assert self._ID self._encryption = Encryption.make(alg, permissions_flag, self._ID[0]) # in case call `encrypt` again entry = self._encryption.write_entry(user_password, owner_password) if self._encrypt_entry: # replace old encrypt_entry assert self._encrypt_entry.indirect_reference is not None entry.indirect_reference = self._encrypt_entry.indirect_reference self._objects[entry.indirect_reference.idnum - 1] = entry else: self._add_object(entry) self._encrypt_entry = entry def _resolve_links(self) -> None: """Patch up links that were added to the document earlier, to make sure they still point to the same pages. """ for (new_link, old_link) in self._unresolved_links: old_page = old_link.find_referenced_page() if not old_page: continue new_page = self._merged_in_pages.get(old_page) if new_page is None: continue new_link.patch_reference(self, new_page) def write_stream(self, stream: StreamType) -> None: if hasattr(stream, "mode") and "b" not in stream.mode: logger_warning( f"File <{stream.name}> to write to is not in binary mode. " "It may not be written to correctly.", __name__, ) self._resolve_links() if self.incremental: self._reader.stream.seek(0) stream.write(self._reader.stream.read(-1)) if len(self.list_objects_in_increment()) > 0: self._write_increment(stream) # writes objs, xref stream and startxref else: object_positions, free_objects = self._write_pdf_structure(stream) xref_location = self._write_xref_table( stream, object_positions, free_objects ) self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]: """ Write the collection of pages added to this object out as a PDF file. Args: stream: An object to write the file to. The object can support the write method and the tell method, similar to a file object, or be a file path, just like the fileobj, just named it stream to keep existing workflow. Returns: A tuple (bool, IO). """ my_file = False if stream == "": raise ValueError(f"Output({stream=}) is empty.") if isinstance(stream, (str, Path)): stream = FileIO(stream, "wb") my_file = True self.write_stream(stream) if my_file: stream.close() else: stream.flush() return my_file, stream def list_objects_in_increment(self) -> list[IndirectObject]: """ For analysis or debugging. Provides the list of new or modified objects that will be written in the increment. Deleted objects will not be freed but will become orphans. Returns: List of new or modified IndirectObjects """ original_hash_count = len(self._original_hash) return [ cast(IndirectObject, obj).indirect_reference for i, obj in enumerate(self._objects) if ( obj is not None and ( i >= original_hash_count or obj.hash_bin() != self._original_hash[i] ) ) ] def _write_increment(self, stream: StreamType) -> None: object_positions = {} object_blocks = [] current_start = -1 current_stop = -2 original_hash_count = len(self._original_hash) for i, obj in enumerate(self._objects): if obj is not None and ( i >= original_hash_count or obj.hash_bin() != self._original_hash[i] ): idnum = i + 1 assert isinstance(obj, PdfObject), "mypy" # first write new/modified object object_positions[idnum] = stream.tell() stream.write(f"{idnum} 0 obj\n".encode()) """ encryption is not operational if self._encryption and obj != self._encrypt_entry: obj = self._encryption.encrypt_object(obj, idnum, 0) """ obj.write_to_stream(stream) stream.write(b"\nendobj\n") # prepare xref if idnum != current_stop: if current_start > 0: object_blocks.append( [current_start, current_stop - current_start] ) current_start = idnum current_stop = idnum + 1 assert current_start > 0, "for pytest only" object_blocks.append([current_start, current_stop - current_start]) # write incremented xref xref_location = stream.tell() xr_id = len(self._objects) + 1 stream.write(f"{xr_id} 0 obj".encode()) init_data = { NameObject("/Type"): NameObject("/XRef"), NameObject("/Size"): NumberObject(xr_id + 1), NameObject("/Root"): self.root_object.indirect_reference, NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/Index"): ArrayObject( [NumberObject(_it) for _su in object_blocks for _it in _su] ), NameObject("/W"): ArrayObject( [NumberObject(1), NumberObject(4), NumberObject(1)] ), "__streamdata__": b"", } if self._info is not None and ( self._info.indirect_reference.idnum - 1 # type: ignore >= len(self._original_hash) or cast(IndirectObject, self._info).hash_bin() # kept for future != self._original_hash[ self._info.indirect_reference.idnum - 1 # type: ignore ] ): init_data[NameObject(TK.INFO)] = self._info.indirect_reference init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) if self._ID: init_data[NameObject(TK.ID)] = self._ID xr = StreamObject.initialize_from_dictionary(init_data) xr.set_data( b"".join( [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] ) ) xr.write_to_stream(stream) stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]: object_positions = [] free_objects = [] stream.write(self.pdf_header.encode() + b"\n") stream.write(b"%\xE2\xE3\xCF\xD3\n") for idnum, obj in enumerate(self._objects, start=1): if obj is not None: object_positions.append(stream.tell()) stream.write(f"{idnum} 0 obj\n".encode()) if self._encryption and obj != self._encrypt_entry: obj = self._encryption.encrypt_object(obj, idnum, 0) obj.write_to_stream(stream) stream.write(b"\nendobj\n") else: object_positions.append(-1) free_objects.append(idnum) free_objects.append(0) # add 0 to loop in accordance with specification return object_positions, free_objects def _write_xref_table( self, stream: StreamType, object_positions: list[int], free_objects: list[int] ) -> int: xref_location = stream.tell() stream.write(b"xref\n") stream.write(f"0 {len(self._objects) + 1}\n".encode()) stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) free_idx = 1 for offset in object_positions: if offset > 0: stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) else: stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) free_idx += 1 return xref_location def _write_trailer(self, stream: StreamType, xref_location: int) -> None: """ Write the PDF trailer to the stream. To quote the PDF specification: [The] trailer [gives] the location of the cross-reference table and of certain special objects within the body of the file. """ stream.write(b"trailer\n") trailer = DictionaryObject( { NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), NameObject(TK.ROOT): self.root_object.indirect_reference, } ) if self._info is not None: trailer[NameObject(TK.INFO)] = self._info.indirect_reference if self._ID is not None: trailer[NameObject(TK.ID)] = self._ID if self._encrypt_entry: trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference trailer.write_to_stream(stream) stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof @property def metadata(self) -> Optional[DocumentInformation]: """ Retrieve/set the PDF file's document information dictionary, if it exists. Args: value: dict with the entries to be set. if None : remove the /Info entry from the pdf. Note that some PDF files use (XMP) metadata streams instead of document information dictionaries, and these metadata streams will not be accessed by this function, but by :meth:`~xmp_metadata`. """ return super().metadata @metadata.setter def metadata( self, value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]], ) -> None: if value is None: self._info = None else: if self._info is not None: self._info.clear() self.add_metadata(value) def add_metadata(self, infos: dict[str, Any]) -> None: """ Add custom metadata to the output. Args: infos: a Python dictionary where each key is a field and each value is your new metadata. """ args = {} if isinstance(infos, PdfObject): infos = cast(DictionaryObject, infos.get_object()) for key, value in list(infos.items()): if isinstance(value, PdfObject): value = value.get_object() args[NameObject(key)] = create_string_object(str(value)) if self._info is None: self._info = DictionaryObject() self._info.update(args) def compress_identical_objects( self, remove_identicals: bool = True, remove_orphans: bool = True, ) -> None: """ Parse the PDF file and merge objects that have the same hash. This will make objects common to multiple pages. Recommended to be used just before writing output. Args: remove_identicals: Remove identical objects. remove_orphans: Remove unreferenced objects. """ def replace_in_obj( obj: PdfObject, crossref: dict[IndirectObject, IndirectObject] ) -> None: if isinstance(obj, DictionaryObject): key_val = obj.items() elif isinstance(obj, ArrayObject): key_val = enumerate(obj) # type: ignore else: return assert isinstance(obj, (DictionaryObject, ArrayObject)) for k, v in key_val: if isinstance(v, IndirectObject): orphans[v.idnum - 1] = False if v in crossref: obj[k] = crossref[v] else: """the filtering on DictionaryObject and ArrayObject only will be performed within replace_in_obj""" replace_in_obj(v, crossref) # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) self._idnum_hash = {} orphans = [True] * len(self._objects) # look for similar objects for idx, obj in enumerate(self._objects): if is_null_or_none(obj): continue assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. assert isinstance(obj.indirect_reference, IndirectObject) h = obj.hash_value() if remove_identicals and h in self._idnum_hash: self._idnum_hash[h][1].append(obj.indirect_reference) self._objects[idx] = None else: self._idnum_hash[h] = (obj.indirect_reference, []) # generate the dict converting others to 1st cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} cnv_rev: dict[IndirectObject, IndirectObject] = {} for k, v in cnv.items(): cnv_rev.update(zip(v, (k,) * len(v))) # replace reference to merged objects for obj in self._objects: if isinstance(obj, (DictionaryObject, ArrayObject)): replace_in_obj(obj, cnv_rev) # remove orphans (if applicable) orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore if not is_null_or_none(self._info): orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore try: orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore except AttributeError: pass for i in compress(range(len(self._objects)), orphans): self._objects[i] = None def get_reference(self, obj: PdfObject) -> IndirectObject: idnum = self._objects.index(obj) + 1 ref = IndirectObject(idnum, 0, self) assert ref.get_object() == obj return ref def get_outline_root(self) -> TreeObject: if CO.OUTLINES in self._root_object: # Entries in the catalog dictionary outline = cast(TreeObject, self._root_object[CO.OUTLINES]) if not isinstance(outline, TreeObject): t = TreeObject(outline) self._replace_object(outline.indirect_reference.idnum, t) outline = t idnum = self._objects.index(outline) + 1 outline_ref = IndirectObject(idnum, 0, self) assert outline_ref.get_object() == outline else: outline = TreeObject() outline.update({}) outline_ref = self._add_object(outline) self._root_object[NameObject(CO.OUTLINES)] = outline_ref return outline def get_threads_root(self) -> ArrayObject: """ The list of threads. See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. Returns: An array (possibly empty) of Dictionaries with an ``/F`` key, and optionally information about the thread in ``/I`` or ``/Metadata`` keys. """ if CO.THREADS in self._root_object: # Entries in the catalog dictionary threads = cast(ArrayObject, self._root_object[CO.THREADS]) else: threads = ArrayObject() self._root_object[NameObject(CO.THREADS)] = threads return threads @property def threads(self) -> ArrayObject: """ Read-only property for the list of threads. See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. Each element is a dictionary with an ``/F`` key, and optionally information about the thread in ``/I`` or ``/Metadata`` keys. """ return self.get_threads_root() def add_outline_item_destination( self, page_destination: Union[IndirectObject, PageObject, TreeObject], parent: Union[None, TreeObject, IndirectObject] = None, before: Union[None, TreeObject, IndirectObject] = None, is_open: bool = True, ) -> IndirectObject: page_destination = cast(PageObject, page_destination.get_object()) if isinstance(page_destination, PageObject): return self.add_outline_item_destination( Destination( f"page #{page_destination.page_number}", cast(IndirectObject, page_destination.indirect_reference), Fit.fit(), ) ) if parent is None: parent = self.get_outline_root() page_destination[NameObject("/%is_open%")] = BooleanObject(is_open) parent = cast(TreeObject, parent.get_object()) page_destination_ref = self._add_object(page_destination) if before is not None: before = before.indirect_reference parent.insert_child( page_destination_ref, before, self, page_destination.inc_parent_counter_outline if is_open else (lambda x, y: 0), # noqa: ARG005 ) if "/Count" not in page_destination: page_destination[NameObject("/Count")] = NumberObject(0) return page_destination_ref def add_outline_item_dict( self, outline_item: OutlineItemType, parent: Union[None, TreeObject, IndirectObject] = None, before: Union[None, TreeObject, IndirectObject] = None, is_open: bool = True, ) -> IndirectObject: outline_item_object = TreeObject() outline_item_object.update(outline_item) """code currently unreachable if "/A" in outline_item: action = DictionaryObject() a_dict = cast(DictionaryObject, outline_item["/A"]) for k, v in list(a_dict.items()): action[NameObject(str(k))] = v action_ref = self._add_object(action) outline_item_object[NameObject("/A")] = action_ref """ return self.add_outline_item_destination( outline_item_object, parent, before, is_open ) def add_outline_item( self, title: str, page_number: Union[None, PageObject, IndirectObject, int], parent: Union[None, TreeObject, IndirectObject] = None, before: Union[None, TreeObject, IndirectObject] = None, color: Optional[Union[tuple[float, float, float], str]] = None, bold: bool = False, italic: bool = False, fit: Fit = PAGE_FIT, is_open: bool = True, ) -> IndirectObject: """ Add an outline item (commonly referred to as a "Bookmark") to the PDF file. Args: title: Title to use for this outline item. page_number: Page number this outline item will point to. parent: A reference to a parent outline item to create nested outline items. before: color: Color of the outline item's font as a red, green, blue tuple from 0.0 to 1.0 or as a Hex String (#RRGGBB) bold: Outline item font is bold italic: Outline item font is italic fit: The fit of the destination page. Returns: The added outline item as an indirect object. """ page_ref: Union[None, NullObject, IndirectObject, NumberObject] if isinstance(italic, Fit): # it means that we are on the old params if fit is not None and page_number is None: page_number = fit return self.add_outline_item( title, page_number, parent, None, before, color, bold, italic, is_open=is_open ) if page_number is None: action_ref = None else: if isinstance(page_number, IndirectObject): page_ref = page_number elif isinstance(page_number, PageObject): page_ref = page_number.indirect_reference elif isinstance(page_number, int): try: page_ref = self.pages[page_number].indirect_reference except IndexError: page_ref = NumberObject(page_number) if page_ref is None: logger_warning( f"can not find reference of page {page_number}", __name__, ) page_ref = NullObject() dest = Destination( NameObject("/" + title + " outline item"), page_ref, fit, ) action_ref = self._add_object( DictionaryObject( { NameObject(GoToActionArguments.D): dest.dest_array, NameObject(GoToActionArguments.S): NameObject("/GoTo"), } ) ) outline_item = self._add_object( _create_outline_item(action_ref, title, color, italic, bold) ) if parent is None: parent = self.get_outline_root() return self.add_outline_item_destination(outline_item, parent, before, is_open) def add_outline(self) -> None: raise NotImplementedError( "This method is not yet implemented. Use :meth:`add_outline_item` instead." ) def add_named_destination_array( self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] ) -> None: named_dest = self.get_named_dest_root() i = 0 while i < len(named_dest): if title < named_dest[i]: named_dest.insert(i, destination) named_dest.insert(i, TextStringObject(title)) return i += 2 named_dest.extend([TextStringObject(title), destination]) return def add_named_destination_object( self, page_destination: PdfObject, ) -> IndirectObject: page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore self.add_named_destination_array( cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore ) return page_destination_ref def add_named_destination( self, title: str, page_number: int, ) -> IndirectObject: page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore dest = DictionaryObject() dest.update( { NameObject(GoToActionArguments.D): ArrayObject( [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] ), NameObject(GoToActionArguments.S): NameObject("/GoTo"), } ) dest_ref = self._add_object(dest) if not isinstance(title, TextStringObject): title = TextStringObject(str(title)) self.add_named_destination_array(title, dest_ref) return dest_ref def remove_links(self) -> None: """Remove links and annotations from this output.""" for page in self.pages: self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS) def remove_annotations( self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]] ) -> None: """ Remove annotations by annotation subtype. Args: subtypes: subtype or list of subtypes to be removed. Examples are: "/Link", "/FileAttachment", "/Sound", "/Movie", "/Screen", ... If you want to remove all annotations, use subtypes=None. """ for page in self.pages: self._remove_annots_from_page(page, subtypes) def _remove_annots_from_page( self, page: Union[IndirectObject, PageObject, DictionaryObject], subtypes: Optional[Iterable[str]], ) -> None: page = cast(DictionaryObject, page.get_object()) if PG.ANNOTS in page: i = 0 while i < len(cast(ArrayObject, page[PG.ANNOTS])): an = cast(ArrayObject, page[PG.ANNOTS])[i] obj = cast(DictionaryObject, an.get_object()) if subtypes is None or cast(str, obj["/Subtype"]) in subtypes: if isinstance(an, IndirectObject): self._objects[an.idnum - 1] = NullObject() # to reduce PDF size del page[PG.ANNOTS][i] # type:ignore else: i += 1 def remove_objects_from_page( self, page: Union[PageObject, DictionaryObject], to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]], text_filters: Optional[dict[str, Any]] = None ) -> None: """ Remove objects specified by ``to_delete`` from the given page. Args: page: Page object to clean up. to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag`` or a list of ObjectDeletionFlag text_filters: Properties of text to be deleted, if applicable. Optional. This is a Python dictionary with the following properties: * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted. """ if isinstance(to_delete, (list, tuple)): for to_d in to_delete: self.remove_objects_from_page(page, to_d) return None assert isinstance(to_delete, ObjectDeletionFlag) if to_delete & ObjectDeletionFlag.LINKS: return self._remove_annots_from_page(page, ("/Link",)) if to_delete & ObjectDeletionFlag.ATTACHMENTS: return self._remove_annots_from_page( page, ("/FileAttachment", "/Sound", "/Movie", "/Screen") ) if to_delete & ObjectDeletionFlag.OBJECTS_3D: return self._remove_annots_from_page(page, ("/3D",)) if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: return self._remove_annots_from_page(page, None) jump_operators = [] if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: jump_operators = [ b"w", b"J", b"j", b"M", b"d", b"i", b"W", b"W*", b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n", b"m", b"l", b"c", b"v", b"y", b"h", b"re", b"sh" ] if to_delete & ObjectDeletionFlag.TEXT: jump_operators = [b"Tj", b"TJ", b"'", b'"'] if not isinstance(page, PageObject): page = PageObject(self, page.indirect_reference) # pragma: no cover if "/Contents" in page: content = cast(ContentStream, page.get_contents()) images, forms = self._remove_objects_from_page__clean_forms( elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters, ) self._remove_objects_from_page__clean( content=content, images=images, forms=forms, jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters ) page.replace_contents(content) return [], [] # type: ignore[return-value] def _remove_objects_from_page__clean( self, content: ContentStream, images: list[str], forms: list[str], jump_operators: list[bytes], to_delete: ObjectDeletionFlag, text_filters: Optional[dict[str, Any]] = None, ) -> None: font_id = None font_ids_to_delete = [] if text_filters and to_delete & ObjectDeletionFlag.TEXT: font_ids_to_delete = text_filters.get("font_ids", []) i = 0 while i < len(content.operations): operands, operator = content.operations[i] if operator == b"Tf": font_id = operands[0] if ( ( operator == b"INLINE IMAGE" and (to_delete & ObjectDeletionFlag.INLINE_IMAGES) ) or (operator in jump_operators) or ( operator == b"Do" and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES) and (operands[0] in images) ) ): if ( not to_delete & ObjectDeletionFlag.TEXT or (to_delete & ObjectDeletionFlag.TEXT and not text_filters) or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete) ): del content.operations[i] else: i += 1 else: i += 1 content.get_data() # this ensures ._data is rebuilt from the .operations def _remove_objects_from_page__clean_forms( self, elt: DictionaryObject, stack: list[DictionaryObject], jump_operators: list[bytes], to_delete: ObjectDeletionFlag, text_filters: Optional[dict[str, Any]] = None, ) -> tuple[list[str], list[str]]: # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference if (elt in stack) or ( hasattr(elt, "indirect_reference") and any( elt.indirect_reference == getattr(x, "indirect_reference", -1) for x in stack ) ): # to prevent infinite looping return [], [] # pragma: no cover try: d = cast( dict[Any, Any], cast(DictionaryObject, elt["/Resources"])["/XObject"], ) except KeyError: d = {} images = [] forms = [] for k, v in d.items(): o = v.get_object() try: content: Any = None if ( to_delete & ObjectDeletionFlag.XOBJECT_IMAGES and o["/Subtype"] == "/Image" ): content = NullObject() # to delete the image keeping the entry images.append(k) if o["/Subtype"] == "/Form": forms.append(k) if isinstance(o, ContentStream): content = o else: content = ContentStream(o, self) content.update( { k1: v1 for k1, v1 in o.items() if k1 not in ["/Length", "/Filter", "/DecodeParms"] } ) try: content.indirect_reference = o.indirect_reference except AttributeError: # pragma: no cover pass stack.append(elt) # clean subforms self._remove_objects_from_page__clean_forms( elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters, ) if content is not None: if isinstance(v, IndirectObject): self._objects[v.idnum - 1] = content else: # should only occur in a PDF not respecting PDF spec # where streams must be indirected. d[k] = self._add_object(content) # pragma: no cover except (TypeError, KeyError): pass for im in images: del d[im] # for clean-up if isinstance(elt, StreamObject): # for /Form if not isinstance(elt, ContentStream): # pragma: no cover e = ContentStream(elt, self) e.update(elt.items()) elt = e # clean the content self._remove_objects_from_page__clean( content=elt, images=images, forms=forms, jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters ) return images, forms def remove_images( self, to_delete: ImageType = ImageType.ALL, ) -> None: """ Remove images from this output. Args: to_delete: The type of images to be deleted (default = all images types) """ if isinstance(to_delete, bool): to_delete = ImageType.ALL i = ObjectDeletionFlag.NONE for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"): if to_delete & ImageType[image]: i |= ObjectDeletionFlag[image] for page in self.pages: self.remove_objects_from_page(page, i) def remove_text(self, font_names: Optional[list[str]] = None) -> None: """ Remove text from the PDF. Args: font_names: List of font names to remove, such as "Helvetica-Bold". Optional. If not specified, all text will be removed. """ if not font_names: font_names = [] for page in self.pages: resource_ids_to_remove = [] # Content streams reference fonts and other resources with names like "/F1" or "/T1_0" # Font names need to be converted to resource names/IDs for easier removal if font_names: # Recursively loop through page objects to gather font info def get_font_info( obj: Any, font_info: Optional[dict[str, Any]] = None, key: Optional[str] = None ) -> dict[str, Any]: if font_info is None: font_info = {} if isinstance(obj, IndirectObject): obj = obj.get_object() if isinstance(obj, dict): if obj.get("/Type") == "/Font": font_name = obj.get("/BaseFont", "") # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold" normalized_font_name = font_name.lstrip("/").split("+")[-1] if normalized_font_name not in font_info: font_info[normalized_font_name] = { "normalized_font_name": normalized_font_name, "resource_ids": [], } if key not in font_info[normalized_font_name]["resource_ids"]: font_info[normalized_font_name]["resource_ids"].append(key) for k in obj: font_info = get_font_info(obj[k], font_info, k) elif isinstance(obj, (list, ArrayObject)): for child_obj in obj: font_info = get_font_info(child_obj, font_info) return font_info # Add relevant resource names for removal font_info = get_font_info(page.get("/Resources")) for font_name in font_names: if font_name in font_info: resource_ids_to_remove.extend(font_info[font_name]["resource_ids"]) text_filters = {} if font_names: text_filters["font_ids"] = resource_ids_to_remove self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters) def add_uri( self, page_number: int, uri: str, rect: RectangleObject, border: Optional[ArrayObject] = None, ) -> None: """ Add an URI from a rectangular area to the specified page. Args: page_number: index of the page on which to place the URI action. uri: URI of resource to link to. rect: :class:`RectangleObject` or array of four integers specifying the clickable rectangular area ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. border: if provided, an array describing border-drawing properties. See the PDF spec for details. No border will be drawn if this argument is omitted. """ page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore page_ref = cast(dict[str, Any], self.get_object(page_link)) border_arr: BorderArrayType if border is not None: border_arr = [NumberObject(n) for n in border[:3]] if len(border) == 4: dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) border_arr.append(dash_pattern) else: border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)] if isinstance(rect, str): rect = NumberObject(rect) elif isinstance(rect, RectangleObject): pass else: rect = RectangleObject(rect) lnk2 = DictionaryObject() lnk2.update( { NameObject("/S"): NameObject("/URI"), NameObject("/URI"): TextStringObject(uri), } ) lnk = DictionaryObject() lnk.update( { NameObject(AA.Type): NameObject("/Annot"), NameObject(AA.Subtype): NameObject("/Link"), NameObject(AA.P): page_link, NameObject(AA.Rect): rect, NameObject("/H"): NameObject("/I"), NameObject(AA.Border): ArrayObject(border_arr), NameObject("/A"): lnk2, } ) lnk_ref = self._add_object(lnk) if PG.ANNOTS in page_ref: page_ref[PG.ANNOTS].append(lnk_ref) else: page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) _valid_layouts = ( "/NoLayout", "/SinglePage", "/OneColumn", "/TwoColumnLeft", "/TwoColumnRight", "/TwoPageLeft", "/TwoPageRight", ) def _get_page_layout(self) -> Optional[LayoutType]: try: return cast(LayoutType, self._root_object["/PageLayout"]) except KeyError: return None def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: """ Set the page layout. Args: layout: The page layout to be used. .. list-table:: Valid ``layout`` arguments :widths: 50 200 * - /NoLayout - Layout explicitly not specified * - /SinglePage - Show one page at a time * - /OneColumn - Show one column at a time * - /TwoColumnLeft - Show pages in two columns, odd-numbered pages on the left * - /TwoColumnRight - Show pages in two columns, odd-numbered pages on the right * - /TwoPageLeft - Show two pages at a time, odd-numbered pages on the left * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ if not isinstance(layout, NameObject): if layout not in self._valid_layouts: logger_warning( f"Layout should be one of: {'', ''.join(self._valid_layouts)}", __name__, ) layout = NameObject(layout) self._root_object.update({NameObject("/PageLayout"): layout}) def set_page_layout(self, layout: LayoutType) -> None: """ Set the page layout. Args: layout: The page layout to be used .. list-table:: Valid ``layout`` arguments :widths: 50 200 * - /NoLayout - Layout explicitly not specified * - /SinglePage - Show one page at a time * - /OneColumn - Show one column at a time * - /TwoColumnLeft - Show pages in two columns, odd-numbered pages on the left * - /TwoColumnRight - Show pages in two columns, odd-numbered pages on the right * - /TwoPageLeft - Show two pages at a time, odd-numbered pages on the left * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ self._set_page_layout(layout) @property def page_layout(self) -> Optional[LayoutType]: """ Page layout property. .. list-table:: Valid ``layout`` values :widths: 50 200 * - /NoLayout - Layout explicitly not specified * - /SinglePage - Show one page at a time * - /OneColumn - Show one column at a time * - /TwoColumnLeft - Show pages in two columns, odd-numbered pages on the left * - /TwoColumnRight - Show pages in two columns, odd-numbered pages on the right * - /TwoPageLeft - Show two pages at a time, odd-numbered pages on the left * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ return self._get_page_layout() @page_layout.setter def page_layout(self, layout: LayoutType) -> None: self._set_page_layout(layout) _valid_modes = ( "/UseNone", "/UseOutlines", "/UseThumbs", "/FullScreen", "/UseOC", "/UseAttachments", ) def _get_page_mode(self) -> Optional[PagemodeType]: try: return cast(PagemodeType, self._root_object["/PageMode"]) except KeyError: return None @property def page_mode(self) -> Optional[PagemodeType]: """ Page mode property. .. list-table:: Valid ``mode`` values :widths: 50 200 * - /UseNone - Do not show outline or thumbnails panels * - /UseOutlines - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen - Fullscreen view * - /UseOC - Show Optional Content Group (OCG) panel * - /UseAttachments - Show attachments panel """ return self._get_page_mode() @page_mode.setter def page_mode(self, mode: PagemodeType) -> None: if isinstance(mode, NameObject): mode_name: NameObject = mode else: if mode not in self._valid_modes: logger_warning( f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ ) mode_name = NameObject(mode) self._root_object.update({NameObject("/PageMode"): mode_name}) def add_annotation( self, page_number: Union[int, PageObject], annotation: dict[str, Any], ) -> DictionaryObject: """ Add a single annotation to the page. The added annotation must be a new annotation. It cannot be recycled. Args: page_number: PageObject or page index. annotation: Annotation to be added (created with annotation). Returns: The inserted object. This can be used for popup creation, for example. """ page = page_number if isinstance(page, int): page = self.pages[page] elif not isinstance(page, PageObject): raise TypeError("page: invalid type") to_add = cast(DictionaryObject, _pdf_objectify(annotation)) to_add[NameObject("/P")] = page.indirect_reference if page.annotations is None: page[NameObject("/Annots")] = ArrayObject() assert page.annotations is not None # Internal link annotations need the correct object type for the # destination if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")]) dest = Destination( NameObject("/LinkName"), tmp["target_page_index"], Fit( fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] ), # I have no clue why this dict-hack is necessary ) to_add[NameObject("/Dest")] = dest.dest_array page.annotations.append(self._add_object(to_add)) if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add: cast(DictionaryObject, to_add["/Parent"].get_object())[ NameObject("/Popup") ] = to_add.indirect_reference return to_add def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: """ Perform some clean up in the page. Currently: convert NameObject named destination to TextStringObject (required for names/dests list) Args: page: Returns: The cleaned PageObject """ page = cast("PageObject", page.get_object()) for a in page.get("/Annots", []): a_obj = a.get_object() d = a_obj.get("/Dest", None) act = a_obj.get("/A", None) if isinstance(d, NameObject): a_obj[NameObject("/Dest")] = TextStringObject(d) elif act is not None: act = act.get_object() d = act.get("/D", None) if isinstance(d, NameObject): act[NameObject("/D")] = TextStringObject(d) return page def _create_stream( self, fileobj: Union[Path, StrByteType, PdfReader] ) -> tuple[IOBase, Optional[Encryption]]: # If the fileobj parameter is a string, assume it is a path # and create a file object at that location. If it is a file, # copy the file's contents into a BytesIO stream object; if # it is a PdfReader, copy that reader's stream into a # BytesIO stream. # If fileobj is none of the above types, it is not modified encryption_obj = None stream: IOBase if isinstance(fileobj, (str, Path)): with FileIO(fileobj, "rb") as f: stream = BytesIO(f.read()) elif isinstance(fileobj, PdfReader): if fileobj._encryption: encryption_obj = fileobj._encryption orig_tell = fileobj.stream.tell() fileobj.stream.seek(0) stream = BytesIO(fileobj.stream.read()) # reset the stream to its original location fileobj.stream.seek(orig_tell) elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): fileobj.seek(0) filecontent = fileobj.read() stream = BytesIO(filecontent) else: raise NotImplementedError( "Merging requires an object that PdfReader can parse. " "Typically, that is a Path or a string representing a Path, " "a file object, or an object implementing .seek and .read. " "Passing a PdfReader directly works as well." ) return stream, encryption_obj def append( self, fileobj: Union[StrByteType, PdfReader, Path], outline_item: Union[ str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int] ] = None, pages: Union[ None, PageRange, tuple[int, int], tuple[int, int, int], list[int], list[PageObject], ] = None, import_outline: bool = True, excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None, ) -> None: """ Identical to the :meth:`merge()` method, but assumes you want to concatenate all pages onto the end of the file instead of specifying a position. Args: fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. outline_item: Optionally, you may specify a string to build an outline (aka 'bookmark') to identify the beginning of the included file. pages: Can be a :class:`PageRange` or a ``(start, stop[, step])`` tuple or a list of pages to be processed to merge only the specified range of pages from the source document into the output document. import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as 'bookmarks') from being imported by specifying this as ``False``. excluded_fields: Provide the list of fields/keys to be ignored if ``/Annots`` is part of the list, the annotation will be ignored if ``/B`` is part of the list, the articles will be ignored """ if excluded_fields is None: excluded_fields = () if isinstance(outline_item, (tuple, list, PageRange)): if isinstance(pages, bool): if not isinstance(import_outline, bool): excluded_fields = import_outline import_outline = pages pages = outline_item self.merge( None, fileobj, None, pages, import_outline, excluded_fields, ) else: # if isinstance(outline_item, str): self.merge( None, fileobj, outline_item, pages, import_outline, excluded_fields, ) def merge( self, position: Optional[int], fileobj: Union[Path, StrByteType, PdfReader], outline_item: Optional[str] = None, pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None, import_outline: bool = True, excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (), ) -> None: """ Merge the pages from the given file into the output file at the specified page number. Args: position: The *page number* to insert this file. File will be inserted after the given number. fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. outline_item: Optionally, you may specify a string to build an outline (aka 'bookmark') to identify the beginning of the included file. pages: can be a :class:`PageRange` or a ``(start, stop[, step])`` tuple or a list of pages to be processed to merge only the specified range of pages from the source document into the output document. import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as 'bookmarks') from being imported by specifying this as ``False``. excluded_fields: provide the list of fields/keys to be ignored if ``/Annots`` is part of the list, the annotation will be ignored if ``/B`` is part of the list, the articles will be ignored Raises: TypeError: The pages attribute is not configured properly """ if isinstance(fileobj, PdfDocCommon): reader = fileobj else: stream, _encryption_obj = self._create_stream(fileobj) # Create a new PdfReader instance using the stream # (either file or BytesIO or StringIO) created above reader = PdfReader(stream, strict=False) # type: ignore[arg-type] if excluded_fields is None: excluded_fields = () # Find the range of pages to merge. if pages is None: pages = list(range(len(reader.pages))) elif isinstance(pages, PageRange): pages = list(range(*pages.indices(len(reader.pages)))) elif isinstance(pages, list): pass # keep unchanged elif isinstance(pages, tuple) and len(pages) <= 3: pages = list(range(*pages)) elif not isinstance(pages, tuple): raise TypeError( '"pages" must be a tuple of (start, stop[, step]) or a list' ) srcpages = {} for page in pages: if isinstance(page, PageObject): pg = page else: pg = reader.pages[page] assert pg.indirect_reference is not None if position is None: # numbers in the exclude list identifies that the exclusion is # only applicable to 1st level of cloning srcpages[pg.indirect_reference.idnum] = self.add_page( pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore ) else: srcpages[pg.indirect_reference.idnum] = self.insert_page( pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore ) position += 1 srcpages[pg.indirect_reference.idnum].original_page = pg reader._named_destinations = ( reader.named_destinations ) # need for the outline processing below arr: Any for dest in reader._named_destinations.values(): self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages) outline_item_typ: TreeObject if outline_item is not None: outline_item_typ = cast( "TreeObject", self.add_outline_item( TextStringObject(outline_item), next(iter(srcpages.values())).indirect_reference, fit=PAGE_FIT, ).get_object(), ) else: outline_item_typ = self.get_outline_root() _ro = reader.root_object if import_outline and CO.OUTLINES in _ro: outline = self._get_filtered_outline( _ro.get(CO.OUTLINES, None), srcpages, reader ) self._insert_filtered_outline( outline, outline_item_typ, None ) # TODO: use before parameter if "/Annots" not in excluded_fields: for pag in srcpages.values(): lst = self._insert_filtered_annotations( pag.original_page.get("/Annots", []), pag, srcpages, reader ) if len(lst) > 0: pag[NameObject("/Annots")] = lst self.clean_page(pag) if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]): if "/AcroForm" not in self._root_object: self._root_object[NameObject("/AcroForm")] = self._add_object( cast( DictionaryObject, reader.root_object["/AcroForm"], ).clone(self, False, ("/Fields",)) ) arr = ArrayObject() else: arr = cast( ArrayObject, cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], ) trslat = self._id_translated[id(reader)] try: for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore try: ind = IndirectObject(trslat[f.idnum], 0, self) if ind not in arr: arr.append(ind) except KeyError: # for trslat[] which mean the field has not be copied # through the page pass except KeyError: # for /Acroform or /Fields are not existing arr = self._add_object(ArrayObject()) cast(DictionaryObject, self._root_object["/AcroForm"])[ NameObject("/Fields") ] = arr if "/B" not in excluded_fields: self.add_filtered_articles("", srcpages, reader) def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None: arr: Any = dest.dest_array if "/Names" in self._root_object and dest["/Title"] in cast( list[Any], cast( DictionaryObject, cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()), ).get("/Names", DictionaryObject()), ): # already exists: should not duplicate it pass elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject): pass elif isinstance(dest["/Page"], int): # the page reference is a page number normally not a PDF Reference # page numbers as int are normally accepted only in external goto try: p = reader.pages[dest["/Page"]] except IndexError: return assert p.indirect_reference is not None try: arr[NumberObject(0)] = NumberObject( srcpages[p.indirect_reference.idnum].page_number ) self.add_named_destination_array(dest["/Title"], arr) except KeyError: pass elif dest["/Page"].indirect_reference.idnum in srcpages: arr[NumberObject(0)] = srcpages[ dest["/Page"].indirect_reference.idnum ].indirect_reference self.add_named_destination_array(dest["/Title"], arr) def _add_articles_thread( self, thread: DictionaryObject, # thread entry from the reader's array of threads pages: dict[int, PageObject], reader: PdfReader, ) -> IndirectObject: """ Clone the thread with only the applicable articles. Args: thread: pages: reader: Returns: The added thread as an indirect reference """ nthread = thread.clone( self, force_duplicate=True, ignore_fields=("/F",) ) # use of clone to keep link between reader and writer self.threads.append(nthread.indirect_reference) first_article = cast("DictionaryObject", thread["/F"]) current_article: Optional[DictionaryObject] = first_article new_article: Optional[DictionaryObject] = None while current_article is not None: pag = self._get_cloned_page( cast("PageObject", current_article["/P"]), pages, reader ) if pag is not None: if new_article is None: new_article = cast( "DictionaryObject", self._add_object(DictionaryObject()).get_object(), ) new_first = new_article nthread[NameObject("/F")] = new_article.indirect_reference else: new_article2 = cast( "DictionaryObject", self._add_object( DictionaryObject( {NameObject("/V"): new_article.indirect_reference} ) ).get_object(), ) new_article[NameObject("/N")] = new_article2.indirect_reference new_article = new_article2 new_article[NameObject("/P")] = pag new_article[NameObject("/T")] = nthread.indirect_reference new_article[NameObject("/R")] = current_article["/R"] pag_obj = cast("PageObject", pag.get_object()) if "/B" not in pag_obj: pag_obj[NameObject("/B")] = ArrayObject() cast("ArrayObject", pag_obj["/B"]).append( new_article.indirect_reference ) current_article = cast("DictionaryObject", current_article["/N"]) if current_article == first_article: new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore current_article = None assert nthread.indirect_reference is not None return nthread.indirect_reference def add_filtered_articles( self, fltr: Union[ Pattern[Any], str ], # thread entry from the reader's array of threads pages: dict[int, PageObject], reader: PdfReader, ) -> None: """ Add articles matching the defined criteria. Args: fltr: pages: reader: """ if isinstance(fltr, str): fltr = re.compile(fltr) elif not isinstance(fltr, Pattern): fltr = re.compile("") for p in pages.values(): pp = p.original_page for a in pp.get("/B", ()): a_obj = a.get_object() if is_null_or_none(a_obj): continue thr = a_obj.get("/T") if thr is None: continue thr = thr.get_object() if thr.indirect_reference.idnum not in self._id_translated[ id(reader) ] and fltr.search((thr.get("/I", {})).get("/Title", "")): self._add_articles_thread(thr, pages, reader) def _get_cloned_page( self, page: Union[None, IndirectObject, PageObject, NullObject], pages: dict[int, PageObject], reader: PdfReader, ) -> Optional[IndirectObject]: if isinstance(page, NullObject): return None if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": _i = page.indirect_reference elif isinstance(page, IndirectObject): _i = page try: return pages[_i.idnum].indirect_reference # type: ignore except Exception: return None def _insert_filtered_annotations( self, annots: Union[IndirectObject, list[DictionaryObject], None], page: PageObject, pages: dict[int, PageObject], reader: PdfReader, ) -> list[Destination]: outlist = ArrayObject() if isinstance(annots, IndirectObject): annots = cast("list[Any]", annots.get_object()) if annots is None: return outlist if not isinstance(annots, list): logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__) return outlist for an in annots: ano = cast("DictionaryObject", an.get_object()) if ( ano["/Subtype"] != "/Link" or "/A" not in ano or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" or "/Dest" in ano ): if "/Dest" not in ano: outlist.append(self._add_object(ano.clone(self))) else: d = ano["/Dest"] if isinstance(d, str): # it is a named dest if str(d) in self.get_named_dest_root(): outlist.append(ano.clone(self).indirect_reference) else: d = cast("ArrayObject", d) p = self._get_cloned_page(d[0], pages, reader) if p is not None: anc = ano.clone(self, ignore_fields=("/Dest",)) anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]]) outlist.append(self._add_object(anc)) else: d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject()) if is_null_or_none(d): continue if isinstance(d, str): # it is a named dest if str(d) in self.get_named_dest_root(): outlist.append(ano.clone(self).indirect_reference) else: d = cast("ArrayObject", d) p = self._get_cloned_page(d[0], pages, reader) if p is not None: anc = ano.clone(self, ignore_fields=("/D",)) cast("DictionaryObject", anc["/A"])[ NameObject("/D") ] = ArrayObject([p, *d[1:]]) outlist.append(self._add_object(anc)) return outlist def _get_filtered_outline( self, node: Any, pages: dict[int, PageObject], reader: PdfReader, ) -> list[Destination]: """ Extract outline item entries that are part of the specified page set. Args: node: pages: reader: Returns: A list of destination objects. """ new_outline = [] if node is None: node = NullObject() node = node.get_object() if is_null_or_none(node): node = DictionaryObject() if node.get("/Type", "") == "/Outlines" or "/Title" not in node: node = node.get("/First", None) if node is not None: node = node.get_object() new_outline += self._get_filtered_outline(node, pages, reader) else: v: Union[None, IndirectObject, NullObject] while node is not None: node = node.get_object() o = cast("Destination", reader._build_outline_item(node)) v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) if v is None: v = NullObject() o[NameObject("/Page")] = v if "/First" in node: o._filtered_children = self._get_filtered_outline( node["/First"], pages, reader ) else: o._filtered_children = [] if ( not isinstance(o["/Page"], NullObject) or len(o._filtered_children) > 0 ): new_outline.append(o) node = node.get("/Next", None) return new_outline def _clone_outline(self, dest: Destination) -> TreeObject: n_ol = TreeObject() self._add_object(n_ol) n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) if not isinstance(dest["/Page"], NullObject): if dest.node is not None and "/A" in dest.node: n_ol[NameObject("/A")] = dest.node["/A"].clone(self) else: n_ol[NameObject("/Dest")] = dest.dest_array # TODO: /SE if dest.node is not None: n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) n_ol[NameObject("/C")] = ArrayObject( dest.node.get( "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] ) ) return n_ol def _insert_filtered_outline( self, outlines: list[Destination], parent: Union[TreeObject, IndirectObject], before: Union[None, TreeObject, IndirectObject] = None, ) -> None: for dest in outlines: # TODO: can be improved to keep A and SE entries (ignored for the moment) # with np=self.add_outline_item_destination(dest,parent,before) if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: np = parent else: np = self._clone_outline(dest) cast(TreeObject, parent.get_object()).insert_child(np, before, self) self._insert_filtered_outline(dest._filtered_children, np, None) def close(self) -> None: """Implemented for API harmonization.""" return def find_outline_item( self, outline_item: dict[str, Any], root: Optional[OutlineType] = None, ) -> Optional[list[int]]: if root is None: o = self.get_outline_root() else: o = cast("TreeObject", root) i = 0 while o is not None: if ( o.indirect_reference == outline_item or o.get("/Title", None) == outline_item ): return [i] if "/First" in o: res = self.find_outline_item( outline_item, cast(OutlineType, o["/First"]) ) if res: return ([i] if "/Title" in o else []) + res if "/Next" in o: i += 1 o = cast(TreeObject, o["/Next"]) else: return None raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover def reset_translation( self, reader: Union[None, PdfReader, IndirectObject] = None ) -> None: """ Reset the translation table between reader and the writer object. Late cloning will create new independent objects. Args: reader: PdfReader or IndirectObject referencing a PdfReader object. if set to None or omitted, all tables will be reset. """ if reader is None: self._id_translated = {} elif isinstance(reader, PdfReader): try: del self._id_translated[id(reader)] except Exception: pass elif isinstance(reader, IndirectObject): try: del self._id_translated[id(reader.pdf)] except Exception: pass else: raise Exception("invalid parameter {reader}") def set_page_label( self, page_index_from: int, page_index_to: int, style: Optional[PageLabelStyle] = None, prefix: Optional[str] = None, start: Optional[int] = 0, ) -> None: """ Set a page label to a range of pages. Page indexes must be given starting from 0. Labels must have a style, a prefix or both. If a range is not assigned any page label, a decimal label starting from 1 is applied. Args: page_index_from: page index of the beginning of the range starting from 0 page_index_to: page index of the beginning of the range starting from 0 style: The numbering style to be used for the numeric portion of each page label: * ``/D`` Decimal Arabic numerals * ``/R`` Uppercase Roman numerals * ``/r`` Lowercase Roman numerals * ``/A`` Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) * ``/a`` Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) prefix: The label prefix for page labels in this range. start: The value of the numeric portion for the first page label in the range. Subsequent pages are numbered sequentially from this value, which must be greater than or equal to 1. Default value: 1. """ if style is None and prefix is None: raise ValueError("At least one of style and prefix must be given") if page_index_from < 0: raise ValueError("page_index_from must be greater or equal than 0") if page_index_to < page_index_from: raise ValueError( "page_index_to must be greater or equal than page_index_from" ) if page_index_to >= len(self.pages): raise ValueError("page_index_to exceeds number of pages") if start is not None and start != 0 and start < 1: raise ValueError("If given, start must be greater or equal than one") self._set_page_label(page_index_from, page_index_to, style, prefix, start) def _set_page_label( self, page_index_from: int, page_index_to: int, style: Optional[PageLabelStyle] = None, prefix: Optional[str] = None, start: Optional[int] = 0, ) -> None: """ Set a page label to a range of pages. Page indexes must be given starting from 0. Labels must have a style, a prefix or both. If a range is not assigned any page label a decimal label starting from 1 is applied. Args: page_index_from: page index of the beginning of the range starting from 0 page_index_to: page index of the beginning of the range starting from 0 style: The numbering style to be used for the numeric portion of each page label: /D Decimal Arabic numerals /R Uppercase Roman numerals /r Lowercase Roman numerals /A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) /a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) prefix: The label prefix for page labels in this range. start: The value of the numeric portion for the first page label in the range. Subsequent pages are numbered sequentially from this value, which must be greater than or equal to 1. Default value: 1. """ default_page_label = DictionaryObject() default_page_label[NameObject("/S")] = NameObject("/D") new_page_label = DictionaryObject() if style is not None: new_page_label[NameObject("/S")] = NameObject(style) if prefix is not None: new_page_label[NameObject("/P")] = TextStringObject(prefix) if start != 0: new_page_label[NameObject("/St")] = NumberObject(start) if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: nums = ArrayObject() nums_insert(NumberObject(0), default_page_label, nums) page_labels = TreeObject() page_labels[NameObject("/Nums")] = nums self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels page_labels = cast( TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] ) nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) nums_insert(NumberObject(page_index_from), new_page_label, nums) nums_clear_range(NumberObject(page_index_from), page_index_to, nums) next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) page_labels[NameObject("/Nums")] = nums self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels def _repr_mimebundle_( self, include: Union[None, Iterable[str]] = None, exclude: Union[None, Iterable[str]] = None, ) -> dict[str, Any]: """ Integration into Jupyter Notebooks. This method returns a dictionary that maps a mime-type to its representation. .. seealso:: https://ipython.readthedocs.io/en/stable/config/integrating.html """ pdf_data = BytesIO() self.write(pdf_data) data = { "application/pdf": pdf_data, } if include is not None: # Filter representations based on include list data = {k: v for k, v in data.items() if k in include} if exclude is not None: # Remove representations based on exclude list data = {k: v for k, v in data.items() if k not in exclude} return data def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject: if isinstance(obj, PdfObject): return obj if isinstance(obj, dict): to_add = DictionaryObject() for key, value in obj.items(): to_add[NameObject(key)] = _pdf_objectify(value) return to_add if isinstance(obj, str): if obj.startswith("/"): return NameObject(obj) return TextStringObject(obj) if isinstance(obj, (float, int)): return FloatObject(obj) if isinstance(obj, list): return ArrayObject(_pdf_objectify(i) for i in obj) raise NotImplementedError( f"{type(obj)=} could not be cast to a PdfObject" ) def _create_outline_item( action_ref: Union[None, IndirectObject], title: str, color: Union[tuple[float, float, float], str, None], italic: bool, bold: bool, ) -> TreeObject: outline_item = TreeObject() if action_ref is not None: outline_item[NameObject("/A")] = action_ref outline_item.update( { NameObject("/Title"): create_string_object(title), } ) if color: if isinstance(color, str): color = hex_to_rgb(color) outline_item.update( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) if italic or bold: format_flag = 0 if italic: format_flag += OutlineFontFlag.italic if bold: format_flag += OutlineFontFlag.bold outline_item.update({NameObject("/F"): NumberObject(format_flag)}) return outline_item ================================================ FILE: pypdf/annotations/__init__.py ================================================ """ PDF specifies several annotation types which pypdf makes available here. The names of the annotations and their attributes do not reflect the names in the specification in all cases. For example, the PDF standard defines a 'Square' annotation that does not actually need to be square. For this reason, pypdf calls it 'Rectangle'. At their core, all annotation types are DictionaryObjects. That means if pypdf does not implement a feature, users can easily extend the given functionality. """ from ._base import NO_FLAGS, AnnotationDictionary from ._markup_annotations import ( Ellipse, FreeText, Highlight, Line, MarkupAnnotation, Polygon, PolyLine, Rectangle, Text, ) from ._non_markup_annotations import Link, Popup __all__ = [ "NO_FLAGS", "AnnotationDictionary", "Ellipse", "FreeText", "Highlight", "Line", "Link", "MarkupAnnotation", "PolyLine", "Polygon", "Popup", "Rectangle", "Text", ] ================================================ FILE: pypdf/annotations/_base.py ================================================ from abc import ABC from ..constants import AnnotationFlag from ..generic import NameObject, NumberObject from ..generic._data_structures import DictionaryObject class AnnotationDictionary(DictionaryObject, ABC): def __init__(self) -> None: super().__init__() from ..generic._base import NameObject # noqa: PLC0415 # /Rect should not be added here as Polygon and PolyLine can automatically set it self[NameObject("/Type")] = NameObject("/Annot") # The flags were NOT added to the constructor on purpose: # We expect that most users don't want to change the default. # If they do, they can use the property. The default is 0. @property def flags(self) -> AnnotationFlag: return self.get(NameObject("/F"), AnnotationFlag(0)) @flags.setter def flags(self, value: AnnotationFlag) -> None: self[NameObject("/F")] = NumberObject(value) NO_FLAGS = AnnotationFlag(0) ================================================ FILE: pypdf/annotations/_markup_annotations.py ================================================ import sys import uuid from abc import ABC from typing import Any, Literal, Optional, Union from ..constants import AnnotationFlag from ..generic import ArrayObject, DictionaryObject from ..generic._base import ( BooleanObject, FloatObject, IndirectObject, NameObject, NumberObject, TextStringObject, ) from ..generic._rectangle import RectangleObject from ..generic._utils import hex_to_rgb from ._base import NO_FLAGS, AnnotationDictionary if sys.version_info[:2] >= (3, 10): from typing import TypeAlias else: # PEP 613 introduced typing.TypeAlias with Python 3.10 # For older Python versions, the backport typing_extensions is necessary: from typing_extensions import TypeAlias Vertex: TypeAlias = tuple[float, float] def _get_bounding_rectangle(vertices: list[Vertex]) -> RectangleObject: x_min, y_min = vertices[0][0], vertices[0][1] x_max, y_max = vertices[0][0], vertices[0][1] for x, y in vertices: x_min = min(x_min, x) y_min = min(y_min, y) x_max = max(x_max, x) y_max = max(y_max, y) return RectangleObject((x_min, y_min, x_max, y_max)) class MarkupAnnotation(AnnotationDictionary, ABC): """ Base class for all markup annotations. Args: title_bar: Text to be displayed in the title bar of the annotation; by convention this is the name of the author in_reply_to: The annotation that this annotation is "in reply to" (PDF 1.5). Can be either an annotation (previously added using :meth:`~pypdf.PdfWriter.add_annotation`) or a reference to the target annotation. reply_type: The relationship between this annotation and the one specified by ``in_reply_to``. Either ``"R"`` (a reply, default) or ``"Group"`` (grouped with the parent annotation). Raises ``ValueError`` if a non-default value is provided without ``in_reply_to``. annotation_name: A text string uniquely identifying this annotation among all annotations on its page. Automatically generated when ``in_reply_to`` is set and no name is provided. Raises ``ValueError`` if provided without ``in_reply_to``. """ def __init__( self, *, title_bar: Optional[str] = None, in_reply_to: Optional[Union[DictionaryObject, IndirectObject]] = None, reply_type: Literal["R", "Group"] = "R", annotation_name: Optional[str] = None, ) -> None: if title_bar is not None: self[NameObject("/T")] = TextStringObject(title_bar) if annotation_name is not None and in_reply_to is None: raise ValueError( "annotation_name is only supported when in_reply_to is set" ) if reply_type != "R" and in_reply_to is None: raise ValueError( "reply_type is only meaningful when in_reply_to is set" ) if in_reply_to is not None: if isinstance(in_reply_to, IndirectObject): ref: IndirectObject = in_reply_to else: indirect_ref = getattr(in_reply_to, "indirect_reference", None) if not isinstance(indirect_ref, IndirectObject): raise ValueError( "in_reply_to must be a registered annotation " "(added via writer.add_annotation() first)" ) ref = indirect_ref self[NameObject("/IRT")] = ref self[NameObject("/RT")] = NameObject(f"/{reply_type}") if annotation_name is None: annotation_name = str(uuid.uuid4()) self[NameObject("/NM")] = TextStringObject(annotation_name) class Text(MarkupAnnotation): """ A text annotation. Args: rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying the clickable rectangular area text: The text that is added to the document open: flags: """ def __init__( self, *, rect: Union[RectangleObject, tuple[float, float, float, float]], text: str, open: bool = False, flags: int = NO_FLAGS, **kwargs: Any, ) -> None: super().__init__(**kwargs) self[NameObject("/Subtype")] = NameObject("/Text") self[NameObject("/Rect")] = RectangleObject(rect) self[NameObject("/Contents")] = TextStringObject(text) self[NameObject("/Open")] = BooleanObject(open) self[NameObject("/Flags")] = NumberObject(flags) class FreeText(MarkupAnnotation): """A FreeText annotation""" def __init__( self, *, text: str, rect: Union[RectangleObject, tuple[float, float, float, float]], font: str = "Helvetica", bold: bool = False, italic: bool = False, font_size: str = "14pt", font_color: str = "000000", border_color: Optional[str] = "000000", background_color: Optional[str] = "ffffff", **kwargs: Any, ) -> None: super().__init__(**kwargs) self[NameObject("/Subtype")] = NameObject("/FreeText") self[NameObject("/Rect")] = RectangleObject(rect) # Table 225 of the 1.7 reference ("CSS2 style attributes used in rich text strings") font_str = "font: " if italic: font_str = f"{font_str}italic " else: font_str = f"{font_str}normal " if bold: font_str = f"{font_str}bold " else: font_str = f"{font_str}normal " font_str = f"{font_str}{font_size} {font}" font_str = f"{font_str};text-align:left;color:#{font_color}" default_appearance_string = "" if border_color: for st in hex_to_rgb(border_color): default_appearance_string = f"{default_appearance_string}{st} " default_appearance_string = f"{default_appearance_string}rg" self.update( { NameObject("/Subtype"): NameObject("/FreeText"), NameObject("/Rect"): RectangleObject(rect), NameObject("/Contents"): TextStringObject(text), # font size color NameObject("/DS"): TextStringObject(font_str), NameObject("/DA"): TextStringObject(default_appearance_string), } ) if border_color is None: # Border Style self[NameObject("/BS")] = DictionaryObject( { # width of 0 means no border NameObject("/W"): NumberObject(0) } ) if background_color is not None: self[NameObject("/C")] = ArrayObject( [FloatObject(n) for n in hex_to_rgb(background_color)] ) class Line(MarkupAnnotation): def __init__( self, p1: Vertex, p2: Vertex, rect: Union[RectangleObject, tuple[float, float, float, float]], text: str = "", **kwargs: Any, ) -> None: super().__init__(**kwargs) self.update( { NameObject("/Subtype"): NameObject("/Line"), NameObject("/Rect"): RectangleObject(rect), NameObject("/L"): ArrayObject( [ FloatObject(p1[0]), FloatObject(p1[1]), FloatObject(p2[0]), FloatObject(p2[1]), ] ), NameObject("/LE"): ArrayObject( [ NameObject("/None"), NameObject("/None"), ] ), NameObject("/IC"): ArrayObject( [ FloatObject(0.5), FloatObject(0.5), FloatObject(0.5), ] ), NameObject("/Contents"): TextStringObject(text), } ) class PolyLine(MarkupAnnotation): def __init__( self, vertices: list[Vertex], **kwargs: Any, ) -> None: super().__init__(**kwargs) if len(vertices) == 0: raise ValueError("A polyline needs at least 1 vertex with two coordinates") coord_list = [] for x, y in vertices: coord_list.append(NumberObject(x)) coord_list.append(NumberObject(y)) self.update( { NameObject("/Subtype"): NameObject("/PolyLine"), NameObject("/Vertices"): ArrayObject(coord_list), NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)), } ) class Rectangle(MarkupAnnotation): def __init__( self, rect: Union[RectangleObject, tuple[float, float, float, float]], *, interior_color: Optional[str] = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) self.update( { NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Square"), NameObject("/Rect"): RectangleObject(rect), } ) if interior_color: self[NameObject("/IC")] = ArrayObject( [FloatObject(n) for n in hex_to_rgb(interior_color)] ) class Highlight(MarkupAnnotation): def __init__( self, *, rect: Union[RectangleObject, tuple[float, float, float, float]], quad_points: ArrayObject, highlight_color: str = "ff0000", printing: bool = False, **kwargs: Any, ) -> None: super().__init__(**kwargs) self.update( { NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/Rect"): RectangleObject(rect), NameObject("/QuadPoints"): quad_points, NameObject("/C"): ArrayObject( [FloatObject(n) for n in hex_to_rgb(highlight_color)] ), } ) if printing: self.flags = AnnotationFlag.PRINT class Ellipse(MarkupAnnotation): def __init__( self, rect: Union[RectangleObject, tuple[float, float, float, float]], *, interior_color: Optional[str] = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) self.update( { NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Circle"), NameObject("/Rect"): RectangleObject(rect), } ) if interior_color: self[NameObject("/IC")] = ArrayObject( [FloatObject(n) for n in hex_to_rgb(interior_color)] ) class Polygon(MarkupAnnotation): def __init__( self, vertices: list[tuple[float, float]], **kwargs: Any, ) -> None: super().__init__(**kwargs) if len(vertices) == 0: raise ValueError("A polygon needs at least 1 vertex with two coordinates") coord_list = [] for x, y in vertices: coord_list.append(NumberObject(x)) coord_list.append(NumberObject(y)) self.update( { NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Polygon"), NameObject("/Vertices"): ArrayObject(coord_list), NameObject("/IT"): NameObject("/PolygonCloud"), NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)), } ) ================================================ FILE: pypdf/annotations/_non_markup_annotations.py ================================================ from typing import TYPE_CHECKING, Any, Optional, Union from ..generic._base import ( BooleanObject, NameObject, NumberObject, TextStringObject, ) from ..generic._data_structures import ArrayObject, DictionaryObject from ..generic._fit import DEFAULT_FIT, Fit from ..generic._rectangle import RectangleObject from ._base import AnnotationDictionary class Link(AnnotationDictionary): def __init__( self, *, rect: Union[RectangleObject, tuple[float, float, float, float]], border: Optional[ArrayObject] = None, url: Optional[str] = None, target_page_index: Optional[int] = None, fit: Fit = DEFAULT_FIT, **kwargs: Any, ) -> None: super().__init__(**kwargs) if TYPE_CHECKING: from ..types import BorderArrayType # noqa: PLC0415 is_external = url is not None is_internal = target_page_index is not None if not is_external and not is_internal: raise ValueError( "Either 'url' or 'target_page_index' have to be provided. Both were None." ) if is_external and is_internal: raise ValueError( "Either 'url' or 'target_page_index' have to be provided. " f"{url=}, {target_page_index=}" ) border_arr: BorderArrayType if border is not None: border_arr = [NumberObject(n) for n in border[:3]] if len(border) == 4: dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) border_arr.append(dash_pattern) else: border_arr = [NumberObject(0)] * 3 self.update( { NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Link"), NameObject("/Rect"): RectangleObject(rect), NameObject("/Border"): ArrayObject(border_arr), } ) if is_external: self[NameObject("/A")] = DictionaryObject( { NameObject("/S"): NameObject("/URI"), NameObject("/Type"): NameObject("/Action"), NameObject("/URI"): TextStringObject(url), } ) if is_internal: # This needs to be updated later! dest_deferred = DictionaryObject( { "target_page_index": NumberObject(target_page_index), "fit": NameObject(fit.fit_type), "fit_args": fit.fit_args, } ) self[NameObject("/Dest")] = dest_deferred class Popup(AnnotationDictionary): def __init__( self, *, rect: Union[RectangleObject, tuple[float, float, float, float]], parent: Optional[DictionaryObject] = None, open: bool = False, **kwargs: Any, ) -> None: super().__init__(**kwargs) self.update( { NameObject("/Subtype"): NameObject("/Popup"), NameObject("/Rect"): RectangleObject(rect), NameObject("/Open"): BooleanObject(open), } ) if parent: # This needs to be an indirect object try: self[NameObject("/Parent")] = parent.indirect_reference except AttributeError: from .._utils import logger_warning # noqa: PLC0415 logger_warning( "Unregistered Parent object : No Parent field set", __name__, ) ================================================ FILE: pypdf/constants.py ================================================ """Various constants, enums, and flags to aid readability.""" from enum import Enum, IntFlag, auto, unique class StrEnum(str, Enum): # Once we are on Python 3.11+: enum.StrEnum def __str__(self) -> str: return str(self.value) class Core: """Keywords that don't quite belong anywhere else.""" OUTLINES = "/Outlines" THREADS = "/Threads" PAGE = "/Page" PAGES = "/Pages" CATALOG = "/Catalog" class TrailerKeys: SIZE = "/Size" PREV = "/Prev" ROOT = "/Root" ENCRYPT = "/Encrypt" INFO = "/Info" ID = "/ID" class CatalogAttributes: NAMES = "/Names" DESTS = "/Dests" class EncryptionDictAttributes: """ Additional encryption dictionary entries for the standard security handler. Table 3.19, Page 122. Table 21 of the 2.0 manual. """ R = "/R" # number, required; revision of the standard security handler O = "/O" # 32-byte string, required # noqa: E741 U = "/U" # 32-byte string, required P = "/P" # integer flag, required; permitted operations ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional class UserAccessPermissions(IntFlag): """ Table 3.20 User access permissions. Table 22 of the 2.0 manual. """ R1 = 1 R2 = 2 PRINT = 4 MODIFY = 8 EXTRACT = 16 ADD_OR_MODIFY = 32 R7 = 64 R8 = 128 FILL_FORM_FIELDS = 256 EXTRACT_TEXT_AND_GRAPHICS = 512 ASSEMBLE_DOC = 1024 PRINT_TO_REPRESENTATION = 2048 R13 = 2**12 R14 = 2**13 R15 = 2**14 R16 = 2**15 R17 = 2**16 R18 = 2**17 R19 = 2**18 R20 = 2**19 R21 = 2**20 R22 = 2**21 R23 = 2**22 R24 = 2**23 R25 = 2**24 R26 = 2**25 R27 = 2**26 R28 = 2**27 R29 = 2**28 R30 = 2**29 R31 = 2**30 R32 = 2**31 @classmethod def _is_reserved(cls, name: str) -> bool: """Check if the given name corresponds to a reserved flag entry.""" return name.startswith("R") and name[1:].isdigit() @classmethod def _is_active(cls, name: str) -> bool: """Check if the given reserved name defaults to 1 = active.""" return name not in {"R1", "R2"} def to_dict(self) -> dict[str, bool]: """Convert the given flag value to a corresponding verbose name mapping.""" result: dict[str, bool] = {} for name, flag in UserAccessPermissions.__members__.items(): if UserAccessPermissions._is_reserved(name): continue result[name.lower()] = (self & flag) == flag return result @classmethod def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions": """Convert the verbose name mapping to the corresponding flag value.""" value_copy = value.copy() result = cls(0) for name, flag in cls.__members__.items(): if cls._is_reserved(name): # Reserved names have a required value. Use it. if cls._is_active(name): result |= flag continue is_active = value_copy.pop(name.lower(), False) if is_active: result |= flag if value_copy: raise ValueError(f"Unknown dictionary keys: {value_copy!r}") return result @classmethod def all(cls) -> "UserAccessPermissions": return cls((2**32 - 1) - cls.R1 - cls.R2) class Resources: """ Table 3.30 Entries in a resource dictionary. Table 34 in the 2.0 reference. """ EXT_G_STATE = "/ExtGState" # dictionary, optional COLOR_SPACE = "/ColorSpace" # dictionary, optional PATTERN = "/Pattern" # dictionary, optional SHADING = "/Shading" # dictionary, optional XOBJECT = "/XObject" # dictionary, optional FONT = "/Font" # dictionary, optional PROC_SET = "/ProcSet" # array, optional PROPERTIES = "/Properties" # dictionary, optional class PagesAttributes: """§7.7.3.2 of the 1.7 and 2.0 reference.""" TYPE = "/Type" # name, required; must be /Pages PARENT = "/Parent" # dictionary, required; indirect reference to pages object KIDS = "/Kids" # array, required; List of indirect references COUNT = "/Count" # integer, required; the number of leaf nodes (page objects) # that are descendants of this node within the page tree class PageAttributes: """§7.7.3.3 of the 1.7 and 2.0 reference.""" TYPE = "/Type" # name, required; must be /Page PARENT = "/Parent" # dictionary, required; a pages object LAST_MODIFIED = ( "/LastModified" # date, optional; date and time of last modification ) RESOURCES = "/Resources" # dictionary, required if there are any MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size CROPBOX = "/CropBox" # rectangle, optional BLEEDBOX = "/BleedBox" # rectangle, optional TRIMBOX = "/TrimBox" # rectangle, optional ARTBOX = "/ArtBox" # rectangle, optional BOX_COLOR_INFO = "/BoxColorInfo" # dictionary, optional CONTENTS = "/Contents" # stream or array, optional ROTATE = "/Rotate" # integer, optional; page rotation in degrees GROUP = "/Group" # dictionary, optional; page group THUMB = "/Thumb" # stream, optional; indirect reference to image of the page B = "/B" # array, optional DUR = "/Dur" # number, optional TRANS = "/Trans" # dictionary, optional ANNOTS = "/Annots" # array, optional; an array of annotations AA = "/AA" # dictionary, optional METADATA = "/Metadata" # stream, optional PIECE_INFO = "/PieceInfo" # dictionary, optional STRUCT_PARENTS = "/StructParents" # integer, optional ID = "/ID" # byte string, optional PZ = "/PZ" # number, optional SEPARATION_INFO = "/SeparationInfo" # dictionary, optional TABS = "/Tabs" # name, optional TEMPLATE_INSTANTIATED = "/TemplateInstantiated" # name, optional PRES_STEPS = "/PresSteps" # dictionary, optional USER_UNIT = "/UserUnit" # number, optional VP = "/VP" # dictionary, optional AF = "/AF" # array of dictionaries, optional OUTPUT_INTENTS = "/OutputIntents" # array, optional D_PART = "/DPart" # dictionary, required, if this page is within the range of a DPart, not permitted otherwise class FileSpecificationDictionaryEntries: """Table 3.41 Entries in a file specification dictionary.""" Type = "/Type" FS = "/FS" # The name of the file system to be used to interpret this file specification F = "/F" # A file specification string of the form described in §3.10.1 UF = "/UF" # A Unicode string of the file as described in §3.10.1 DOS = "/DOS" Mac = "/Mac" Unix = "/Unix" ID = "/ID" V = "/V" EF = "/EF" # dictionary, containing a subset of the keys F, UF, DOS, Mac, and Unix RF = "/RF" # dictionary, containing arrays of /EmbeddedFile DESC = "/Desc" # description of the file Cl = "/Cl" class StreamAttributes: """ Table 4.2. Table 5 in the 2.0 reference. """ LENGTH = "/Length" # integer, required FILTER = "/Filter" # name or array of names, optional DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong @unique class FilterTypes(StrEnum): """§7.4 of the 1.7 and 2.0 references.""" ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85 LZW_DECODE = "/LZWDecode" # abbreviation: LZW FLATE_DECODE = "/FlateDecode" # abbreviation: Fl RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF DCT_DECODE = "/DCTDecode" # abbreviation: DCT JPX_DECODE = "/JPXDecode" JBIG2_DECODE = "/JBIG2Decode" class FilterTypeAbbreviations: """§8.9.7 of the 1.7 and 2.0 references.""" AHx = "/AHx" A85 = "/A85" LZW = "/LZW" FL = "/Fl" RL = "/RL" CCF = "/CCF" DCT = "/DCT" class LzwFilterParameters: """ Table 4.4. Table 8 in the 2.0 reference. """ PREDICTOR = "/Predictor" # integer COLORS = "/Colors" # integer BITS_PER_COMPONENT = "/BitsPerComponent" # integer COLUMNS = "/Columns" # integer EARLY_CHANGE = "/EarlyChange" # integer class CcittFaxDecodeParameters: """ Table 4.5. Table 11 in the 2.0 reference. """ K = "/K" # integer END_OF_LINE = "/EndOfLine" # boolean ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean COLUMNS = "/Columns" # integer ROWS = "/Rows" # integer END_OF_BLOCK = "/EndOfBlock" # boolean BLACK_IS_1 = "/BlackIs1" # boolean DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer class ImageAttributes: """§11.6.5 of the 1.7 and 2.0 references.""" TYPE = "/Type" # name, required; must be /XObject SUBTYPE = "/Subtype" # name, required; must be /Image NAME = "/Name" # name, required WIDTH = "/Width" # integer, required HEIGHT = "/Height" # integer, required BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required COLOR_SPACE = "/ColorSpace" # name, required DECODE = "/Decode" # array, optional INTENT = "/Intent" # string, optional INTERPOLATE = "/Interpolate" # boolean, optional IMAGE_MASK = "/ImageMask" # boolean, optional MASK = "/Mask" # 1-bit image mask stream S_MASK = "/SMask" # dictionary or name, optional class ColorSpaces: DEVICE_RGB = "/DeviceRGB" DEVICE_CMYK = "/DeviceCMYK" DEVICE_GRAY = "/DeviceGray" class TypArguments: """Table 8.2 of the PDF 1.7 reference.""" LEFT = "/Left" RIGHT = "/Right" BOTTOM = "/Bottom" TOP = "/Top" class TypFitArguments: """Table 8.2 of the PDF 1.7 reference.""" XYZ = "/XYZ" FIT = "/Fit" FIT_H = "/FitH" FIT_V = "/FitV" FIT_R = "/FitR" FIT_B = "/FitB" FIT_BH = "/FitBH" FIT_BV = "/FitBV" class GoToActionArguments: S = "/S" # name, required: type of action D = "/D" # name, byte string, or array, required: destination to jump to SD = "/SD" # array, optional: structure destination to jump to class AnnotationDictionaryAttributes: """Table 8.15 Entries common to all annotation dictionaries.""" Type = "/Type" Subtype = "/Subtype" Rect = "/Rect" Contents = "/Contents" P = "/P" NM = "/NM" M = "/M" F = "/F" AP = "/AP" AS = "/AS" DA = "/DA" Border = "/Border" C = "/C" StructParent = "/StructParent" OC = "/OC" class InteractiveFormDictEntries: Fields = "/Fields" NeedAppearances = "/NeedAppearances" SigFlags = "/SigFlags" CO = "/CO" DR = "/DR" DA = "/DA" Q = "/Q" XFA = "/XFA" class FieldDictionaryAttributes: """ Entries common to all field dictionaries (Table 8.69 PDF 1.7 reference) (*very partially documented here*). FFBits provides the constants used for `/Ff` from Table 8.70/8.75/8.77/8.79 """ FT = "/FT" # name, required for terminal fields Parent = "/Parent" # dictionary, required for children Kids = "/Kids" # array, sometimes required T = "/T" # text string, optional TU = "/TU" # text string, optional TM = "/TM" # text string, optional Ff = "/Ff" # integer, optional V = "/V" # text string or array, optional DV = "/DV" # text string, optional AA = "/AA" # dictionary, optional Opt = "/Opt" # array, optional class FfBits(IntFlag): """ Ease building /Ff flags Some entries may be specific to: * Text (Tx) (Table 8.75 PDF 1.7 reference) * Buttons (Btn) (Table 8.77 PDF 1.7 reference) * Choice (Ch) (Table 8.79 PDF 1.7 reference) """ ReadOnly = 1 << 0 """common to Tx/Btn/Ch in Table 8.70""" Required = 1 << 1 """common to Tx/Btn/Ch in Table 8.70""" NoExport = 1 << 2 """common to Tx/Btn/Ch in Table 8.70""" Multiline = 1 << 12 """Tx""" Password = 1 << 13 """Tx""" NoToggleToOff = 1 << 14 """Btn""" Radio = 1 << 15 """Btn""" Pushbutton = 1 << 16 """Btn""" Combo = 1 << 17 """Ch""" Edit = 1 << 18 """Ch""" Sort = 1 << 19 """Ch""" FileSelect = 1 << 20 """Tx""" MultiSelect = 1 << 21 """Tx""" DoNotSpellCheck = 1 << 22 """Tx/Ch""" DoNotScroll = 1 << 23 """Tx""" Comb = 1 << 24 """Tx""" RadiosInUnison = 1 << 25 """Btn""" RichText = 1 << 25 """Tx""" CommitOnSelChange = 1 << 26 """Ch""" @classmethod def attributes(cls) -> tuple[str, ...]: """ Get a tuple of all the attributes present in a Field Dictionary. This method returns a tuple of all the attribute constants defined in the FieldDictionaryAttributes class. These attributes correspond to the entries that are common to all field dictionaries as specified in the PDF 1.7 reference. Returns: A tuple containing all the attribute constants. """ return ( cls.TM, cls.T, cls.FT, cls.Parent, cls.TU, cls.Ff, cls.V, cls.DV, cls.Kids, cls.AA, ) @classmethod def attributes_dict(cls) -> dict[str, str]: """ Get a dictionary of attribute keys and their human-readable names. This method returns a dictionary where the keys are the attribute constants defined in the FieldDictionaryAttributes class and the values are their corresponding human-readable names. These attributes correspond to the entries that are common to all field dictionaries as specified in the PDF 1.7 reference. Returns: A dictionary containing attribute keys and their names. """ return { cls.FT: "Field Type", cls.Parent: "Parent", cls.T: "Field Name", cls.TU: "Alternate Field Name", cls.TM: "Mapping Name", cls.Ff: "Field Flags", cls.V: "Value", cls.DV: "Default Value", } class CheckboxRadioButtonAttributes: """Table 8.76 Field flags common to all field types.""" Opt = "/Opt" # Options, Optional @classmethod def attributes(cls) -> tuple[str, ...]: """ Get a tuple of all the attributes present in a Field Dictionary. This method returns a tuple of all the attribute constants defined in the CheckboxRadioButtonAttributes class. These attributes correspond to the entries that are common to all field dictionaries as specified in the PDF 1.7 reference. Returns: A tuple containing all the attribute constants. """ return (cls.Opt,) @classmethod def attributes_dict(cls) -> dict[str, str]: """ Get a dictionary of attribute keys and their human-readable names. This method returns a dictionary where the keys are the attribute constants defined in the CheckboxRadioButtonAttributes class and the values are their corresponding human-readable names. These attributes correspond to the entries that are common to all field dictionaries as specified in the PDF 1.7 reference. Returns: A dictionary containing attribute keys and their names. """ return { cls.Opt: "Options", } class FieldFlag(IntFlag): """Table 8.70 Field flags common to all field types.""" READ_ONLY = 1 REQUIRED = 2 NO_EXPORT = 4 class DocumentInformationAttributes: """Table 10.2 Entries in the document information dictionary.""" TITLE = "/Title" # text string, optional AUTHOR = "/Author" # text string, optional SUBJECT = "/Subject" # text string, optional KEYWORDS = "/Keywords" # text string, optional CREATOR = "/Creator" # text string, optional PRODUCER = "/Producer" # text string, optional CREATION_DATE = "/CreationDate" # date, optional MOD_DATE = "/ModDate" # date, optional TRAPPED = "/Trapped" # name, optional class PageLayouts: """ Page 84, PDF 1.4 reference. Page 115, PDF 2.0 reference. """ SINGLE_PAGE = "/SinglePage" ONE_COLUMN = "/OneColumn" TWO_COLUMN_LEFT = "/TwoColumnLeft" TWO_COLUMN_RIGHT = "/TwoColumnRight" TWO_PAGE_LEFT = "/TwoPageLeft" # (PDF 1.5) TWO_PAGE_RIGHT = "/TwoPageRight" # (PDF 1.5) class GraphicsStateParameters: """Table 58 – Entries in a Graphics State Parameter Dictionary""" TYPE = "/Type" # name, optional LW = "/LW" # number, optional LC = "/LC" # integer, optional LJ = "/LJ" # integer, optional ML = "/ML" # number, optional D = "/D" # array, optional RI = "/RI" # name, optional OP = "/OP" op = "/op" OPM = "/OPM" FONT = "/Font" # array, optional BG = "/BG" BG2 = "/BG2" UCR = "/UCR" UCR2 = "/UCR2" TR = "/TR" TR2 = "/TR2" HT = "/HT" FL = "/FL" SM = "/SM" SA = "/SA" BM = "/BM" S_MASK = "/SMask" # dictionary or name, optional CA = "/CA" ca = "/ca" AIS = "/AIS" TK = "/TK" class CatalogDictionary: """§7.7.2 of the 1.7 and 2.0 references.""" TYPE = "/Type" # name, required; must be /Catalog VERSION = "/Version" # name EXTENSIONS = "/Extensions" # dictionary, optional; ISO 32000-1 PAGES = "/Pages" # dictionary, required PAGE_LABELS = "/PageLabels" # number tree, optional NAMES = "/Names" # dictionary, optional DESTS = "/Dests" # dictionary, optional VIEWER_PREFERENCES = "/ViewerPreferences" # dictionary, optional PAGE_LAYOUT = "/PageLayout" # name, optional PAGE_MODE = "/PageMode" # name, optional OUTLINES = "/Outlines" # dictionary, optional THREADS = "/Threads" # array, optional OPEN_ACTION = "/OpenAction" # array or dictionary or name, optional AA = "/AA" # dictionary, optional URI = "/URI" # dictionary, optional ACRO_FORM = "/AcroForm" # dictionary, optional METADATA = "/Metadata" # stream, optional STRUCT_TREE_ROOT = "/StructTreeRoot" # dictionary, optional MARK_INFO = "/MarkInfo" # dictionary, optional LANG = "/Lang" # text string, optional SPIDER_INFO = "/SpiderInfo" # dictionary, optional OUTPUT_INTENTS = "/OutputIntents" # array, optional PIECE_INFO = "/PieceInfo" # dictionary, optional OC_PROPERTIES = "/OCProperties" # dictionary, optional PERMS = "/Perms" # dictionary, optional LEGAL = "/Legal" # dictionary, optional REQUIREMENTS = "/Requirements" # array, optional COLLECTION = "/Collection" # dictionary, optional NEEDS_RENDERING = "/NeedsRendering" # boolean, optional DSS = "/DSS" # dictionary, optional AF = "/AF" # array of dictionaries, optional D_PART_ROOT = "/DPartRoot" # dictionary, optional class OutlineFontFlag(IntFlag): """A class used as an enumerable flag for formatting an outline font.""" italic = 1 bold = 2 class PageLabelStyle: """ Table 8.10 in the 1.7 reference. Table 161 in the 2.0 reference. """ DECIMAL = "/D" # Decimal Arabic numerals UPPERCASE_ROMAN = "/R" # Uppercase Roman numerals LOWERCASE_ROMAN = "/r" # Lowercase Roman numerals UPPERCASE_LETTER = "/A" # Uppercase letters LOWERCASE_LETTER = "/a" # Lowercase letters class AnnotationFlag(IntFlag): """See §12.5.3 "Annotation Flags".""" INVISIBLE = 1 HIDDEN = 2 PRINT = 4 NO_ZOOM = 8 NO_ROTATE = 16 NO_VIEW = 32 READ_ONLY = 64 LOCKED = 128 TOGGLE_NO_VIEW = 256 LOCKED_CONTENTS = 512 PDF_KEYS = ( AnnotationDictionaryAttributes, CatalogAttributes, CatalogDictionary, CcittFaxDecodeParameters, CheckboxRadioButtonAttributes, ColorSpaces, Core, DocumentInformationAttributes, EncryptionDictAttributes, FieldDictionaryAttributes, FileSpecificationDictionaryEntries, FilterTypeAbbreviations, FilterTypes, GoToActionArguments, GraphicsStateParameters, ImageAttributes, InteractiveFormDictEntries, LzwFilterParameters, PageAttributes, PageLayouts, PagesAttributes, Resources, StreamAttributes, TrailerKeys, TypArguments, TypFitArguments, ) class ImageType(IntFlag): NONE = 0 XOBJECT_IMAGES = auto() INLINE_IMAGES = auto() DRAWING_IMAGES = auto() ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES IMAGES = ALL # for consistency with ObjectDeletionFlag _INLINE_IMAGE_VALUE_MAPPING = { "/G": "/DeviceGray", "/RGB": "/DeviceRGB", "/CMYK": "/DeviceCMYK", "/I": "/Indexed", "/AHx": "/ASCIIHexDecode", "/A85": "/ASCII85Decode", "/LZW": "/LZWDecode", "/Fl": "/FlateDecode", "/RL": "/RunLengthDecode", "/CCF": "/CCITTFaxDecode", "/DCT": "/DCTDecode", "/DeviceGray": "/DeviceGray", "/DeviceRGB": "/DeviceRGB", "/DeviceCMYK": "/DeviceCMYK", "/Indexed": "/Indexed", "/ASCIIHexDecode": "/ASCIIHexDecode", "/ASCII85Decode": "/ASCII85Decode", "/LZWDecode": "/LZWDecode", "/FlateDecode": "/FlateDecode", "/RunLengthDecode": "/RunLengthDecode", "/CCITTFaxDecode": "/CCITTFaxDecode", "/DCTDecode": "/DCTDecode", "/RelativeColorimetric": "/RelativeColorimetric", } _INLINE_IMAGE_KEY_MAPPING = { "/BPC": "/BitsPerComponent", "/CS": "/ColorSpace", "/D": "/Decode", "/DP": "/DecodeParms", "/F": "/Filter", "/H": "/Height", "/W": "/Width", "/I": "/Interpolate", "/Intent": "/Intent", "/IM": "/ImageMask", "/BitsPerComponent": "/BitsPerComponent", "/ColorSpace": "/ColorSpace", "/Decode": "/Decode", "/DecodeParms": "/DecodeParms", "/Filter": "/Filter", "/Height": "/Height", "/Width": "/Width", "/Interpolate": "/Interpolate", "/ImageMask": "/ImageMask", } class AFRelationship: """ Associated file relationship types, defining the relationship between the PDF component and the associated file. Defined in table 43 of the PDF 2.0 reference. """ SOURCE = "/Source" # Original content source DATA = "/Data" # Base data for visual presentation ALTERNATIVE = "/Alternative" # Alternative content representation SUPPLEMENT = "/Supplement" # Supplemental representation of original source/data ENCRYPTED_PAYLOAD = "/EncryptedPayload" # Encrypted payload document FORM_DATA = "/FormData" # Data associated with AcroForm of this PDF SCHEMA = "/Schema" # Schema definition for associated object UNSPECIFIED = "/Unspecified" # Not known or cannot be described with values class BorderStyles: """ A class defining border styles used in PDF documents. Defined in table 168 of the PDF 2.0 reference. """ BEVELED = "/B" DASHED = "/D" INSET = "/I" SOLID = "/S" UNDERLINED = "/U" class FontFlags(IntFlag): """ A class defining font flags in PDF document font descriptor resources. Defined in table 121 of the PDF 2.0 reference. """ FIXED_PITCH = 1 << 0 SERIF = 1 << 1 SYMBOLIC = 1 << 2 SCRIPT = 1 << 3 NONSYMBOLIC = 1 << 5 ITALIC = 1 << 6 ALL_CAP = 1 << 16 SMALL_CAP = 1 << 17 FORCE_BOLD = 1 << 18 ================================================ FILE: pypdf/errors.py ================================================ """ All errors/exceptions pypdf raises and all of the warnings it uses. Please note that broken PDF files might cause other Exceptions. """ class DeprecationError(Exception): """Raised when a deprecated feature is used.""" class DependencyError(Exception): """ Raised when a required dependency (a library or module that pypdf depends on) is not available or cannot be imported. """ class PyPdfError(Exception): """Base class for all exceptions raised by pypdf.""" class PdfReadError(PyPdfError): """Raised when there is an issue reading a PDF file.""" class PageSizeNotDefinedError(PyPdfError): """Raised when the page size of a PDF document is not defined.""" class PdfReadWarning(UserWarning): """Issued when there is a potential issue reading a PDF file, but it can still be read.""" class PdfStreamError(PdfReadError): """Raised when there is an issue reading the stream of data in a PDF file.""" class ParseError(PyPdfError): """ Raised when there is an issue parsing (analyzing and understanding the structure and meaning of) a PDF file. """ class FileNotDecryptedError(PdfReadError): """ Raised when a PDF file that has been encrypted (meaning it requires a password to be accessed) has not been successfully decrypted. """ class WrongPasswordError(FileNotDecryptedError): """Raised when the wrong password is used to try to decrypt an encrypted PDF file.""" class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" class EmptyImageDataError(PyPdfError): """Raised when trying to process an image that has no data.""" STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" class LimitReachedError(PyPdfError): """Raised when a limit is reached.""" class XmpDocumentError(PyPdfError, RuntimeError): """Raised when the XMP XML document context is invalid or missing.""" ================================================ FILE: pypdf/filters.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """ Implementation of stream filters; §7.4 Filters of the PDF 2.0 specification. §8.9.7 Inline images of the PDF 2.0 specification has abbreviations that can be used for the names of filters in an inline image object. """ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import binascii import math import os import shutil import struct import subprocess import zlib from base64 import a85decode from dataclasses import dataclass from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional, Union, cast from ._codecs._codecs import LzwCodec as _LzwCodec from ._utils import ( WHITESPACES_AS_BYTES, deprecate, deprecation_with_replacement, logger_warning, ) from .constants import CcittFaxDecodeParameters as CCITT from .constants import FilterTypeAbbreviations as FTA from .constants import FilterTypes as FT from .constants import ImageAttributes as IA from .constants import LzwFilterParameters as LZW from .constants import StreamAttributes as SA from .errors import DependencyError, LimitReachedError, PdfReadError, PdfStreamError from .generic import ( ArrayObject, DictionaryObject, IndirectObject, NullObject, NumberObject, StreamObject, is_null_or_none, ) MAX_DECLARED_STREAM_LENGTH = 75_000_000 MAX_ARRAY_BASED_STREAM_OUTPUT_LENGTH = 75_000_000 JBIG2_MAX_OUTPUT_LENGTH = 75_000_000 LZW_MAX_OUTPUT_LENGTH = 75_000_000 RUN_LENGTH_MAX_OUTPUT_LENGTH = 75_000_000 ZLIB_MAX_OUTPUT_LENGTH = 75_000_000 ZLIB_MAX_RECOVERY_INPUT_LENGTH = 5_000_000 # Reuse cached 1-byte values in the fallback loop to avoid per-byte allocations. _SINGLE_BYTES = tuple(bytes((i,)) for i in range(256)) def _decompress_with_limit(data: bytes) -> bytes: decompressor = zlib.decompressobj() result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH) if decompressor.unconsumed_tail: raise LimitReachedError( f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining." ) return result def decompress(data: bytes) -> bytes: """ Decompress the given data using zlib. Attempts to decompress the input data using zlib. If the decompression fails due to a zlib error, it falls back to using a decompression object with a larger window size. Please note that the output length is limited to avoid memory issues. If you need to process larger content streams, consider adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you are only dealing with trusted inputs and/or want to disable these limits, set the value to `0`. Args: data: The input data to be decompressed. Returns: The decompressed data. """ try: return _decompress_with_limit(data) except zlib.error: # First quick approach: There are known issues with faulty added bytes to the # tail of the encoded stream from early Adobe Distiller or Pitstop versions # with CR char as the default line separator (assumed by reverse engineering) # that breaks the decoding process in the end. # # Try first to cut off some of the tail byte by byte, but limited to not # iterate through too many loops and kill the performance for large streams, # to then allow the final fallback to run. Added this intermediate attempt, # because starting from the head of the stream byte by byte kills completely # the performance for large streams (e.g., 6 MB) with the tail-byte-issue # and takes ages. This solution is really fast: max_tail_cut_off_bytes: int = 8 for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))): try: return _decompress_with_limit(data[:-i]) except zlib.error: pass # If still failing, then try with increased window size. decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32) result_str = b"" remaining_limit = ZLIB_MAX_OUTPUT_LENGTH data_length = len(data) known_errors = set() for index in range(data_length): chunk = _SINGLE_BYTES[data[index]] try: decompressed = decompressor.decompress(chunk, max_length=remaining_limit) result_str += decompressed remaining_limit -= len(decompressed) if remaining_limit <= 0: raise LimitReachedError( f"Limit reached while decompressing. {data_length - index} bytes remaining." ) except zlib.error as error: if index > ZLIB_MAX_RECOVERY_INPUT_LENGTH: raise LimitReachedError( f"Recovery limit reached while decompressing. {data_length - index} bytes remaining." ) error_str = str(error) if error_str in known_errors: continue logger_warning(error_str, __name__) known_errors.add(error_str) return result_str class FlateDecode: @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decode data which is flate-encoded. Args: data: Flate-encoded data. decode_parms: Additional decoding parameters. Returns: The flate-decoded data. Raises: PdfReadError: Unsupported parameters have been found. """ str_data = decompress(data) if isinstance(decode_parms, DictionaryObject): parameters = decode_parms else: parameters = DictionaryObject() predictor = parameters.get("/Predictor", 1) # predictor 1 == no predictor if predictor != 1: columns, colors, bits_per_component = FlateDecode._get_parameters(parameters) # PNG predictor can vary by row and so is the lead byte on each row rowlength = ( math.ceil(columns * colors * bits_per_component / 8) + 1 ) # number of bytes # TIFF prediction: if predictor == 2: rowlength -= 1 # remove the predictor byte bpp = rowlength // columns str_data = bytearray(str_data) for i in range(len(str_data)): if i % rowlength >= bpp: str_data[i] = (str_data[i] + str_data[i - bpp]) % 256 str_data = bytes(str_data) # PNG prediction: elif 10 <= predictor <= 15: str_data = FlateDecode._decode_png_prediction( str_data, columns, rowlength ) else: raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}") return str_data @staticmethod def _get_parameters(parameters: DictionaryObject) -> tuple[int, int, int]: # For details, see table 8 of ISO 32000-2:2020. def get(key: str, default: int) -> int: _value = parameters.get(key, NumberObject(default)).get_object() if not isinstance(_value, int) or _value < 1: raise PdfReadError(f"Expected positive number for {key}, got {_value}!") return _value columns = get(key=LZW.COLUMNS, default=1) colors = get(key=LZW.COLORS, default=1) bits_per_component = get(key=LZW.BITS_PER_COMPONENT, default=8) return columns, colors, bits_per_component @staticmethod def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes: # PNG prediction can vary from row to row if (remainder := len(data) % rowlength) != 0: logger_warning("Image data is not rectangular. Adding padding.", __name__) data += b"\x00" * (rowlength - remainder) assert len(data) % rowlength == 0 output = [] prev_rowdata = (0,) * rowlength bpp = (rowlength - 1) // columns # recomputed locally to not change params for row in range(0, len(data), rowlength): rowdata: list[int] = list(data[row : row + rowlength]) filter_byte = rowdata[0] if filter_byte == 0: # PNG None Predictor pass elif filter_byte == 1: # PNG Sub Predictor for i in range(bpp + 1, rowlength): rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256 elif filter_byte == 2: # PNG Up Predictor for i in range(1, rowlength): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 elif filter_byte == 3: # PNG Average Predictor for i in range(1, bpp + 1): floor = prev_rowdata[i] // 2 rowdata[i] = (rowdata[i] + floor) % 256 for i in range(bpp + 1, rowlength): left = rowdata[i - bpp] floor = (left + prev_rowdata[i]) // 2 rowdata[i] = (rowdata[i] + floor) % 256 elif filter_byte == 4: # PNG Paeth Predictor for i in range(1, bpp + 1): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 for i in range(bpp + 1, rowlength): left = rowdata[i - bpp] up = prev_rowdata[i] up_left = prev_rowdata[i - bpp] p = left + up - up_left dist_left = abs(p - left) dist_up = abs(p - up) dist_up_left = abs(p - up_left) if dist_left <= dist_up and dist_left <= dist_up_left: paeth = left elif dist_up <= dist_up_left: paeth = up else: paeth = up_left rowdata[i] = (rowdata[i] + paeth) % 256 else: raise PdfReadError( f"Unsupported PNG filter {filter_byte!r}" ) # pragma: no cover prev_rowdata = tuple(rowdata) output.extend(rowdata[1:]) return bytes(output) @staticmethod def encode(data: bytes, level: int = -1) -> bytes: """ Compress the input data using zlib. Args: data: The data to be compressed. level: See https://docs.python.org/3/library/zlib.html#zlib.compress Returns: The compressed data. """ return zlib.compress(data, level) class ASCIIHexDecode: """ The ASCIIHexDecode filter decodes data that has been encoded in ASCII hexadecimal form into a base-7 ASCII format. """ @staticmethod def decode( data: Union[str, bytes], decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decode an ASCII-Hex encoded data stream. Args: data: a str sequence of hexadecimal-encoded values to be converted into a base-7 ASCII string decode_parms: this filter does not use parameters. Returns: A string conversion in base-7 ASCII, where each of its values v is such that 0 <= ord(v) <= 127. Raises: PdfStreamError: """ if isinstance(data, str): data = data.encode() # Stop at EOD eod = data.find(b">") if eod == -1: logger_warning( "missing EOD in ASCIIHexDecode, check if output is OK", __name__, ) hex_data = data else: hex_data = data[:eod] # Remove whitespace hex_data = b"".join(hex_data.split()) # Pad if odd length if len(hex_data) % 2 == 1: hex_data += b"0" return binascii.unhexlify(hex_data) class RunLengthDecode: """ The RunLengthDecode filter decodes data that has been encoded in a simple byte-oriented format based on run length. The encoded data is a sequence of runs, where each run consists of a length byte followed by 1 to 128 bytes of data. If the length byte is in the range 0 to 127, the following length + 1 (1 to 128) bytes are copied literally during decompression. If length is in the range 129 to 255, the following single byte is to be copied 257 − length (2 to 128) times during decompression. A length value of 128 denotes EOD. """ @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decode a run length encoded data stream. Args: data: a bytes sequence of length/data decode_parms: this filter does not use parameters. Returns: A bytes decompressed sequence. Raises: PdfStreamError: """ lst = [] index = 0 data_length = len(data) total_length = 0 while True: if index >= data_length: logger_warning( "missing EOD in RunLengthDecode, check if output is OK", __name__ ) break # Reached end of string without an EOD length = data[index] index += 1 if length == 128: if index < data_length: # We should first check, if we have an inner stream from a multi-encoded # stream with a faulty trailing newline that we can decode properly. # We will just ignore the last byte and raise a warning ... if (index == data_length - 1) and (data[index : index + 1] == b"\n"): logger_warning( "Found trailing newline in stream data, check if output is OK", __name__ ) break # Raising an exception here breaks all image extraction for this file, which might # not be desirable. For this reason, indicate that the output is most likely wrong, # as processing stopped after the first EOD marker. See issue #3517. logger_warning( "Early EOD in RunLengthDecode, check if output is OK", __name__ ) break if length < 128: length += 1 lst.append(data[index : (index + length)]) index += length else: # >128 length = 257 - length lst.append(bytes((data[index],)) * length) index += 1 total_length += length if total_length > RUN_LENGTH_MAX_OUTPUT_LENGTH: raise LimitReachedError("Limit reached while decompressing.") return b"".join(lst) class LZWDecode: class Decoder: STOP = 257 CLEARDICT = 256 def __init__(self, data: bytes) -> None: self.data = data def decode(self) -> bytes: return _LzwCodec(max_output_length=LZW_MAX_OUTPUT_LENGTH).decode(self.data) @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decode an LZW encoded data stream. Args: data: ``bytes`` or ``str`` text to decode. decode_parms: a dictionary of parameter values. Returns: decoded data. """ # decode_parms is unused here return LZWDecode.Decoder(data).decode() class ASCII85Decode: """Decodes string ASCII85-encoded data into a byte format.""" @staticmethod def decode( data: Union[str, bytes], decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decode an Ascii85 encoded data stream. Args: data: ``bytes`` or ``str`` text to decode. decode_parms: this filter does not use parameters. Returns: decoded data. """ if isinstance(data, str): data = data.encode() data = data.strip(WHITESPACES_AS_BYTES) if len(data) > 2 and data.endswith(b">"): data = data[:-1].rstrip(WHITESPACES_AS_BYTES) + data[-1:] try: return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) except ValueError as error: if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'": logger_warning("Ignoring missing Ascii85 end marker.", __name__) return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES) raise class DCTDecode: @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decompresses data encoded using a DCT (discrete cosine transform) technique based on the JPEG standard (IS0/IEC 10918), reproducing image sample data that approximates the original data. Args: data: text to decode. decode_parms: this filter does not use parameters. Returns: decoded data. """ return data class JPXDecode: @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ Decompresses data encoded using the wavelet-based JPEG 2000 standard, reproducing the original image data. Args: data: text to decode. decode_parms: this filter does not use parameters. Returns: decoded data. """ return data @dataclass class CCITTParameters: """§7.4.6, optional parameters for the CCITTFaxDecode filter.""" K: int = 0 columns: int = 1728 rows: int = 0 EndOfLine: Union[bool, None] = False EncodedByteAlign: Union[bool, None] = False EndOfBlock: Union[bool, None] = True BlackIs1: bool = False DamagedRowsBeforeError: Union[int, None] = 0 @property def group(self) -> int: if self.K < 0: # Pure two-dimensional encoding (Group 4) CCITTgroup = 4 else: # K == 0: Pure one-dimensional encoding (Group 3, 1-D) # K > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D) CCITTgroup = 3 return CCITTgroup def __create_old_class_instance( K: int = 0, columns: int = 0, rows: int = 0 ) -> CCITTParameters: deprecation_with_replacement("CCITParameters", "CCITTParameters", "6.0.0") return CCITTParameters(K, columns, rows) # Create an alias for the old class name CCITParameters = __create_old_class_instance class CCITTFaxDecode: """ §7.4.6, CCITTFaxDecode filter (ISO 32000). Either Group 3 or Group 4 CCITT facsimile (fax) encoding. CCITT encoding is bit-oriented, not byte-oriented. §7.4.6, optional parameters for the CCITTFaxDecode filter. """ @staticmethod def _get_parameters( parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], rows: Union[int, IndirectObject], ) -> CCITTParameters: ccitt_parameters = CCITTParameters(rows=int(rows)) if parameters: parameters_unwrapped = cast( Union[ArrayObject, DictionaryObject], parameters.get_object() ) if isinstance(parameters_unwrapped, ArrayObject): for decode_parm in parameters_unwrapped: if CCITT.K in decode_parm: ccitt_parameters.K = decode_parm[CCITT.K].get_object() if CCITT.COLUMNS in decode_parm: ccitt_parameters.columns = decode_parm[CCITT.COLUMNS].get_object() if CCITT.BLACK_IS_1 in decode_parm: ccitt_parameters.BlackIs1 = decode_parm[CCITT.BLACK_IS_1].get_object().value else: if CCITT.K in parameters_unwrapped: ccitt_parameters.K = parameters_unwrapped[CCITT.K].get_object() # type: ignore if CCITT.COLUMNS in parameters_unwrapped: ccitt_parameters.columns = parameters_unwrapped[CCITT.COLUMNS].get_object() # type: ignore if CCITT.BLACK_IS_1 in parameters_unwrapped: ccitt_parameters.BlackIs1 = parameters_unwrapped[CCITT.BLACK_IS_1].get_object().value # type: ignore return ccitt_parameters @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, height: int = 0, **kwargs: Any, ) -> bytes: params = CCITTFaxDecode._get_parameters(decode_parms, height) img_size = len(data) tiff_header_struct = "<2shlh" + "hhll" * 8 + "h" tiff_header = struct.pack( tiff_header_struct, b"II", # Byte order indication: Little endian 42, # Version number (always 42) 8, # Offset to the first image file directory (IFD) 8, # Number of tags in IFD 256, # ImageWidth, LONG, 1, width 4, 1, params.columns, 257, # ImageLength, LONG, 1, length 4, 1, params.rows, 258, # BitsPerSample, SHORT, 1, 1 3, 1, 1, 259, # Compression, SHORT, 1, compression Type 3, 1, params.group, 262, # Thresholding, SHORT, 1, 0 = BlackIs1 3, 1, int(params.BlackIs1), 273, # StripOffsets, LONG, 1, length of header 4, 1, struct.calcsize( tiff_header_struct ), 278, # RowsPerStrip, LONG, 1, length 4, 1, params.rows, 279, # StripByteCounts, LONG, 1, size of image 4, 1, img_size, 0, # last IFD ) return tiff_header + data JBIG2DEC_BINARY = shutil.which("jbig2dec") class JBIG2Decode: @staticmethod def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: if JBIG2DEC_BINARY is None: raise DependencyError("jbig2dec binary is not available.") with TemporaryDirectory() as tempdir: directory = Path(tempdir) paths: list[Path] = [] if decode_parms and "/JBIG2Globals" in decode_parms: jbig2_globals = decode_parms["/JBIG2Globals"] if not is_null_or_none(jbig2_globals) and not is_null_or_none(pointer := jbig2_globals.get_object()): assert pointer is not None, "mypy" if isinstance(pointer, StreamObject): path = directory.joinpath("globals.jbig2") path.write_bytes(pointer.get_data()) paths.append(path) path = directory.joinpath("image.jbig2") path.write_bytes(data) paths.append(path) environment = os.environ.copy() environment["LC_ALL"] = "C" result = subprocess.run( # noqa: S603 [ JBIG2DEC_BINARY, "--embedded", "--format", "png", "--output", "-", "-M", str(JBIG2_MAX_OUTPUT_LENGTH), *paths ], capture_output=True, env=environment, ) if b"unrecognized option '--embedded'" in result.stderr or b"unrecognized option '-M'" in result.stderr: raise DependencyError("jbig2dec>=0.19 is required.") if b"FATAL ERROR failed to allocate image data buffer" in result.stderr: raise LimitReachedError( f"Memory limit reached while reading JBIG2 data:\n{result.stderr.decode('utf-8')}" ) if result.stderr: for line in result.stderr.decode("utf-8").splitlines(): logger_warning(line, __name__) if result.returncode != 0: raise PdfStreamError(f"Unable to decode JBIG2 data. Exit code: {result.returncode}") return result.stdout @staticmethod def _is_binary_compatible() -> bool: if not JBIG2DEC_BINARY: # pragma: no cover return False result = subprocess.run( # noqa: S603 [JBIG2DEC_BINARY, "--version"], capture_output=True, text=True, ) version = result.stdout.split(" ", maxsplit=1)[1] from ._utils import Version # noqa: PLC0415 return Version(version) >= Version("0.19") def _deprecate_inline_image_filters(filter_name: str, old_name: str, new_name: str) -> None: if filter_name != old_name: return deprecate( f"The filter name {old_name} is deprecated and will be removed in pypdf 7.0.0. Use {new_name} instead.", 4, ) def decode_stream_data(stream: StreamObject) -> bytes: """ Decode the stream data based on the specified filters. This function decodes the stream data using the filters provided in the stream. Args: stream: The input stream object containing the data and filters. Returns: The decoded stream data. Raises: NotImplementedError: If an unsupported filter type is encountered. """ filters = stream.get(SA.FILTER, ()) if isinstance(filters, IndirectObject): filters = cast(ArrayObject, filters.get_object()) if not isinstance(filters, ArrayObject): # We have a single filter instance filters = (filters,) decode_parms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decode_parms, (list, tuple)): decode_parms = (decode_parms,) data: bytes = stream._data # If there is no data to decode, we should not try to decode it. if not data: return data for filter_name, params in zip(filters, decode_parms): if isinstance(params, NullObject): params = {} if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.AHx, new_name=FT.ASCII_HEX_DECODE) data = ASCIIHexDecode.decode(data) elif filter_name in (FT.ASCII_85_DECODE, FTA.A85): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.A85, new_name=FT.ASCII_85_DECODE) data = ASCII85Decode.decode(data) elif filter_name in (FT.LZW_DECODE, FTA.LZW): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.LZW, new_name=FT.LZW_DECODE) data = LZWDecode.decode(data, params) elif filter_name in (FT.FLATE_DECODE, FTA.FL): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.FL, new_name=FT.FLATE_DECODE) data = FlateDecode.decode(data, params) elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.RL, new_name=FT.RUN_LENGTH_DECODE) data = RunLengthDecode.decode(data) elif filter_name in (FT.CCITT_FAX_DECODE, FTA.CCF): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.CCF, new_name=FT.CCITT_FAX_DECODE) height = stream.get(IA.HEIGHT, ()) data = CCITTFaxDecode.decode(data, params, height) elif filter_name in (FT.DCT_DECODE, FTA.DCT): _deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.DCT, new_name=FT.DCT_DECODE) data = DCTDecode.decode(data) elif filter_name == FT.JPX_DECODE: data = JPXDecode.decode(data) elif filter_name == FT.JBIG2_DECODE: data = JBIG2Decode.decode(data, params) elif filter_name == "/Crypt": if "/Name" in params or "/Type" in params: raise NotImplementedError( "/Crypt filter with /Name or /Type not supported yet" ) else: raise NotImplementedError(f"Unsupported filter {filter_name}") return data ================================================ FILE: pypdf/generic/__init__.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """Implementation of generic PDF objects (dictionary, number, string, ...).""" __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" from ..constants import OutlineFontFlag from ._base import ( BooleanObject, ByteStringObject, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, TextStringObject, encode_pdfdocencoding, is_null_or_none, ) from ._data_structures import ( ArrayObject, ContentStream, DecodedStreamObject, Destination, DictionaryObject, EncodedStreamObject, Field, StreamObject, TreeObject, read_object, ) from ._files import EmbeddedFile from ._fit import Fit from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links from ._outline import OutlineItem from ._rectangle import RectangleObject from ._utils import ( create_string_object, decode_pdfdocencoding, hex_to_rgb, read_hex_string_from_stream, read_string_from_stream, ) from ._viewerpref import ViewerPreferences PAGE_FIT = Fit.fit() __all__ = [ "PAGE_FIT", "ArrayObject", "BooleanObject", "ByteStringObject", "ContentStream", "DecodedStreamObject", "Destination", "DictionaryObject", "DirectReferenceLink", "EmbeddedFile", "EncodedStreamObject", "Field", "Fit", "FloatObject", "IndirectObject", "NameObject", "NamedReferenceLink", "NullObject", "NumberObject", "OutlineFontFlag", "OutlineItem", "PdfObject", "RectangleObject", "ReferenceLink", "StreamObject", "TextStringObject", "TreeObject", "ViewerPreferences", # Utility functions "create_string_object", "decode_pdfdocencoding", "encode_pdfdocencoding", "extract_links", "hex_to_rgb", "is_null_or_none", "read_hex_string_from_stream", # Data structures core functions "read_object", "read_string_from_stream", ] ================================================ FILE: pypdf/generic/_appearance_stream.py ================================================ import re from dataclasses import dataclass from enum import IntEnum from typing import Any, Optional, Union, cast from .._codecs import fill_from_encoding from .._codecs.core_font_metrics import CORE_FONT_METRICS from .._font import Font from .._utils import logger_warning from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes from ..generic import ( DecodedStreamObject, DictionaryObject, NameObject, NumberObject, RectangleObject, ) from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none DEFAULT_FONT_SIZE_IN_MULTILINE = 12 @dataclass class BaseStreamConfig: """A container representing the basic layout of an appearance stream.""" rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0) border_width: int = 1 # The width of the border in points border_style: str = BorderStyles.SOLID class BaseStreamAppearance(DecodedStreamObject): """A class representing the very base of an appearance stream, that is, a rectangle and a border.""" def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None: """ Takes the appearance stream layout as an argument. Args: layout: The basic layout parameters. """ super().__init__() self._layout = layout or BaseStreamConfig() self[NameObject("/Type")] = NameObject("/XObject") self[NameObject("/Subtype")] = NameObject("/Form") self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle) class TextAlignment(IntEnum): """Defines the alignment options for text within a form field's appearance stream.""" LEFT = 0 CENTER = 1 RIGHT = 2 class TextStreamAppearance(BaseStreamAppearance): """ A class representing the appearance stream for a text-based form field. This class generates the content stream (the `ap_stream_data`) that dictates how text is rendered within a form field's bounding box. It handles properties like font, font size, color, multiline text, and text selection highlighting. """ def _scale_text( self, font: Font, font_size: float, leading_factor: float, field_width: float, field_height: float, text: str, min_font_size: float, font_size_step: float = 0.2 ) -> tuple[list[tuple[float, str]], float]: """ Takes a piece of text and scales it to field_width or field_height, given font_name and font_size. Wraps text where necessary. Args: font: The font to be used. font_size: The font size in points. leading_factor: The line distance. field_width: The width of the field in which to fit the text. field_height: The height of the field in which to fit the text. text: The text to fit with the field. min_font_size: The minimum font size at which to scale the text. font_size_step: The amount by which to decrement font size per step while scaling. Returns: The text in the form of list of tuples, each tuple containing the length of a line and its contents, and the font_size for these lines and lengths. """ orig_text = text paragraphs = text.replace("\n", "\r").split("\r") wrapped_lines = [] current_line_words: list[str] = [] current_line_width: float = 0 space_width = font.space_width * font_size / 1000 for paragraph in paragraphs: if not paragraph.strip(): wrapped_lines.append((0.0, "")) continue words = paragraph.split(" ") for i, word in enumerate(words): word_width = font.text_width(word) * font_size / 1000 test_width = current_line_width + word_width + (space_width if i else 0) if test_width > field_width and current_line_words: wrapped_lines.append((current_line_width, " ".join(current_line_words))) current_line_words = [word] current_line_width = word_width elif not current_line_words and word_width > field_width: wrapped_lines.append((word_width, word)) current_line_words = [] current_line_width = 0 else: if current_line_words: current_line_width += space_width current_line_words.append(word) current_line_width += word_width if current_line_words: wrapped_lines.append((current_line_width, " ".join(current_line_words))) current_line_words = [] current_line_width = 0 # Estimate total height. estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size if estimated_total_height > field_height: # Text overflows height; Retry with smaller font size. new_font_size = font_size - font_size_step if new_font_size >= min_font_size: return self._scale_text( font, new_font_size, leading_factor, field_width, field_height, orig_text, min_font_size, font_size_step ) return wrapped_lines, round(font_size, 1) def _generate_appearance_stream_data( self, text: str, selection: Union[list[str], None], font: Font, font_glyph_byte_map: Optional[dict[str, bytes]] = None, font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", is_multiline: bool = False, alignment: TextAlignment = TextAlignment.LEFT, is_comb: bool = False, max_length: Optional[int] = None ) -> bytes: """ Generates the raw bytes of the PDF appearance stream for a text field. This private method assembles the PDF content stream operators to draw the provided text within the specified rectangle. It handles text positioning, font application, color, and special formatting like selected text. Args: text: The text to be rendered in the form field. selection: An optional list of strings that should be highlighted as selected. font: The font to use. font_glyph_byte_map: An optional dictionary mapping characters to their byte representation for glyph encoding. font_name: The name of the font resource to use (e.g., "/Helv"). font_size: The font size. If 0, it is automatically calculated based on whether the field is multiline or not. font_color: The color to apply to the font, represented as a PDF graphics state string (e.g., "0 g" for black). is_multiline: A boolean indicating if the text field is multiline. alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. is_comb: Boolean that designates fixed-length fields, where every character fills one "cell", such as in a postcode. max_length: Used if is_comb is set. The maximum number of characters for a fixed- length field. Returns: A byte string containing the PDF content stream data. """ rectangle = self._layout.rectangle font_glyph_byte_map = font_glyph_byte_map or {} if isinstance(rectangle, tuple): rectangle = RectangleObject(rectangle) leading_factor = (font.font_descriptor.bbox[3] - font.font_descriptor.bbox[1]) / 1000.0 # Set margins based on border width and style, but never less than 1 point factor = 2 if self._layout.border_style in {"/B", "/I"} else 1 margin = max(self._layout.border_width * factor, 1) field_height = rectangle.height - 2 * margin field_width = rectangle.width - 4 * margin # If font_size is 0, apply the logic for multiline or large-as-possible font if font_size == 0: min_font_size = 4.0 # The mininum font size if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems is_multiline = False # with matching "selection" with "line" later on. if is_multiline: font_size = DEFAULT_FONT_SIZE_IN_MULTILINE lines, font_size = self._scale_text( font, font_size, leading_factor, field_width, field_height, text, min_font_size ) else: max_vertical_size = field_height / leading_factor text_width_unscaled = font.text_width(text) / 1000 max_horizontal_size = field_width / (text_width_unscaled or 1) font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1) lines = [(text_width_unscaled * font_size, text)] elif is_comb: if max_length and len(text) > max_length: logger_warning ( f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.", __name__ ) # We act as if each character is one line, because we draw it separately later on lines = [( font.text_width(char) * font_size / 1000, char ) for index, char in enumerate(text) if index < (max_length or len(text))] else: lines = [( font.text_width(line) * font_size / 1000, line ) for line in text.replace("\n", "\r").split("\r")] # Set the vertical offset if is_multiline: y_offset = rectangle.height + margin - font.font_descriptor.bbox[3] * font_size / 1000.0 else: y_offset = margin + ((field_height - font.font_descriptor.ascent * font_size / 1000) / 2) default_appearance = f"{font_name} {font_size} Tf {font_color}" ap_stream = ( f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} " f"re\nW\nBT\n{default_appearance}\n" ).encode() current_x_pos: float = 0 # Initial virtual position within the text object. for line_number, (line_width, line) in enumerate(lines): if selection and line in selection: # Might be improved, but cannot find how to get fill working => replaced with lined box ap_stream += ( f"1 {y_offset - (line_number * font_size * leading_factor) - 1} " f"{rectangle.width - 2} {font_size + 2} re\n" f"0.5 0.5 0.5 rg s\n{default_appearance}\n" ).encode() # Calculate the desired absolute starting X for the current line desired_abs_x_start: float = 0 if is_comb and max_length: # Calculate the width of a cell for one character cell_width = rectangle.width / max_length # Space from the left edge of the cell to the character's baseline start # line_width here is the *actual* character width in points for the single character 'line' centering_offset_in_cell = (cell_width - line_width) / 2 # Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell elif alignment == TextAlignment.RIGHT: desired_abs_x_start = rectangle.width - margin * 2 - line_width elif alignment == TextAlignment.CENTER: desired_abs_x_start = (rectangle.width - line_width) / 2 else: # Left aligned; default desired_abs_x_start = margin * 2 # Calculate x_rel_offset: how much to move from the current_x_pos # to reach the desired_abs_x_start. x_rel_offset = desired_abs_x_start - current_x_pos # Y-offset: y_rel_offset: float = 0 if line_number == 0: y_rel_offset = y_offset # Initial vertical position elif is_comb: y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters else: y_rel_offset = - font_size * leading_factor # Move down by line height # Td is a relative translation (Tx and Ty). # It updates the current text position. ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode() # Update current_x_pos based on the Td operation for the next iteration. # This is the X position where the *current line* will start. current_x_pos = desired_abs_x_start encoded_line: list[bytes] = [ font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line ] if any(len(c) >= 2 for c in encoded_line): ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n" else: ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" return ap_stream def __init__( self, layout: Optional[BaseStreamConfig] = None, text: str = "", selection: Optional[list[str]] = None, font_resource: Optional[DictionaryObject] = None, font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", is_multiline: bool = False, alignment: TextAlignment = TextAlignment.LEFT, is_comb: bool = False, max_length: Optional[int] = None ) -> None: """ Initializes a TextStreamAppearance object. This constructor creates a new PDF stream object configured as an XObject of subtype Form. It uses the `_appearance_stream_data` method to generate the content for the stream. Args: layout: The basic layout parameters. text: The text to be rendered in the form field. selection: An optional list of strings that should be highlighted as selected. font_resource: An optional variable that represents a PDF font dictionary. font_name: The name of the font resource, e.g., "/Helv". font_size: The font size. If 0, it's auto-calculated. font_color: The font color string. is_multiline: A boolean indicating if the text field is multiline. alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. is_comb: Boolean that designates fixed-length fields, where every character fills one "cell", such as in a postcode. max_length: Used if is_comb is set. The maximum number of characters for a fixed- length field. """ super().__init__(layout) # If a font resource was added, get the font character map if font_resource: font = Font.from_font_resource(font_resource) else: logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) font_name = "/Helv" core_font_metrics = CORE_FONT_METRICS["Helvetica"] font = Font( name="Helvetica", character_map={}, encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding sub_type="Type1", font_descriptor=core_font_metrics.font_descriptor, character_widths=core_font_metrics.character_widths ) font_resource = font.as_font_resource() # Check whether the font resource is able to encode the text value. encodable = True try: if isinstance(font.encoding, str): text.encode(font.encoding, "surrogatepass") else: supported_chars = set(font.encoding.values()) if any(char not in supported_chars for char in text): encodable = False # We should add a final check against the character_map (CMap) of the font, # but we don't appear to have PDF forms with such fonts, so we skip this for # now. except UnicodeEncodeError: encodable = False if not encodable: logger_warning( f"Text string '{text}' contains characters not supported by font encoding. " "This may result in text corruption. " "Consider calling writer.update_page_form_field_values with auto_regenerate=True.", __name__ ) font_glyph_byte_map: dict[str, bytes] if isinstance(font.encoding, str): font_glyph_byte_map = { v: k.encode(font.encoding) for k, v in font.character_map.items() } else: font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()} font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()} for key, value in font.character_map.items(): font_glyph_byte_map[value] = font_encoding_rev.get(key, key) ap_stream_data = self._generate_appearance_stream_data( text, selection, font, font_glyph_byte_map, font_name=font_name, font_size=font_size, font_color=font_color, is_multiline=is_multiline, alignment=alignment, is_comb=is_comb, max_length=max_length ) self.set_data(ByteStringObject(ap_stream_data)) self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) # Update Resources with font information self[NameObject("/Resources")] = DictionaryObject({ NameObject("/Font"): DictionaryObject({ NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) }) }) @staticmethod def _find_annotation_font_resource( font_name: str, annotation: DictionaryObject, acro_form: DictionaryObject ) -> tuple[str, DictionaryObject]: # Try to find a resource dictionary for the font by examining the annotation and, if that fails, # the AcroForm resources dictionary acro_form_resources: Any = cast( DictionaryObject, annotation.get_inherited( "/DR", acro_form.get("/DR", DictionaryObject()), ), ) acro_form_font_resources = acro_form_resources.get("/Font", DictionaryObject()) font_resource = acro_form_font_resources.get(font_name, None) # Normally, we should have found a font resource by now. However, when a user has provided a specific # font name, we may not have found the associated font resource among the AcroForm resources. Also, in # case of the 14 Adobe Core fonts, we may be expected to construct a font resource ourselves. if is_null_or_none(font_resource): if font_name.removeprefix("/") not in CORE_FONT_METRICS: # Default to Helvetica if we haven't found a font resource and cannot construct one ourselves. logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) font_name = "/Helvetica" core_font_metrics = CORE_FONT_METRICS[font_name.removeprefix("/")] font_resource = Font( name=font_name.removeprefix("/"), character_map={}, encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding sub_type="Type1", font_descriptor=core_font_metrics.font_descriptor, character_widths=core_font_metrics.character_widths ).as_font_resource() return font_name, font_resource @classmethod def from_text_annotation( cls, acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) field: DictionaryObject, annotation: DictionaryObject, user_font_name: str = "", user_font_size: float = -1, ) -> "TextStreamAppearance": """ Creates a TextStreamAppearance object from a text field annotation. This class method is a factory for creating a `TextStreamAppearance` instance by extracting all necessary information (bounding box, font, text content, etc.) from the PDF field and annotation dictionaries. It respects inheritance for properties like default appearance (`/DA`). Args: acro_form: The root AcroForm dictionary from the PDF catalog. field: The field dictionary object. annotation: The widget annotation dictionary object associated with the field. user_font_name: An optional user-provided font name to override the default. Defaults to an empty string. user_font_size: An optional user-provided font size to override the default. A value of -1 indicates no override. Returns: A new `TextStreamAppearance` instance configured for the given field. """ # Calculate rectangle dimensions _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) # Get default appearance dictionary from annotation default_appearance = annotation.get_inherited( AnnotationDictionaryAttributes.DA, acro_form.get(AnnotationDictionaryAttributes.DA, None), ) if not default_appearance: # Create a default appearance if none was found in the annotation default_appearance = TextStringObject("/Helv 0 Tf 0 g") else: default_appearance = default_appearance.get_object() # Retrieve field text and selected values field_flags = field.get(FieldDictionaryAttributes.Ff, 0) if ( field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 ): text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, [])) selection = field.get("/V", []) if not isinstance(selection, list): selection = [selection] else: # /Tx text = field.get("/V", "") selection = [] # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") # Derive font name, size and color from the default appearance. Also set # user-provided font name and font size in the default appearance, if given. # For a font name, this presumes that we can find an associated font resource # dictionary. Uses the variable font_properties as an intermediate. # As per the PDF spec: # "At a minimum, the string [that is, default_appearance] shall include a Tf (text # font) operator along with its two operands, font and size" (Section 12.7.4.3 # "Variable text" of the PDF 2.0 specification). font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop] font_name = font_properties.pop(font_properties.index("Tf") - 2) font_size = float(font_properties.pop(font_properties.index("Tf") - 1)) font_properties.remove("Tf") font_color = " ".join(font_properties) # Determine the font name to use, prioritizing the user's input if user_font_name: font_name = user_font_name # Determine the font size to use, prioritizing the user's input if user_font_size > 0: font_size = user_font_size font_name, font_resource = cls._find_annotation_font_resource(font_name, annotation, acro_form) # Retrieve formatting information is_comb = False max_length = None if field_flags & FieldDictionaryAttributes.FfBits.Comb: is_comb = True max_length = annotation.get("/MaxLen") is_multiline = False if field_flags & FieldDictionaryAttributes.FfBits.Multiline: is_multiline = True alignment = field.get("/Q", TextAlignment.LEFT) border_width = 1 border_style = BorderStyles.SOLID if "/BS" in field: border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width) border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style) # Create the TextStreamAppearance instance layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style) new_appearance_stream = cls( layout, text, selection, font_resource, font_name=font_name, font_size=font_size, font_color=font_color, is_multiline=is_multiline, alignment=alignment, is_comb=is_comb, max_length=max_length ) if AnnotationDictionaryAttributes.AP in annotation: for key, value in ( cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() ): if key in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: continue # Don't overwrite font resources added by TextAppearanceStream.__init__ if key == "/Resources": if "/Font" not in value: value.get_object()[NameObject("/Font")] = DictionaryObject() value["/Font"].get_object()[NameObject(font_name)] = getattr( font_resource, "indirect_reference", font_resource ) else: new_appearance_stream[key] = value return new_appearance_stream ================================================ FILE: pypdf/generic/_base.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import binascii import codecs import hashlib import re import sys from collections.abc import Sequence from math import log10 from struct import iter_unpack from typing import Any, Callable, ClassVar, Optional, Union, cast if sys.version_info[:2] >= (3, 10): from typing import TypeGuard else: from typing_extensions import TypeGuard # PEP 647 if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self from .._codecs import _pdfdoc_encoding_rev from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( StreamType, classproperty, deprecation_no_replacement, deprecation_with_replacement, logger_warning, read_non_whitespace, read_until_regex, ) from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" class PdfObject(PdfObjectProtocol): # function for calculating a hash value hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 indirect_reference: Optional["IndirectObject"] def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ raise NotImplementedError( f"{self.__class__.__name__} does not implement .hash_bin() so far" ) def hash_value_data(self) -> bytes: return f"{self}".encode() def hash_value(self) -> bytes: return ( f"{self.__class__.__name__}:" f"{self.hash_func(self.hash_value_data()).hexdigest()}" ).encode() def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "PdfObject": """ Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) without ensuring links. This is used in clone_document_from_root with incremental = True. Args: pdf_dest: Target to clone to. Returns: The cloned PdfObject """ return self.clone(pdf_dest) def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "PdfObject": """ Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). By default, this method will call ``_reference_clone`` (see ``_reference``). Args: pdf_dest: Target to clone to. force_duplicate: By default, if the object has already been cloned and referenced, the copy will be returned; when ``True``, a new copy will be created. (Default value = ``False``) ignore_fields: List/tuple of field names (for dictionaries) that will be ignored during cloning (applies to children duplication as well). If fields are to be considered for a limited number of levels, you have to add it as integer, for example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first level only but ``"/TOTO"`` on all levels. Returns: The cloned PdfObject """ raise NotImplementedError( f"{self.__class__.__name__} does not implement .clone so far" ) def _reference_clone( self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False ) -> PdfObjectProtocol: """ Reference the object within the _objects of pdf_dest only if indirect_reference attribute exists (which means the objects was already identified in xref/xobjstm) if object has been already referenced do nothing. Args: clone: pdf_dest: Returns: The clone """ try: if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: return clone except Exception: pass # if hasattr(clone, "indirect_reference"): try: ind = self.indirect_reference except AttributeError: return clone if ( pdf_dest.incremental and ind is not None and ind.pdf == pdf_dest._reader and ind.idnum <= len(pdf_dest._objects) ): i = ind.idnum else: i = len(pdf_dest._objects) + 1 if ind is not None: if id(ind.pdf) not in pdf_dest._id_translated: pdf_dest._id_translated[id(ind.pdf)] = {} pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index] if ( not force_duplicate and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] ): obj = pdf_dest.get_object( pdf_dest._id_translated[id(ind.pdf)][ind.idnum] ) assert obj is not None return obj pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i try: pdf_dest._objects[i - 1] = clone except IndexError: pdf_dest._objects.append(clone) i = len(pdf_dest._objects) clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone def get_object(self) -> Optional["PdfObject"]: """Resolve indirect references.""" return self def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: raise NotImplementedError class NullObject(PdfObject): def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "NullObject": """Clone object into pdf_dest.""" return cast( "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__,)) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"null") @staticmethod def read_from_stream(stream: StreamType) -> "NullObject": nulltxt = stream.read(4) if nulltxt != b"null": raise PdfReadError("Could not read Null object") return NullObject() def __repr__(self) -> str: return "NullObject" def __eq__(self, other: object) -> bool: return isinstance(other, NullObject) def __hash__(self) -> int: return self.hash_bin() class BooleanObject(PdfObject): def __init__(self, value: Any) -> None: self.value = value def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "BooleanObject": """Clone object into pdf_dest.""" return cast( "BooleanObject", self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self.value)) def __eq__(self, o: object, /) -> bool: if isinstance(o, BooleanObject): return self.value == o.value if isinstance(o, bool): return self.value == o return False def __hash__(self) -> int: return self.hash_bin() def __repr__(self) -> str: return "True" if self.value else "False" def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) if self.value: stream.write(b"true") else: stream.write(b"false") @staticmethod def read_from_stream(stream: StreamType) -> "BooleanObject": word = stream.read(4) if word == b"true": return BooleanObject(True) if word == b"fals": stream.read(1) return BooleanObject(False) raise PdfReadError("Could not read Boolean object") class IndirectObject(PdfObject): def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader self.idnum = idnum self.generation = generation self.pdf = pdf def __hash__(self) -> int: return hash((self.idnum, self.generation, id(self.pdf))) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "PdfObject": return IndirectObject(self.idnum, self.generation, pdf_dest) def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "IndirectObject": """Clone object into pdf_dest.""" if self.pdf == pdf_dest and not force_duplicate: # Already duplicated and no extra duplication required return self if id(self.pdf) not in pdf_dest._id_translated: pdf_dest._id_translated[id(self.pdf)] = {} pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index] if self.idnum in pdf_dest._id_translated[id(self.pdf)]: dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) if force_duplicate: assert dup is not None assert dup.indirect_reference is not None idref = dup.indirect_reference return IndirectObject(idref.idnum, idref.generation, idref.pdf) else: obj = self.get_object() # case observed : a pointed object can not be found if obj is None: # this normally obj = NullObject() assert isinstance(self, (IndirectObject,)) obj.indirect_reference = self dup = pdf_dest._add_object( obj.clone(pdf_dest, force_duplicate, ignore_fields) ) assert dup is not None, "mypy" assert dup.indirect_reference is not None, "mypy" return dup.indirect_reference @property def indirect_reference(self) -> "IndirectObject": # type: ignore[override] return self def get_object(self) -> Optional["PdfObject"]: return self.pdf.get_object(self) def __deepcopy__(self, memo: Any) -> "IndirectObject": return IndirectObject(self.idnum, self.generation, self.pdf) def _get_object_with_check(self) -> Optional["PdfObject"]: o = self.get_object() # the check is done here to not slow down get_object() if isinstance(o, IndirectObject): raise PdfStreamError( f"{self.__repr__()} references an IndirectObject {o.__repr__()}" ) return o def __getattr__(self, name: str) -> Any: # Attribute not found in object: look in pointed object try: return getattr(self._get_object_with_check(), name) except AttributeError: raise AttributeError( f"No attribute {name} found in IndirectObject or pointed object" ) def __getitem__(self, key: Any) -> Any: # items should be extracted from pointed Object return self._get_object_with_check()[key] # type: ignore def __contains__(self, key: Any) -> bool: return key in self._get_object_with_check() # type: ignore def __iter__(self) -> Any: return self._get_object_with_check().__iter__() # type: ignore def __float__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__float__() # type: ignore def __int__(self) -> int: # in this case we are looking for the pointed data return self.get_object().__int__() # type: ignore def __str__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__str__() def __repr__(self) -> str: return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" def __eq__(self, other: object) -> bool: return ( other is not None and isinstance(other, IndirectObject) and self.idnum == other.idnum and self.generation == other.generation and self.pdf is other.pdf ) def __ne__(self, other: object) -> bool: return not self.__eq__(other) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(f"{self.idnum} {self.generation} R".encode()) @staticmethod def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader idnum = b"" while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): break idnum += tok generation = b"" while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): if not generation: continue break generation += tok r = read_non_whitespace(stream) if r != b"R": raise PdfReadError( f"Error reading indirect object reference at byte {hex(stream.tell())}" ) return IndirectObject(int(idnum), int(generation), pdf) FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj class FloatObject(float, PdfObject): def __new__( cls, value: Any = "0.0", context: Optional[Any] = None ) -> Self: try: value = float(value) return float.__new__(cls, value) except Exception as e: # If this isn't a valid decimal (happens in malformed PDFs) # fallback to 0 logger_warning( f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ ) return float.__new__(cls, 0.0) def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "FloatObject": """Clone object into pdf_dest.""" return cast( "FloatObject", self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self.as_numeric)) def myrepr(self) -> str: if self == 0: return "0.0" nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") def __repr__(self) -> str: return self.myrepr() # repr(float(self)) def as_numeric(self) -> float: return float(self) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(self.myrepr().encode("utf8")) class NumberObject(int, PdfObject): NumberPattern = re.compile(b"[^+-.0-9]") def __new__(cls, value: Any) -> Self: try: return int.__new__(cls, int(value)) except ValueError: logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) return int.__new__(cls, 0) def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "NumberObject": """Clone object into pdf_dest.""" return cast( "NumberObject", self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self.as_numeric())) def as_numeric(self) -> int: return int(repr(self).encode("utf8")) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(repr(self).encode("utf8")) @staticmethod def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: num = read_until_regex(stream, NumberObject.NumberPattern) if b"." in num: return FloatObject(num) return NumberObject(num) class ByteStringObject(bytes, PdfObject): """ Represents a string object where the text encoding could not be determined. This occurs quite often, as the PDF spec doesn't provide an alternate way to represent strings -- for example, the encryption data stored in files (like /O) is clearly not text, but is still stored in a "String" object. """ def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "ByteStringObject": """Clone object into pdf_dest.""" return cast( "ByteStringObject", self._reference_clone( ByteStringObject(bytes(self)), pdf_dest, force_duplicate ), ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, bytes(self))) @property def original_bytes(self) -> bytes: """For compatibility with TextStringObject.original_bytes.""" return self def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<") stream.write(binascii.hexlify(self)) stream.write(b">") def __str__(self) -> str: charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] for enc in charset_to_try: try: return self.decode(enc) except UnicodeDecodeError: pass raise PdfReadError("Cannot decode ByteStringObject.") class TextStringObject(str, PdfObject): # noqa: SLOT000 """ A string object that has been decoded into a real unicode string. If read from a PDF document, this string appeared to match the PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to occur. """ autodetect_pdfdocencoding: bool autodetect_utf16: bool utf16_bom: bytes _original_bytes: Optional[bytes] = None def __new__(cls, value: Any) -> Self: original_bytes = None if isinstance(value, bytes): original_bytes = value value = value.decode("charmap") text_string_object = str.__new__(cls, value) text_string_object._original_bytes = original_bytes text_string_object.autodetect_utf16 = False text_string_object.autodetect_pdfdocencoding = False text_string_object.utf16_bom = b"" if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}: # The value of `original_bytes` is only set for inputs being `bytes`. # If this is UTF-16 data according to the BOM (first two characters), # perform special handling. All other cases should not need any special conversion # due to already being a string. try: text_string_object = str.__new__(cls, original_bytes.decode("utf-16")) except UnicodeDecodeError as exception: logger_warning( f"{exception!s}\ninitial string:{exception.object!r}", __name__, ) text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16")) text_string_object._original_bytes = original_bytes text_string_object.autodetect_utf16 = True text_string_object.utf16_bom = original_bytes[:2] else: try: encode_pdfdocencoding(text_string_object) text_string_object.autodetect_pdfdocencoding = True except UnicodeEncodeError: text_string_object.autodetect_utf16 = True text_string_object.utf16_bom = codecs.BOM_UTF16_BE return text_string_object def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "TextStringObject": """Clone object into pdf_dest.""" obj = TextStringObject(self) obj._original_bytes = self._original_bytes obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 obj.utf16_bom = self.utf16_bom return cast( "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self.original_bytes)) @property def original_bytes(self) -> bytes: """ It is occasionally possible that a text string object gets created where a byte string object was expected due to the autodetection mechanism -- if that occurs, this "original_bytes" property can be used to back-calculate what the original encoded bytes were. """ if self._original_bytes is not None: return self._original_bytes return self.get_original_bytes() def get_original_bytes(self) -> bytes: # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but # we were wrong. It's pretty common. Return the original bytes that # would have been used to create this object, based upon the autodetect # method. if self.autodetect_utf16: if self.utf16_bom == codecs.BOM_UTF16_LE: return codecs.BOM_UTF16_LE + self.encode("utf-16le") if self.utf16_bom == codecs.BOM_UTF16_BE: return codecs.BOM_UTF16_BE + self.encode("utf-16be") return self.encode("utf-16be") if self.autodetect_pdfdocencoding: return encode_pdfdocencoding(self) raise Exception("no information about original bytes") # pragma: no cover def get_encoded_bytes(self) -> bytes: # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: if self._original_bytes is not None: return self._original_bytes if self.autodetect_utf16: raise UnicodeEncodeError("", "forced", -1, -1, "") bytearr = encode_pdfdocencoding(self) except UnicodeEncodeError: if self.utf16_bom == codecs.BOM_UTF16_LE: bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") elif self.utf16_bom == codecs.BOM_UTF16_BE: bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") else: bytearr = self.encode("utf-16be") return bytearr def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) bytearr = self.get_encoded_bytes() stream.write(b"(") for c_ in iter_unpack("c", bytearr): c = cast(bytes, c_[0]) if not c.isalnum() and c != b" ": # This: # stream.write(rf"\{c:0>3o}".encode()) # gives # https://github.com/davidhalter/parso/issues/207 stream.write(b"\\%03o" % ord(c)) else: stream.write(c) stream.write(b")") class NameObject(str, PdfObject): # noqa: SLOT000 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") prefix = b"/" renumber_table: ClassVar[dict[str, bytes]] = { **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, **{chr(i): f"#{i:02X}".encode() for i in range(33)}, } def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "NameObject": """Clone object into pdf_dest.""" return cast( "NameObject", self._reference_clone(NameObject(self), pdf_dest, force_duplicate), ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, self)) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(self.renumber()) def renumber(self) -> bytes: out = self[0].encode("utf-8") if out != b"/": deprecation_no_replacement( f"Incorrect first char in NameObject, should start with '/': ({self})", "5.0.0", ) parts = [out] for c in self[1:]: if c > "~": parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8")) else: try: parts.append(self.renumber_table[c]) except KeyError: parts.append(c.encode("utf-8")) return b"".join(parts) def _sanitize(self) -> "NameObject": """ Sanitize the NameObject's name to be a valid PDF name part (alphanumeric, underscore, hyphen). The _sanitize method replaces spaces and any non-alphanumeric/non-underscore/non-hyphen with underscores. Returns: NameObject with sanitized name. """ name = str(self).removeprefix("/") name = re.sub(r"\ ", "_", name) name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) return NameObject("/" + name) @classproperty def surfix(cls) -> bytes: # noqa: N805 deprecation_with_replacement("surfix", "prefix", "5.0.0") return b"/" @staticmethod def unnumber(sin: bytes) -> bytes: result = bytearray() i = 0 while i < len(sin): if sin[i:i + 1] == b"#": try: result.append(int(sin[i + 1 : i + 3], 16)) i += 3 continue except (ValueError, IndexError): # if the 2 characters after # can not be converted to hex # we change nothing and carry on pass result.append(sin[i]) i += 1 return bytes(result) CHARSETS = ("utf-8", "gbk", "latin1") @staticmethod def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.prefix: raise PdfReadError("Name read error") name += read_until_regex(stream, NameObject.delimiter_pattern) try: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number name = NameObject.unnumber(name) for enc in NameObject.CHARSETS: try: ret = name.decode(enc) return NameObject(ret) except Exception: pass raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") except (UnicodeEncodeError, UnicodeDecodeError) as e: if not pdf.strict: logger_warning( f"Illegal character in NameObject ({name!r}), " "you may need to adjust NameObject.CHARSETS", __name__, ) return NameObject(name.decode("charmap")) raise PdfReadError( f"Illegal character in NameObject ({name!r}). " "You may need to adjust NameObject.CHARSETS.", ) from e def encode_pdfdocencoding(unicode_string: str) -> bytes: try: return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) except KeyError: raise UnicodeEncodeError( "pdfdocencoding", unicode_string, -1, -1, "does not exist in translation table", ) def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: """ Returns: True if x is None or NullObject. """ return x is None or ( isinstance(x, PdfObject) and (x.get_object() is None or isinstance(x.get_object(), NullObject)) ) ================================================ FILE: pypdf/generic/_data_structures.py ================================================ # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import logging import re import sys from collections.abc import Iterable, Sequence from io import BytesIO from math import ceil from typing import ( Any, Callable, Optional, Union, cast, ) from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol from .._utils import ( WHITESPACES, StreamType, deprecation_no_replacement, logger_warning, read_non_whitespace, read_until_regex, read_until_whitespace, skip_over_comment, ) from ..constants import ( CheckboxRadioButtonAttributes, FieldDictionaryAttributes, OutlineFontFlag, ) from ..constants import FilterTypes as FT from ..constants import StreamAttributes as SA from ..constants import TypArguments as TA from ..constants import TypFitArguments as TF from ..errors import STREAM_TRUNCATED_PREMATURELY, LimitReachedError, PdfReadError, PdfStreamError from ._base import ( BooleanObject, ByteStringObject, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, PdfObject, TextStringObject, is_null_or_none, ) from ._fit import Fit from ._image_inline import ( extract_inline__ascii85_decode, extract_inline__ascii_hex_decode, extract_inline__dct_decode, extract_inline__run_length_decode, extract_inline_default, ) from ._utils import read_hex_string_from_stream, read_string_from_stream if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self logger = logging.getLogger(__name__) IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") class ArrayObject(list[Any], PdfObject): def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "ArrayObject": arr = cast( "ArrayObject", self._reference_clone(ArrayObject(), pdf_dest, False), ) for data in self: if hasattr(data, "replicate"): arr.append(data.replicate(pdf_dest)) else: arr.append(data) return arr def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "ArrayObject": """Clone object into pdf_dest.""" try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore return self except Exception: pass arr = cast( "ArrayObject", self._reference_clone(ArrayObject(), pdf_dest, force_duplicate=True), ) for data in self: if isinstance(data, StreamObject): dup = data._reference_clone( data.clone(pdf_dest, force_duplicate, ignore_fields), pdf_dest, force_duplicate, ) arr.append(dup.indirect_reference) elif isinstance(data, IndirectObject) and isinstance(resolved := data.get_object(), StreamObject): dup = data._reference_clone( resolved.clone(pdf_dest, force_duplicate=True, ignore_fields=ignore_fields), pdf_dest, force_duplicate, ) arr.append(dup.indirect_reference) elif hasattr(data, "clone"): arr.append(data.clone(pdf_dest, force_duplicate, ignore_fields)) else: arr.append(data) return arr def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash((self.__class__, tuple(x.hash_bin() for x in self))) def items(self) -> Iterable[Any]: """Emulate DictionaryObject.items for a list (index, object).""" return enumerate(self) def _to_lst(self, lst: Any) -> list[Any]: # Convert to list, internal if isinstance(lst, (list, tuple, set)): pass elif isinstance(lst, PdfObject): lst = [lst] elif isinstance(lst, str): if lst[0] == "/": lst = [NameObject(lst)] else: lst = [TextStringObject(lst)] elif isinstance(lst, bytes): lst = [ByteStringObject(lst)] else: # for numbers,... lst = [lst] return lst def __add__(self, lst: Any) -> "ArrayObject": """ Allow extension by adding list or add one element only Args: lst: any list, tuples are extended the list. other types(numbers,...) will be appended. if str is passed it will be converted into TextStringObject or NameObject (if starting with "/") if bytes is passed it will be converted into ByteStringObject Returns: ArrayObject with all elements """ temp = ArrayObject(self) temp.extend(self._to_lst(lst)) return temp def __iadd__(self, lst: Any) -> Self: """ Allow extension by adding list or add one element only Args: lst: any list, tuples are extended the list. other types(numbers,...) will be appended. if str is passed it will be converted into TextStringObject or NameObject (if starting with "/") if bytes is passed it will be converted into ByteStringObject """ self.extend(self._to_lst(lst)) return self def __isub__(self, lst: Any) -> Self: """Allow to remove items""" for x in self._to_lst(lst): try: index = self.index(x) del self[index] except ValueError: pass return self def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"[") for data in self: stream.write(b" ") data.write_to_stream(stream) stream.write(b" ]") @staticmethod def read_from_stream( stream: StreamType, pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> "ArrayObject": arr = ArrayObject() tmp = stream.read(1) if tmp != b"[": raise PdfReadError("Could not read array") while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) if tok == b"": break if tok == b"%": stream.seek(-1, 1) skip_over_comment(stream) continue stream.seek(-1, 1) # check for array ending peek_ahead = stream.read(1) if peek_ahead == b"]": break stream.seek(-1, 1) # read and append object arr.append(read_object(stream, pdf, forced_encoding)) return arr class DictionaryObject(dict[Any, Any], PdfObject): def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "DictionaryObject": d__ = cast( "DictionaryObject", self._reference_clone(self.__class__(), pdf_dest, False), ) for k, v in self.items(): d__[k.replicate(pdf_dest)] = ( v.replicate(pdf_dest) if hasattr(v, "replicate") else v ) return d__ def clone( self, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "DictionaryObject": """Clone object into pdf_dest.""" try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore return self except Exception: pass visited: set[tuple[int, int]] = set() # (idnum, generation) d__ = cast( "DictionaryObject", self._reference_clone(self.__class__(), pdf_dest, force_duplicate), ) if ignore_fields is None: ignore_fields = [] if len(d__.keys()) == 0: d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) return d__ def _clone( self, src: "DictionaryObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], visited: set[tuple[int, int]], # (idnum, generation) ) -> None: """ Update the object from src. Args: src: "DictionaryObject": pdf_dest: force_duplicate: ignore_fields: """ # First we remove the ignore_fields # that are for a limited number of levels assert ignore_fields is not None ignore_fields = list(ignore_fields) x = 0 while x < len(ignore_fields): if isinstance(ignore_fields[x], int): if cast(int, ignore_fields[x]) <= 0: del ignore_fields[x] del ignore_fields[x] continue ignore_fields[x] -= 1 # type:ignore x += 1 # Check if this is a chain list, we need to loop to prevent recur if any( field not in ignore_fields and field in src and isinstance(src.raw_get(field), IndirectObject) and isinstance(src[field], DictionaryObject) and ( src.get("/Type", None) is None or cast(DictionaryObject, src[field]).get("/Type", None) is None or src.get("/Type", None) == cast(DictionaryObject, src[field]).get("/Type", None) ) for field in ["/Next", "/Prev", "/N", "/V"] ): ignore_fields = list(ignore_fields) for lst in (("/Next", "/Prev"), ("/N", "/V")): for k in lst: objs = [] if ( k in src and k not in self and isinstance(src.raw_get(k), IndirectObject) and isinstance(src[k], DictionaryObject) # If need to go further the idea is to check # that the types are the same and ( src.get("/Type", None) is None or cast(DictionaryObject, src[k]).get("/Type", None) is None or src.get("/Type", None) == cast(DictionaryObject, src[k]).get("/Type", None) ) ): cur_obj: Optional[DictionaryObject] = cast( "DictionaryObject", src[k] ) prev_obj: Optional[DictionaryObject] = self while cur_obj is not None: clon = cast( "DictionaryObject", cur_obj._reference_clone( cur_obj.__class__(), pdf_dest, force_duplicate ), ) # Check to see if we've previously processed our item if clon.indirect_reference is not None: idnum = clon.indirect_reference.idnum generation = clon.indirect_reference.generation if (idnum, generation) in visited: cur_obj = None break visited.add((idnum, generation)) objs.append((cur_obj, clon)) assert prev_obj is not None prev_obj[NameObject(k)] = clon.indirect_reference prev_obj = clon try: if cur_obj == src: cur_obj = None else: cur_obj = cast("DictionaryObject", cur_obj[k]) except Exception: cur_obj = None for s, c in objs: c._clone( s, pdf_dest, force_duplicate, ignore_fields, visited ) for k, v in src.items(): if k not in ignore_fields: if isinstance(v, StreamObject): if not hasattr(v, "indirect_reference"): v.indirect_reference = None vv = v.clone(pdf_dest, force_duplicate, ignore_fields) assert vv.indirect_reference is not None self[k.clone(pdf_dest)] = vv.indirect_reference elif k not in self: self[NameObject(k)] = ( v.clone(pdf_dest, force_duplicate, ignore_fields) if hasattr(v, "clone") else v ) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ return hash( (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items()))) ) def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) def get_inherited(self, key: str, default: Any = None) -> Any: """ Returns the value of a key or from the parent if not found. If not found returns default. Args: key: string identifying the field to return default: default value to return Returns: Current key or inherited one, otherwise default value. """ if key in self: return self[key] try: if "/Parent" not in self: return default raise KeyError("Not present") except KeyError: return cast("DictionaryObject", self["/Parent"].get_object()).get_inherited( key, default ) def __setitem__(self, key: Any, value: Any) -> Any: if not isinstance(key, PdfObject): raise ValueError("Key must be a PdfObject") if not isinstance(value, PdfObject): raise ValueError("Value must be a PdfObject") return dict.__setitem__(self, key, value) def setdefault(self, key: Any, value: Optional[Any] = None) -> Any: if not isinstance(key, PdfObject): raise ValueError("Key must be a PdfObject") if not isinstance(value, PdfObject): raise ValueError("Value must be a PdfObject") return dict.setdefault(self, key, value) def __getitem__(self, key: Any) -> PdfObject: return dict.__getitem__(self, key).get_object() @property def xmp_metadata(self) -> Optional[XmpInformationProtocol]: """ Retrieve XMP (Extensible Metadata Platform) data relevant to this object, if available. See Table 347 — Additional entries in a metadata stream dictionary. Returns: Returns a :class:`~pypdf.xmp.XmpInformation` instance that can be used to access XMP metadata from the document. Can also return None if no metadata was found on the document root. """ from ..xmp import XmpInformation # noqa: PLC0415 metadata = self.get("/Metadata", None) if is_null_or_none(metadata): return None assert metadata is not None, "mypy" metadata = metadata.get_object() return XmpInformation(metadata) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<<\n") for key, value in self.items(): if len(key) > 2 and key[1] == "%" and key[-1] == "%": continue key.write_to_stream(stream, encryption_key) stream.write(b" ") value.write_to_stream(stream) stream.write(b"\n") stream.write(b">>") @classmethod def _get_next_object_position( cls, position_before: int, position_end: int, generations: list[int], pdf: PdfReaderProtocol ) -> int: out = position_end for generation in generations: location = pdf.xref[generation] values = [x for x in location.values() if position_before < x <= position_end] if values: out = min(out, *values) return out @classmethod def _read_unsized_from_stream( cls, stream: StreamType, pdf: PdfReaderProtocol ) -> bytes: object_position = cls._get_next_object_position( position_before=stream.tell(), position_end=2 ** 32, generations=list(pdf.xref), pdf=pdf ) - 1 current_position = stream.tell() # Read until the next object position. read_value = stream.read(object_position - stream.tell()) endstream_position = read_value.find(b"endstream") if endstream_position < 0: raise PdfReadError( f"Unable to find 'endstream' marker for obj starting at {current_position}." ) # 9 = len(b"endstream") stream.seek(current_position + endstream_position + 9) return read_value[: endstream_position - 1] @staticmethod def read_from_stream( stream: StreamType, pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> "DictionaryObject": tmp = stream.read(2) if tmp != b"<<": raise PdfReadError( f"Dictionary read error at byte {hex(stream.tell())}: " "stream must begin with '<<'" ) data: dict[Any, Any] = {} while True: tok = read_non_whitespace(stream) if tok == b"\x00": continue if tok == b"%": stream.seek(-1, 1) skip_over_comment(stream) continue if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b">": stream.read(1) break stream.seek(-1, 1) try: try: key = read_object(stream, pdf) if isinstance(key, NullObject): break if not isinstance(key, NameObject): raise PdfReadError( f"Expecting a NameObject for key but found {key!r}" ) except PdfReadError as exc: if pdf is not None and pdf.strict: raise logger_warning(exc.__repr__(), __name__) continue tok = read_non_whitespace(stream) stream.seek(-1, 1) value = read_object(stream, pdf, forced_encoding) except Exception as exc: if pdf is not None and pdf.strict: raise PdfReadError(exc.__repr__()) logger_warning(exc.__repr__(), __name__) retval = DictionaryObject() retval.update(data) return retval # return partial data if not data.get(key): data[key] = value else: # multiple definitions of key not permitted msg = ( f"Multiple definitions in dictionary at byte " f"{hex(stream.tell())} for key {key}" ) if pdf is not None and pdf.strict: raise PdfReadError(msg) logger_warning(msg, __name__) pos = stream.tell() s = read_non_whitespace(stream) if s == b"s" and stream.read(5) == b"tream": eol = stream.read(1) # Occasional PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b" ": eol = stream.read(1) if eol not in (b"\n", b"\r"): raise PdfStreamError("Stream data must be followed by a newline") if eol == b"\r" and stream.read(1) != b"\n": stream.seek(-1, 1) # this is a stream object, not a dictionary if SA.LENGTH not in data: if pdf is not None and pdf.strict: raise PdfStreamError("Stream length not defined") logger_warning( f"Stream length not defined @pos={stream.tell()}", __name__ ) data[NameObject(SA.LENGTH)] = NumberObject(-1) length = data[SA.LENGTH] if isinstance(length, IndirectObject): t = stream.tell() assert pdf is not None, "mypy" length = pdf.get_object(length) stream.seek(t, 0) if length is None: # if the PDF is damaged length = -1 pstart = stream.tell() if length >= 0: from ..filters import MAX_DECLARED_STREAM_LENGTH # noqa: PLC0415 if length > MAX_DECLARED_STREAM_LENGTH: raise LimitReachedError(f"Declared stream length of {length} exceeds maximum allowed length.") data["__streamdata__"] = stream.read(length) else: data["__streamdata__"] = read_until_regex( stream, re.compile(b"endstream") ) e = read_non_whitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b"endstream": # the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b"endstream": # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] elif pdf is not None and not pdf.strict: stream.seek(pstart, 0) data["__streamdata__"] = DictionaryObject._read_unsized_from_stream(stream, pdf) pos = stream.tell() else: stream.seek(pos, 0) raise PdfReadError( "Unable to find 'endstream' marker after stream at byte " f"{hex(stream.tell())} (nd='{ndstream!r}', end='{end!r}')." ) else: stream.seek(pos, 0) if "__streamdata__" in data: return StreamObject.initialize_from_dictionary(data) retval = DictionaryObject() retval.update(data) return retval class TreeObject(DictionaryObject): def __init__(self, dct: Optional[DictionaryObject] = None) -> None: DictionaryObject.__init__(self) if dct: self.update(dct) def has_children(self) -> bool: return "/First" in self def __iter__(self) -> Any: return self.children() def children(self) -> Iterable[Any]: if not self.has_children(): return child_ref = self[NameObject("/First")] last = self[NameObject("/Last")] child = child_ref.get_object() visited: set[int] = set() while True: child_id = id(child) if child_id in visited: logger_warning(f"Detected cycle in outline structure for {child}", __name__) return visited.add(child_id) yield child if child == last: return child_ref = child.get(NameObject("/Next")) # type: ignore if is_null_or_none(child_ref): return child = child_ref.get_object() def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None: self.insert_child(child, None, pdf) def inc_parent_counter_default( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: if is_null_or_none(parent): return assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) if "/Count" in parent: parent[NameObject("/Count")] = NumberObject( max(0, cast(int, parent[NameObject("/Count")]) + n) ) self.inc_parent_counter_default(parent.get("/Parent", None), n) def inc_parent_counter_outline( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: if is_null_or_none(parent): return assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) # BooleanObject requires comparison with == not is opn = parent.get("/%is_open%", True) == True # noqa: E712 c = cast(int, parent.get("/Count", 0)) if c < 0: c = abs(c) parent[NameObject("/Count")] = NumberObject((c + n) * (1 if opn else -1)) if not opn: return self.inc_parent_counter_outline(parent.get("/Parent", None), n) def insert_child( self, child: Any, before: Any, pdf: PdfWriterProtocol, inc_parent_counter: Optional[Callable[..., Any]] = None, ) -> IndirectObject: if inc_parent_counter is None: inc_parent_counter = self.inc_parent_counter_default child_obj = child.get_object() child = child.indirect_reference # get_reference(child_obj) prev: Optional[DictionaryObject] if "/First" not in self: # no child yet self[NameObject("/First")] = child self[NameObject("/Count")] = NumberObject(0) self[NameObject("/Last")] = child child_obj[NameObject("/Parent")] = self.indirect_reference inc_parent_counter(self, child_obj.get("/Count", 1)) if "/Next" in child_obj: del child_obj["/Next"] if "/Prev" in child_obj: del child_obj["/Prev"] return child prev = cast("DictionaryObject", self["/Last"]) while prev.indirect_reference != before: if "/Next" in prev: prev = cast("TreeObject", prev["/Next"]) else: # append at the end prev[NameObject("/Next")] = cast("TreeObject", child) child_obj[NameObject("/Prev")] = prev.indirect_reference child_obj[NameObject("/Parent")] = self.indirect_reference if "/Next" in child_obj: del child_obj["/Next"] self[NameObject("/Last")] = child inc_parent_counter(self, child_obj.get("/Count", 1)) return child try: # insert as first or in the middle assert isinstance(prev["/Prev"], DictionaryObject) prev["/Prev"][NameObject("/Next")] = child child_obj[NameObject("/Prev")] = prev["/Prev"] except Exception: # it means we are inserting in first position del child_obj["/Next"] child_obj[NameObject("/Next")] = prev prev[NameObject("/Prev")] = child child_obj[NameObject("/Parent")] = self.indirect_reference inc_parent_counter(self, child_obj.get("/Count", 1)) return child def _remove_node_from_tree( self, prev: Any, prev_ref: Any, cur: Any, last: Any ) -> None: """ Adjust the pointers of the linked list and tree node count. Args: prev: prev_ref: cur: last: """ next_ref = cur.get(NameObject("/Next"), None) if prev is None: if next_ref: # Removing first tree node next_obj = next_ref.get_object() del next_obj[NameObject("/Prev")] self[NameObject("/First")] = next_ref self[NameObject("/Count")] = NumberObject( self[NameObject("/Count")] - 1 # type: ignore ) else: # Removing only tree node self[NameObject("/Count")] = NumberObject(0) del self[NameObject("/First")] if NameObject("/Last") in self: del self[NameObject("/Last")] else: if next_ref: # Removing middle tree node next_obj = next_ref.get_object() next_obj[NameObject("/Prev")] = prev_ref prev[NameObject("/Next")] = next_ref else: # Removing last tree node assert cur == last del prev[NameObject("/Next")] self[NameObject("/Last")] = prev_ref self[NameObject("/Count")] = NumberObject(self[NameObject("/Count")] - 1) # type: ignore def remove_child(self, child: Any) -> None: child_obj = child.get_object() child = child_obj.indirect_reference if NameObject("/Parent") not in child_obj: raise ValueError("Removed child does not appear to be a tree item") if child_obj[NameObject("/Parent")] != self: raise ValueError("Removed child is not a member of this tree") found = False prev_ref = None prev = None cur_ref: Optional[Any] = self[NameObject("/First")] cur: Optional[dict[str, Any]] = cur_ref.get_object() # type: ignore last_ref = self[NameObject("/Last")] last = last_ref.get_object() while cur is not None: if cur == child_obj: self._remove_node_from_tree(prev, prev_ref, cur, last) found = True break # Go to the next node prev_ref = cur_ref prev = cur if NameObject("/Next") in cur: cur_ref = cur[NameObject("/Next")] cur = cur_ref.get_object() else: cur_ref = None cur = None if not found: raise ValueError("Removal couldn't find item in tree") _reset_node_tree_relationship(child_obj) def remove_from_tree(self) -> None: """Remove the object from the tree it is in.""" if NameObject("/Parent") not in self: raise ValueError("Removed child does not appear to be a tree item") cast("TreeObject", self["/Parent"]).remove_child(self) def empty_tree(self) -> None: for child in self: child_obj = child.get_object() _reset_node_tree_relationship(child_obj) if NameObject("/Count") in self: del self[NameObject("/Count")] if NameObject("/First") in self: del self[NameObject("/First")] if NameObject("/Last") in self: del self[NameObject("/Last")] def _reset_node_tree_relationship(child_obj: Any) -> None: """ Call this after a node has been removed from a tree. This resets the nodes attributes in respect to that tree. Args: child_obj: """ del child_obj[NameObject("/Parent")] if NameObject("/Next") in child_obj: del child_obj[NameObject("/Next")] if NameObject("/Prev") in child_obj: del child_obj[NameObject("/Prev")] class StreamObject(DictionaryObject): def __init__(self) -> None: self._data: bytes = b"" self.decoded_self: Optional[DecodedStreamObject] = None def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "StreamObject": d__ = cast( "StreamObject", self._reference_clone(self.__class__(), pdf_dest, False), ) d__._data = self._data try: decoded_self = self.decoded_self if decoded_self is None: self.decoded_self = None else: self.decoded_self = cast( "DecodedStreamObject", decoded_self.replicate(pdf_dest) ) except Exception: pass for k, v in self.items(): d__[k.replicate(pdf_dest)] = ( v.replicate(pdf_dest) if hasattr(v, "replicate") else v ) return d__ def _clone( self, src: DictionaryObject, pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], visited: set[tuple[int, int]], ) -> None: """ Update the object from src. Args: src: pdf_dest: force_duplicate: ignore_fields: """ self._data = cast("StreamObject", src)._data try: decoded_self = cast("StreamObject", src).decoded_self if decoded_self is None: self.decoded_self = None else: self.decoded_self = cast( "DecodedStreamObject", decoded_self.clone(pdf_dest, force_duplicate, ignore_fields), ) except Exception: pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) def hash_bin(self) -> int: """ Used to detect modified object. Returns: Hash considering type and value. """ # Use _data to prevent errors on non-decoded streams. return hash((super().hash_bin(), self._data)) def get_data(self) -> bytes: return self._data def set_data(self, data: bytes) -> None: self._data = data def hash_value_data(self) -> bytes: data = super().hash_value_data() data += self.get_data() return data def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) DictionaryObject.write_to_stream(self, stream) del self[SA.LENGTH] stream.write(b"\nstream\n") stream.write(self._data) stream.write(b"\nendstream") @staticmethod def initialize_from_dictionary( data: dict[str, Any] ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: retval: Union[EncodedStreamObject, DecodedStreamObject] if SA.FILTER in data: retval = EncodedStreamObject() else: retval = DecodedStreamObject() retval._data = data["__streamdata__"] del data["__streamdata__"] if SA.LENGTH in data: del data[SA.LENGTH] retval.update(data) return retval def flate_encode(self, level: int = -1) -> "EncodedStreamObject": from ..filters import FlateDecode # noqa: PLC0415 if SA.FILTER in self: f = self[SA.FILTER] if isinstance(f, ArrayObject): f = ArrayObject([NameObject(FT.FLATE_DECODE), *f]) try: params = ArrayObject( [NullObject(), *self.get(SA.DECODE_PARMS, ArrayObject())] ) except TypeError: # case of error where the * operator is not working (not an array params = ArrayObject( [NullObject(), self.get(SA.DECODE_PARMS, ArrayObject())] ) else: f = ArrayObject([NameObject(FT.FLATE_DECODE), f]) params = ArrayObject( [NullObject(), self.get(SA.DECODE_PARMS, NullObject())] ) else: f = NameObject(FT.FLATE_DECODE) params = None retval = EncodedStreamObject() retval.update(self) retval[NameObject(SA.FILTER)] = f if params is not None: retval[NameObject(SA.DECODE_PARMS)] = params retval._data = FlateDecode.encode(self._data, level) return retval def decode_as_image(self, pillow_parameters: Union[dict[str, Any], None] = None) -> Any: """ Try to decode the stream object as an image Args: pillow_parameters: parameters provided to Pillow Image.save() method, cf. Returns: a PIL image if proper decoding has been found Raises: Exception: Errors during decoding will be reported. It is recommended to catch exceptions to prevent stops in your program. """ from ._image_xobject import _xobj_to_image # noqa: PLC0415 if self.get("/Subtype", "") != "/Image": try: msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover except AttributeError: msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover logger_warning(msg, __name__) extension, _, img = _xobj_to_image(self, pillow_parameters) if extension is None: return None # pragma: no cover return img class DecodedStreamObject(StreamObject): pass class EncodedStreamObject(StreamObject): def __init__(self) -> None: self.decoded_self: Optional[DecodedStreamObject] = None # This overrides the parent method def get_data(self) -> bytes: from ..filters import decode_stream_data # noqa: PLC0415 if self.decoded_self is not None: # Cached version of decoded object return self.decoded_self.get_data() # Create decoded object decoded = DecodedStreamObject() decoded.set_data(decode_stream_data(self)) for key, value in self.items(): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value self.decoded_self = decoded return decoded.get_data() # This overrides the parent method: def set_data(self, data: bytes) -> None: from ..filters import FlateDecode # noqa: PLC0415 if self.get(SA.FILTER, "") in (FT.FLATE_DECODE, [FT.FLATE_DECODE]): if not isinstance(data, bytes): raise TypeError("Data must be bytes") if self.decoded_self is None: self.get_data() # to create self.decoded_self assert self.decoded_self is not None, "mypy" self.decoded_self.set_data(data) super().set_data(FlateDecode.encode(data)) else: raise PdfReadError( "Streams encoded with a filter different from FlateDecode are not supported" ) CONTENT_STREAM_ARRAY_MAX_LENGTH = 10_000 class ContentStream(DecodedStreamObject): """ In order to be fast, this data structure can contain either: * raw data in ._data * parsed stream operations in ._operations. At any time, ContentStream object can either have both of those fields defined, or one field defined and the other set to None. These fields are "rebuilt" lazily, when accessed: * when .get_data() is called, if ._data is None, it is rebuilt from ._operations. * when .operations is called, if ._operations is None, it is rebuilt from ._data. Conversely, these fields can be invalidated: * when .set_data() is called, ._operations is set to None. * when .operations is set, ._data is set to None. """ def __init__( self, stream: Any, pdf: Any, forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> None: self.pdf = pdf self._operations: list[tuple[Any, bytes]] = [] # stream may be a StreamObject or an ArrayObject containing # StreamObjects to be concatenated together. if stream is None: super().set_data(b"") else: stream = stream.get_object() if isinstance(stream, ArrayObject): from pypdf.filters import MAX_ARRAY_BASED_STREAM_OUTPUT_LENGTH # noqa: PLC0415 if (stream_length := len(stream)) > CONTENT_STREAM_ARRAY_MAX_LENGTH: raise LimitReachedError( f"Array-based stream has {stream_length} > {CONTENT_STREAM_ARRAY_MAX_LENGTH} elements." ) data = bytearray() length = 0 for s in stream: s_resolved = s.get_object() if isinstance(s_resolved, NullObject): continue if not isinstance(s_resolved, StreamObject): # No need to emit an exception here for now - the PDF structure # seems to already be broken beforehand in these cases. logger_warning( f"Expected StreamObject, got {type(s_resolved).__name__} instead. Data might be wrong.", __name__ ) else: new_data = s_resolved.get_data() length += len(new_data) if length > MAX_ARRAY_BASED_STREAM_OUTPUT_LENGTH: raise LimitReachedError( f"Array-based stream has at least {length} > " f"{MAX_ARRAY_BASED_STREAM_OUTPUT_LENGTH} output bytes." ) data += new_data if len(data) == 0 or data[-1] != b"\n": # There should be no direct need to check for a change of one byte. length += 1 data += b"\n" super().set_data(bytes(data)) else: stream_data = stream.get_data() assert stream_data is not None super().set_data(stream_data) self.forced_encoding = forced_encoding def replicate( self, pdf_dest: PdfWriterProtocol, ) -> "ContentStream": d__ = cast( "ContentStream", self._reference_clone(self.__class__(None, None), pdf_dest, False), ) d__._data = self._data try: decoded_self = self.decoded_self if decoded_self is None: self.decoded_self = None else: self.decoded_self = cast( "DecodedStreamObject", decoded_self.replicate(pdf_dest) ) except Exception: pass for k, v in self.items(): d__[k.replicate(pdf_dest)] = ( v.replicate(pdf_dest) if hasattr(v, "replicate") else v ) return d__ d__.set_data(self._data) d__.pdf = pdf_dest d__._operations = list(self._operations) d__.forced_encoding = self.forced_encoding return d__ def clone( self, pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = (), ) -> "ContentStream": """ Clone object into pdf_dest. Args: pdf_dest: force_duplicate: ignore_fields: Returns: The cloned ContentStream """ try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore return self except Exception: pass visited: set[tuple[int, int]] = set() d__ = cast( "ContentStream", self._reference_clone( self.__class__(None, None), pdf_dest, force_duplicate ), ) if ignore_fields is None: ignore_fields = [] d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) return d__ def _clone( self, src: DictionaryObject, pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], visited: set[tuple[int, int]], ) -> None: """ Update the object from src. Args: src: pdf_dest: force_duplicate: ignore_fields: """ src_cs = cast("ContentStream", src) super().set_data(src_cs._data) self.pdf = pdf_dest self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding # no need to call DictionaryObjection or anything # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) def _parse_content_stream(self, stream: StreamType) -> None: # 7.8.2 Content Streams stream.seek(0, 0) operands: list[Union[int, str, PdfObject]] = [] while True: peek = read_non_whitespace(stream) if peek in (b"", 0): break stream.seek(-1, 1) if peek.isalpha() or peek in (b"'", b'"'): operator = read_until_regex(stream, NameObject.delimiter_pattern) if operator == b"BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... assert operands == [] ii = self._read_inline_image(stream) self._operations.append((ii, b"INLINE IMAGE")) else: self._operations.append((operands, operator)) operands = [] elif peek == b"%": # If we encounter a comment in the content stream, we have to # handle it here. Typically, read_object will handle # encountering a comment -- but read_object assumes that # following the comment must be the object we're trying to # read. In this case, it could be an operator instead. while peek not in (b"\r", b"\n", b""): peek = stream.read(1) else: operands.append(read_object(stream, None, self.forced_encoding)) def _read_inline_image(self, stream: StreamType) -> dict[str, Any]: # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() while True: tok = read_non_whitespace(stream) stream.seek(-1, 1) if tok == b"I": # "ID" - begin of image data break key = read_object(stream, self.pdf) tok = read_non_whitespace(stream) stream.seek(-1, 1) value = read_object(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == b"ID" filtr = settings.get("/F", settings.get("/Filter", "not set")) savpos = stream.tell() if isinstance(filtr, list): filtr = filtr[0] # used forencoding if "AHx" in filtr or "ASCIIHexDecode" in filtr: data = extract_inline__ascii_hex_decode(stream) elif "A85" in filtr or "ASCII85Decode" in filtr: data = extract_inline__ascii85_decode(stream) elif "RL" in filtr or "RunLengthDecode" in filtr: data = extract_inline__run_length_decode(stream) elif "DCT" in filtr or "DCTDecode" in filtr: data = extract_inline__dct_decode(stream) elif filtr == "not set": cs = settings.get("/CS", "") if isinstance(cs, list): cs = cs[0] if "RGB" in cs: lcs = 3 elif "CMYK" in cs: lcs = 4 else: bits = settings.get( "/BPC", 8 if cs in {"/I", "/G", "/Indexed", "/DeviceGray"} else -1, ) if bits > 0: lcs = bits / 8.0 else: data = extract_inline_default(stream) lcs = -1 if lcs > 0: data = stream.read( ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) ) # Move to the `EI` if possible. ei = read_non_whitespace(stream) stream.seek(-1, 1) else: data = extract_inline_default(stream) ei = stream.read(3) stream.seek(-1, 1) if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES: # Deal with wrong/missing `EI` tags. Example: Wrong dimensions specified above. stream.seek(savpos, 0) data = extract_inline_default(stream) ei = stream.read(3) stream.seek(-1, 1) if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES: # pragma: no cover # Check the same condition again. This should never fail as # edge cases are covered by `extract_inline_default` above, # but check this ot make sure that we are behind the `EI` afterwards. raise PdfStreamError( f"Could not extract inline image, even using fallback. Expected 'EI', got {ei!r}" ) return {"settings": settings, "data": data} # This overrides the parent method def get_data(self) -> bytes: if not self._data: new_data = BytesIO() for operands, operator in self._operations: if operator == b"INLINE IMAGE": new_data.write(b"BI") dict_text = BytesIO() operands["settings"].write_to_stream(dict_text) new_data.write(dict_text.getvalue()[2:-2]) new_data.write(b"ID ") new_data.write(operands["data"]) new_data.write(b"EI") else: for op in operands: op.write_to_stream(new_data) new_data.write(b" ") new_data.write(operator) new_data.write(b"\n") self._data = new_data.getvalue() return self._data # This overrides the parent method def set_data(self, data: bytes) -> None: super().set_data(data) self._operations = [] @property def operations(self) -> list[tuple[Any, bytes]]: if not self._operations and self._data: self._parse_content_stream(BytesIO(self._data)) self._data = b"" return self._operations @operations.setter def operations(self, operations: list[tuple[Any, bytes]]) -> None: self._operations = operations self._data = b"" def isolate_graphics_state(self) -> None: if self._operations: self._operations.insert(0, ([], b"q")) self._operations.append(([], b"Q")) elif self._data: self._data = b"q\n" + self._data + b"\nQ\n" # This overrides the parent method def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if not self._data and self._operations: self.get_data() # this ensures ._data is rebuilt super().write_to_stream(stream, encryption_key) def read_object( stream: StreamType, pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> Union[PdfObject, int, str, ContentStream]: tok = stream.read(1) stream.seek(-1, 1) # reset to start if tok == b"/": return NameObject.read_from_stream(stream, pdf) if tok == b"<": # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b"<<": return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) return read_hex_string_from_stream(stream, forced_encoding) if tok == b"[": return ArrayObject.read_from_stream(stream, pdf, forced_encoding) if tok in (b"t", b"f"): return BooleanObject.read_from_stream(stream) if tok == b"(": return read_string_from_stream(stream, forced_encoding) if tok == b"e" and stream.read(6) == b"endobj": return NullObject() if tok == b"n": return NullObject.read_from_stream(stream) if tok == b"%": # comment skip_over_comment(stream) tok = read_non_whitespace(stream) stream.seek(-1, 1) return read_object(stream, pdf, forced_encoding) if tok in b"0123456789+-.": # number object OR indirect reference peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) is not None: assert pdf is not None, "mypy" return IndirectObject.read_from_stream(stream, pdf) return NumberObject.read_from_stream(stream) pos = stream.tell() stream.seek(-20, 1) stream_extract = stream.read(80) stream.seek(pos) read_until_whitespace(stream) raise PdfReadError( f"Invalid Elementary Object starting with {tok!r} @{pos}: {stream_extract!r}" ) class Field(TreeObject): """ A class representing a field dictionary. This class is accessed through :meth:`get_fields()` """ def __init__(self, data: DictionaryObject) -> None: DictionaryObject.__init__(self) field_attributes = ( FieldDictionaryAttributes.attributes() + CheckboxRadioButtonAttributes.attributes() ) self.indirect_reference = data.indirect_reference for attr in field_attributes: try: self[NameObject(attr)] = data[attr] except KeyError: pass if isinstance(self.get("/V"), EncodedStreamObject): d = cast(EncodedStreamObject, self[NameObject("/V")]).get_data() if isinstance(d, bytes): d_str = d.decode() elif d is None: d_str = "" else: raise Exception("Should never happen") self[NameObject("/V")] = TextStringObject(d_str) # TABLE 8.69 Entries common to all field dictionaries @property def field_type(self) -> Optional[NameObject]: """Read-only property accessing the type of this field.""" return self.get(FieldDictionaryAttributes.FT) @property def parent(self) -> Optional[DictionaryObject]: """Read-only property accessing the parent of this field.""" return self.get(FieldDictionaryAttributes.Parent) @property def kids(self) -> Optional["ArrayObject"]: """Read-only property accessing the kids of this field.""" return self.get(FieldDictionaryAttributes.Kids) @property def name(self) -> Optional[str]: """Read-only property accessing the name of this field.""" return self.get(FieldDictionaryAttributes.T) @property def alternate_name(self) -> Optional[str]: """Read-only property accessing the alternate name of this field.""" return self.get(FieldDictionaryAttributes.TU) @property def mapping_name(self) -> Optional[str]: """ Read-only property accessing the mapping name of this field. This name is used by pypdf as a key in the dictionary returned by :meth:`get_fields()` """ return self.get(FieldDictionaryAttributes.TM) @property def flags(self) -> Optional[int]: """ Read-only property accessing the field flags, specifying various characteristics of the field (see Table 8.70 of the PDF 1.7 reference). """ return self.get(FieldDictionaryAttributes.Ff) @property def value(self) -> Optional[Any]: """ Read-only property accessing the value of this field. Format varies based on field type. """ return self.get(FieldDictionaryAttributes.V) @property def default_value(self) -> Optional[Any]: """Read-only property accessing the default value of this field.""" return self.get(FieldDictionaryAttributes.DV) @property def additional_actions(self) -> Optional[DictionaryObject]: """ Read-only property accessing the additional actions dictionary. This dictionary defines the field's behavior in response to trigger events. See Section 8.5.2 of the PDF 1.7 reference. """ return self.get(FieldDictionaryAttributes.AA) class Destination(TreeObject): """ A class representing a destination within a PDF file. See section 12.3.2 of the PDF 2.0 reference. Args: title: Title of this destination. page: Reference to the page of this destination. Should be an instance of :class:`IndirectObject`. fit: How the destination is displayed. Raises: PdfReadError: If destination type is invalid. """ node: Optional[ DictionaryObject ] = None # node provide access to the original Object def __init__( self, title: Union[str, bytes], page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject], fit: Fit, ) -> None: self._filtered_children: list[Any] = [] # used in PdfWriter typ = fit.fit_type args = fit.fit_args DictionaryObject.__init__(self) self[NameObject("/Title")] = TextStringObject(title) self[NameObject("/Page")] = page self[NameObject("/Type")] = typ # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": if len(args) < 1: # left is missing : should never occur args.append(NumberObject(0.0)) if len(args) < 2: # top is missing args.append(NumberObject(0.0)) if len(args) < 3: # zoom is missing args.append(NumberObject(0.0)) ( self[NameObject(TA.LEFT)], self[NameObject(TA.TOP)], self[NameObject("/Zoom")], ) = args elif len(args) == 0: pass elif typ == TF.FIT_R: ( self[NameObject(TA.LEFT)], self[NameObject(TA.BOTTOM)], self[NameObject(TA.RIGHT)], self[NameObject(TA.TOP)], ) = args elif typ in [TF.FIT_H, TF.FIT_BH]: try: # Prefer to be more robust not only to null parameters (self[NameObject(TA.TOP)],) = args except Exception: (self[NameObject(TA.TOP)],) = (NullObject(),) elif typ in [TF.FIT_V, TF.FIT_BV]: try: # Prefer to be more robust not only to null parameters (self[NameObject(TA.LEFT)],) = args except Exception: (self[NameObject(TA.LEFT)],) = (NullObject(),) elif typ in [TF.FIT, TF.FIT_B]: pass else: raise PdfReadError(f"Unknown Destination Type: {typ!r}") @property def dest_array(self) -> "ArrayObject": return ArrayObject( [self.raw_get("/Page"), self["/Type"]] + [ self[x] for x in ["/Left", "/Bottom", "/Right", "/Top", "/Zoom"] if x in self ] ) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<<\n") key = NameObject("/D") key.write_to_stream(stream) stream.write(b" ") value = self.dest_array value.write_to_stream(stream) key = NameObject("/S") key.write_to_stream(stream) stream.write(b" ") value_s = NameObject("/GoTo") value_s.write_to_stream(stream) stream.write(b"\n") stream.write(b">>") @property def title(self) -> Optional[str]: """Read-only property accessing the destination title.""" return self.get("/Title") @property def page(self) -> Optional[IndirectObject]: """Read-only property accessing the IndirectObject of the destination page.""" return self.get("/Page") @property def typ(self) -> Optional[str]: """Read-only property accessing the destination type.""" return self.get("/Type") @property def zoom(self) -> Optional[int]: """Read-only property accessing the zoom factor.""" return self.get("/Zoom", None) @property def left(self) -> Optional[FloatObject]: """Read-only property accessing the left horizontal coordinate.""" return self.get("/Left", None) @property def right(self) -> Optional[FloatObject]: """Read-only property accessing the right horizontal coordinate.""" return self.get("/Right", None) @property def top(self) -> Optional[FloatObject]: """Read-only property accessing the top vertical coordinate.""" return self.get("/Top", None) @property def bottom(self) -> Optional[FloatObject]: """Read-only property accessing the bottom vertical coordinate.""" return self.get("/Bottom", None) @property def color(self) -> Optional["ArrayObject"]: """Read-only property accessing the color in (R, G, B) with values 0.0-1.0.""" return self.get( "/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]) ) @property def font_format(self) -> Optional[OutlineFontFlag]: """ Read-only property accessing the font type. 1=italic, 2=bold, 3=both """ return self.get("/F", 0) @property def outline_count(self) -> Optional[int]: """ Read-only property accessing the outline count. positive = expanded negative = collapsed absolute value = number of visible descendants at all levels """ return self.get("/Count", None) ================================================ FILE: pypdf/generic/_files.py ================================================ from __future__ import annotations import bisect from functools import cached_property from typing import TYPE_CHECKING, cast from pypdf._utils import format_iso8824_date, parse_iso8824_date from pypdf.constants import CatalogAttributes as CA from pypdf.constants import FileSpecificationDictionaryEntries from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PyPdfError from pypdf.generic import ( ArrayObject, ByteStringObject, DecodedStreamObject, DictionaryObject, NameObject, NullObject, NumberObject, StreamObject, TextStringObject, is_null_or_none, ) if TYPE_CHECKING: import datetime from collections.abc import Generator from pypdf._writer import PdfWriter class EmbeddedFile: """ Container holding the information on an embedded file. Attributes are evaluated lazily if possible. Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. """ def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None: """ Args: name: The (primary) name as provided in the name tree. pdf_object: The corresponding PDF object to allow retrieving further data. parent: The parent list. """ self._name = name self.pdf_object = pdf_object self._parent = parent @property def name(self) -> str: """The (primary) name of the embedded file as provided in the name tree.""" return self._name @classmethod def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile: """ Create a new embedded file and add it to the PdfWriter. Args: writer: The PdfWriter instance to add the embedded file to. name: The filename to display. content: The data in the file. Returns: EmbeddedFile instance for the newly created embedded file. """ # Convert string content to bytes if needed if isinstance(content, str): content = content.encode("latin-1") # Create the file entry (the actual embedded file stream) file_entry = DecodedStreamObject() file_entry.set_data(content) file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")}) # Create the /EF entry ef_entry = DictionaryObject() ef_entry.update({NameObject("/F"): writer._add_object(file_entry)}) # Create the filespec dictionary from pypdf.generic import create_string_object # noqa: PLC0415 filespec = DictionaryObject() filespec_reference = writer._add_object(filespec) name_object = cast(TextStringObject, create_string_object(name)) filespec.update( { NameObject(PG.TYPE): NameObject("/Filespec"), NameObject(FileSpecificationDictionaryEntries.F): name_object, NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, } ) # Add the name and filespec to the names array. # We use the inverse order for insertion, as this allows us to re-use the # same index. names_array = cls._get_names_array(writer) insertion_index = cls._get_insertion_index(names_array, name_object) names_array.insert(insertion_index, filespec_reference) names_array.insert(insertion_index, name_object) # Return an EmbeddedFile instance return cls(name=name, pdf_object=filespec, parent=names_array) @classmethod def _get_names_array(cls, writer: PdfWriter) -> ArrayObject: """Get the names array for embedded files, possibly creating and flattening it.""" if CA.NAMES not in writer.root_object: # Add the /Names entry to the catalog. writer.root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject()) names_dict = cast(DictionaryObject, writer.root_object[CA.NAMES]) if "/EmbeddedFiles" not in names_dict: # We do not yet have an entry for embedded files. Create and return it. names = ArrayObject() embedded_files_names_dictionary = DictionaryObject( {NameObject(CA.NAMES): names} ) names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) return names # We have an existing embedded files entry. embedded_files_names_tree = cast(DictionaryObject, names_dict["/EmbeddedFiles"]) if "/Names" in embedded_files_names_tree: # Simple case: We already have a flat list. return cast(ArrayObject, embedded_files_names_tree[NameObject(CA.NAMES)]) if "/Kids" not in embedded_files_names_tree: # Invalid case: This is no name tree. raise PdfReadError("Got neither Names nor Kids in embedded files tree.") # Complex case: Convert a /Kids-based name tree to a /Names-based one. # /Name-based ones are much easier to handle and allow us to simplify the # actual insertion logic by only having to consider one case. names = ArrayObject() kids = cast(ArrayObject, embedded_files_names_tree["/Kids"].get_object()) embedded_files_names_dictionary = DictionaryObject( {NameObject(CA.NAMES): names} ) names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) for kid in kids: # Write the flattened file entries. As we do not change the actual files, # this should not have any impact on references to them. # There might be further (nested) kids here. # Wait for an example before evaluating an implementation. for name in kid.get_object().get("/Names", []): names.append(name) return names @classmethod def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int: keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)] name_bytes = name.encode("utf-8") start = bisect.bisect_left(keys, name_bytes) end = bisect.bisect_right(keys, name_bytes) if start != end: return end * 2 if start == 0: return 0 if start == (key_count := len(keys)): return key_count * 2 return end * 2 @property def alternative_name(self) -> str | None: """Retrieve the alternative name (file specification).""" for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: # PDF 2.0 reference, table 43: # > A PDF reader shall use the value of the UF key, when present, instead of the F key. if key in self.pdf_object: value = self.pdf_object[key].get_object() if not is_null_or_none(value): return cast(str, value) return None @alternative_name.setter def alternative_name(self, value: TextStringObject | None) -> None: """Set the alternative name (file specification).""" if value is None: if FileSpecificationDictionaryEntries.UF in self.pdf_object: self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject() if FileSpecificationDictionaryEntries.F in self.pdf_object: self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject() else: self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value @property def description(self) -> str | None: """Retrieve the description.""" value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC) if is_null_or_none(value): return None return value @description.setter def description(self, value: TextStringObject | None) -> None: """Set the description.""" if value is None: self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject() else: self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value @property def associated_file_relationship(self) -> str: """Retrieve the relationship of the referring document to this embedded file.""" return self.pdf_object.get("/AFRelationship", "/Unspecified") @associated_file_relationship.setter def associated_file_relationship(self, value: NameObject) -> None: """Set the relationship of the referring document to this embedded file.""" self.pdf_object[NameObject("/AFRelationship")] = value @property def _embedded_file(self) -> StreamObject: """Retrieve the actual embedded file stream.""" if "/EF" not in self.pdf_object: raise PdfReadError(f"/EF entry not found: {self.pdf_object}") ef = cast(DictionaryObject, self.pdf_object["/EF"]) for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: if key in ef: return cast(StreamObject, ef[key].get_object()) raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}") @property def _params(self) -> DictionaryObject: """Retrieve the file-specific parameters.""" return self._embedded_file.get("/Params", DictionaryObject()).get_object() @cached_property def _ensure_params(self) -> DictionaryObject: """Ensure the /Params dictionary exists and return it.""" embedded_file = self._embedded_file if "/Params" not in embedded_file: embedded_file[NameObject("/Params")] = DictionaryObject() return cast(DictionaryObject, embedded_file["/Params"]) @property def subtype(self) -> str | None: """Retrieve the subtype. This is a MIME media type, prefixed by a slash.""" value = self._embedded_file.get("/Subtype") if is_null_or_none(value): return None return value @subtype.setter def subtype(self, value: NameObject | None) -> None: """Set the subtype. This should be a MIME media type, prefixed by a slash.""" embedded_file = self._embedded_file if value is None: embedded_file[NameObject("/Subtype")] = NullObject() else: embedded_file[NameObject("/Subtype")] = value @property def content(self) -> bytes: """Retrieve the actual file content.""" return self._embedded_file.get_data() @content.setter def content(self, value: str | bytes) -> None: """Set the file content.""" if isinstance(value, str): value = value.encode("latin-1") self._embedded_file.set_data(value) @property def size(self) -> int | None: """Retrieve the size of the uncompressed file in bytes.""" value = self._params.get("/Size") if is_null_or_none(value): return None return value @size.setter def size(self, value: NumberObject | None) -> None: """Set the size of the uncompressed file in bytes.""" params = self._ensure_params if value is None: params[NameObject("/Size")] = NullObject() else: params[NameObject("/Size")] = value @property def creation_date(self) -> datetime.datetime | None: """Retrieve the file creation datetime.""" return parse_iso8824_date(self._params.get("/CreationDate")) @creation_date.setter def creation_date(self, value: datetime.datetime | None) -> None: """Set the file creation datetime.""" params = self._ensure_params if value is None: params[NameObject("/CreationDate")] = NullObject() else: date_str = format_iso8824_date(value) params[NameObject("/CreationDate")] = TextStringObject(date_str) @property def modification_date(self) -> datetime.datetime | None: """Retrieve the datetime of the last file modification.""" return parse_iso8824_date(self._params.get("/ModDate")) @modification_date.setter def modification_date(self, value: datetime.datetime | None) -> None: """Set the datetime of the last file modification.""" params = self._ensure_params if value is None: params[NameObject("/ModDate")] = NullObject() else: date_str = format_iso8824_date(value) params[NameObject("/ModDate")] = TextStringObject(date_str) @property def checksum(self) -> bytes | None: """Retrieve the MD5 checksum of the (uncompressed) file.""" value = self._params.get("/CheckSum") if is_null_or_none(value): return None return value @checksum.setter def checksum(self, value: ByteStringObject | None) -> None: """Set the MD5 checksum of the (uncompressed) file.""" params = self._ensure_params if value is None: params[NameObject("/CheckSum")] = NullObject() else: params[NameObject("/CheckSum")] = value def delete(self) -> None: """Delete the file from the document.""" if not self._parent: raise PyPdfError("Parent required to delete file from document.") if self.pdf_object in self._parent: index = self._parent.index(self.pdf_object) elif ( (indirect_reference := getattr(self.pdf_object, "indirect_reference", None)) is not None and indirect_reference in self._parent ): index = self._parent.index(indirect_reference) else: raise PyPdfError("File not found in parent object.") self._parent.pop(index) # Reference. self._parent.pop(index - 1) # Name. self.pdf_object = DictionaryObject() # Invalidate. def __repr__(self) -> str: return f"<{self.__class__.__name__} name={self.name!r}>" @classmethod def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: """ Convert the given name tree into class instances. Args: names: The name tree to load the data from. Returns: Iterable of class instances for the files found. """ # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...] for i, name in enumerate(names): if not isinstance(name, str): # Skip plain strings and retrieve them as `direct_name` by index. file_dictionary = name.get_object() direct_name = names[i - 1].get_object() yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names) @classmethod def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: """ Load the embedded files for the given document catalog. This method and its signature are considered internal API and thus not exposed publicly for now. Args: catalog: The document catalog to load from. Returns: Iterable of class instances for the files found. """ try: container = cast( DictionaryObject, cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], ) except KeyError: return if "/Kids" in container: for kid in cast(ArrayObject, container["/Kids"].get_object()): # There might be further (nested) kids here. # Wait for an example before evaluating an implementation. kid = kid.get_object() if "/Names" in kid: yield from cls._load_from_names(cast(ArrayObject, kid["/Names"])) if "/Names" in container: yield from cls._load_from_names(cast(ArrayObject, container["/Names"])) ================================================ FILE: pypdf/generic/_fit.py ================================================ from typing import Any, Optional, Union from ._base import is_null_or_none class Fit: def __init__( self, fit_type: str, fit_args: tuple[Union[None, float, Any], ...] = () ) -> None: from ._base import FloatObject, NameObject, NullObject, NumberObject # noqa: PLC0415 self.fit_type = NameObject(fit_type) self.fit_args: list[Union[NullObject, FloatObject, NumberObject]] = [ NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args ] @classmethod def xyz( cls, left: Optional[float] = None, top: Optional[float] = None, zoom: Optional[float] = None, ) -> "Fit": """ Display the page designated by page, with the coordinates (left, top) positioned at the upper-left corner of the window and the contents of the page magnified by the factor zoom. A null value for any of the parameters left, top, or zoom specifies that the current value of that parameter is to be retained unchanged. A zoom value of 0 has the same meaning as a null value. Args: left: top: zoom: Returns: The created fit object. """ return Fit(fit_type="/XYZ", fit_args=(left, top, zoom)) @classmethod def fit(cls) -> "Fit": """ Display the page designated by page, with its contents magnified just enough to fit the entire page within the window both horizontally and vertically. If the required horizontal and vertical magnification factors are different, use the smaller of the two, centering the page within the window in the other dimension. """ return Fit(fit_type="/Fit") @classmethod def fit_horizontally(cls, top: Optional[float] = None) -> "Fit": """ Display the page designated by page, with the vertical coordinate top positioned at the top edge of the window and the contents of the page magnified just enough to fit the entire width of the page within the window. A null value for ``top`` specifies that the current value of that parameter is to be retained unchanged. Args: top: Returns: The created fit object. """ return Fit(fit_type="/FitH", fit_args=(top,)) @classmethod def fit_vertically(cls, left: Optional[float] = None) -> "Fit": return Fit(fit_type="/FitV", fit_args=(left,)) @classmethod def fit_rectangle( cls, left: Optional[float] = None, bottom: Optional[float] = None, right: Optional[float] = None, top: Optional[float] = None, ) -> "Fit": """ Display the page designated by page, with its contents magnified just enough to fit the rectangle specified by the coordinates left, bottom, right, and top entirely within the window both horizontally and vertically. If the required horizontal and vertical magnification factors are different, use the smaller of the two, centering the rectangle within the window in the other dimension. A null value for any of the parameters may result in unpredictable behavior. Args: left: bottom: right: top: Returns: The created fit object. """ return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top)) @classmethod def fit_box(cls) -> "Fit": """ Display the page designated by page, with its contents magnified just enough to fit its bounding box entirely within the window both horizontally and vertically. If the required horizontal and vertical magnification factors are different, use the smaller of the two, centering the bounding box within the window in the other dimension. """ return Fit(fit_type="/FitB") @classmethod def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit": """ Display the page designated by page, with the vertical coordinate top positioned at the top edge of the window and the contents of the page magnified just enough to fit the entire width of its bounding box within the window. A null value for top specifies that the current value of that parameter is to be retained unchanged. Args: top: Returns: The created fit object. """ return Fit(fit_type="/FitBH", fit_args=(top,)) @classmethod def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit": """ Display the page designated by page, with the horizontal coordinate left positioned at the left edge of the window and the contents of the page magnified just enough to fit the entire height of its bounding box within the window. A null value for left specifies that the current value of that parameter is to be retained unchanged. Args: left: Returns: The created fit object. """ return Fit(fit_type="/FitBV", fit_args=(left,)) def __str__(self) -> str: if not self.fit_args: return f"Fit({self.fit_type})" return f"Fit({self.fit_type}, {self.fit_args})" DEFAULT_FIT = Fit.fit() ================================================ FILE: pypdf/generic/_image_inline.py ================================================ # Copyright (c) 2024, pypdf contributors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import logging from io import BytesIO from typing import IO from .._utils import ( WHITESPACES, WHITESPACES_AS_BYTES, StreamType, logger_warning, read_non_whitespace, ) from ..errors import PdfReadError logger = logging.getLogger(__name__) # An inline image should be used only for small images (4096 bytes or less), # but allow twice this for cases where this has been exceeded. BUFFER_SIZE = 8192 def _check_end_image_marker(stream: StreamType) -> bool: ei_tok = read_non_whitespace(stream) ei_tok += stream.read(2) stream.seek(-3, 1) return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES) def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes: """ Extract HexEncoded stream from inline image. The stream will be moved onto the EI. """ data_out: bytes = b"" # Read data until delimiter > and EI as backup. while True: data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) if not data_buffered: raise PdfReadError("Unexpected end of stream") pos_tok = data_buffered.find(b">") if pos_tok >= 0: # found > data_out += data_buffered[: pos_tok + 1] stream.seek(-len(data_buffered) + pos_tok + 1, 1) break pos_ei = data_buffered.find(b"EI") if pos_ei >= 0: # found EI stream.seek(-len(data_buffered) + pos_ei - 1, 1) c = stream.read(1) while c in WHITESPACES: stream.seek(-2, 1) c = stream.read(1) pos_ei -= 1 data_out += data_buffered[:pos_ei] break if len(data_buffered) == 2: data_out += data_buffered raise PdfReadError("Unexpected end of stream") # Neither > nor EI found data_out += data_buffered[:-2] stream.seek(-2, 1) if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out def extract_inline__ascii85_decode(stream: StreamType) -> bytes: """ Extract A85 stream from inline image. The stream will be moved onto the EI. """ data_out: bytes = b"" # Read data until delimiter ~> while True: data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) if not data_buffered: raise PdfReadError("Unexpected end of stream") pos_tok = data_buffered.find(b"~>") if pos_tok >= 0: # found! data_out += data_buffered[: pos_tok + 2] stream.seek(-len(data_buffered) + pos_tok + 2, 1) break if len(data_buffered) == 2: # end of buffer data_out += data_buffered raise PdfReadError("Unexpected end of stream") data_out += data_buffered[ :-2 ] # back by one char in case of in the middle of ~> stream.seek(-2, 1) if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out def extract_inline__run_length_decode(stream: StreamType) -> bytes: """ Extract RL (RunLengthDecode) stream from inline image. The stream will be moved onto the EI. """ data_out: bytes = b"" # Read data until delimiter 128 while True: data_buffered = stream.read(BUFFER_SIZE) if not data_buffered: raise PdfReadError("Unexpected end of stream") pos_tok = data_buffered.find(b"\x80") if pos_tok >= 0: # found # Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10 # marks the EOD. But there apparently are cases like in issue #3517, where we have # an inline image with up to 51 EOD markers. In these cases, be resilient here and # use the default `EI` marker detection instead. Please note that this fallback # still omits special `EI` handling within the stream, but for now assume that having # both of these cases occur at the same time is very unlikely (and the image stream # is broken anyway). # For now, do not skip over more than one whitespace character. after_token = data_buffered[pos_tok + 1 : pos_tok + 4] if after_token.startswith(b"EI") or after_token.endswith(b"EI"): data_out += data_buffered[: pos_tok + 1] stream.seek(-len(data_buffered) + pos_tok + 1, 1) else: logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__) ei_marker = data_buffered.find(b"EI") if ei_marker > 0: data_out += data_buffered[: ei_marker] stream.seek(-len(data_buffered) + ei_marker - 1, 1) break data_out += data_buffered if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out def extract_inline__dct_decode(stream: StreamType) -> bytes: """ Extract DCT (JPEG) stream from inline image. The stream will be moved onto the EI. """ def read(length: int) -> bytes: # If 0 bytes are returned, and *size* was not 0, this indicates end of file. # If the object is in non-blocking mode and no bytes are available, `None` is returned. _result = stream.read(length) if _result is None or len(_result) != length: raise PdfReadError("Unexpected end of stream") return _result data_out: bytes = b"" # Read Blocks of data (ID/Size/data) up to ID=FF/D9 # https://www.digicamsoft.com/itu/itu-t81-36.html not_first = False while True: c = read(1) if not_first or (c == b"\xff"): data_out += c if c != b"\xff": continue not_first = True c = read(1) data_out += c if c == b"\xff": stream.seek(-1, 1) # pragma: no cover elif c == b"\x00": # stuffing pass elif c == b"\xd9": # end break elif c in ( b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" b"\xda\xdb\xdc\xdd\xde\xdf" b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" ): c = read(2) data_out += c sz = c[0] * 256 + c[1] data_out += read(sz - 2) if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out def extract_inline_default(stream: StreamType) -> bytes: """Legacy method, used by default""" stream_out = BytesIO() # Read the inline image, while checking for EI (End Image) operator. while True: data_buffered = stream.read(BUFFER_SIZE) if not data_buffered: raise PdfReadError("Unexpected end of stream") pos_ei = data_buffered.find( b"E" ) # We can not look straight for "EI" because it may not have been loaded in the buffer if pos_ei == -1: stream_out.write(data_buffered) else: # Write out everything including E (the one from EI to be removed) stream_out.write(data_buffered[0 : pos_ei + 1]) sav_pos_ei = stream_out.tell() - 1 # Seek back in the stream to read the E next stream.seek(pos_ei + 1 - len(data_buffered), 1) saved_pos = stream.tell() # Check for End Image tok2 = stream.read(1) # I of "EI" if tok2 != b"I": stream.seek(saved_pos, 0) continue tok3 = stream.read(1) # possible space after "EI" if tok3 not in WHITESPACES: stream.seek(saved_pos, 0) continue while tok3 in WHITESPACES: tok3 = stream.read(1) if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { b"Q", b"E", }: # for Q or EMC stream.seek(saved_pos, 0) continue if is_followed_by_binary_data(stream): # Inline image contains `EI ` sequence usually marking the end of it, but # is followed by binary data which does not make sense for the actual end. stream.seek(saved_pos, 0) continue # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient # remove E(I) wrongly inserted earlier stream.seek(saved_pos - 1, 0) stream_out.truncate(sav_pos_ei) break return stream_out.getvalue() def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool: """ Check if the next bytes of the stream look like binary image data or regular page content. This is just some heuristics due to the PDF specification being too imprecise about inline images containing the `EI` marker which would end an image. Starting with PDF 2.0, we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited everywhere, we should not expect to be able to remove such hacks in the near future - especially considering legacy documents as well. The actual implementation draws some inspiration from https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java """ position = stream.tell() data = stream.read(length) stream.seek(position) if not data: return False operator_start = None operator_end = None for index, byte in enumerate(data): if byte < 32 and byte not in WHITESPACES_AS_BYTES: # This covers all characters not being displayable directly, although omitting whitespace # to allow for operator detection. return True is_whitespace = byte in WHITESPACES_AS_BYTES if operator_start is None and not is_whitespace: # Interpret all other non-whitespace characters as the start of an operation. operator_start = index if operator_start is not None and is_whitespace: # A whitespace stops an operation. # Assume that having an inline image with tons of whitespace is rather unlikely. operator_end = index break if operator_start is None: # Inline images should not have tons of whitespaces, which would lead to no operator start. return False if operator_end is None: # We probably are inside an operation. operator_end = length operator_length = operator_end - operator_start operator = data[operator_start:operator_end] if operator.startswith(b"/") and operator_length > 1: # Name object. return False if operator.replace(b".", b"").isdigit(): # Graphics operator, for example a move. A number (integer or float). return False if operator_length > 3: # noqa: SIM103 # Usually, the operators inside a content stream should not have more than three characters, # especially after an inline image. return True return False ================================================ FILE: pypdf/generic/_image_xobject.py ================================================ """Functions to convert an image XObject to an image""" import sys from io import BytesIO from typing import Any, Literal, Optional, Union, cast from .._utils import check_if_whitespace_only, logger_warning from ..constants import ColorSpaces, StreamAttributes from ..constants import FilterTypes as FT from ..constants import ImageAttributes as IA from ..errors import EmptyImageDataError, PdfReadError from ..generic import ( ArrayObject, DecodedStreamObject, EncodedStreamObject, NullObject, TextStringObject, is_null_or_none, ) if sys.version_info[:2] >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias try: from PIL import Image, UnidentifiedImageError except ImportError: raise ImportError( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ) mode_str_type: TypeAlias = Literal[ "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" ] MAX_IMAGE_MODE_NESTING_DEPTH: int = 10 def _get_image_mode( color_space: Union[str, list[Any], Any], color_components: int, prev_mode: mode_str_type, depth: int = 0, ) -> tuple[mode_str_type, bool]: """ Returns: Image mode, not taking into account mask (transparency). ColorInversion is required (like for some DeviceCMYK). """ if depth > MAX_IMAGE_MODE_NESTING_DEPTH: raise PdfReadError( "Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH." ) if is_null_or_none(color_space): return "", False color_space_str: str = "" if isinstance(color_space, str): color_space_str = color_space elif not isinstance(color_space, list): raise PdfReadError( "Cannot interpret color space", color_space ) # pragma: no cover elif not color_space: return "", False elif color_space[0].startswith("/Cal"): # /CalRGB or /CalGray color_space_str = "/Device" + color_space[0][4:] elif color_space[0] == "/ICCBased": icc_profile = color_space[1].get_object() color_components = cast(int, icc_profile["/N"]) color_space_str = icc_profile.get("/Alternate", "") elif color_space[0] == "/Indexed": color_space_str = color_space[1].get_object() mode, invert_color = _get_image_mode( color_space_str, color_components, prev_mode, depth + 1 ) if mode in ("RGB", "CMYK"): mode = "P" return mode, invert_color elif color_space[0] == "/Separation": color_space_str = color_space[2].get_object() mode, invert_color = _get_image_mode( color_space_str, color_components, prev_mode, depth + 1 ) return mode, True elif color_space[0] == "/DeviceN": original_color_space = color_space color_components = len(color_space[1]) color_space_str = color_space[2].get_object() if color_space_str == "/DeviceCMYK" and color_components == 1: if original_color_space[1][0] != "/Black": logger_warning( f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team", __name__, ) return "L", True mode, invert_color = _get_image_mode( color_space_str, color_components, prev_mode, depth + 1 ) return mode, invert_color mode_map: dict[str, mode_str_type] = { "1bit": "1", # must be zeroth position: color_components may index the values "/DeviceGray": "L", # must be first position: color_components may index the values "palette": "P", # must be second position: color_components may index the values "/DeviceRGB": "RGB", # must be third position: color_components may index the values "/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values "2bit": "2bits", "4bit": "4bits", } mode = ( mode_map.get(color_space_str) or list(mode_map.values())[color_components] or prev_mode ) return mode, mode == "CMYK" def bits2byte(data: bytes, size: tuple[int, int], bits: int) -> bytes: mask = (1 << bits) - 1 byte_buffer = bytearray(size[0] * size[1]) data_index = 0 bit = 8 - bits for y in range(size[1]): if bit != 8 - bits: data_index += 1 bit = 8 - bits for x in range(size[0]): byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask bit -= bits if bit < 0: data_index += 1 bit = 8 - bits return bytes(byte_buffer) def _extended_image_from_bytes( mode: str, size: tuple[int, int], data: bytes ) -> Image.Image: try: img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] data_length = len(data) if data_length == 0: raise EmptyImageDataError( "Data is 0 bytes, cannot process an image from empty data." ) from exc if data_length % nb_pix != 0: raise exc k = nb_pix * len(mode) / data_length data = b"".join(bytes((x,) * int(k)) for x in data) img = Image.frombytes(mode, size, data) return img def __handle_flate__indexed(color_space: ArrayObject) -> tuple[Any, Any, Any, Any]: count = len(color_space) if count == 4: color_space, base, hival, lookup = (value.get_object() for value in color_space) return color_space, base, hival, lookup # Deal with strange AutoDesk files where `base` and `hival` look like this: # /DeviceRGB\x00255 element1 = color_space[1] element1 = element1 if isinstance(element1, str) else element1.get_object() if count == 3 and "\x00" in element1: color_space, lookup = color_space[0].get_object(), color_space[2].get_object() base, hival = element1.split("\x00") hival = int(hival) return color_space, base, hival, lookup raise PdfReadError(f"Expected color space with 4 values, got {count}: {color_space}") def _handle_flate( size: tuple[int, int], data: bytes, mode: mode_str_type, color_space: str, colors: int, obj_as_text: str, ) -> tuple[Image.Image, str, str, bool]: """ Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ extension = ".png" # mime_type: "image/png" image_format = "PNG" lookup: Any base: Any hival: Any if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed": color_space, base, hival, lookup = __handle_flate__indexed(color_space) if mode == "2bits": mode = "P" data = bits2byte(data, size, 2) elif mode == "4bits": mode = "P" data = bits2byte(data, size, 4) img = _extended_image_from_bytes(mode, size, data) if color_space == "/Indexed": if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)): lookup = lookup.get_data() if isinstance(lookup, TextStringObject): lookup = lookup.original_bytes if isinstance(lookup, str): lookup = lookup.encode() try: nb, conv, mode = { # type: ignore "1": (0, "", ""), "L": (1, "P", "L"), "P": (0, "", ""), "RGB": (3, "P", "RGB"), "CMYK": (4, "P", "CMYK"), }[_get_image_mode(base, 0, "")[0]] except KeyError: # pragma: no cover logger_warning( f"Base {base} not coded please share the pdf file with pypdf dev team", __name__, ) lookup = None else: if img.mode == "1": # Two values ("high" and "low"). expected_count = 2 * nb actual_count = len(lookup) if actual_count != expected_count: if actual_count < expected_count: logger_warning( f"Not enough lookup values: Expected {expected_count}, got {actual_count}.", __name__ ) lookup += bytes([0] * (expected_count - actual_count)) elif not check_if_whitespace_only(lookup[expected_count:]): logger_warning( f"Too many lookup values: Expected {expected_count}, got {actual_count}.", __name__ ) lookup = lookup[:expected_count] colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( b"".join( colors_arr[1 if img.getpixel((x, y)) > 127 else 0] # type: ignore[operator,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 for x in range(img.size[0]) ) for y in range(img.size[1]) ) img = Image.frombytes(mode, img.size, arr) else: img = img.convert(conv) if len(lookup) != (hival + 1) * nb: logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__) lookup = None elif mode == "L": # gray lookup does not work: it is converted to a similar RGB lookup lookup = b"".join([bytes([b, b, b]) for b in lookup]) mode = "RGB" # TODO: https://github.com/py-pdf/pypdf/pull/2039 # this is a work around until PIL is able to process CMYK images elif mode == "CMYK": _rgb = [] for _c, _m, _y, _k in ( lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4) ): _r = int(255 * (1 - _c / 255) * (1 - _k / 255)) _g = int(255 * (1 - _m / 255) * (1 - _k / 255)) _b = int(255 * (1 - _y / 255) * (1 - _k / 255)) _rgb.append(bytes((_r, _g, _b))) lookup = b"".join(_rgb) mode = "RGB" if lookup is not None: img.putpalette(lookup, rawmode=mode) img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") elif not is_null_or_none(color_space) and color_space[0] == "/ICCBased": # Exclude pure black-and-white images. # TODO: The remaining code still does not look correct. Shouldn't the proper way be # to use the original image and apply the ICC transformation on it? # For now, this just loads the original image with a different color space. if mode != "1": # Table 65 - Additional Entries Specific to an ICC Profile Stream Dictionary mode2 = _get_image_mode(color_space, colors, mode)[0] if mode != mode2: img = Image.frombytes(mode, size, data) # reloaded as mode may have changed if mode == "CMYK": extension = ".tif" image_format = "TIFF" return img, image_format, extension, False def _handle_jpx( size: tuple[int, int], data: bytes, mode: mode_str_type, color_space: str, colors: int, ) -> tuple[Image.Image, str, str, bool]: """ Process image encoded as JPX/JPEG2000 Returns img, image_format, extension, inversion """ extension = ".jp2" # mime_type: "image/x-jp2" img1: Image.Image = Image.open(BytesIO(data), formats=("JPEG2000",)) mode, invert_color = _get_image_mode(color_space, colors, mode) if mode == "": mode = cast(mode_str_type, img1.mode) invert_color = mode == "CMYK" if img1.mode == "RGBA" and mode == "RGB": mode = "RGBA" # we need to convert to the good mode if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets # L and P are indexed modes which should not be changed. img = img1 elif {img1.mode, mode} == {"RGBA", "CMYK"}: # RGBA / CMYK are 4bytes encoding where # the encoding should be corrected img = Image.frombytes(mode, img1.size, img1.tobytes()) else: # pragma: no cover img = img1.convert(mode) # CMYK conversion # https://stackverflow.com/questions/38855022/ if img.mode == "CMYK" and color_space == "/ICCBased": img = img.convert("RGB") image_format = "JPEG2000" return img, image_format, extension, invert_color def _apply_decode( img: Image.Image, x_object_obj: dict[str, Any], lfilters: FT, color_space: Union[str, list[Any], Any], invert_color: bool, ) -> Image.Image: # CMYK image and other color spaces without decode # requires reverting scale (cf p243,2§ last sentence) if IA.DECODE in x_object_obj: decode = x_object_obj[IA.DECODE] # if invert_color and lfilters == FT.DCT_DECODE: # decode = list(reversed(decode)) elif img.mode == "CMYK" and lfilters == FT.JPX_DECODE: decode = [1.0, 0.0] if not invert_color else [0.0, 1.0] decode = decode * len(img.getbands()) elif (img.mode == "CMYK" and lfilters == FT.DCT_DECODE) or (invert_color and img.mode == "L"): decode = [1.0, 0.0] * len(img.getbands()) else: decode = None if ( isinstance(color_space, ArrayObject) and color_space[0].get_object() == "/Indexed" ): decode = None # decode is meaningless if Indexed if ( isinstance(color_space, ArrayObject) and color_space[0].get_object() == "/Separation" ): decode = [1.0, 0.0] * len(img.getbands()) if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))): lut: list[int] = [] for i in range(0, len(decode), 2): dmin = decode[i] dmax = decode[i + 1] lut.extend( round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256) ) img = img.point(lut) return img def _get_mode_and_invert_color( x_object_obj: dict[str, Any], colors: int, color_space: Union[str, list[Any], Any] ) -> tuple[mode_str_type, bool]: if ( IA.COLOR_SPACE in x_object_obj and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB ): # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes mode: mode_str_type = "RGB" if x_object_obj.get("/BitsPerComponent", 8) < 8: mode, invert_color = _get_image_mode( f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, "" ) else: mode, invert_color = _get_image_mode( color_space, 2 if ( colors == 1 and ( not is_null_or_none(color_space) and "Gray" not in color_space ) ) else colors, "", ) return mode, invert_color def _xobj_to_image( x_object: dict[str, Any], pillow_parameters: Union[dict[str, Any], None] = None ) -> tuple[Optional[str], bytes, Any]: """ Users need to have the pillow package installed. It's unclear if pypdf will keep this function here, hence it's private. It might get removed at any point. Args: x_object: pillow_parameters: parameters provided to Pillow Image.save() method, cf. Returns: Tuple[file extension, bytes, PIL.Image.Image] """ def _apply_alpha( img: Image.Image, x_object: dict[str, Any], obj_as_text: str, image_format: str, extension: str, ) -> tuple[Image.Image, str, str]: alpha = None if IA.S_MASK in x_object: # add alpha channel alpha = _xobj_to_image(x_object[IA.S_MASK])[2] if img.size != alpha.size: logger_warning( f"image and mask size not matching: {obj_as_text}", __name__ ) else: # TODO: implement mask if alpha.mode != "L": alpha = alpha.convert("L") if img.mode == "P": img = img.convert("RGB") elif img.mode == "1": img = img.convert("L") img.putalpha(alpha) if "JPEG" in image_format: image_format = "JPEG2000" extension = ".jp2" else: image_format = "PNG" extension = ".png" return img, extension, image_format # For error reporting obj_as_text = ( x_object.indirect_reference.__repr__() if x_object is None # pragma: no cover else x_object.__repr__() ) # Get size and data size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT])) data = x_object.get_data() # type: ignore if isinstance(data, str): # pragma: no cover data = data.encode() if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' data = data[:-1] # Get color properties colors = x_object.get("/Colors", 1) color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object() if isinstance(color_space, list) and len(color_space) == 1: color_space = color_space[0].get_object() mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space) # Get filters filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object() lfilters = filters[-1] if isinstance(filters, list) else filters decode_parms = x_object.get(StreamAttributes.DECODE_PARMS) if decode_parms and isinstance(decode_parms, (tuple, list)): decode_parms = decode_parms[0] else: decode_parms = {} if not isinstance(decode_parms, dict): decode_parms = {} extension = None if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): img, image_format, extension, _ = _handle_flate( size, data, mode, color_space, colors, obj_as_text, ) elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE): # I'm not sure if the following logic is correct. # There might not be any relationship between the filters and the # extension if lfilters == FT.LZW_DECODE: image_format = "TIFF" extension = ".tiff" # mime_type = "image/tiff" else: image_format = "PNG" extension = ".png" # mime_type = "image/png" try: img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) except UnidentifiedImageError: img = _extended_image_from_bytes(mode, size, data) elif lfilters == FT.DCT_DECODE: img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" # invert_color kept unchanged elif lfilters == FT.JPX_DECODE: img, image_format, extension, invert_color = _handle_jpx( size, data, mode, color_space, colors ) elif lfilters == FT.CCITT_FAX_DECODE: img, image_format, extension, invert_color = ( Image.open(BytesIO(data), formats=("TIFF",)), "TIFF", ".tiff", False, ) elif lfilters == FT.JBIG2_DECODE: img, image_format, extension, invert_color = ( Image.open(BytesIO(data), formats=("PNG", "PPM")), "PNG", ".png", False, ) elif mode == "CMYK": img, image_format, extension, invert_color = ( _extended_image_from_bytes(mode, size, data), "TIFF", ".tif", False, ) elif mode == "": raise PdfReadError(f"ColorSpace field not found in {x_object}") else: img, image_format, extension, invert_color = ( _extended_image_from_bytes(mode, size, data), "PNG", ".png", False, ) img = _apply_decode(img, x_object, lfilters, color_space, invert_color) img, extension, image_format = _apply_alpha( img, x_object, obj_as_text, image_format, extension ) if pillow_parameters is None: pillow_parameters = {} # Preserve JPEG image quality - see issue #3515. if image_format == "JPEG": # This prevents: Cannot use 'keep' when original image is not a JPEG: # "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format img.format = "JPEG" if "quality" not in pillow_parameters: pillow_parameters["quality"] = "keep" # Save image to bytes img_byte_arr = BytesIO() try: img.save(img_byte_arr, format=image_format, **pillow_parameters) except OSError: # pragma: no cover # covered with pillow 10.3 # in case of we convert to RGBA and then to PNG img1 = img.convert("RGBA") image_format = "PNG" extension = ".png" img_byte_arr = BytesIO() img1.save(img_byte_arr, format=image_format) data = img_byte_arr.getvalue() try: # temporary try/except until other fixes of images img = Image.open(BytesIO(data)) except Exception as exception: logger_warning(f"Failed loading image: {exception}", __name__) img = None # type: ignore[assignment,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 return extension, data, img ================================================ FILE: pypdf/generic/_link.py ================================================ # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # This module contains code used by _writer.py to track links in pages # being added to the writer until the links can be resolved. from typing import TYPE_CHECKING, Optional, Union, cast from .._utils import logger_warning from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject, is_null_or_none if TYPE_CHECKING: from .._page import PageObject from .._reader import PdfReader from .._writer import PdfWriter class NamedReferenceLink: """Named reference link being preserved until we can resolve it correctly.""" def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None: """reference: TextStringObject with named reference""" self._reference = reference self._source_pdf = source_pdf def find_referenced_page(self) -> Union[IndirectObject, None]: destination = self._source_pdf.named_destinations.get(str(self._reference)) return destination.page if destination else None def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" # point named destination in new PDF to the new page if str(self._reference) not in target_pdf.named_destinations: target_pdf.add_named_destination(str(self._reference), new_page.page_number) class DirectReferenceLink: """Direct reference link being preserved until we can resolve it correctly.""" def __init__(self, reference: ArrayObject) -> None: """reference: an ArrayObject whose first element is the Page indirect object""" self._reference = reference def find_referenced_page(self) -> IndirectObject: return self._reference[0] def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" self._reference[0] = new_page ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink] def extract_links(new_page: "PageObject", old_page: "PageObject") -> list[tuple[ReferenceLink, ReferenceLink]]: """Extracts links from two pages on the assumption that the two pages are the same. Produces one list of (new link, old link) tuples. """ new_annotations = new_page.get("/Annots", ArrayObject()).get_object() old_annotations = old_page.get("/Annots", ArrayObject()).get_object() if is_null_or_none(new_annotations): new_annotations = ArrayObject() if is_null_or_none(old_annotations): old_annotations = ArrayObject() if not isinstance(new_annotations, ArrayObject) or not isinstance(old_annotations, ArrayObject): logger_warning( f"Expected annotation arrays: {old_annotations} {new_annotations}. Ignoring annotations.", __name__ ) return [] # TODO: Investigate in https://github.com/py-pdf/pypdf/issues/3667 # if len(new_annotations) != len(old_annotations): # logger_warning(f"Annotation sizes differ: {old_annotations} vs. {new_annotations}", __name__) new_links = [_build_link(link, new_page) for link in new_annotations] old_links = [_build_link(link, old_page) for link in old_annotations] return [ (new_link, old_link) for (new_link, old_link) in zip(new_links, old_links) if new_link and old_link ] def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]: src = cast("PdfReader", page.pdf) link = cast(DictionaryObject, indirect_object.get_object()) if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link": return None if "/A" in link: action = cast(DictionaryObject, link["/A"]) if action.get("/S") != "/GoTo": return None if "/D" not in action: return None return _create_link(action["/D"], src) if "/Dest" in link: return _create_link(link["/Dest"], src) return None # Nothing to do here def _create_link(reference: PdfObject, source_pdf: "PdfReader") -> Optional[ReferenceLink]: if isinstance(reference, TextStringObject): return NamedReferenceLink(reference, source_pdf) if isinstance(reference, ArrayObject): return DirectReferenceLink(reference) return None ================================================ FILE: pypdf/generic/_outline.py ================================================ from typing import Union from .._utils import StreamType, deprecation_no_replacement from ._base import NameObject from ._data_structures import Destination class OutlineItem(Destination): def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<<\n") for key in [ NameObject(x) for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"] if x in self ]: key.write_to_stream(stream) stream.write(b" ") value = self.raw_get(key) value.write_to_stream(stream) stream.write(b"\n") key = NameObject("/Dest") key.write_to_stream(stream) stream.write(b" ") value = self.dest_array value.write_to_stream(stream) stream.write(b"\n") stream.write(b">>") ================================================ FILE: pypdf/generic/_rectangle.py ================================================ from typing import Any, Union from ._base import FloatObject, NumberObject from ._data_structures import ArrayObject class RectangleObject(ArrayObject): """ This class is used to represent *page boxes* in pypdf. These boxes include: * :attr:`artbox ` * :attr:`bleedbox ` * :attr:`cropbox ` * :attr:`mediabox ` * :attr:`trimbox ` """ def __init__( self, arr: Union["RectangleObject", tuple[float, float, float, float]] ) -> None: # must have four points assert len(arr) == 4 # automatically convert arr[x] into NumberObject(arr[x]) if necessary ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]: if not isinstance(value, (FloatObject, NumberObject)): value = FloatObject(value) return value def scale(self, sx: float, sy: float) -> "RectangleObject": return RectangleObject( ( float(self.left) * sx, float(self.bottom) * sy, float(self.right) * sx, float(self.top) * sy, ) ) def __repr__(self) -> str: return f"RectangleObject({list(self)!r})" @property def left(self) -> FloatObject: return self[0] @left.setter def left(self, f: float) -> None: self[0] = FloatObject(f) @property def bottom(self) -> FloatObject: return self[1] @bottom.setter def bottom(self, f: float) -> None: self[1] = FloatObject(f) @property def right(self) -> FloatObject: return self[2] @right.setter def right(self, f: float) -> None: self[2] = FloatObject(f) @property def top(self) -> FloatObject: return self[3] @top.setter def top(self, f: float) -> None: self[3] = FloatObject(f) @property def lower_left(self) -> tuple[float, float]: """ Property to read and modify the lower left coordinate of this box in (x,y) form. """ return self.left, self.bottom @lower_left.setter def lower_left(self, value: tuple[float, float]) -> None: self[0], self[1] = (self._ensure_is_number(x) for x in value) @property def lower_right(self) -> tuple[float, float]: """ Property to read and modify the lower right coordinate of this box in (x,y) form. """ return self.right, self.bottom @lower_right.setter def lower_right(self, value: tuple[float, float]) -> None: self[2], self[1] = (self._ensure_is_number(x) for x in value) @property def upper_left(self) -> tuple[float, float]: """ Property to read and modify the upper left coordinate of this box in (x,y) form. """ return self.left, self.top @upper_left.setter def upper_left(self, value: tuple[float, float]) -> None: self[0], self[3] = (self._ensure_is_number(x) for x in value) @property def upper_right(self) -> tuple[float, float]: """ Property to read and modify the upper right coordinate of this box in (x,y) form. """ return self.right, self.top @upper_right.setter def upper_right(self, value: tuple[float, float]) -> None: self[2], self[3] = (self._ensure_is_number(x) for x in value) @property def width(self) -> float: return self.right - self.left @property def height(self) -> float: return self.top - self.bottom ================================================ FILE: pypdf/generic/_utils.py ================================================ import codecs from typing import Union from .._codecs import _pdfdoc_encoding from .._utils import StreamType, logger_warning, read_non_whitespace from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError from ._base import ByteStringObject, TextStringObject def hex_to_rgb(value: str) -> tuple[float, float, float]: return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore def read_hex_string_from_stream( stream: StreamType, forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) arr = [] x = b"" while True: tok = read_non_whitespace(stream) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b">": break x += tok if len(x) == 2: arr.append(int(x, base=16)) x = b"" if len(x) == 1: x += b"0" if x != b"": arr.append(int(x, base=16)) return create_string_object(bytes(arr), forced_encoding) __ESCAPE_DICT__ = { b"n": ord(b"\n"), b"r": ord(b"\r"), b"t": ord(b"\t"), b"b": ord(b"\b"), b"f": ord(b"\f"), b"(": ord(b"("), b")": ord(b")"), b"/": ord(b"/"), b"\\": ord(b"\\"), b" ": ord(b" "), b"%": ord(b"%"), b"<": ord(b"<"), b">": ord(b">"), b"[": ord(b"["), b"]": ord(b"]"), b"#": ord(b"#"), b"_": ord(b"_"), b"&": ord(b"&"), b"$": ord(b"$"), } __BACKSLASH_CODE__ = 92 def read_string_from_stream( stream: StreamType, forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: tok = stream.read(1) parens = 1 txt = [] while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b"(": parens += 1 elif tok == b")": parens -= 1 if parens == 0: break elif tok == b"\\": tok = stream.read(1) try: txt.append(__ESCAPE_DICT__[tok]) continue except KeyError: if b"0" <= tok <= b"7": # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) sav = stream.tell() - 1 for _ in range(2): ntok = stream.read(1) if b"0" <= ntok <= b"7": tok += ntok else: stream.seek(-1, 1) # ntok has to be analyzed break i = int(tok, base=8) if i > 255: txt.append(__BACKSLASH_CODE__) stream.seek(sav) else: txt.append(i) continue if tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if tok not in b"\n\r": stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: continue msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}" logger_warning(msg, __name__) txt.append(__BACKSLASH_CODE__) txt.append(ord(tok)) return create_string_object(bytes(txt), forced_encoding) def create_string_object( string: Union[str, bytes], forced_encoding: Union[None, str, list[str], dict[int, str]] = None, ) -> Union[TextStringObject, ByteStringObject]: """ Create a ByteStringObject or a TextStringObject from a string to represent the string. Args: string: The data being used forced_encoding: Typically None, or an encoding string Returns: A ByteStringObject Raises: TypeError: If string is not of type str or bytes. """ if isinstance(string, str): return TextStringObject(string) if isinstance(string, bytes): if isinstance(forced_encoding, (list, dict)): out = "" for x in string: try: out += forced_encoding[x] except Exception: out += bytes((x,)).decode("charmap") obj = TextStringObject(out) obj._original_bytes = string return obj if isinstance(forced_encoding, str): if forced_encoding == "bytes": return ByteStringObject(string) obj = TextStringObject(string.decode(forced_encoding)) obj._original_bytes = string return obj try: if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = string[:2] return retval if string.startswith(b"\x00"): retval = TextStringObject(string.decode("utf-16be")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = codecs.BOM_UTF16_BE return retval if string[1:2] == b"\x00": retval = TextStringObject(string.decode("utf-16le")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = codecs.BOM_UTF16_LE return retval # This is probably a big performance hit here, but we need # to convert string objects into the text/unicode-aware # version if possible... and the only way to check if that's # possible is to try. # Some strings are strings, some are just byte arrays. retval = TextStringObject(decode_pdfdocencoding(string)) retval._original_bytes = string retval.autodetect_pdfdocencoding = True return retval except UnicodeDecodeError: return ByteStringObject(string) else: raise TypeError("create_string_object should have str or unicode arg") def decode_pdfdocencoding(byte_array: bytes) -> str: retval = "" for b in byte_array: c = _pdfdoc_encoding[b] if c == "\u0000": raise UnicodeDecodeError( "pdfdocencoding", bytearray(b), -1, -1, "does not exist in translation table", ) retval += c return retval ================================================ FILE: pypdf/generic/_viewerpref.py ================================================ # Copyright (c) 2023, Pubpub-ZZ # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from typing import ( Any, Optional, ) from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none from ._data_structures import ArrayObject, DictionaryObject f_obj = BooleanObject(False) class ViewerPreferences(DictionaryObject): def __init__(self, obj: Optional[DictionaryObject] = None) -> None: super().__init__(self) if not is_null_or_none(obj): self.update(obj.items()) # type: ignore try: self.indirect_reference = obj.indirect_reference # type: ignore except AttributeError: pass def _get_bool(self, key: str, default: Optional[BooleanObject]) -> Optional[BooleanObject]: return self.get(key, default) def _set_bool(self, key: str, v: bool) -> None: self[NameObject(key)] = BooleanObject(v is True) def _get_name(self, key: str, default: Optional[NameObject]) -> Optional[NameObject]: return self.get(key, default) def _set_name(self, key: str, lst: list[str], v: NameObject) -> None: if v[0] != "/": raise ValueError(f"{v} does not start with '/'") if lst != [] and v not in lst: raise ValueError(f"{v} is an unacceptable value") self[NameObject(key)] = NameObject(v) def _get_arr(self, key: str, default: Optional[list[Any]]) -> Optional[ArrayObject]: return self.get(key, None if default is None else ArrayObject(default)) def _set_arr(self, key: str, v: Optional[ArrayObject]) -> None: if v is None: try: del self[NameObject(key)] except KeyError: pass return if not isinstance(v, ArrayObject): raise ValueError("ArrayObject is expected") self[NameObject(key)] = v def _get_int(self, key: str, default: Optional[NumberObject]) -> Optional[NumberObject]: return self.get(key, default) def _set_int(self, key: str, v: int) -> None: self[NameObject(key)] = NumberObject(v) @property def PRINT_SCALING(self) -> NameObject: return NameObject("/PrintScaling") def __new__(cls: Any, value: Any = None) -> "ViewerPreferences": # noqa: PYI034 def _add_prop_bool(key: str, default: Optional[BooleanObject]) -> property: return property( lambda self: self._get_bool(key, default), lambda self, v: self._set_bool(key, v), None, f""" Returns/Modify the status of {key}, Returns {default} if not defined """, ) def _add_prop_name( key: str, lst: list[str], default: Optional[NameObject] ) -> property: return property( lambda self: self._get_name(key, default), lambda self, v: self._set_name(key, lst, v), None, f""" Returns/Modify the status of {key}, Returns {default} if not defined. Acceptable values: {lst} """, ) def _add_prop_arr(key: str, default: Optional[ArrayObject]) -> property: return property( lambda self: self._get_arr(key, default), lambda self, v: self._set_arr(key, v), None, f""" Returns/Modify the status of {key}, Returns {default} if not defined """, ) def _add_prop_int(key: str, default: Optional[int]) -> property: return property( lambda self: self._get_int(key, default), lambda self, v: self._set_int(key, v), None, f""" Returns/Modify the status of {key}, Returns {default} if not defined """, ) cls.hide_toolbar = _add_prop_bool("/HideToolbar", f_obj) cls.hide_menubar = _add_prop_bool("/HideMenubar", f_obj) cls.hide_windowui = _add_prop_bool("/HideWindowUI", f_obj) cls.fit_window = _add_prop_bool("/FitWindow", f_obj) cls.center_window = _add_prop_bool("/CenterWindow", f_obj) cls.display_doctitle = _add_prop_bool("/DisplayDocTitle", f_obj) cls.non_fullscreen_pagemode = _add_prop_name( "/NonFullScreenPageMode", ["/UseNone", "/UseOutlines", "/UseThumbs", "/UseOC"], NameObject("/UseNone"), ) cls.direction = _add_prop_name( "/Direction", ["/L2R", "/R2L"], NameObject("/L2R") ) cls.view_area = _add_prop_name("/ViewArea", [], None) cls.view_clip = _add_prop_name("/ViewClip", [], None) cls.print_area = _add_prop_name("/PrintArea", [], None) cls.print_clip = _add_prop_name("/PrintClip", [], None) cls.print_scaling = _add_prop_name("/PrintScaling", [], None) cls.duplex = _add_prop_name( "/Duplex", ["/Simplex", "/DuplexFlipShortEdge", "/DuplexFlipLongEdge"], None ) cls.pick_tray_by_pdfsize = _add_prop_bool("/PickTrayByPDFSize", None) cls.print_pagerange = _add_prop_arr("/PrintPageRange", None) cls.num_copies = _add_prop_int("/NumCopies", None) cls.enforce = _add_prop_arr("/Enforce", ArrayObject()) return DictionaryObject.__new__(cls) ================================================ FILE: pypdf/pagerange.py ================================================ """ Representation and utils for ranges of PDF file pages. Copyright (c) 2014, Steve Witham . All rights reserved. This software is available under a BSD license; see https://github.com/py-pdf/pypdf/blob/main/LICENSE """ import re from typing import Any, Union from .errors import ParseError _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". PAGE_RANGE_RE = f"^({_INT_RE}|({_INT_RE}?(:{_INT_RE}?(:{_INT_RE}?)?)))$" # groups: 12 34 5 6 7 8 class PageRange: """ A slice-like representation of a range of page indices. For example, page numbers, only starting at zero. The syntax is like what you would put between brackets [ ]. The slice is one of the few Python types that can't be subclassed, but this class converts to and from slices, and allows similar use. - PageRange(str) parses a string representing a page range. - PageRange(slice) directly "imports" a slice. - to_slice() gives the equivalent slice. - str() and repr() allow printing. - indices(n) is like slice.indices(n). """ def __init__(self, arg: Union[slice, "PageRange", str]) -> None: """ Initialize with either a slice -- giving the equivalent page range, or a PageRange object -- making a copy, or a string like "int", "[int]:[int]" or "[int]:[int]:[int]", where the brackets indicate optional ints. Remember, page indices start with zero. Page range expression examples: : all pages. -1 last page. 22 just the 23rd page. :-1 all but the last page. 0:3 the first three pages. -2 second-to-last page. :3 the first three pages. -2: last two pages. 5: from the sixth page onward. -3:-1 third & second to last. The third, "stride" or "step" number is also recognized. ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. 1:10:2 1 3 5 7 9 2::-1 2 1 0. ::-1 all pages in reverse order. Note the difference between this notation and arguments to slice(): slice(3) means the first three pages; PageRange("3") means the range of only the fourth page. However PageRange(slice(3)) means the first three pages. """ if isinstance(arg, slice): self._slice = arg return if isinstance(arg, PageRange): self._slice = arg.to_slice() return m = isinstance(arg, str) and re.match(PAGE_RANGE_RE, arg) if not m: raise ParseError(arg) if m.group(2): # Special case: just an int means a range of one page. start = int(m.group(2)) stop = start + 1 if start != -1 else None self._slice = slice(start, stop) else: self._slice = slice(*[int(g) if g else None for g in m.group(4, 6, 8)]) @staticmethod def valid(input: Any) -> bool: """ True if input is a valid initializer for a PageRange. Args: input: A possible PageRange string or a PageRange object. Returns: True, if the ``input`` is a valid PageRange. """ return isinstance(input, (slice, PageRange)) or ( isinstance(input, str) and bool(re.match(PAGE_RANGE_RE, input)) ) def to_slice(self) -> slice: """Return the slice equivalent of this page range.""" return self._slice def __str__(self) -> str: """A string like "1:2:3".""" s = self._slice indices: Union[tuple[int, int], tuple[int, int, int]] if s.step is None: if s.start is not None and s.stop == s.start + 1: return str(s.start) indices = s.start, s.stop else: indices = s.start, s.stop, s.step return ":".join("" if i is None else str(i) for i in indices) def __repr__(self) -> str: """A string like "PageRange('1:2:3')".""" return "PageRange(" + repr(str(self)) + ")" def indices(self, n: int) -> tuple[int, int, int]: """ Assuming a sequence of length n, calculate the start and stop indices, and the stride length of the PageRange. See help(slice.indices). Args: n: the length of the list of pages to choose from. Returns: Arguments for range(). """ return self._slice.indices(n) def __eq__(self, other: object) -> bool: if not isinstance(other, PageRange): return False return self._slice == other._slice def __hash__(self) -> int: return hash((self.__class__, (self._slice.start, self._slice.stop, self._slice.step))) def __add__(self, other: "PageRange") -> "PageRange": if not isinstance(other, PageRange): raise TypeError(f"Can't add PageRange and {type(other)}") if self._slice.step is not None or other._slice.step is not None: raise ValueError("Can't add PageRange with stride") a = self._slice.start, self._slice.stop b = other._slice.start, other._slice.stop if a[0] > b[0]: a, b = b, a # Now a[0] is the smallest if b[0] > a[1]: # There is a gap between a and b. raise ValueError("Can't add PageRanges with gap") return PageRange(slice(a[0], max(a[1], b[1]))) PAGE_RANGE_ALL = PageRange(":") # The range of all pages. def parse_filename_page_ranges( args: list[Union[str, PageRange, None]] ) -> list[tuple[str, PageRange]]: """ Given a list of filenames and page ranges, return a list of (filename, page_range) pairs. Args: args: A list where the first element is a filename. The other elements are filenames, page-range expressions, slice objects, or PageRange objects. A filename not followed by a page range indicates all pages of the file. Returns: A list of (filename, page_range) pairs. """ pairs: list[tuple[str, PageRange]] = [] pdf_filename: Union[str, None] = None did_page_range = False for arg in [*args, None]: if PageRange.valid(arg): if not pdf_filename: raise ValueError( "The first argument must be a filename, not a page range." ) assert arg is not None pairs.append((pdf_filename, PageRange(arg))) did_page_range = True else: # New filename or end of list - use the complete previous file? if pdf_filename and not did_page_range: pairs.append((pdf_filename, PAGE_RANGE_ALL)) assert not isinstance(arg, PageRange), arg pdf_filename = arg did_page_range = False return pairs PageRangeSpec = Union[str, PageRange, tuple[int, int], tuple[int, int, int], list[int]] ================================================ FILE: pypdf/papersizes.py ================================================ """Helper to get paper sizes.""" from typing import NamedTuple class Dimensions(NamedTuple): width: int height: int class PaperSize: """(width, height) of the paper in portrait mode in pixels at 72 ppi.""" # Notes of how to calculate it: # 1. Get the size of the paper in millimeters # 2. Convert it to inches (25.4 millimeters is equal to 1 inch) # 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels) # All Din-A paper sizes follow this pattern: # 2 x A(n - 1) = A(n) # So the height of the next bigger one is the width of the smaller one # The ratio is always approximately 1:2**0.5 # Additionally, A0 is defined to have an area of 1 m**2 # https://en.wikipedia.org/wiki/ISO_216 # Be aware of rounding issues! A0 = Dimensions(2384, 3370) # 841mm x 1189mm A1 = Dimensions(1684, 2384) A2 = Dimensions(1191, 1684) A3 = Dimensions(842, 1191) A4 = Dimensions( 595, 842 ) # Printer paper, documents - this is by far the most common A5 = Dimensions(420, 595) # Paperback books A6 = Dimensions(298, 420) # Postcards A7 = Dimensions(210, 298) A8 = Dimensions(147, 210) # Envelopes C4 = Dimensions(649, 918) _din_a = ( PaperSize.A0, PaperSize.A1, PaperSize.A2, PaperSize.A3, PaperSize.A4, PaperSize.A5, PaperSize.A6, PaperSize.A7, PaperSize.A8, ) ================================================ FILE: pypdf/py.typed ================================================ ================================================ FILE: pypdf/types.py ================================================ """Helpers for working with PDF types.""" import sys from typing import Literal, Union if sys.version_info[:2] >= (3, 10): # Python 3.10+: https://www.python.org/dev/peps/pep-0484 from typing import TypeAlias else: from typing_extensions import TypeAlias from .generic._base import NameObject, NullObject, NumberObject from .generic._data_structures import ArrayObject, Destination from .generic._outline import OutlineItem BorderArrayType: TypeAlias = list[Union[NameObject, NumberObject, ArrayObject]] OutlineItemType: TypeAlias = Union[OutlineItem, Destination] FitType: TypeAlias = Literal[ "/XYZ", "/Fit", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV" ] # These go with the FitType, they specify values for the fit ZoomArgType: TypeAlias = Union[NumberObject, NullObject, float] ZoomArgsType: TypeAlias = list[ZoomArgType] # Recursive types like the following are not yet supported by Sphinx: # OutlineType = List[Union[Destination, "OutlineType"]] # Hence use this for the moment: OutlineType = list[Union[Destination, list[Union[Destination, list[Destination]]]]] LayoutType: TypeAlias = Literal[ "/NoLayout", "/SinglePage", "/OneColumn", "/TwoColumnLeft", "/TwoColumnRight", "/TwoPageLeft", "/TwoPageRight", ] PagemodeType: TypeAlias = Literal[ "/UseNone", "/UseOutlines", "/UseThumbs", "/FullScreen", "/UseOC", "/UseAttachments", ] AnnotationSubtype: TypeAlias = Literal[ "/Text", "/Link", "/FreeText", "/Line", "/Square", "/Circle", "/Polygon", "/PolyLine", "/Highlight", "/Underline", "/Squiggly", "/StrikeOut", "/Caret", "/Stamp", "/Ink", "/Popup", "/FileAttachment", "/Sound", "/Movie", "/Screen", "/Widget", "/PrinterMark", "/TrapNet", "/Watermark", "/3D", "/Redact", "/Projection", "/RichMedia", ] ================================================ FILE: pypdf/xmp.py ================================================ """ Anything related to Extensible Metadata Platform (XMP) metadata. https://en.wikipedia.org/wiki/Extensible_Metadata_Platform """ import datetime import decimal import re from collections.abc import Iterator from typing import ( Any, Callable, Optional, TypeVar, Union, ) from xml.dom.minidom import Document, parseString from xml.dom.minidom import Element as XmlElement from xml.parsers.expat import ExpatError from ._protocols import XmpInformationProtocol from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement from .errors import PdfReadError, XmpDocumentError from .generic import ContentStream, PdfObject RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" # What is the PDFX namespace, you might ask? # It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf # This namespace is used to place "custom metadata" # properties, which are arbitrary metadata properties with no semantic or # documented meaning. # # Elements in the namespace are key/value-style storage, # where the element name is the key and the content is the value. The keys # are transformed into valid XML identifiers by substituting an invalid # identifier character with \u2182 followed by the unicode hex ID of the # original character. A key like "my car" is therefore "my\u21820020car". # # \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} # # The pdfx namespace should be avoided. # A custom data schema and sensical XML elements could be used instead, as is # suggested by Adobe's own documentation on XMP under "Extensibility of # Schemas". PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" # PDF/A PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" # Internal mapping of namespace URI → prefix _NAMESPACE_PREFIX_MAP = { DC_NAMESPACE: "dc", XMP_NAMESPACE: "xmp", PDF_NAMESPACE: "pdf", XMPMM_NAMESPACE: "xmpMM", PDFAID_NAMESPACE: "pdfaid", PDFX_NAMESPACE: "pdfx", } iso8601 = re.compile( """ (?P[0-9]{4}) (- (?P[0-9]{2}) (- (?P[0-9]+) (T (?P[0-9]{2}): (?P[0-9]{2}) (:(?P[0-9]{2}(.[0-9]+)?))? (?PZ|[-+][0-9]{2}:[0-9]{2}) )? )? )? """, re.VERBOSE, ) K = TypeVar("K") # Minimal XMP template _MINIMAL_XMP = f""" """ def _identity(value: K) -> K: return value def _converter_date(value: str) -> datetime.datetime: matches = iso8601.match(value) if matches is None: raise ValueError(f"Invalid date format: {value}") year = int(matches.group("year")) month = int(matches.group("month") or "1") day = int(matches.group("day") or "1") hour = int(matches.group("hour") or "0") minute = int(matches.group("minute") or "0") second = decimal.Decimal(matches.group("second") or "0") seconds_dec = second.to_integral(decimal.ROUND_FLOOR) milliseconds_dec = (second - seconds_dec) * 1_000_000 seconds = int(seconds_dec) milliseconds = int(milliseconds_dec) tzd = matches.group("tzd") or "Z" dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) if tzd != "Z": tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) tzd_hours *= -1 if tzd_hours < 0: tzd_minutes *= -1 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) return dt def _format_datetime_utc(value: datetime.datetime) -> str: """Format a datetime as UTC with trailing 'Z'. - If the input is timezone-aware, convert to UTC first. - If naive, assume UTC. """ if value.tzinfo is not None and value.utcoffset() is not None: value = value.astimezone(datetime.timezone.utc) value = value.replace(tzinfo=None) return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") def _generic_get( element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity ) -> Optional[list[str]]: containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) retval: list[Any] = [] if len(containers): for container in containers: for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): value = self._get_text(item) value = converter(value) retval.append(value) return retval return None class XmpInformation(XmpInformationProtocol, PdfObject): """ An object that represents Extensible Metadata Platform (XMP) metadata. Usually accessed by :py:attr:`xmp_metadata()`. Raises: PdfReadError: if XML is invalid """ def __init__(self, stream: ContentStream) -> None: self.stream = stream try: data = self.stream.get_data() doc_root: Document = parseString(data) # noqa: S318 except (AttributeError, ExpatError) as e: raise PdfReadError(f"XML in XmpInformation was invalid: {e}") self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( RDF_NAMESPACE, "RDF" )[0] self.cache: dict[Any, Any] = {} @classmethod def create(cls) -> "XmpInformation": """ Create a new XmpInformation object with minimal structure. Returns: A new XmpInformation instance with empty metadata fields. """ stream = ContentStream(None, None) stream.set_data(_MINIMAL_XMP.encode("utf-8")) return cls(stream) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: deprecate_with_replacement( "XmpInformation.write_to_stream", "PdfWriter.xmp_metadata", "6.0.0" ) if encryption_key is not None: # deprecated deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) self.stream.write_to_stream(stream) def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: attr = desc.getAttributeNodeNS(namespace, name) if attr is not None: yield attr yield from desc.getElementsByTagNameNS(namespace, name) def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: for i in range(desc.attributes.length): attr = desc.attributes.item(i) if attr and attr.namespaceURI == namespace: yield attr for child in desc.childNodes: if child.namespaceURI == namespace: yield child def _get_text(self, element: XmlElement) -> str: text = "" for child in element.childNodes: if child.nodeType == child.TEXT_NODE: text += child.data return text def _get_single_value( self, namespace: str, name: str, converter: Callable[[str], Any] = _identity, ) -> Optional[Any]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached value = None for element in self.get_element("", namespace, name): if element.nodeType == element.ATTRIBUTE_NODE: value = element.nodeValue else: value = self._get_text(element) break if value is not None: value = converter(value) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = value return value def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached retval: list[str] = [] for element in self.get_element("", namespace, name): if (bags := _generic_get(element, self, list_type="Bag")) is not None: retval.extend(bags) else: value = self._get_text(element) retval.append(value) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = retval return retval def _get_seq_values( self, namespace: str, name: str, converter: Callable[[Any], Any] = _identity, ) -> Optional[list[Any]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached retval: list[Any] = [] for element in self.get_element("", namespace, name): if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: retval.extend(seqs) elif (bags := _generic_get(element, self, list_type="Bag")) is not None: # See issue at https://github.com/py-pdf/pypdf/issues/3324 # Some applications violate the XMP metadata standard regarding `dc:creator` which should # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. # This seems to stem from the fact that the original Dublin Core specification does indeed # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore # the plain Dublin Core variant. For this reason, add a fallback here to deal with such # issues accordingly. retval.extend(bags) else: value = converter(self._get_text(element)) retval.append(value) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = retval return retval def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached retval: dict[Any, Any] = {} for element in self.get_element("", namespace, name): alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") if len(alts): for alt in alts: for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): value = self._get_text(item) retval[item.getAttribute("xml:lang")] = value else: retval["x-default"] = self._get_text(element) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = retval return retval @property def dc_contributor(self) -> Optional[list[str]]: """Contributors to the resource (other than the authors).""" return self._getter_bag(DC_NAMESPACE, "contributor") @dc_contributor.setter def dc_contributor(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "contributor", values) @property def dc_coverage(self) -> Optional[str]: """Text describing the extent or scope of the resource.""" return self._get_single_value(DC_NAMESPACE, "coverage") @dc_coverage.setter def dc_coverage(self, value: Optional[str]) -> None: self._set_single_value(DC_NAMESPACE, "coverage", value) @property def dc_creator(self) -> Optional[list[str]]: """A sorted array of names of the authors of the resource, listed in order of precedence.""" return self._get_seq_values(DC_NAMESPACE, "creator") @dc_creator.setter def dc_creator(self, values: Optional[list[str]]) -> None: self._set_seq_values(DC_NAMESPACE, "creator", values) @property def dc_date(self) -> Optional[list[datetime.datetime]]: """A sorted array of dates of significance to the resource. The dates and times are in UTC.""" return self._get_seq_values(DC_NAMESPACE, "date", _converter_date) @dc_date.setter def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None: if values is None: self._set_seq_values(DC_NAMESPACE, "date", None) else: date_strings = [] for value in values: if isinstance(value, datetime.datetime): date_strings.append(_format_datetime_utc(value)) else: date_strings.append(str(value)) self._set_seq_values(DC_NAMESPACE, "date", date_strings) @property def dc_description(self) -> Optional[dict[str, str]]: """A language-keyed dictionary of textual descriptions of the content of the resource.""" return self._get_langalt_values(DC_NAMESPACE, "description") @dc_description.setter def dc_description(self, values: Optional[dict[str, str]]) -> None: self._set_langalt_values(DC_NAMESPACE, "description", values) @property def dc_format(self) -> Optional[str]: """The mime-type of the resource.""" return self._get_single_value(DC_NAMESPACE, "format") @dc_format.setter def dc_format(self, value: Optional[str]) -> None: self._set_single_value(DC_NAMESPACE, "format", value) @property def dc_identifier(self) -> Optional[str]: """Unique identifier of the resource.""" return self._get_single_value(DC_NAMESPACE, "identifier") @dc_identifier.setter def dc_identifier(self, value: Optional[str]) -> None: self._set_single_value(DC_NAMESPACE, "identifier", value) @property def dc_language(self) -> Optional[list[str]]: """An unordered array specifying the languages used in the resource.""" return self._getter_bag(DC_NAMESPACE, "language") @dc_language.setter def dc_language(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "language", values) @property def dc_publisher(self) -> Optional[list[str]]: """An unordered array of publisher names.""" return self._getter_bag(DC_NAMESPACE, "publisher") @dc_publisher.setter def dc_publisher(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "publisher", values) @property def dc_relation(self) -> Optional[list[str]]: """An unordered array of text descriptions of relationships to other documents.""" return self._getter_bag(DC_NAMESPACE, "relation") @dc_relation.setter def dc_relation(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "relation", values) @property def dc_rights(self) -> Optional[dict[str, str]]: """A language-keyed dictionary of textual descriptions of the rights the user has to this resource.""" return self._get_langalt_values(DC_NAMESPACE, "rights") @dc_rights.setter def dc_rights(self, values: Optional[dict[str, str]]) -> None: self._set_langalt_values(DC_NAMESPACE, "rights", values) @property def dc_source(self) -> Optional[str]: """Unique identifier of the work from which this resource was derived.""" return self._get_single_value(DC_NAMESPACE, "source") @dc_source.setter def dc_source(self, value: Optional[str]) -> None: self._set_single_value(DC_NAMESPACE, "source", value) @property def dc_subject(self) -> Optional[list[str]]: """An unordered array of descriptive phrases or keywords that specify the topic of the content.""" return self._getter_bag(DC_NAMESPACE, "subject") @dc_subject.setter def dc_subject(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "subject", values) @property def dc_title(self) -> Optional[dict[str, str]]: """A language-keyed dictionary of the title of the resource.""" return self._get_langalt_values(DC_NAMESPACE, "title") @dc_title.setter def dc_title(self, values: Optional[dict[str, str]]) -> None: self._set_langalt_values(DC_NAMESPACE, "title", values) @property def dc_type(self) -> Optional[list[str]]: """An unordered array of textual descriptions of the document type.""" return self._getter_bag(DC_NAMESPACE, "type") @dc_type.setter def dc_type(self, values: Optional[list[str]]) -> None: self._set_bag_values(DC_NAMESPACE, "type", values) @property def pdf_keywords(self) -> Optional[str]: """An unformatted text string representing document keywords.""" return self._get_single_value(PDF_NAMESPACE, "Keywords") @pdf_keywords.setter def pdf_keywords(self, value: Optional[str]) -> None: self._set_single_value(PDF_NAMESPACE, "Keywords", value) @property def pdf_pdfversion(self) -> Optional[str]: """The PDF file version, for example 1.0 or 1.3.""" return self._get_single_value(PDF_NAMESPACE, "PDFVersion") @pdf_pdfversion.setter def pdf_pdfversion(self, value: Optional[str]) -> None: self._set_single_value(PDF_NAMESPACE, "PDFVersion", value) @property def pdf_producer(self) -> Optional[str]: """The name of the tool that saved the document as a PDF.""" return self._get_single_value(PDF_NAMESPACE, "Producer") @pdf_producer.setter def pdf_producer(self, value: Optional[str]) -> None: self._set_single_value(PDF_NAMESPACE, "Producer", value) @property def xmp_create_date(self) -> Optional[datetime.datetime]: """The date and time the resource was originally created. Returned as a UTC datetime object.""" return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date) @xmp_create_date.setter def xmp_create_date(self, value: Optional[datetime.datetime]) -> None: if value: date_str = _format_datetime_utc(value) self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str) else: self._set_single_value(XMP_NAMESPACE, "CreateDate", None) @property def xmp_modify_date(self) -> Optional[datetime.datetime]: """The date and time the resource was last modified. Returned as a UTC datetime object.""" return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date) @xmp_modify_date.setter def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None: if value: date_str = _format_datetime_utc(value) self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str) else: self._set_single_value(XMP_NAMESPACE, "ModifyDate", None) @property def xmp_metadata_date(self) -> Optional[datetime.datetime]: """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object.""" return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date) @xmp_metadata_date.setter def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None: if value: date_str = _format_datetime_utc(value) self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str) else: self._set_single_value(XMP_NAMESPACE, "MetadataDate", None) @property def xmp_creator_tool(self) -> Optional[str]: """The name of the first known tool used to create the resource.""" return self._get_single_value(XMP_NAMESPACE, "CreatorTool") @xmp_creator_tool.setter def xmp_creator_tool(self, value: Optional[str]) -> None: self._set_single_value(XMP_NAMESPACE, "CreatorTool", value) @property def xmpmm_document_id(self) -> Optional[str]: """The common identifier for all versions and renditions of this resource.""" return self._get_single_value(XMPMM_NAMESPACE, "DocumentID") @xmpmm_document_id.setter def xmpmm_document_id(self, value: Optional[str]) -> None: self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value) @property def xmpmm_instance_id(self) -> Optional[str]: """An identifier for a specific incarnation of a document, updated each time a file is saved.""" return self._get_single_value(XMPMM_NAMESPACE, "InstanceID") @xmpmm_instance_id.setter def xmpmm_instance_id(self, value: Optional[str]) -> None: self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value) @property def pdfaid_part(self) -> Optional[str]: """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" return self._get_single_value(PDFAID_NAMESPACE, "part") @pdfaid_part.setter def pdfaid_part(self, value: Optional[str]) -> None: self._set_single_value(PDFAID_NAMESPACE, "part", value) @property def pdfaid_conformance(self) -> Optional[str]: """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" return self._get_single_value(PDFAID_NAMESPACE, "conformance") @pdfaid_conformance.setter def pdfaid_conformance(self, value: Optional[str]) -> None: self._set_single_value(PDFAID_NAMESPACE, "conformance", value) @property def custom_properties(self) -> dict[Any, Any]: """ Retrieve custom metadata properties defined in the undocumented pdfx metadata schema. Returns: A dictionary of key/value items for custom metadata properties. """ if not hasattr(self, "_custom_properties"): self._custom_properties = {} for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): key = node.localName while True: # see documentation about PDFX_NAMESPACE earlier in file idx = key.find("\u2182") if idx == -1: break key = ( key[:idx] + chr(int(key[idx + 1 : idx + 5], base=16)) + key[idx + 5 :] ) if node.nodeType == node.ATTRIBUTE_NODE: value = node.nodeValue else: value = self._get_text(node) self._custom_properties[key] = value return self._custom_properties def _get_or_create_description(self, about_uri: str = "") -> XmlElement: """Get or create an rdf:Description element with the given about URI.""" for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: return desc doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description") desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri) self.rdf_root.appendChild(desc) return desc def _clear_cache_entry(self, namespace: str, name: str) -> None: """Remove a cached value for a given namespace/name if present.""" ns_cache = self.cache.get(namespace) if ns_cache and name in ns_cache: del ns_cache[name] def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None: """Set or remove a single metadata value.""" self._clear_cache_entry(namespace, name) desc = self._get_or_create_description() existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) for elem in existing_elements: desc.removeChild(elem) if existing_attr := desc.getAttributeNodeNS(namespace, name): desc.removeAttributeNode(existing_attr) if value is not None: doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") prefix = self._get_namespace_prefix(namespace) elem = doc.createElementNS(namespace, f"{prefix}:{name}") text_node = doc.createTextNode(str(value)) elem.appendChild(text_node) desc.appendChild(elem) self._update_stream() def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: """Set or remove bag values (unordered array).""" self._clear_cache_entry(namespace, name) desc = self._get_or_create_description() existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) for elem in existing_elements: desc.removeChild(elem) if values: doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") prefix = self._get_namespace_prefix(namespace) elem = doc.createElementNS(namespace, f"{prefix}:{name}") bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag") for value in values: li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") text_node = doc.createTextNode(str(value)) li.appendChild(text_node) bag.appendChild(li) elem.appendChild(bag) desc.appendChild(elem) self._update_stream() def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: """Set or remove sequence values (ordered array).""" self._clear_cache_entry(namespace, name) desc = self._get_or_create_description() existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) for elem in existing_elements: desc.removeChild(elem) if values: doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") prefix = self._get_namespace_prefix(namespace) elem = doc.createElementNS(namespace, f"{prefix}:{name}") seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq") for value in values: li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") text_node = doc.createTextNode(str(value)) li.appendChild(text_node) seq.appendChild(li) elem.appendChild(seq) desc.appendChild(elem) self._update_stream() def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None: """Set or remove language alternative values.""" self._clear_cache_entry(namespace, name) desc = self._get_or_create_description() existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) for elem in existing_elements: desc.removeChild(elem) if values: doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") prefix = self._get_namespace_prefix(namespace) elem = doc.createElementNS(namespace, f"{prefix}:{name}") alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt") for lang, value in values.items(): li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") li.setAttribute("xml:lang", lang) text_node = doc.createTextNode(str(value)) li.appendChild(text_node) alt.appendChild(li) elem.appendChild(alt) desc.appendChild(elem) self._update_stream() def _get_namespace_prefix(self, namespace: str) -> str: """Get the appropriate namespace prefix for a given namespace URI.""" return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown") def _update_stream(self) -> None: """Update the stream with the current XML content.""" doc = self.rdf_root.ownerDocument if doc is None: raise XmpDocumentError("XMP Document is None") xml_data = doc.toxml(encoding="utf-8") self.stream.set_data(xml_data) ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["flit_core >=3.11,<4"] build-backend = "flit_core.buildapi" [project] name = "pypdf" authors = [{ name = "Mathieu Fenniak", email = "biziqe@mathieu.fenniak.net" }] maintainers = [{ name = "stefan6419846" }, { name = "Martin Thoma", email = "info@martin-thoma.de" }] description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" readme = "README.md" dynamic = ["version"] license = "BSD-3-Clause" license-files = ["LICENSE"] requires-python = ">=3.9" classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", ] dependencies = [ "typing_extensions >= 4.0; python_version < '3.11'", ] [project.urls] Changelog = "https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html" Documentation = "https://pypdf.readthedocs.io/en/latest/" Source = "https://github.com/py-pdf/pypdf" "Bug Reports" = "https://github.com/py-pdf/pypdf/issues" [project.optional-dependencies] crypto = ["cryptography"] cryptodome = ["PyCryptodome"] image = ["Pillow>=8.0.0"] full = [ "cryptography", "Pillow>=8.0.0" ] dev = [ "flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel" ] docs = [ "myst_parser", "sphinx", "sphinx_rtd_theme" ] [tool.check-wheel-contents] package = "./pypdf" [tool.flit.sdist] exclude = [ ".gitblame-ignore-revs", ".github/*", ".gitignore", ".gitmodules", ".pre-commit-config.yaml", "docs/*", "make_release.py", "Makefile", "requirements/*", "sample-files/.github/*", "sample-files/.gitignore", "sample-files/.pre-commit-config.yaml", "tests/pdf_cache/*", ] include = ["resources/", "tests/", "CHANGELOG.md"] [tool.pytest.ini_options] addopts = "--disable-socket" filterwarnings = ["error"] markers = [ "slow: Test which require more than a second", "samples: Tests which use files from https://github.com/py-pdf/sample-files", "enable_socket: Tests which need to download files" ] testpaths = ["tests"] norecursedirs = ["tests/pdf_cache"] [tool.isort] line_length = 79 indent = ' ' multi_line_output = 3 include_trailing_comma = true known_third_party = ["pytest"] [tool.coverage.run] source = ["pypdf"] branch = true patch = [ "subprocess", ] parallel = true [tool.coverage.report] # Regexes for lines to exclude from consideration exclude_lines = [ # Have to re-enable the standard pragma "pragma: no cover", "@overload", "deprecated", # Don't complain about type-checking code not being hit by unit tests "if TYPE_CHECKING", # Don't complain about missing debug-only code: "def __repr__", "def __str__", "if self\\.debug", # Don't complain if tests don't hit defensive assertion code: "raise AssertionError", "raise NotImplementedError", # Don't complain if non-runnable code isn't run: "if __name__ == .__main__.:", ] [tool.ruff] line-length = 120 exclude = [ "sample-files/", ] [tool.ruff.lint] select = ["ALL"] ignore = [ "A001", # Variable is shadowing a Python builtin "A002", # Function argument is shadowing a Python builtin "ANN401", # Dynamically typed expressions (typing.Any) are disallowed "ARG001", # Unused function argument "ARG002", # Unused method argument "ARG004", # Unused static method argument "B904", # Within an `except` clause, raise exceptions with "B905", # `zip()` without an explicit `strict=` parameter "BLE001", # Do not catch blind exception: `Exception` "COM812", # Yes, they make the diff smaller "D101", # Missing docstring in public class "D102", # Missing docstring in public method "D105", # Missing docstring in magic method "D106", # Missing docstring in public nested class "D107", # Missing docstring in `__init__` "D205", # One blank line required between summary line and description "D212", # I want multiline-docstrings to start at the second line "D401", # First line of docstring should be in imperative mood - false positives "D415", # First line should end with a period "D417", # Missing argument descriptions in the docstring "DTZ001", # The use of `datetime.datetime()` without `tzinfo` is necessary "EM101", # Exception must not use a string literal, assign to variable first "EM102", # Exception must not use an f-string literal, assign to variable first "ERA001", # Found commented-out code "FA100", # Missing `from __future__ import annotations`, but uses `typing.Dict` "FA102", # Missing `from __future__ import annotations`, but uses PEP 604 union "FBT001", # Boolean positional arg in function definition "FBT002", # Boolean default value in function definition "FBT003", # Boolean positional value in function call "FIX002", # TODOs should typically not be in the code, but sometimes are ok "G004", # f-string in logging statement "N806", # non-lowercase-variable-in-function "N814", # Camelcase `PageAttributes` imported as constant `PG` "N817", # CamelCase `PagesAttributes` imported as acronym `PA` "PERF203", # `try`-`except` within a loop incurs performance overhead "PGH003", # Use specific rule codes when ignoring type issues "PLW1510", # `subprocess.run` without explicit `check` argument "PLW2901", # `with` statement variable `img` overwritten by assignment target "PT011", # `pytest.raises(ValueError)` is too broad, set the `match` "PT012", # `pytest.raises()` block should contain a single simple statement "PT014", # Ruff bug: Duplicate of test case at index 1 in `@pytest_mark.parametrize` "PTH123", # `open()` should be replaced by `Path.open()` "PYI042", # Type alias `mode_str_type` should be CamelCase "RUF001", # Detect confusable Unicode-to-Unicode units. Introduces bugs "RUF002", # Detect confusable Unicode-to-Unicode units. Introduces bugs "S101", # Use of `assert` detected "S110", # `try`-`except`-`pass` detected, consider logging the exception "SIM105", # contextlib.suppress "SIM108", # Don't enforce ternary operators "SLF001", # Private member accessed "TC006", # To discuss: Add quotes to type expression in `typing.cast()` "TD002", # Authors of TODOs can be found via git "TD003", # For the moment, fix it later: Missing issue link on the line following this TODO "TID252", # We want relative imports "TRY002", # Create your own exception "TRY003", # Avoid specifying long messages outside the exception class "TRY004", # Prefer `TypeError` exception for invalid type "TRY201", # Use `raise` without specifying exception name "TRY300", # Consider moving this statement to an `else` block "TRY301", # Abstract `raise` to an inner function "UP006", # Non-PEP 585 annotation. As long as we are not on Python 3.11+ "UP007", # Non-PEP 604 annotation. As long as we are not on Python 3.11+ ] [tool.ruff.lint.mccabe] max-complexity = 30 # Recommended: 10 [tool.ruff.lint.per-file-ignores] "_cryptography.py" = ["S304", "S305"] # Use of insecure cipher / modes, aka RC4 and AES-ECB "_encryption.py" = ["S324"] "_writer.py" = ["S324"] "pypdf/_codecs/symbol.py" = ["A005"] # Module shadows a Python standard-library module "types.py" = ["A005"] # Module shadows a Python standard-library module "pypdf/_text_extraction/__init__.py" = ["PLW0603"] # Using the global statement to update is discouraged "docs/conf.py" = ["INP001", "PTH100"] "json_consistency.py" = ["T201"] "make_release.py" = ["S603", "S607", "T201"] "pypdf/*" = ["N802", "N803"] # We first need to deprecate old stuff "tests/*" = ["ANN001", "ANN201", "B017", "B018", "D103", "D104", "S105", "S106"] "tests/test_workflows.py" = ["T201"] [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.pylint] allow-magic-value-types = ["bytes", "float", "int", "str"] max-args = 12 # Recommended: 5 max-branches = 36 # Recommended: 12 max-returns = 11 # Recommended: 6 max-statements = 176 # Recommended: 50 [tool.docformatter] pre-summary-newline = true wrap-summaries = 0 wrap-descriptions = 0 [tool.mypy] show_error_codes = true ignore_missing_imports = true check_untyped_defs = true disallow_any_generics = true disallow_untyped_defs = true disallow_incomplete_defs = true warn_redundant_casts = true warn_unused_ignores = true warn_unused_configs = true exclude = ['venv', '.venv', 'tests', 'make_release.py'] ================================================ FILE: requirements/ci-3.11.txt ================================================ # # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in # cffi==2.0.0 # via cryptography coverage[toml]==7.13.0 # via # -r requirements/ci.in # pytest-cov cryptography==46.0.5 # via -r requirements/ci.in defusedxml==0.7.1 # via fpdf2 exceptiongroup==1.2.2 # via pytest execnet==2.1.1 # via pytest-xdist fonttools==4.61.0 # via fpdf2 fpdf2==2.8.1 # via -r requirements/ci.in iniconfig==2.0.0 # via pytest mypy==1.17.0 # via -r requirements/ci.in mypy-extensions==1.0.0 # via mypy packaging==24.1 # via pytest pillow==12.1.1 # via # -r requirements/ci.in # fpdf2 pluggy==1.5.0 # via pytest py-cpuinfo==9.0.0 # via pytest-benchmark pycparser==2.22 # via cffi pytest==8.3.3 # via # -r requirements/ci.in # pytest-benchmark # pytest-cov # pytest-socket # pytest-timeout # pytest-xdist pytest-benchmark==4.0.0 # via -r requirements/ci.in pytest-cov==5.0.0 # via -r requirements/ci.in pytest-socket==0.7.0 # via -r requirements/ci.in pytest-timeout==2.3.1 # via -r requirements/ci.in pytest-xdist==3.6.1 # via -r requirements/ci.in pyyaml==6.0.2 # via -r requirements/ci.in ruff==0.15.0 # via -r requirements/ci.in tomli==2.0.2 # via # coverage # mypy # pytest typeguard==4.3.0 # via -r requirements/ci.in typing-extensions==4.12.2 # via # mypy # typeguard ================================================ FILE: requirements/ci.in ================================================ coverage fpdf2 mypy pillow cryptography pytest pytest-benchmark pytest-socket pytest-timeout pytest-xdist pytest-cov # ruff # only take this for 3.11 typeguard pyyaml ================================================ FILE: requirements/ci.txt ================================================ # # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile requirements/ci.in # cffi==2.0.0 # via cryptography coverage[toml]==7.10.7 # via # -r requirements/ci.in # pytest-cov cryptography==46.0.5 # via -r requirements/ci.in exceptiongroup==1.2.2 # via pytest execnet==2.1.1 # via pytest-xdist importlib-metadata==8.5.0 # via typeguard iniconfig==2.0.0 # via pytest mypy==1.13.0 # via -r requirements/ci.in mypy-extensions==1.0.0 # via mypy packaging==24.1 # via pytest pillow==10.4.0 # via # -r requirements/ci.in # fpdf2 pluggy==1.5.0 # via pytest py-cpuinfo==9.0.0 # via pytest-benchmark pycparser==2.22 # via cffi pytest==8.3.3 # via # -r requirements/ci.in # pytest-benchmark # pytest-cov # pytest-socket # pytest-timeout # pytest-xdist pytest-benchmark==4.0.0 # via -r requirements/ci.in pytest-cov==5.0.0 # via -r requirements/ci.in pytest-socket==0.7.0 # via -r requirements/ci.in pytest-timeout==2.3.1 # via -r requirements/ci.in pytest-xdist==3.6.1 # via -r requirements/ci.in pyyaml==6.0.2 # via -r requirements/ci.in tomli==2.0.2 # via # coverage # mypy # pytest typeguard==4.3.0 # via -r requirements/ci.in typing-extensions==4.13.2 # via # mypy # typeguard zipp==3.20.2 # via importlib-metadata ================================================ FILE: requirements/dev.in ================================================ pillow pip-tools pre-commit pytest-cov flit wheel ================================================ FILE: requirements/dev.txt ================================================ # # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile requirements/dev.in # build==1.2.2.post1 # via pip-tools certifi==2024.8.30 # via requests cfgv==3.4.0 # via pre-commit charset-normalizer==3.4.0 # via requests click==8.1.7 # via pip-tools coverage[toml]==7.6.1 # via pytest-cov distlib==0.3.9 # via virtualenv docutils==0.20.1 # via flit exceptiongroup==1.2.2 # via pytest filelock==3.20.3 # via virtualenv flit==3.11.0 # via -r dev.in flit-core==3.11.0 # via flit identify==2.6.1 # via pre-commit idna==3.10 # via requests importlib-metadata==8.5.0 # via build iniconfig==2.0.0 # via pytest nodeenv==1.9.1 # via pre-commit packaging==24.1 # via # build # pytest # wheel pillow==12.1.1 # via -r dev.in pip-tools==7.4.1 # via -r dev.in platformdirs==4.3.6 # via virtualenv pluggy==1.5.0 # via pytest pre-commit==3.5.0 # via -r dev.in pyproject-hooks==1.2.0 # via # build # pip-tools pytest==8.3.3 # via pytest-cov pytest-cov==5.0.0 # via -r dev.in pyyaml==6.0.2 # via pre-commit requests==2.32.4 # via flit tomli==2.0.2 # via # build # coverage # pip-tools # pytest tomli-w==1.0.0 # via flit typing-extensions==4.15.0 # via virtualenv urllib3==2.6.3 # via requests virtualenv==20.36.1 # via pre-commit wheel==0.46.2 # via # -r dev.in # pip-tools zipp==3.20.2 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip # setuptools ================================================ FILE: requirements/docs.in ================================================ sphinx sphinx_rtd_theme myst_parser ================================================ FILE: requirements/docs.txt ================================================ # # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/docs.in # alabaster==1.0.0 # via sphinx babel==2.16.0 # via sphinx certifi==2024.8.30 # via requests charset-normalizer==3.4.0 # via requests docutils==0.21.2 # via # myst-parser # sphinx # sphinx-rtd-theme idna==3.10 # via requests imagesize==1.4.1 # via sphinx jinja2==3.1.6 # via # myst-parser # sphinx markdown-it-py==3.0.0 # via # mdit-py-plugins # myst-parser markupsafe==3.0.1 # via jinja2 mdit-py-plugins==0.4.2 # via myst-parser mdurl==0.1.2 # via markdown-it-py myst-parser==4.0.0 # via -r requirements/docs.in packaging==24.1 # via sphinx pygments==2.18.0 # via sphinx pyyaml==6.0.2 # via myst-parser requests==2.32.4 # via sphinx snowballstemmer==2.2.0 # via sphinx sphinx==8.1.3 # via # -r requirements/docs.in # myst-parser # sphinx-rtd-theme # sphinxcontrib-jquery sphinx-rtd-theme==3.0.1 # via -r requirements/docs.in sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-htmlhelp==2.1.0 # via sphinx sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx tomli==2.0.2 # via sphinx urllib3==2.6.3 # via requests ================================================ FILE: resources/010-pdflatex-forms.txt ================================================ Name Check Submit 1 ================================================ FILE: resources/AEO.1172.layout.rot180.txt ================================================ 9 1of Page 2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO.Date Printed: 17/Nov/2022 PRODUCT SUMMARY Fit / Other: 1172 KNIT SHORTIE Style Desc: SUMMER-B 2023 Season: 50 / 170 Division / Dept: AMERICAN EAGLE OUTFITTERSCompany: SUMMER-B 2023 1172 KNIT SHORTIE STYLE: 1172 STATUS: FNL ================================================ FILE: resources/AEO.1172.layout.txt ================================================ STATUS: FNL STYLE: 1172 1172 KNIT SHORTIE SUMMER-B 2023 Company: AMERICAN EAGLE OUTFITTERS Division / Dept: 50 / 170 Season: SUMMER-B 2023 Style Desc: 1172 KNIT SHORTIE Fit / Other: PRODUCT SUMMARY Date Printed: 17/Nov/2022 2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO.Page 1of 9 ================================================ FILE: resources/Claim Maker Alerts Guide_pg2.layout.txt ================================================ Updated System Responses for Common Scenarios Scenario Before Change After Why? An On Hold / Missing New doc info was Leave state as On Batches can be released early Documents case receives its logged but no Hold and update state and coders can code all they can first documentation set after further automated reason to Ready To and then leave the batch in In coding operations have action was taken. Code. Progress. When docs come in, already begun for the batch the case is picked up by the (batch state = In Progress). normal On Hold process due to the assignment of the Ready to Code state reason. An “incomplete” case (not All documents All manually attached Ensures that ALL info that has Code Completed or Ignored) were “overwritten” PDFs are preserved arrived for the case remains in an “in flight” batch (state = with data from the in place and all visible to users. Specifically Reconciled, Assigned, or In new documents. “extracted” addresses split labor / C-section Progress) receives new documents are cases, allowing a coder to refer documents. aggregated under a back to the “Superseded” SUPERSEDED ON documents to make sure a newly [DATE] text doc with extracted “C-section only” type Complete document wasn’t really a Labor Record. to C-section case. New documents are received New doc info was Existing documents Prompts the coder to review the for a Code Completed or logged but no are “superseded” new documentation set while Ignored case in an “in flight” further automated (see previous) and retaining all previously applied batch. action was taken. the case is set back codes. If no significant change is to On Hold / Ready to noted, the case can simply be set Code. back to Code Completed. Documentation for an New case info The case is added to Ensures proper review of any “uncoded” (aka not Code was logged but a new batch with the additional documentation Completed) case or a new no further same date of service. received for a previously patient is received for a automated action Set state to Ignored completed batch as well as Complete or Charges Entered was taken. on the original case (if documentation for brand new batch. it exists) and add cases after a batch has already notes to both the been Completed. Notes on the original and new original and duplicate case cases indicating the ensure that users are aware of link between the two. actions taken by the system. Documentation for a Code New doc info was Existing case The status of the new document Completed case in a logged but no documents are left in is clearly indicated as arriving Complete or Charges Entered further automated place and the new AFTER the associated case was batch is received. action was taken. documentation is coded avoiding potential added as a PDF confusion regarding which attachment with type documentation was utilized at the “complete record” and time of coding while also title POSTED LATE - providing access to the new info [DATE]. and allowing the end user to determine the correct course of action. ================================================ FILE: resources/Epic.Page.layout.txt ================================================ All Postprocedure Notes Last edited 10/11/23 0919 by Danny Chaung, DO Date of Service 10/11/23 0918 Status: Signed Anesthesia Post Evaluation Procedure Summary Date: 10/11/23 Room / Location: EHMC ENDOSCOPY Anesthesia Start: 0852 Anesthesia Stop: 0918 Procedure: COLONOSCOPY Diagnosis: Cancer screening Scheduled Providers: Walter A Klein, MD; Danny Chaung, Responsible Provider: Danny Chaung, DO DO Anesthesia Type: general ASA Status: 2 Patient location during evaluation: PACU Post op Vital Signs: stable Level of consciousness: awake and alert Pain management: adequate analgesia Airway patency: patent Anesthetic complications: no Respiratory status: unassisted Hydration status: continuing Post-op Complications: No Assessment: Nausea and Vomiting: absent MIPS Measure #404 - Smoking Abstinence Is the patient a current smoker? No (XX404) ================================================ FILE: resources/afm_to_dataclass.py ================================================ # ruff: noqa: T201, INP001, D100 # Use this file to generate Font dataclasses for the 14 Adobe Core fonts. import re import textwrap import urllib.request from io import BytesIO from typing import cast from zipfile import ZipFile from pypdf._codecs.adobe_glyphs import adobe_glyphs from pypdf.constants import FontFlags # FONT_LOC = "web.archive.org/web/20110531171921if_/http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/Core14_AFMs.zip" FONT_LOC = "download.macromedia.com/pub/developer/opentype/tech-notes/Core14_AFMs.zip" PROTOCOL = "https://" FONT_URL = PROTOCOL + FONT_LOC class Parser: def __init__(self) -> None: self.license_information = "" self.files: dict[str, str] = {} def get_fonts(self) -> None: with urllib.request.urlopen( f"https://{FONT_LOC}" ) as connection, ZipFile(BytesIO( connection.read()) ) as font_zip: for filename in font_zip.namelist(): if filename.lower().endswith(".afm"): with font_zip.open(filename, mode="r") as afm_font_file: self.files[filename] = afm_font_file.read().decode("utf-8") else: with font_zip.open(filename, mode="r") as afm_font_file: self.license_information = afm_font_file.read().decode("utf-8") def get_disclaimer(self, width: int = 95) -> str: pre = ( "# This file is based upon the 14 core AFM files provided by Adobe/Macromedia at\n# " + FONT_URL + "\n# The original copyright follows:\n#\n# " + "-" * width + "\n" ) title = "# " + self.license_information.split("")[1].split("")[0] text = self.license_information.split('')[1].split(" list[str]: # noqa: C901 # AFM specification: https://adobe-type-tools.github.io/font-tech-notes/pdfs/5004.AFM_Spec.pdf copyrights: list[str] = [] name: str = "" family: str = "" weight: str = "" ascent: float = 0.0 descent: float = 0.0 cap_height: float = 0.0 x_height: float = 0.0 italic_angle: float = 0.0 flags: int = 0 bbox: tuple[float, float, float, float] = (0, 0, 0, 0) character_widths: dict[str, int] = {} for line in font_data.splitlines(keepends=False): if not line.strip(): continue if " " not in line: continue key, value = line.split(" ", maxsplit=1) if not key: continue if key == "FontName": name = value if "Times" in value: flags |= FontFlags.SERIF if key == "Weight": weight = value if key == "FamilyName": family = value if key == "Ascender": ascent = cast(float, value) if key == "Descender": descent = cast(float, value) if key == "CapHeight": cap_height = cast(float, value) if key == "XHeight": x_height = cast(float, value) if key == "ItalicAngle": italic_angle = cast(float, value) if value != "0": flags |= FontFlags.ITALIC if key == "IsFixedPitch" and value.lower() == "true": flags |= FontFlags.FIXED_PITCH if key == "FontBBox": bbox = tuple(map(float, value.split(" ")[:4])) # type: ignore if key == "EncodingScheme": if value == "FontSpecific": flags |= FontFlags.SYMBOLIC else: flags |= FontFlags.NONSYMBOLIC # Add copyright information. This is available in two fields: "Comment" and "Notice". # However, all information available in "Comment" is also available in "Notice", and # the information under "Notice" is more complete. Ignore "Comment" and only copy # information from "Notice", to avoid adding the same information twice. if key == "Notice" and value.startswith("Copyright"): copyrights.append(re.sub(r"\.([A-Z])", r". \1", value)) # Take care of missing space after period. if key == "C": # C integer ; WX number ; N name; We're ignoring C. key_value_pairs = line.split(";") character_width_x = -1 character_name = "dummy" for pair in key_value_pairs: if not pair.strip(): continue key_of_pair, value_of_pair = pair.strip().split(" ", maxsplit=1) if key_of_pair == "WX": character_width_x = int(value_of_pair) if key_of_pair == "N": character_name = value_of_pair glyph = adobe_glyphs[f"/{character_name}"] character_widths[glyph.encode("unicode_escape").decode("utf-8")] = character_width_x if key == "CH": raise NotImplementedError(name, line) # Add default width try: if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: character_widths["default"] = character_widths[" "] else: character_widths["default"] = 2 * character_widths[" "] except KeyError: pass result = [ f" # Generated from {file_name}" ] for copyright_entry in sorted(set(copyrights)): result.extend(f" # {line}" for line in textwrap.wrap(text=copyright_entry, width=95)) result.append(f' "{name}": CoreFontMetrics(') result.append(" font_descriptor=FontDescriptor(") result.append(f' name="{name}",') result.append(f' family="{family}",') result.append(f' weight="{weight}",') result.append(f" ascent={ascent},") result.append(f" descent={descent},") result.append(f" cap_height={cap_height},") result.append(f" x_height={x_height},") result.append(f" italic_angle={italic_angle},") result.append(f" flags={flags},") result.append(f" bbox=({', '.join(map(str, bbox))}),") result.append(" ),") result.append(" character_widths={") for character, width in character_widths.items(): d = '"' try: if ord(character) == 34: # Double quotation mark d = "'" except TypeError: pass result.append(f" {d}{character}{d}: {width},") result.append(" },") result.append(" ),") return result def get_font_data(self) -> str: data = [ "from pypdf._font import CoreFontMetrics, FontDescriptor\n\n" "CORE_FONT_METRICS: dict[str, CoreFontMetrics] = {", ] for name, font_data in self.files.items(): data.extend(self._handle_font(name, font_data)) data.append("}\n") return "\n".join(data) parser = Parser() parser.get_fonts() print(parser.get_disclaimer()) print(parser.get_font_data()) ================================================ FILE: resources/crazyones.txt ================================================ The Crazy Ones October 14, 1998 Heres to the crazy ones. The misfits. The rebels. The troublemakers. The round pegs in the square holes. The ones who see things differently. Theyre not fond of rules. And they have no respect for the status quo. You can quote them, disagree with them, glorify or vilify them. About the only thing you cant do is ignore them. Because they change things. They invent. They imagine. They heal. They explore. They create. They inspire. They push the human race forward. Maybe they have to be crazy. How else can you stare at an empty canvas and see a work of art? Or sit in silence and hear a song thats never been written? Or gaze at a red planet and see a laboratory on wheels? We make tools for these kinds of people. While some see them as the crazy ones, we see genius. Because the people who are crazy enough to think they can change the world, are the ones who do. ================================================ FILE: resources/crazyones_layout_vertical_space.txt ================================================ The Crazy Ones October 14, 1998 Heres to the crazy ones. The misfits. The rebels. The troublemakers. The round pegs in the square holes. The ones who see things differently. Theyre not fond of rules. And they have no respect for the status quo. You can quote them, disagree with them, glorify or vilify them. About the only thing you cant do is ignore them. Because they change things. They invent. They imagine. They heal. They explore. They create. They inspire. They push the human race forward. Maybe they have to be crazy. How else can you stare at an empty canvas and see a work of art? Or sit in silence and hear a song thats never been written? Or gaze at a red planet and see a laboratory on wheels? We make tools for these kinds of people. While some see them as the crazy ones, we see genius. Because the people who are crazy enough to think they can change the world, are the ones who do. ================================================ FILE: resources/crazyones_layout_vertical_space_font_height_weight.txt ================================================ The Crazy Ones October 14, 1998 Heres to the crazy ones. The misfits. The rebels. The troublemakers. The round pegs in the square holes. The ones who see things differently. Theyre not fond of rules. And they have no respect for the status quo. You can quote them, disagree with them, glorify or vilify them. About the only thing you cant do is ignore them. Because they change things. They invent. They imagine. They heal. They explore. They create. They inspire. They push the human race forward. Maybe they have to be crazy. How else can you stare at an empty canvas and see a work of art? Or sit in silence and hear a song thats never been written? Or gaze at a red planet and see a laboratory on wheels? We make tools for these kinds of people. While some see them as the crazy ones, we see genius. Because the people who are crazy enough to think they can change the world, are the ones who do. ================================================ FILE: resources/jpeg.txt ================================================ ffd8ffe000104a46494600010100000100010000ffdb0043000302020302020303030304030304050805050404050a070706080c0a0c0c0b0a0b0b0d0e12100d0e110e0b0b1016101113141515150c0f171816141812141514ffdb00430103040405040509050509140d0b0d1414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414ffc20011080258032003012200021101031101ffc4001d000100010501010100000000000000000000050203040607010809ffc4001c0101000203010101000000000000000000000304010205060708ffda000c03010002100310000001f950000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000cab1ae2a6365f550e977ba07be8a1d12eefb9672fc7e978d1e79ad3d121b8926a8918ef236020c800000000000000000007b7f5d71d9f9fa4702cfc2df7a46db800000000000000000000000000000000002fef8b12d3131efaae14eef3d87af1fceb81db78f48c5f7cc6f4f064db8ff63cdea3cf60cf96f3b1e1cc3416d719c99355a64b03e736e81c990000000000000000af6e86bead3bb7e673793aec84fe651e6c24d64e4d7abe5491a3cdd3795fd33edab9f27bb472bf4bebe2c5de88000000000000000000000000000000d8afe91fb35d7d528decfc2f2675acce296a0cecdab62474b897cad5f2a0cecd7b5fccecc720b17a1ce458bf6a7c58f2f79be21a1b708be349a8f92f11f2ebc1ccdc0000000000013135bff2f8d19277ee70fcddbab325659e0aacbc1860cac98ec9df79cdbb4bedd2cdcc63ba7f27a1cdaade1c9435f89689f5670aef7a6d207a0f50000000000000000000000000000b956e1de8ac491f54a2f3df6c63cf2f55066cd19d174b68c8cccc3e5ef6fcabde4c9e5da6425c5ebd876ba5a4f65c24cfa18ae48741eb9c097e54c7ebbc9bb91476b3b9e0f377d295d1f21e806000000000cbc6b6fa4484cf9ff314dff2aa5cef7da33f5d3a06e393b1fa1f57c3746e95cc39dc9b1662e2ea50df379e419b1c5f44f32d52c4d676891d467b95c4ca8c9eb55aa70bd27e92e11ec3de420ebf740000000000000000000000017acedbd68f2336fd3f57a365ef9775ab271aaad9b96ed4b53da3b23bd6d1e6a6f9161bade87576b193294f4b4c08fcf839b164317368d4e6ae63ec3ea5c87b07ceadf2ef95fe91f9c7d9d78db52b67d8d7d320b7bd23e6376d8f2938000000a8bfd63036cf3fe5e8aaba79fcab757a8e25cb5e1d5a7f86e374fb3b073eb51d1c35506db65e4d32ba690fe64531c3eeeda6f528a1bb87b46a7479d662a564659be5db3d67937aef7416ae8000000000000000000000197bf68d39eeaaec78b8befad82ea8f7a1a54f3130af7ae57ef979feb9e53ccad71a4af371f63edc76b5494d7f47b47be67018cf976d3674ae8df3ecc7563d9e323b2ed624ac5cc78d15acecd85536d29553f21e80600000369d5bb4f3f97396b3e57cf794d66ade3579ed46e3fb83a47974c561997116ed4f67d79eef2012fb2e97b0e359684dde220adadf5ce45b061f4e729da22a7b3cf33706df278539f307d15cfaff004f960f59ee00000000000000000000000d96fcdc9fd739faee4ccd57b11989b659ab9d371b68898f346545c6e1278f8793d2d26a5a0aef1a489c7f69800cbcf7c05c2995c05bc4fdcc7bde8e0c9b567c8f2b77a8dda945ecdacfc8ef870650000243b8f27ebde6fc94bf60e49dbb48f1f8c76ff9ff00b7e8b49c3c2b9c5f3d3dbdec1d5fa7d8f99745fa1386691c47be5da5428f7a0c9cf63956565e0435fb359d8b65af57e6cbf2bad69a741cdd6a66ad1aad64d31c58fe5ff34d3e7bc1ea9cafd9fd002dde000000000000000000000eb92dad6c7f53a3730ba4e6d2db915dced9bb11e931bd679ed6da063f7394e4efc7bcd9a07bb1d6a6c51da8f0c6078cb268da77c45f6687fa83cfcdc6f81fdf7f2af3f7e3927ab4a7d1e9e7fb1be753491f7065e5c60e8bd3b9c7ce6de38f0b680000dc7abf1eeabe6bc849ecda4e250e6743e5f2917358d6e468ae7b1f4574ef94773ebf7a4b834fe9f5eafbba693d3e283bfe06efabf67d0fcf5a2eeba8f1f81f42efbccb69e7f2b48e4bdaf9e56abafed5a7ec38d6664e224638b6dd5fb046f4bafc5be7cfa5392edb73a1e9bd80000000000000000000006dfb972be9ff0047a7b46ebc9e7e9ed3f85660ad6bd6357c291f3f345f41e41b3e16b9cf66e6f2e35988db627d2c3abaaa707819bb6e95b0eee97f55fc37d578527d43f1eed5f3ed5da1b2f173bd3439b8bb1c7fa386d4b62e576e3b5a0740e79f3cb7647cd6e0000199daf8475fe3f0767b176be1f9ca31a4bcd34d5312720ed5c88a6aa2d5db7e57e6fbf931135631df6238ff009358cacc8798af53a4ecbcff0067a5cfb5096e361af6bdc5b3b6db75d8592ad5376dcb8f665bbd91a4ec7621aff38afd8f6ff460ce40000000000000000000742e7b2bd98fa45ca6bfa8d1b8f2b9176f635c851f76ac1e1c9b3c3625ee748c191a2deba2e2cc43cf81e63345cb5e54ce5dbb55d8c7b4d77a7c62ca64dbea4797663323a5a4de458bdd4d3179ded3ab7c7fa02ef94b169b149d7ec694deae457b4175dcd87a5c576ede71e3a73d5dabbc2f97557ac55af9fb11bb86f77fa5f3c4776be4b73a11a20ac07a78c5d908992db7d92a838c2e46dacd924f26eec954a5e6558ab48f33222bcaf5253c84c2d34e5560f6bf430ce4000000000000000000003a34f72ceabf51a2afcbdde8a8c8b556b990819c8fe249177ed5ca5b5dae9bbae60751e85a34d8c3f3df375bf2e4c57cc35eef51f1b92cc6557eb20f632431fa1a43e565e5d0de8c096caf3fe879a4eee177e3bef62b67c2eb7cff00458bb6cf697d1f19571f9484a5e9727222a5ea7a185b925996391834ec5090f46d7985469ccecbd67e2ceffdaf95e5f00ed7c1f1e4e1c50e61ee56566e4c24975bb12319bc95f95cde91e3cedebd4b9cabca99bd4556b18a54d51c546a7b8f25bdd1851e8bd580000000000000000000000e87cf32ba91f61928597faa5258db65b97bf39a65ec74f487f36782e349897eddee7ef569bb8c7498e7abf66d63cef7c57e97833dba137cd73c1d9f9135cdab5efac57c5af3673cbfb48495c97cd7e83e53579c2edc75fbdb1cb435ca7e879dbbe6be4a77de0315ff0021acebcc4b6f7cba567e5fdb7b7fc67876b85f4a70fd57cabdebd5e05fa5e9756c4db75c9f97b5476b9b1c7f2188f73b0f3e4ecdec3b534fb061c564e76a6e65ed1a471b3db566c9368f66522a973bdf482b79e55eb1e548dce62f98e4e37a5f5e166e0000000000000000000000006d5d2f85746f7b57a76e3cd6627c4fc3c9c0e71bfeaf6e6bcccfa56f3aace5fd2175aea9aa7435e3d81bce8bda8a7b77d1f23d243f54e95c9257e31f43899eccfa13c97a7d02466b9d6fc8c0d2e8b153bf5dea7c8af57d4795e6ef0f57d3755d5731c9ea7731f31625bcfc99b930cd8b2b38d6762ae312cde0c759c36b908dcd8ef5d55e55ef69b6372d52d79cdba3a89ea5f02d3f0f758a9a3d7e46ff00a5fe91cf67b48faa6b1ac614f62e6394f9e1a4635dde592e578d63d07a80bbd1000000000000000000000000015523a6ed9c23a1fd169efd463dff005f06549e0e4f1e4c4c8b345bd76343e5f9e961393761e3d7b5a247569fe2763a0e6e853bf1ff00b5cf7b4d50def69c5c1ce92b5c1646d1cc75ae25d1ed79feeb6fe74e71d1f11d8b8b624d55f418f973187ad9ca8bd729da1ccc579893df016ae799d7629185ea91dbd1abebda8ef5b4dc6cab5174759df75d95adf2f93879bc3a5f2f87f2559db0a46e578d71ac67624f656f22d16ad61739b36e6f493bfea0269c00000000000000000000000000000365e89c59e8a1fa2b2be7fd9fd145d8a3f4fcce9c7b3d5af61f435bfa0664351daeef105b0fc8fe9f5554fbc1f4397133146f535a92be8ba397abf45e77a4d6f13dc2b3c4c8cbc49292a48cfc646e24c981a81e1917738b4eb19f3f2b89fb251715ddfbea8e1bf4fdef2da945eff19678bf24e0ccc379afb7798b9789a72e7fdb1721fcd97acd56e1ad72e47e4e9a576a9d427b3b9e8da645767bd7ec1d6ee8672000000000000000000000000000000000000001d1e6a226783f57f445d0b8a19d5ae6c91bbd58bf306f494f13cb5724a7952713218df363bda7597df2437d92af326e3a76b23ba70afb467e5e5e3eef8bd1f1dc27e6dfb2fe34a3ea37efb7bf3a3adc57bec0e7dcbb9ab6b1e3ce57bf5357873ebbee99daf8b744f79d31e7b728a824b3dfb059b619c8000000000000000000000000000000000000000007469ed4f6ce17d51ed1ec5d0abd7b8dd6ee666d06a517f46f15ea783d42abd62bf66fe462646b2e4deb3d271276a9ddbfdebfcef9ff00cd9f617ce9075b8d7d63f2748d2f4dfa11e7ce90f7bcac9fce3936a87ab4ec16743d3da7cf7ca7e94053550c6bfccfb0f20ebfcee91d1f1a00000000000000000000000000000000000000000000006c9d0792f5ae57bfa55297a4c8b59746f5f1ef5a6963a041e548f7be4dc37065b0287adb39f1f271ddbbd63956e7a4ff006e55f3b74aeaf83dd7e27dcf8a55ee9e2a77803c0f7ca32de987934bd2d6f3dd65f28aace71739375be7577cceae3b5f3200000000000000000000000000000000000000000000002aebdc7fa7d3f4930ae9e47d0aed3458b3c5c8f7371abf5eecf6b5aa5bf3f459abd92945ca60674777dd835ec7c49b8e1ebd676af72edbaf5b1ef8f7593c7a3c78614d546712f3da5ef115e558f7f4b2b17ad696ebd2f75d627e4f3a1ddf9500000000000000000000000000000000000000000000000dd34b938ee75db176df07eb1e5bbb4e63df698f96ec7cdf43d7b6cd7b97efa13df2ab5e7f12b918bd2cdec7cbaa3b3837b2aac2cfb79a4d6aaad8cd36efda6b47977dcb1ed65e1ef1d33faf48e77d96ff004d9e9b9dc568db354e7fadb509390d678fcc476fe5e000000000000000000000000000000000000000000000001d733b51dbf83f587af62bd21b66893b7fc953ab6c507075b56b945e9f8fb5697baeb1acd45db9454ec514fb7718b74e5dac6d8deafed1def24ac2dc65a96e973f178d6276be3d3d68deefc2feb49a86fd6f6fb577ca726f987ebff8f391f42c883dab49afd9d4c77fe4800000000000000000000000000000000000000000000000137d4b89760e67b7929bb1b6c76f9dfb231f57bfe535dbc49aa5db155ef253b67132239efe1db57ede3dec762be559a33d88eb9d035d9a847d8c445d1fa47a24bcbf77e65a3718fa578beb6be67fa0b8ee6f0fea5f7579f1f43dff0025b9f35a25f97eeed721ddf9cf4bc5d23a9e0c000000000000000000000000000000000000000000000001bbe91950f43b96d7a7ec1cdf6d7a0368d5731d319916a8fa981c09b8ae8f8ebd971d9da661733072a2bb53c476333ea3e35f5774bc8e9d19d1636f79df8d20fac6a1c2fab7d51d0fe27eb577cbf79f983dd360eb6af81b2c573bd8dec1ae8ce9277e9d0a4ad1513ef9e83e3a1bc000000000000000000000000000000000000000000000000006f7d1780f62e47d0f6c8fb7e53f455d5896a3b98713b05ab5c3d7a9dce2e7e569f564c3631354dfcd8ba5b6fda7f01f71b9c2fa2e334fe47bf3f5b8c381f5cc4a323037af2be5aab4b34e267626d144466369fd1f1b37add0eaf8209290000000000000000000000000000000000000000000000000003aaf2ae97cc9b74f362c3f3ff4a8faae48dcf431945fb6cd7911d5cdcdd5346de345b7c09b93d5645bccca51374bd555579e56e955e53eb2c0ceb5b53a6fe7741e6f97d73649fb7caf23f206b3be687f45e105cd400000000000000000000000000000000000000000000000000006ff00a06ebcfdbaa6d5cd76cf136a623e62333dec5c3b15f5bd7d56aab96ee691a96d5aff0043ca6064534e61ca9bd733e29363af176fa18d7765de6e712088dd79d6c973ce336760f97891b98b97c197e63e71d3b98fd5a8874b400000000000000000000000000000000000000000000000000001b66a7b055cf4f979792f0f661ed4be1c3985b37a1ec6332d62513ef9317969e5c2bb7d98ee487bb54f06b8dda2b7c456cfcf25b8d24b4eeb937067abe9b2b816f58f90c0a3cecdc079a6f9a1fd3a9074350000000000000000000000000000000000000000000000000000176d30fadf64e7fd37e796f578dcfb442d9dd6bf4f0738d77adf3ea9b412f51476b7e55e65b0740e47b05dd7a36bf15812e35fc79fbfc8923b6a8c92a5b6c7b668bbe4b8c7d5f73d360dbe65d6367d63e81502ce000000000000000000000000000000000000000000000000000000379fa7fe27fa5bcc4db1dccbf7c9cf231f6a2af6b8b0d97667c62db97918d019fb6667a9874bc7df203832c4d797e7224c6b977d628a2e51b2a9dd6aee5b2c4e1c5ecf9cb18fa4530c8000000000000000000000000000000000000000000000000000001b2eb48df64d3c7bb17cd6e62634a62d2da2e2e6706de2c4879976f590cad7d3624e328b55f376e63e74b8f32b3f2fd0c50d19b2eb3c692d635fa39fbdbe43b9f12f470da1e9e1000000000000000000000000000000000000000000000000000000000c9fa8be54d839127d558f1f21f3cb545fa657a9a6164e5627a3861e3b3b0fcbcf6bdb88f34d7e7994bdc845cd72e36eda62ceb507cafafa5dc03d4c01b0000000000000000000000000000000000000000000000000000000000093eb9c41cfdbec3c8f91f7af2d3fd0185ccf2a8edb9dbd4b0776f7e685adcd8ec1e70d81bbafd1daaf04a2fe9d479f47bada05cd4000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003ffc40032100001040201030303030402020300000002000103040511120610131420212231600730501523324016332434357090ffda0008010100010502ff00ee4f4d2ec68ce69b1765d7f47b5a2c6d9067ad2b310b8fe36358c9474db941000bad2015f66374e8c06469b1d1934b4658bfde66db8d790d0519e45fd12e6ca859017171fe52be3ca5186a08a870f24a9f0251b156f1268d3b6979348a5da77edada36d2936a58f922171ff6a184a7283162a3a62286a6d47534a18f8b0ab38e83230dae8aad2bdcc55aa127f1f0c2539d6a410b047ccb0984f3bc1838abc798920846d4cc47e564669dddd37716532e3b5342a40443c5ff00d611737ab86d20a4c2c359990c08604d132105ad287685d70172ca74641236471b3e2ec7f1956af99e3008d0b285f81e2f30359aef546c6f654a7792c7c8d9f9697685f7df926747f2b49c7934d029634eda7ff528634efbd4a215a318d78d30f6674ce85d548bcae18afa26aef1391697954f0437a2cf74bc98d7fe2aae3db8ebb33a725e42446ee8cf48cd73d21990d84361348ce874b69fb9372696153c1f1fe9e3b024682261661ed142f2a6c6bea5afe35b4c4849519bc7362a509a3cbd06656be8367f98d11f16ea1c37a59bf8708ca42ab486bfbb86d18e9a5745df92127dc65a5ea1467cbb0839bb63a6d4911469db68e356e1f117fa10c2760e9e12085985337611e4588c5f35363580325598548da223e282cfc84cb1398784ace5c658af4ec5246ec85d9949f28e0e4f9fc03e3ff8686379a4af5c6b87766dae09856f4d348a474ebeeb83a18dc9c61e2ce5da1241f52c0613ce87a6c7c7d4183f4e89b8bbb6d4b173121702fdead564b5250a034c75db7da9ff00df8311e378599b304ccf6256679a75e57dc563e06d38a6ca9e8af72515bda80f9260da31d2b910d9832f8d2c65afe0e288a63ab59abc7dd932725c9712913616795ade2ce04103bbc5513d443598558f8ee0fa7c67d763a56a3387859756d6660bc3ab09d964a0d3feec15cecc98fa234e161f681f03c565fc637338c4194c97379ec6dddf7da174edf1b75cdd4523f2a24eb969a4916f6b27870ca559622825fe0b1f51e216044ddd9d725b546815b930fd26dc2ce32bd38ba86787c916b62c8e46152d9f890f93f7c7cde39ba43241c1e6061eafca03a99fcb23448e3d29e3630901e23fdb66727c2637d242c3ef1b0f129b266ed3d9737fbf7ae09e1db144e2fc5461fdcc5d2f235aa0e0130b838a89759e18f97f035a3f2cc2ec9df489fda72715d39918e1793ac638a0cc7561d9534a531d689dd48fe209a6722dfb40b8be1b365511f596e2bb962b842e994aa459187f73a7e9958b7c112d3a76d76224c48e6d296c239797b6b3a89b93495b6d2c5c5c7e1fa76d8a92b858af94a9e32dfcc66a4e13c36ab9d4b1fc0566d20b1a4d3ed7919725bed24ba5211128865140331227667841423c46f4ff00b109a17db28dd6d484894c1ce376717fdac355f494c476a0c73ca8b17c06d47e3452239114c8e42744efee8a4e2aad8faa21e63720d22fa5f1b6bc478ccc7f6b2b65a646df2c5a4d22eaea62507f015e2d42d5ddd0d7d26817a6f8281d91ed937d4a38c53943134d6f9a050a924e31cc7ccfde2fa78a45b4cfa5e44e7bececafc7c65fd9a15ded5b8db4d483c92e2e8338deacc23986e0f2cfa22976a8d37b45174ffd190c67894a1e33efaed58b5362a2f20652ab82b1f495693e6b592063b447d9d93329231963bb54a95aff007ebd5fa1abe9b5a4c81db460cea5aec8ab2900c53b3b3a075112b127d3ef18c8d156905a22d3c7f2daf612bf1f38ff0067a74395f1658f2e1362ac0f1ca58060ce5a67233d9c40e65d378a1d4948423cdc0dac80ea4400f2163f02f20d9c1f8c6e55f13c1ff6f4f45c87318f630ca527028cb815693b332715c7b75653f257ff007e87cd5d6dbd01c8a5ac712675a72442fa315c76a6aa8e3e2e2e80d4a5bf7042e4a0c73cc784e95f20643a4b50e671ef4a48255cbe392dae4beea683c913b69ff63a69ff00f20097978153cbf05772de41bf2bc84c1f3037197a72c070b328bc5d4168595d979cbb5d3b4bd44d8cc58857c9556e39a8599e01dcfd35f036b44198aac4ad41c0ea48a22d8b3aa80d213633616e9bc0f2c632c76ea9d2b1fef626467a5117d58e6026bb8f131b757c278da63235dc43305baaf13d3c6f9dece09e30b94dc1fc2fc8e3e0cefed887915781b581a8324d88a22104d558a3eb3aa20f19f17693e1cd79503ed4428b5c676d4ffb1d3e5aba048d73767e5b69c130e9cdbe3159a7aea4ea6dc593caf9dddf6eba4a5112a320955cbc82019db0ce548794f8697c4076f935d3636bb5f6b878e4ad276a5378e5c648134797c7b6ad45e33eadada2ff007b0528b0b16953b9c541739b5f8d9d52b5e278ed34c393acceb1d2b42432c73c797c7327afc4a4876a68b5edadf787fc7a7a768ec6225692ad995a283ae320252289f6bc7b670f984137c313ab3ff7fec54b0f56787e5b8edbc49a353448e3d29117df93fb31b71e9cf47a9f88653a81e456ac3ce741b4f4e7d3797e89e7f994f929436a1f840fb658ac9bc4f6724d2c5902e679aa9eaf1dfef5395a0b480f8bd7b7a53dbe4c32e8eb5ad2b365881e6e12d4b8a736962ba1a974ad027f87ef19f1286c28af7864c4759fa71ccf5bbcb0e46f15d99561db8c5f49836e36ec4a6ffbbf67116dadd7675c931226db4c2a545f75aedaecd213273724cabbf154a4db9d86109ad33395c647676867f98664d26d312f567a33e6e42c633c7e19bfdec4daf515937c2dbbf61371453bbb48fb786c383faffa253f2169580d8ccda2f60cae2bcceeb6fdf5b507d29ec699a4e483ecad1f8e2fdae9fb651cccfd996d483b5252335354305f6f78a89405c14f73e2c5b7777989d47cc9431ba16d262764d2a6993cc9a6f99cda49bfdec75cf4761b44dec61e4a4896b49bb98edadc7af737708b920abf120f14ea141f6e5a595b1f4768e2399e1c0da9545d36ee9fa7e38c63c542ca0c3d4e058fa8bd152276c2541766d7b28d47b0f1e17fb795c5f06bb0f8cfddcfe63953d95358daff00278abed435f49875ecf94e6ead484357f80c358f354f647f738d88260d3b26eda572246dc4bd8cc981d4706d450696b4d28ed3c4a1816b8b4b2715204d7a68ba6a4750e0aa44abd6398a8742deb4c1d055e11b9d394ab85f8c02587e40abf312ac60f17367dadadaf2332e9d08e662aec30e6819872dfe7ec68ddd7851c7a4c4ecb9ba6672786b28a1e3d9930a76efc57504de2adfc050b4f56cf6e1d9be1049f137cad264ddac86c2c8e8bbe3a83dc931fd23ca2c974d7a7450784b69dd12664df09a0925418d615e95c1d88d90c8ceba4f3b8ec5c32f5ee3c1af75e94aaee5ad5e23f91ab61a27f3724efb50d1b1617f40bfab54ec545c96d7f57b18fb9d3bd7f16522ce5e676c81f293bc21b431368b42a676ec11f25057411f1eecb9277f665720f7ac7f0384ba56636fbc6db628dd7174df089714ec993276e4d7e0d3f7e8dc5b12ad41a38f398f12ab9987c53ebb34446a3a0e82b847dca42120998970174f13271b0c724a51304e4616724bd713ae98c54f9ebf8be89c763d18c75c32d9da75472f997c89228f9058ac50133b8bd5cc1da8ed06fbbba8a5d2f51f073a2772411727af53682ae98878fec66f20f521fe0ab593a92d6986c451baa90b48a7c6b70b159e378e0f2a2c5bb0cb1bc6fc769a1364ceaf47cc661e26cdb50c3f3d16e2cc3fe39e95a3a79a91a4b1c5451ec84599bd840c4b11d256f30f8fe80a9585f0b46a0e627a9542dcc3666b9776cefb5b543256719347d7d97269f3d72e212e488f487992fa99492f319a178d7d9e097d546703b398bb27edb7401b410ede9639cd4189766929346369b8fbee6421a236272b337f0784c935636552c7050db631bb0312accd1491bc724791a2ceaad7d491d0138ef63b839c1b6c856e2f047b401c56072af4e4abd5a2d0e77a88ed8c9ca53c6f4e5dc9bd3fd3b3012e90a900e62856a43cd3cc85fe1e4658cfd403a3566fd449cdaef555fb8af5eda966724ecb8ae3da382499e0c71448ad41033e41c9eb9b908ba76dae02436eb3d692bcbe097834832c08eba285042863d2807eac6388b3da161bb736a53e6fedbb763a315dba77a6fe170b9569a302d2827d22b3c82493ebaf61d9148d24723f03ad6dd98f538598389656ba093c72472726555a475f24b1875e3bb0f55e1e18ee75d5216c87574f694f24b39f89c9344ccde35e164154dd5813ac335c39138ed3b218c8d35395d478b98d34352b23be4ec44e4eff0029bef4fe636f8ef34632059aaf0ac658e61c392920470ae1aed0a8a728d15e91d14845eec9e5c6934f39d997f8612717c4e65adb09e934ce9df6a3274337d3216de23504da52fd6b2306e3b70f19427f1a1b5f152519615af66dbb08b99627a0649c6b74de3a80daf1c23d45d4d540423daf1ed45519d340310cb948c14d6649fda4a83fd04c87b3a76626707a36a16e4c61f1282f1af128e3d211440b5d9fb492842393cf736fe259dc5f159869d850fdc5971461a4c84f49a459093fb57cb7228a3d450b4d0bc5912db1b3f692761452917688f83c79bf45603f562f30ddfd4ccb586c866efe4de2075146eea1aaad64e3aca6b125a266d7b7ec9fe5b1a4b19d3a5607fa05761ca609eb022653c3ea21c4cfceb11271da78d30ae299912d7ce93ab97e2a4190cacb7dff008cc6e6a4a45572b56cb09a134edb626d3b3a7361590b7f13b737a95fcb60231064c854d5e40379a507a7e4b44740c95981e296210679a667919ddd032841440c037324f32e2def343f6e8cc77f51cb853d315453d3621bd07a6b9da0fec5deeff099fb12d277601c8f51844734f2583fe3eb64ed54683ab660287acab701ea2a33306629984d97aeadde12457878e2b5246dde3f953ff918b18e2ee044f213461349e69663d3321513287e15bb6f3bfb6288e7928744ec2c745d770cae366c65805fa5602791681140a581662469b29dac37d31c8d207625cbe45d58b11d68ef752c211dec94f903fe6713ff00c7b771271445c9d6463e32bdc9658d9485c8d90a894f37d3eee81c1b141e911d45d63896b1870fbf46e71b039d8e4194175767a3c4d2fbf776db4f7ace36e37545943d58dcbfe531bc45d486e32e6ee4af24f24cff00cde2beac7fbaf8728173d0264280b4b7da9509f212c7d13270c8f4cd9a21dba36b8ffc73d3a3aebaa8460c10fdd74ef56dfc555b3fa85909a39ec496a5f675043f3f80e14f950f718f36b18bb7523fba76d3b265cbb43114f361b011e2e9bd352d45d518cfe9b945fa61990b7874edb5fa9d9f8f88b69d634fe3dd998bc957f01e9f27f0a74deca9ff00b4d7ddc3378e1a16e41f84c87e7b742e3bd4e742b7c15752d75fa920216963f2363156e8feadb3459afd54b36a2924298fb513e161fdd683c9010f12fc03046e3713a6431f26d7688b84ab3157d4d24edc5d90b6900f33e9eb638cca36b4a506e3d63976cc66fde25c0d9f90fb75b6c845e2b5f805493c567b328dfe256f9ed426f2c2edb6983c734cc9beea07d4cceba77af7d046fd758568fabff50e5c98fec12a07e5acefddfecddb3b1f19ff00006771780fcf076074526dfb4133c06397aaf1ca7e4949b6cdf7ed0db63053d8189b7ccff61d6325e1297c38bfb197508fe05d3d2b4b41db4fd9bef0539acb1038129c78cfda56d1f6dfcf98d3bed0b6bd9af6ba02e06cfe7899f5dc937cace8ee0fc07a66df09e66fabb6962ec157af9610ba0af8ea5ed61be03fc53ad2105a5a5a5a5a45f0b4b4b48bb50b5e34c43221f9644c85d66bff005ff01c6cfe9ef99727ef8e3dc4acc5e29b223f43762a8670c3f23a5c1715a5af69326ee6b6a8c2766ce33a523aa078789c72f8af4088d10e9653fc7f02a537a8a9de94de2956487ebbadb85937df5f0c1e3b0c2b4b4b4b8ad7666da08d144b8ebb13fc71dbafd38c434ccd511d559fa0d2e3064215ea496489f5f8174f58e517b2aded35db0d31596fed7d9f4a07f241783c77b827f85b4cdd8bb46a36f89155a92df9eaf46c42177a361902ed49684c47b5fa5e23274ff859140ba9e41a387fbaaf578b66a6e47f81622d7a6b8a28de53b5859eac5de61fec17dc3e5a94bc0b3317f685988246ecceb9277ec05a432e94b22e8fc1b56c6fa352545d7d8d61ab5ebf98fa2f350606c09318c920c41d7dd49fd5ac54afb56e4f0c17a4e72fe078eb3eaaad5061986c18b5d87c737690bff1a66d482fda79de4a94e6dd632edb5b54e84f90903a365e390c0d9c78ed37d4f8cacde8bd3a92bafd460d626031ac3ea6395aa750dfa11647a8b257da19ce526d8acc5cf822727fc0fa6a76e71fd062fb6b31b1878c5e49dda2197e20b208546eb4a12f14bbef56b1dcb18ac1478eaaf4d58a2c639ba1fd3722cdc9743e6432b85521080754661b2d7ed3b7976abc9138d91613a11686e596af1deb0e67f8256b075278a509e3a92ed8db60e3f320ba9bfc4fe51c7a76519ed5bfeddaeffa7f59ac7500d6f82aca5aebace93499b1a2cca85a9b0b660fd47b3e3cbf545ccbb13a96b01a9aaf06ac02724d1b0ca538d4832175d7dff06e99bac2e25c5fd5bb8691eb50c0d2c76ab3c49d6bb5d7d9573e43dba0ee352ea4d2d2ba71d583256defde53288b4fd89b69ddd94903a79783dbc8edc8dcdff0612702c4dc7bd476bcaca4179998258822825b0bd36a46a719358a5e25918b8c719f028a092c26a91409ecbae9afd47f0407d798a68f3fd533e6dfb4adb1fb267e4da5a56240886f65b829ed14df85748f12ab3c442410b0ad271765c886311110066528f10ce46c35954bc7182fba82142da6f618f1785d0bf90ea74cd9b29b014a8b752d46a39cfc2ba5dfc708719466a8f1a1fbb331338f07d693277775d40fff008e9be157b3c15701918078adadf6d27760668e6b6ab619f950863acde6465c975bc0307517e158191fd2d5b1b509b1b4b45897078de664ecb5db2e5e590a3d2fb273676199c24feaf2090e6a25fd661516424b44183c9ca38cc4d2aee528945e1faa3f843f3dbafe330ea1fc2b004ef1452717ab610cff001624da39c99c66675cc59d859d5eac5099b2765a4c2eea3a33ccabf4fc922a982a30aa9622ac366c3cac53bc458c95e677abf12b78e48c932fd438bc79efc2ba77eab035ddd431383348ec8cb6893ba775b66464c48a9c2efe8a04d5e1140fc109484dc645e530786cede23621b41b2c37c1beb8dff828cd01afd40b052e7bf0ac09f0cac5590d5455f4a4894df09cdd727f755afcd4543e0293ab94f8b485e292bddd27979bd13e07eab6d607c89e3e09a4d2eb6363ea1fc2a290a1930cc56e9bc1c54eda721da38369a93129b1ccc881c0bd98e974abccdc7d530abd719dac97291b6a090956251caea08f9c7660d0bfc3f536bfaefe17fa6b91f5b8fb65a7b0fb789b920aece9ab833db8435763fab82e2b4b48798bc16e415eadc9a4d9af4c9ab21afa51b69464a94ade29cd9c64ff003ea532933bf85f45e53fa4f50dc2faa46e4a11d381b32b122b16113791de05e976869a0a6ce9e832f49c5dabaf0b2e0c98596bb0baaf678095ae4a59c628ec98c963f0ce94ccbe631bc57f8a23533393490edd878a16da8419470b218019e5005337cfb1fb7d9f9ae6b2d686ae1ff0dc0664b077a29c2c444e9f69d48c8fe1c1d467c5e29d93caca495116d6d33261401b470a20d390a74c4bab725ea2cfe1dd0d971765a462a404516d4702f0271e2b6b7de2f9402ca360653b8a98be7922f959eca7f4c82490a693f0ead664a93e1733066aa224c1c97a567415348a0d35814fdb4b484b8bc761914cca59368fe57d96673418c82f5b2bd6bf10c366a7c2d8c56561cb552fb87dc3eceda52c8fa98f7fb1a52ca104791eb18e3566cc9727fc4b1d919f176b0fd6b14e0c98dd93caea437745edda2904066eaac744577ad5c8e5c85a9c3f16ad92b749e1ebcb8317fcfea78e2eb5c74aff00f2dc6139f5951647d67480a5eb89dded7575d9cace6af5b444e65ffed8ff00ffc4003d110001030204030604040404070000000001000203041105122131101341061420225051324061b1233071a1334291f0151624345262728081c1e1ffda0008010301013f01f5e240dd3a6017783ecb9ce5cf284c3aa041dbf3c9037466886ee0839aed8fccec9d2f46a0c2ed4a7b40e21aac9aeb26baff00992cec84799495ae7fc3a27484f54fd538b987337754d8c4f0e927982a6af82aff008675f6ebf2ce706ee9cf2fe19ca1aa2c56e17e0d726bb37e55557867922dd3dee71b94ccce1e545ae0754ed06aa9e364c2c54f46637a7b397a858662666fc19f7e87dfe51efc889277e16e0df0109b1dd38654c7e53f904868b95575ce90e58f409a0754eb1365491343157583bcab217ee9a0c2fb853cc6409f7ea83b5d161958e999cb94f9be49eeca11b9d780574d8fdd1d3c0533652ebc2275c5bc75b5265772dbb04e6d95d0dd32b0b5b9423779b9e0e1c1f173765c8e53bccb98e87cccdd5254b6aa1120fecfc8c86e51e2d1645eaf7f0dc857e0d394f8b10a8e445a6e546e321b052d3c8d172a360ea8b3314d65b89e0d7f29d7553cb959991d45961137226319d9df2278857e03c247823376f871792f2867b2c35adcd72aa4b72d964b2a7a7045caa988336e0d81c45d3865d13911982b906c9c2c55faaa3a8ef3087f5f90eab22b5f4562d56ba22de160ba70447187dbc389479a72999a1d90a973ceab3282701aa793985462ee4db06aa8233294d828e5f7528ea131de61752d2874799ab0a98c53728f5f909058dd31fee89b146ce09853878586c9ce478c1bf8714668d90279b85184ddd05757b2ef0eb5913752eba042072e5bc0d51690a2aa2c6e529b296c824081bea3f3dedcc38dd0d15efe3278c42c1005c6c1458357cdab623ff9d3ee99d9dac3f1587f7f44dec9d465bba468feaaa7b23513c45b148d72acc0313a13f8d09fbfd96ad3aaa681d31ba92131f832a1105a354925f6595b6d53e31d1362cc6c80b69f21236c7f24b4aba2500a8b01acadd436cdf72a93b39490ff00b8bbbf61fb290330f80ba8e1b9f60a6abc6ea0d83720586d15546d32554a5d7e89c79b1e5b26e1c2239f36509b5b4d7e5b6504feab10c330fc45b967841faf5feaaa7051417301bb3f75552b5da0e207073c353e42e4d013b85045cc96fd07c8bdb9820b22d50f034ea8eca3a692a1d9626dcac3fb272cfe6a97651f4dd51e0b4545fc366bee5594d27756991eeb354fda5a36e8c6977ecb0dc49d5cd27965a3dcaacc40b7f0e32856cd948075526175b54733e5cca8b09a6a1b4d2eae51b9b2ebd154d364d59b2c7f09eecfef707c2771ec86a805b2be6d02ee65c2e53e20c57574d6b9e6cd54b00a78f2f5f92959fcc131fd0a75815a10989e2cb75b1583f679f58c135568de81565f0b688a8a0bfd931b8f55ff00c83fa7ff00552b6be38f94e7171f729b054b0e6bdcaa8c3fbe0b556c9b4386d2eac6027faaa8aa7c9a0d022dbae5a60c86e139eecdaa9dee89cdcba28a5cec0557411bb342762ab699f4552e81dd3ec84964e7dd326c86e9f5ae76c9cf2ee11c6e95d958a9691b4fa9d5df292476d5bc05c2bd917dc20b0ca2ef73f9b60a0aa6e401da26398fd5a6e9f335ba26cfaecb16acabd194247d4a761f573bb35449fbdd51534548db052381fd10713b713aaac9a18e95b2c8561f8dd349680dc1faaaf61b87aed7511e4c75adfd0a0f2b3942ee4c1ee9c47454f48f9cdfa2861640dcacf967461cb94e08b0ab1401581537269f39ddca56e7610a9ae0a0f6b5c03ceea7823eaaa4b5a796c1c77e1b276274cd765ba63c3ce66ecb1595cf9f21d9bc221cc81bccf60b16a06d661f2530f6d10664d1c9fbad428a29a6f802a6c3037cd36bf44001a0f9ca11fe9e3ffa47db8582ab66788aa5abfe590a2ecc7314384d550c1a48e514cc98668cdd62b2164161d7861249611f555585bab9d9a23e6541d9e90481f55b0e9c718a08a3ac782dfaa387537fc3fb94ca4819b35016dbe7b0d7f328e33f4fb68afadb84cec91b9e7a2a6ab131ca7429a784afe5c6e7fb2738bce676eb0a796d4651d555d3f798b22186d49765b2a7a76d3c618140ec9283c4aed245695927bfa0e012e6a731fb1fbab6a992de42d29ed0f6969ea9ec7d24d94eed4d2988b43da5a7aa968a689f96d7586d1987f164dfc1b2130e5730a6b83c5c271b15da1873d267f6f41c0e731cc59efc26bc7502dd507b5c6c0ac5a8a39b2c8775b2893cd82ba60b0f0151b8be9dccf6540fbddaaa9d923cc156bbbcd049efe83472f26763d3764f682a49a4a2ae73c6f753913d3676febc213aa73332115b756564ed9593f40a9e611bb31d9371ca6a798e4692107c75f4dcc84dc14d6bb952b0fa150cfde2999270c670d74c79f08b9eaa8637b685ac937b2b26795e15acad745a80b94e8c06aa9ac8697479d5475b154fc0abdc5b01038766b3f25fed758b3b914ce73bafa1767aa3307539fd47fed62788cb433b58068a195b3b048cd8af759ace21145e1cc050364f94345dc5475d4f23b2b5daa74848b29a432c85ee5401c6a58d66e50c3217d398a5dcaff2c4c1fac832aa1a58a9630c8c681769ab448e10b7d0a82abb9d4b26e837fd17686939f0b6a63d6df65d9c9dae89d13fa153e80953b2cecc146ee8544ed32f0c62625e22e9c282964a8a7648f3b858860733273c8d41584609242ee6c9bfd93e27353b56aafaf65041941d54f33a790bdde87805789a3ee526e36fd3dbfbe8a1a4a5a73e46d9171276d154d337e31a0429f3fc250cd0c96720d73f61a2c530e7cc44b0ea7aaa1c0aaaaa41cc6e56f5253182368637609fe57dd66f652683313655f8d4507962d4aa9ab96a9d779f4485ee8de1ed3a8586e2acaaf24da3fee9ceb3ac9cd0f6dca10c6d398055a7f1953565bf0e554f0dfccb3376babaaa9a285b9a57594fda3631b9606dcfd554d7d4d59bcae4edfd15bba0553e2f34360ff305498bd2cc329758fd5492306b752de4797159509e4885b3582ff14e56a0dd4d8e55cba35d65248f90e679ba69e0ff0088fa28dfc0d95ecf84aef539fe646690eee45c8395f80d1029df11f461a8e04a1c48ba0d564139351dfd1a33d15916ab5b85d375f0ede8fb269b8f0655b2bacc5037e0f3d3d218ecbc1c802871caac8bfdbd2838b50902ccd5982ce17302e622e27fec43ffc400391100010302040403050801040300000000010002030411051221311013224132505106142040d12330334261718191a1157280b1c1e1f0ffda0008010201013f01f3e869e5a976589b7549ecc54d47e23837fca6fb214c3c729ff0a4f65e85ba073bfb1f447d93a722e2423fa553ecc4ecb981e1dfe0fd14d04b4eec92b6c7efc90374658c6ee41c0edf32c63a5706305c9587fb3ad60e6571fe3eaaab13828dbcb845bf6586574b2baee1a22f73b64d1adca9aac30d9365e66cab6844a3a85d57503a94e61e1fbc9256c43a93ea9cef0e89cfba72712d399aa0c4e68b47f5050564351e03afa7cb5150cd5f272e11f4587e1d0e1cce91777aa7e791d65fe990976672a89194ccb3028310be8e4da86386e9ec648991868d16555944d91a4762abe89d4725bb1dbeeaa2b037a63dd39c49b94dcce1a221c0a768140c6ca2c54b4a58e4e6e4d42a0c40c9f652efebf2985e14fc45d73a306e7e8a9e9e2a5672e2161c1d2b1a8485cefd155c99cdb8465fd936578eea9a4ced5598e329a4c81525536ae3cc16298736b212d1bf644169b1f8c9005caa8ab2f3666c9a11d4d95346d0d55760ed165cdba6831bae14d21784fbf740aa0a932b7249bfc9619446ba70c3e1eea1e5c2d10c6341c250e3b2e5068ccf5598c323e889412ba717726b02b01c2090b6ed588b1e273758139d132ce59c15ed051f267e7b3677fdfc7553991d91bb22db2ba0995240b047a8dcf0770747ccd972b967559dd17531534e2a230f1f2383c6628330fcca08edd4e59c14f7860b955b399c646aa7c2b3393e99b4e2df0036374ca582a7a9c13e95ad1d0afd96214a2ae99d19fe3f7445b43f0d6cdca8f4ee984bce8a485e05d31a3ba2db941b6e27835dcb3753e491b74750b0d97932643b3be46189b1b1acf456d1465d9ac54f258d9ca386376a9b66e81543b349f0d34f95d64d3982e58ba2c58dd37bbd63bd1dafc3893ef206aa10dba9ed956550422d72a78f2f010b88ba22c8a3a857b1b222caea9a6e7c41ff0020db985920f40998b81272dea5a98e26734a33c758cbb10a910bad754f52254ffc43c5adbac567745d2d587d53f3d8aa5a8cc2c5660bb2f6963be497f8f86b9979936f16c84ee71d566514a0354afce983a90b00a6f1290e89922907709875d5494e0b333561d272e5e59eff0021815409a9794efcab12c3339e644a0a5927a6c8f54cc9a927c9d96270b9afe63561d339c8756bc5af016274c67ea6aa0a2746733d4370fd130e6d131a0357b58472a303d7e1af6ecf09c6e1461350e17b2e73ad644a935d1729cb238044151d4168ca50932bc3c21aebf7f8555fba5482e3d274283504f8987a88550c6c9a151c2d8bc2986dc0aba6ea106176ca9e980d4a6b407279b3345ed0d489aa4463f2ff00e56c8d444dee8d5c7d97bfb3b052d532461639aacaf650446437524593e0ca84616813df7595bdd3a31d9363b9b2dbe4305acf7aa6ca776e9f45b276c88bbac9e32bb834f070cba954f55149272d182c06450c391b72a790df455ded152d0332976677a0553884b3c864f542f2bad2393594ccfd54d230e8c6af09bae6e6d2cb96fdc8593d55316b058aa8901d07103839d64e7dd342770a48f3c97f4f91c36b4d0ce1ff0097ba97ae30f66c86231b4f2dc98d8c0e629df1c87a166ca8390d557c0f14f995397098109b89b29e10e99d655dedb4510cb4accc7d4ecab71baeaf3f68fd3d069c1a33e81368e43b9534222eea38afa95cb6dd09e3668059493be5e91b22084c7df74f6907304100b657be817bb122e53a3caafc034b8d8282110b2ddfe4b00c505bdca73fedfa7d162f8638fda46a879b35398de87369aa32bb6588b646012316172b2a4653ba686c32752e5c7531e5ecb18ada6c3e731526ae1b9f45cd7d63b3cef47dd63fd53f944dc2ccc2992f2fc08cb33f729ad01665991d559345c6a88b14dea6ea8ddaeb20fb22fba6cb90a7d5128b89e0c63a4395aa9e9843af7f94c231a1503ddaacf5763ebffbff00bfdf76c6d66c14f1c32c9d41490472c79145869a6973314e40d5cb10c664a2a7223d09d027b09374411ba0d256550363de55cf8dbe00a47ba43aa03e18dae2fb052d33c75288f6551ea1071598add307aa2428299d37eca389b10b37e5a8b1ca9a36e43d4dfd5331da3947582d3fdfff007f4a9f11a57e8240b9d09d43c7f6aa24a6683cc78fed63552d9ea4b587a5a81b14f56d344d714c17d4fc5c97a22ca00036fc0e8e3647ab74d6db74edd6ca38e497c0141401bd52ea80b6df39278cf18cd9c9ecf456b716b1ced91696eea0177709f74c9b943552d58b599c628e39580b82f7183d1369a16ecd5b7cf4c2d21e2d17364f8f2f168b9b202da29c74a8df90dd73989efce6e9c2e3e0a377491e4354db3afc0b74ba06c6e810f6a28ad8dd364690a6933683e1cbad96dc291d67dbc86a5b76df83756ab150485ba70721c0fc2747dd4a3ba60b9518c928f219066691c0141a248c04de97db8395ec8bd6657415d3754e6dc2f767bdba956313ece5f981f2295b91e470a79837a5ca5239970ae8ea38df804d89d26c84259ba60bbf8565b3050753ec3c8aadb6b3d430895a4a734b4d8f022e102ad63c036fb2313da2e4209a328b29080d24a33bb3e60bdf5b6db552bcbcdcaa38ac331f22959cc616aa47e57642ab1a736609a9a74b27b7ba70efc295ba66e123db1b880a2aa696f52a8a90e19420e057750c46575d35b945bc8eae2ca79813a47bf7e0c7f659adba2330d165b6ea0903742a5aa630686e513737286a380f4515397eae4c6066de49887e028ea03ba4a01036598a8fc28b7d13dfd958f00a7ad8e3db55356ba46900aa3767a68ddfa0f25ae17854d1f70a0ab923e976a10998e4dd7643416e0ec8357296aa067eaa4af75ed1b6c993195a1a555c39023a2c38de8e2fdbc96a85e17232665a26b2e9ad2dd8a0f78ee8bddeaa2a6e6ea54b439429204d3cb7053fdb37452c56365878cb4910fd0792eea78cc12b98531a1e5414cccaaa2311ecafc21a9e5296b83859194294872a376a41556066b854df80cfd87936314fa09c7f2b9a46ca1ae7b549506551bb33aca3a7616aac6f2ce8b31574d17562c7688b4cae0d40585bc9a463656963b62aae98d2ca632815182ed908decd6cbdf1e3445e653d4991b49b28e8e22d553072cf4a37eeb0ba624f39dfc79462345ef71f4f882735d1b8b5dbaa291ac3d4a49e073148466d15d0714caa7b519cbd5261cf91d9e61609a034651e5353430d58eb1afaa9704a867e19bff85ee558ddd850a3a93f90a661d52ffca9983cc7c44051e0d18fc475d43490c1e06ffc10ffc4003f1000010203040607060406030101000000010002031121101231510420223041611323324252607140508191b1f0143362a105247282c1e14353d17090ffda0008010100063f02ff00ec92e89f3fe954847e345d8fdc29f47f0bc112611a29986471aaa823d7cb784bd5554c31a0e72d7939a1c39a177608585f1fa7dbe8c284a19ae744d943bd3c9c1173b478a00a92585544b8d7de9789ba3eabab629c94e5bbae39aafb549a1031093c82d8601aa61c660764ee2df45d43dd0bd6b2c7e7c0704d6468575e4175d0e0e32ce9ef0bad409179f9a92145b520881bcafb449a0b8e410744abbc3c10a6e43a42f0a4d07e864c123187dabdf338fdd17431c00ec448ce633f76ccd1a382935a1b6042aa57d1afb8cc8dc60e2800d17a4017018ee6606a7451e1b62b3272e9747bf1f469578b99ebcb9fbaef4518f77571553ee36c4d228cff00af8a9012032d49ea8b0908ea44d220868807b83bbee89344cabd39bbd8a404d4e4b685b3181f61b90da5eee4a1b9c0ba28919cf8ea0010a2c11d5133afd3c107a0388f07b99ac1c5506d115de61ae28b04ea295841441a11bfbb0db3e7c02201267c380d56a6d851d4a2c759d0a209b1c8b2a611ab1e78fb92eb7152e271dc5029c9555770d0536c253ad1107a1df5c86dbce418daf1279eb029b558a35d496a0d588d0d636391b311cdc13e1bc49ec3748e7ee3bceed3b865b8025441ce6a3390443772d299557af0922d6944da5a7028b4e23780013278057dfdb898b72dc62b1dc84160ab69fe22c20b241b11b412e7cfefe1ee168226de3b917949b20886ba6a6ea9dd0aa95faa333aa220f8ef3a4ba0b19c5c38fb23415346d7c28826c78bae1c94482fed30cbdc25db9a192da79526e39d9352dfb8291a1ddb1bb533521c703c6cc161ad5dd8574a28dacd28025ed3709184befebee16f313dcd549b41ec97b3dd438633aa0820648d11d4eca3452d66a08db43aae63c4dae12214480ec5871cfdc0d12972d5c352bbcd913532ddc4f8b7753c86765536a8d51b00cd368b046d0d1895392c1609bea9b444847710f486b7f2e8e74f87b8209fd02ca2a8b6a2da6ec043654eea9cb5dcdcf7513d2c0562b146c61e69b5b0d6dbc7d10a2344534734d52475b0b1d0de26d709109f06276db97b7c39114a192084d12db02ecdb31455dd0f54289c3927d370ff005dc9c88dc4af497691acd4ec68e20ac51450435c584816418e063b2e33f97f9f6f88c96d4e73cfeffcdb22a7648a2a4a5208c957735cd0927927827069aee1febb96c413a70071dce3a80f02a57911791b029a3ac01b63325370179b49d47b7c379a006b6e3ad8eec38200bd4ba445c4ee1fea7741dc703bdc554eee62d2d70041a1054487e1716d7dbeee0e87b3f0dcca7ec8f3c409eedd07161ad06075a9bcc6cc563b888e680017120012fdbdbe67b0ea39022a0f1f6c10fc58db286c73ce4d134661b0e5e328f49165fd2109bdcf77c82ac2fdca6ce0368813a342a7e8087f2f0a9fa420e84d74278c1ed7198d69c91a7b2c620c8861a8f4f7086f187b38fb1e1b9d8639e277448511e92335b95d135561886739bcab9061de393420e89284d2bac885c51e1cca774756d996b8ba4195851f6410bbcf3970f70b1d7a4c9ed7a7b0f243657614b568b0a299a95b2aa2c947675b3ed2d9bce5d4419732bac7d3258c94899f3b7ab82f7fa053fc3b87aaeb61965a1fa3452c2d4347d28745a40c327232f6535ea9a7607b89cc88fbd11b9e56d372751b308001132aaa5b40b69506a5560a8bb3b2ab545d76415d01488a2106038018b89e01073d9f888be288b65ad6019511bd19ae3936aa4d1761d844e45570cd4c508421463378c0e7ec9d1b2f08913070e1ee31121991faa6c46998395b416cd48aa2ecd93b426208cf75386db90bc6ec10fc43dd18aeaf47604ebc193c93881761ab8cc2de97468ae84fcdaa474d7aebb4988ffee539cd52aa73b2eb997829c8ddb2bda1b8c161b99c4757c0314e8aeed3b2f7218515c7a377649346db236554b50d110752486d221a68a725d54237732a71e306f20aae738ad832765666ab64380ed15af0c129b5757a3359ea519c5e8db93519ba67352141abb0d2e538d103464a4d9b951a00449b4b5c26d2a58b78141ca638eb0437379e6bdd6e68bdd41c1b3a0f737451e275a302eef7fbd7a6a4d116ccba8aaa1bb4913820d64a50e2060ca4baa0f888884d10c22e79bc79aa95458d9b309c7d029c486f1fdaa43645b46cd761545c1995b47a677252860436f2532666d3a92709a9e2d5d19c46a61ab53ac58cda8d97018628c4886f3ce27dce083223884d851691fd3b5bcc6c691ad8d81ad1327804d89a6c4e89bff5b715b1a334bbc4fa947b2c013e14170d2221a53016f655e790c6f35282dbc7c456dbbe1bbba704d3dd437979eeba1747a29207189eea041911c426c38ce9469e27bdbac6c68152a6d98404467cada54ac6c993209916056230cc12b6b4380e39cca943e8607f4b668fe27498914653a6a5d85d644cf80537ba7b8210891a6d69c1aa5d10462429968c45a471e0a47b4da1dd92f70bf2a338952fcb85e007ebeedbb10ba2c1c2ecf04d2c8ad04d2eb8c8cf5e5601c1506a4d910adafdd600346255225df822c2f2e5b7323923d18bace13d4bcea008b21ecc3faee83089c366d39616104514685e175b2eec4faee49719018928b3460224a85cec3e1fbabf11e5eee7ef022146201ceabac810dedc9bb28749062b5dc43644227a4b9fd6394d5e1a443f8992a46867fb94ef897aacd3a27396a85228e8f1361d3a1cd173a80273f352cf56e8a431fbeb358c6973dd40020ed29fb5e062eacbd8ef59ae8a28f477036699e2e8c7d753497370be6dbc316d5070e3ad7e2bc31b9945ba34e23c8ed60029c576cf81bd9f7d42f8fd75c3b34d86f792d161d4b83e3aeed39edda71bacf4b62bfbf076c1b20c67fe4bf6227a26bd8439aea822c7436ba7a544126b72e7a9244b5e4b3c0e336a33870b96284f4690e327ff00a46701c227013a7cd194101dc0ce8bf36e0c9945d64473f8ed19fbf217c7ebaf3cac3afd1c065f3c792eb34801d935b345edeba18c6588b74097fd76e9cf761d19b58c85103e18ee44aabb0d90e09f1013462457988f38b9daad7f909a3c248d72dcd08917477b219c1c4535d909957bdc1a13213057bceccd982706894389b6db3f04e775fa39c336da3f85c175e776a291c395ae6eb9e5e428ada5d0e9f3d785ea8c38ac6c584ea39a57553e81f564f87253d5d1defc1b370f96a6863bd75d63349d16218515981080d334126278a13b1f8230b4081f8507fe4799bbe09cf7b8b9ee3324f1b7d75de11197904b67473709ebb1d91b1de266d0b25680b458a7b0d749de982a616138051624333830fab67a6e1a503ac538675f20c2764e1ab3b403da6a927b723253b5b63745d35ae8905b46c56f69aaf7e289fd221ba69da1e840e8fa39ed38f69dff009ba9711ae0f90410644714c8b295e1396a4adbc15fe90097778a7bfc467ab2ef5999533babbc0a96b34f90ae7186658fc7549870cbe59293848e46c70d7ed6f0141e3c90f807bf56fafdfd355b728ba6bb76334565deb01ced050f6091c14c6afcfc8501ff00aa584f565958e09a6d27d85b0a18bce720628e92271c94ba31f257c7e59fd949a27633c8709f3bc4b6a79f1d4e46c69d40a2339fb046d31c267b0db74804776769bde1f21c4824f64cc57efeceadd7fcd00dc0236b4a61f18ddf4505b79c5031897bb9608f424c37f09e09d0a2b64e0a4153b42219dba4c475366ed979c9d2e34f21b66fb90dd47656068c4ae93b6de5a8e36dd3814d883ba5039ee9919cdeb636d1f4b61e900541ba5068f8afc3c5373478bdec8a0e69983c422e7b835a3894347d1eba2c3ef788abc512a59791213dd8915f54082889cc1e051bbd936bbd751f0dc27440654d5b90597b33c02da8c01e415f22fc3f136c00a812c2e0fa5b0e1f17c45202aa4ea2bba369ae0cf09aafe63497446e5c14a53b25c1aa66a7c89120388aed342054d578293705218a60cd4ec958f66a43830c4def3209b0983d4e7616b9b307828b07ba2a3d2c84dbd3d2200b8f16173c86b4624a3d1fe443a37ff519597620973526ba615e5fa8a2d9faf9159159da6e69b121baf31d81574a2a8aa9ba9253f8ea36627721976a197804ec11b458861bfeab6f4584e7e6090ae3dc2141f032d98aa93e815d18213c72539ed9f23bb45762e379bfe5514acaa2ec4298ab751a7929656e8ce7761fd59394ed7c6886eb1a264a8d1cf7cd3705c1735437ca9b8ccf91c1064460426c4719c41b2e90b688c8d85aee1661319a6bb9a9ad814cd758fe91d905260b83926e8ffc498e89768d8ccc7e2af35f1221f0862b92e87470690c1c7d771b46a8b45397159372f25446feb52fdf50dd59b8e3682077ace88bb63777218311f9354e3bc68ecc855c85c877dfe27d56950439cfa875e74a64913e1ebe4b26b22ee2a454c545925246d863f55b27f654da6635a64c975101f1078e526aebe27f6314a1b036dd22eb8baf86b8ccce54f5fbf4f25fa388b24a6da1522a7a81bc1b6b69eaa7066cf8a01ec556382ecb95dd1f467c477241d18b3456658b95e883a7899c4aabad000e48ea3cb9ce21d0da5b7b80e5f19f92e2b692067ab2e164a6aae47bcd3c757661b8aeb223618f9a9bc18c7f52bb098d863f48dc34df73afc16ba4e3d9a914f9792e2b7f4e1b9c5779764fcd7e505b2c68f86a0dc163990c746c01ae69a918d6b9cfc970364baf1bb413df60b04662cc75e3c8ce4d683b2070fdfef2f25b2230c9ed37815062b800e7344e584f972d4c161648ee0ea8d4d32e96117bfe312187d7c9913467340768c40986c8169c3e38a3bb98d5c358ad2cb8106f4aad970f2668f11d1190a0bfab885e642e9e72ce48ea62b1b70582c3752b22458864c602e71e4a2b9a5ee697120c4ed1f5e7e4d9457dfd220c9aea1c3819ceb391f66d3223dbd236e5dbb9ce9fe7c9c6386078730c333e1cd322c337a1c468734e63758eed9a3437de870bb52f17dfd4f93e2683167d293d235ee7622405dfdbee5ec9b37ba678d93c07dfde29cf799b9c664f93d91a0bcc388c330e0ba585b2f147c338b4fb151c0c6346b653af3f98f9a7c77e2ef289890aad7483d9989cff00d7c532342f08bc3c2729fdfefbf2f88f6c360c5ce320a5a1b1b1bf5be63f64e8d19d7a23b1329794dba468eebaf1c38386454283a6cd91e774c5ee1e672dd973886b45493c139bd317913ec34fc933f0d09cc635e092e226f1c47245913498d118716b9e48f2bb4c0d222439700ea6787c4fcd4a2c2644781473693a77bfd4937f978d7e931490ceab6ba585fd6cff00cfbaafce70c6a587eeaa4de91c66409890f5f4fb92126c57b4b419b4607228f47a34368952f126aa70ee4096176b4e73a7ec8f49a4c4208ba5ad37411e8117389738d493c7ff00db2fffc4002d10010002020103030304030101010100000100112131411051617181912060a13050b1f040c1d1e1f17090ffda0008010100013f21ff00f64cac0652d3547fb73eb2c312b8ddf9969077e07e65426e9a47e2b7b8ac103829a2adaf722f4ef615e3edca4a19a76858a7405f4bccbd0b98a7304285c2e3a9666fd6670f272fc7f9c8005ae009be7c6ab9a8f0b96ab1fca2c07ac6b5d5e738eddafda0c1ea080efa8dd33a282b0967e3f74d89051d9bbf88bd08bcedf980e5ccb4635a550229d2ad8b05d0bb58972ce50857f94cb3775d1283a61d5ea46fb0ebdffeb13685001536cc1c42201ef2e347c4bc0ae32ae6969f202e956abba0339483684d1873e1edfb801dce6dd10416fb68f48c1dd2d89ccbb6012aaefc45b5316e59cc4207a06662889532abc4cb42351c71fe39d69d05b3d74155c3528ca068ed3889e285cc14108b530699611456245720d597ec7c10512f36ce1c14bdb4e3b998137710b901f0ef3fb6aed41687308eccc1b96c044a7c8fac7a91c658f95cb369df4351aa97084a8152e4c53b9cd530a4661dff008ae2382b173e3fbcc039c00e3cb071880db00e3ab64c50298cae0c758c74d8318acf05d35567672e4cca328b24c02dc0f57e5e7f6a75d2e96aaf58c054a8e9a4e190ab47d65497c1a9e6957305e7ae9408afa8c36c45d1926bfc34402176bb7aeab86121e206012981516c1322232984a7a19e69848c5c243f12c6f3158139c969dc2e1b3868ed7fcea8fda2e5fb4442c94ef8319f5fa2fa175c2e9593125d4111d0a6d9835d75d928878623544333c32b6eff006ff8235638137b777a3c358bef3174d8c4ad5862f6a9e954670971084a6970dacc11a3de5cf789763bc66e692160ee499126a02f42adbc8b79f6edfb36d216fb430000b8adbf42c1874112dd2d8a53a1d1542a6ea711d34c49032b896ef84af2835a8f4615cd6a2aad92a42d69861f207f5c3312d3fc87d9800ccbb8ceb9e79570768418eaae6d4762dc63d418b7e65371894888a84adbd226f2c04a64a1982f1318417ad8d9870e430e3bce13cae9ff004d7e79fd90d9daf83cca13399f9ff9f41b6003ad30d9992012df294f1301886f105b488287d0f4cd0dc6aa1b2e024ab464b947357d60a01f93f58bb10ba05fefaca605dd669bfc4aa2743a573c333a8a0f674b2fb31de7a5830935732994e88ac2c5895513a621a85c346eaf60ebde5ddb6d5d0693f637e541b19feb1d0a3ad1186d0b8b28613df335a42595bed3279979059dc7d379df4c7137c54b1bdc432e19e4365e6a769094c153d98cfd476cea02d58f6cd42857899e7bff006ea26a32a6a5c49a3c2303f945598b6cf5babcf4dd3310bca3d598f9318f745608c598244e26909c4fc0672638fd887900e5588ea8225afd2423751afe61a44a989c3da5b85388e10a2ee98b79fa6a62c2e5441e06ae2f26f172e9acd26d2941e3febfbe3f506ef3b89e064cf9cd7c43945505e22c6a51d1013cd196e5dfd0188073295160a87926a295b538e23e3fbcc22999d09b1568a482cd3dd5d3e4be1dfec2c03bd46da00964f54af79604e2455a2259e01d0f59b51ca06e149758fd37b4c3198a66c364bb3de5607b0fe9e51405c2c0a1e2ee52a881b469ed389429d2a26b0635689f5599702e67a6435d2b32e1a138e605fca7cf4bfd00574ca6d73eff00d63f61f2a0dae29186a3aebad20f72cf339152d2b3da3c86f3721d52c1fd268b12f431e913a4fb547f3fa5cf6d925d06ff00acaa76864ed1e5d4917605ceea618292abb1d93ed15298e3e8b767a379914b504c25a9954188014cabeb97bcfbd563863bb743c1b1f729fd81866103943d573c553162343a51ba2666b0c11a527307ac199a56935a2fd7b97d12ec48d4388a16230ea3c5e4bfd2c7397b3f1cff001f89921f6907885456a4211b8ed53baf54ec49e21baa621b0c06aefd3b448436937de667b633d52a70e33e81030931264f110389604332c223e65aaeb1d184556106dac5673e717ace3fcf16c16ab457040d1360f47aa8680b3781eb2b88392025c2748a194feb5cc5dcb8b6c6a130cea25b0a6998a6715652a3e70133c688883b31fa3562e198152823a8a42e4eff00c4405b993098d021317307226bbc3b14b2e81de5b422456eb7033627989afc252c6351b3686308cc84d63d0aa94a665bb238428b96a86bd5625302f05a6d63c8fc7f9e36ca780fc1fdefb9be9869728a22e128ba864972c00f6e233c3536579ed0c0384760ccd422077fac4cc33030c29822d02ad3182b3c740be12e7aa59982e664a8e77dfe8a12f46bcff6ff00af4acd3d007b99632d651080ac3898238f78c0f62e3b2e7a7fee412b95f499c480a3191c10f2673115426b52d74b2afa64da800c6a70452ce1a3da5ade300bc7a3fcfa0218a362bf82a1d5c6a6666c8c20f31a9bd4b41770d44be25a1909550f89660cf479fd5eaa82228b7062559b282f2d79e8d84d98c69334782629f96e6ff46d61ca853b4642b963765730cb9359be3b6d6f98f5283fe885871aef090b5f98a4ea52be65706652c75cca86ae3bcad834dc48352ef64144dca23442ed947975effe7bace5076d3d12e23d639f966f130333632ea664d32e09bcb4e205c69899b9a86d3e9230dcba149ccc7857762f3f062024be798f4f291499a953467f4ddff48cb7569763f01aad74cac16134a6fa77f557410701953743eb05b3158416cdf63c6c8ee485cba4487b4a91292514ff00711161e06a0b13b4572ed8a0386b85fe5ff3c8980441cd30fe3f12a36398c3a45562d937a85e07b3ea4fb17a1259e696bb63051a67312ff48220bb23e3f4df023e63605fa7f719979d154318fac01431f5ab88aa77f5ed02d453b8c49e53d197b2aa72630e995f8982512e05ca2a38182f166bd2ff00cf60ec467077f6ff00b1a580b03848102109425046ce90812fe5e7ab1fa01154d8a8c70457b9b74da2286ae59706bf3fc757a465d855ed0376aaee7c5c1a8aac64cd7fdb9521f7549edffb111be9bcff00d25c1d612d5d5efbeff8ed2c80b5e038acf78a1caaca2e2bde2606bfc98b3f1d38e3ade2b10f240b30963f1f52c01044ac8b6cc57bd198b001885aca862398a34cb91e73b1291b7ec20e50d6326b87c7fe4a8421d2b17d0084ca5e2cb43a31254e4ac12c651edd20522744bccbdd4add74876cd567314b9c3fe66b99eec357f9a94282a2cfb51454b5c557bc5ccec8a8ee12f4d47dd207ce898d51d6d87b459286c423d0c399733136859a8431c74c287abfa13dcbc5f995a97710440a82024abab76089a562b4ce1d6c3e7f61b30635cd3a643756ca81052aa2da68425a1387404bb7e83a83c0bb562591e3da29743d25db2921496c30d63050b66bebdcc53e423691859770b89176e1b98c57c421215cab7e10d97ea314d4f11da6e2c7c802d4bf148d20778c369c9315387d6070365fc12ef46bbe8e442452e2675472c67ac4d38863e8604bfa1d28eb38ab8c67579abcfec4a71b6616d7f3ff00a74860ed2e2464a8583d14cc23b82a779ac750b6889e74e181193dad4bca017d15312ac765578e906a36cf1a986153729186c5454603965f81e89842796369b31027842ba8bfc481ee3e6f825541b8009e750d6626abb81db02e01170d9c46e66b5df09ba8b12500452c225952aba7e9a79a5fcc463ea8f4c44d517aae5f40e86e96e1079cf7fe37dbf63a283b1d0ecc175cbb579d873e2572b6c9908324311dcaec6505a82742d9c9254e70f445f44588eb799903310a1a483483988eab7328038b8080afa77e4b16ba93af647a0664d1161e41332a8e3885c1784ea70c1e08aade835a6ac94f77d633f2584aff002995f0b4b16a7a4bcc4640a778b61270b0524693998df2a20c4e36598b52ee5e9a88a881fe90a69321dc14840952a540e99b8c6a95f9aed0dd04b44d7e57f642a4a0c859e381bdc751d99a97a370d53995082584cb05c3a53dd9a3912843412235094112cb71c33131a2aa5325733647bc0f7d71510cb0e63ebeb2c596f05a2610d197626f402571006a7e3e6333b337329961c35300adb4dacb5e8e3e13d32aa563d120879eb298fa3a854f9f4fe2525f0f6085233a48e6d767be451b5cfa4a1d42e2d6a276cc499654e8d8426375702d91979cfd4b0f71ee3ad7fec6bdb7791f1f1fb30a20f5894e33cff59ccb72d998003da57623a665cda9918345c77e05d5640041e65c38cc0818c9001d4d11722b88aee0c0a29e358b043d2a14abe7989453950edbec4ab0a8b60e4567e6e163b2be54b888d92db2b9b4bd04d1a8ed90ed62f90f8d278e55bcb52ef2c14980e90ed2efa22008fd32f3da5c2767d3a18752a6766d01bd41cc33170188fd4536df573b8e43a71300938038ae3f6746ceb1291898708986377e1c45e52957115c2442917c928252ab953d272fcc140fb25d86622238c9d32815d1cca787cc11d31603a80b563e56cd77ef7887eadc3fcd1ad5c960083121a67eb32cdb2aaa29793e6505ce551117dbc916ca9d983a5f514dcbd4e523b3a6445036b89615bb87c4b46c7a4b17a7d13b2835288d1958ea7201173e057f032c3821c5e1b012ce33e3f6a76ceb129196d768051d9ad76f89a417d19725ab98b1bab4d02665127638945e778b6e2e3909731306c70ec740f71b313a8f6cb943f05314003cdafccb111ce9f0313b4e985baf785a47f45e65d73e0e0f695be95a758145657cafacff00ef29c8113c74e7276479843dfc8986e5332660095c7328d219bb88bbb95e8637379515a8f9eefdb4e501932d725f60d6a211207a654efda0c27981d302a8165991048d5ced032cc6a3cf4a241644d90cd5d4a56de12c08078cdef6576de5983bcd54b3ee12b66c597cbea2643395855bef733e09552fe9788ee31102a7f0408306a58111491d8e63a054566857d1a984e81c182e6f0a21ad1404626d82df1c1c9ff119aa72b5e0ec7ee1488cb01a946ef8a3d88d5a861557eadc4f8ba57ba9fc4b93af143fd2cf53c966821d17be1cca6fb208b5176f283c0bc04433567fb534e8626021f82280b9b6276ea1b6a56b1d2e57317dff000eae88416e0399e3083f97d481939258cad6e4c1eac6471a5a2722592d3e22dc116b47d2082024df89be6d5d5abb9a441302e5c18e15da586e2e27e5df34777c4d6252b6739c994c62abccc680aab81e6bbef3fbd7f6bca1c7573112c7a50fc66b80a3fefa2c1c6baab246fad0d728bcd0dbf3fc431d4c5a954ebe0db3e3a1fc11d2e7da0716b2b13a152a21b1ca8aa55b5e7a932d32be4d057c1c7b43174d680a6f9ce799547b72b50f48bada30951e7873c4c90dc8d05f6f4f33005c41862b7bfccb927792e541cf80f83f7cba71a182b940a3eabee59c351bae421d0e54b8d9bdbd2f6f6ae07758fb50b821f5b23ec642341e9d6d67097dddfe652b53c531300b75970749d4c041aaa1e9da5b063369f98d3eed16bf4330de37f61163b84f5bff007d16a0dfd1852e953baeb15ef1295118628e70e86a501eead1002b5bab3de98b50d182af351dbb9f3d1212d2272ae13d35f1d036820d68e63b3d7ccb4e9776ff00532ff4a8e3ec27ab0033b267f83f3f5f5cd27f0a8f091164f24ee5e92eee108a0b1d3b0eef7453b098753c3015ce7d2ce8c27b2f6767b9e2174c64f5ee6be606b2d353fe07e659590f6aeb5a7184c1afa582c9626a333b54fd8358822d52ebb1cbbfcc3a378ccdc68e7a78e060d90ab97ffb32a5cf42b4f31b314bef864fe65c1c9644897068402d5e22f9f7c8dbeedcbfa6fa7815943f27d0b52ee34dde19dafb044354c8a6b179cf44b20a954e27775b71c0c06591c3386efcbd6612754b9757bf469a9f601d939256edaffe0260cb6ec7db1a7d37f4b3a635543acaa11cd1a7ec172c8b06c653c89d9c75a278bf5dbeee7788530657e1531455b5f565b9311e1e834dc26da18e8a69ec92e9917f47597ae39bd92c2ba31299d92a3567e7ec20f88d46d4e0f8da7b4be3a2d116f0e95b42461e10e9ebd5f5b7f39977d1590d4c15688b2dca97d6a5a53f4dc1d8c2da2b32e4bbe98a4748982aeb9fb081db477629bc79398aba1d405cb6072631a358e07fde9e3f75f46622bf4ba0a6099cafd1ce988cba5eb5777608ac462cf4b894b00ec18cbdbec2624ba08b60e1c1cd332dd2a5665bf3512ca753b03764b3b0669d0cc717517b309cb8813eb1961c448916610bbd7404baa664e9ed2f8afc45c2fbab306d5f731d5ea623cfd87a0e582bb3f2bfa28ade07a7aeca96be3a754c6bd54f07aaf4ea58cba5af4b92de2668d4f4aa0da7132e7894431b1e3bc30d4c3a9a7e21e2a69597b57739b3269fb0f72193e0ee8f0fd0201f0ea3379c9de5d2ae87bb814ced0d0faebfe42843193d20e961b6043a5867f0004e747c551888db2e1ae7e7f9854d12af4a710a2126a3c19a1e5639510ff0507094b07b7d87e67b36c357efcca9b0f2a0c41caf1eaec964d4b9cc583921bbd7f3cab79fcf007a03049cf4d8be2cb1286e5b31102c78e04c0d4c1a8090b857114e6dca562502ee3bf0f0e7096302225ab412cbd9c83def4976bc43d1e289fd0f2fd883706ab71630bf274e66381530270aca1d12e59e7a449568fcc276344fa4efd4af8bd5bfb6783d4626fdbb691952b7c5ebd03842c2d7e3d76a470d4f04a62e86bc4c87f258d2efa21c3ad2d25684f7d7c2106044355443b79e9dd88decdbf621ab3cc395e6b19c573c7acc01aa653f74ba98aa486d9145dd55cbe871153395ed086f251b75a8a194d12cd795de63d41f06a57328fc27bad4454457906d671a7a15d1b55011995f17befd408f5c7459390731c53d8212426d1dfb7d8ae197594b1ee43604b099f326a557c40b5e98179f996cd28268b5349bedc17f7d40d9675d5d0b1ce0ff0070f84c1a9e18c1c060acb396fbad0ec91a0346b0be917fa318fabccc34189c0d31f50819a0c96e3742b887ae4d2763bc552adaf2fd8d87cb586dac1f60fcc4b153149994ab95188be31494bf88bb08dc7c26a335bc2645be85a8056e5b4702fbd40110c0b6f22774043c1c743a980e1e97006372c04bf594b01ed0ed5c215a0be704bb8f27d8e819d68a461287384c9ff94fbc28746aa2a0e353c45d0bae6e18abbaa536abcc5cfd0a004f64abf94d8de5d4cdfc490e522d435497654ec3fdcc198b143f9956e4961e0f57b24bda38b2dd16ca3b399ccdf9bfe4c1fd9502e2af25ce68d7698070c05dd83e2119254b15866f9ca5ea9b3cdca2e2cdc00c138f13702d44d530dd6c8a254b19812202662bb877bd59ff00239f7cce652a566109e81b5c7d96f5074a94f931eded306dcdf91c22f68aae34d36e65d6ee094b64a7bbb7e3a2db712963dfda0f277882653a37d15c43bb013ce143dddfb4696eef80f983fd1e4124c7e6602615f48c6cd786aaf060bf657f238f61ff0072a16c02b231f7d1214ccbea6e3c91156731330f6815ef2e6255a03615a1cc3de8bca03163798d3e2331a70afa41ee4d50b8e1d17000797f6101979a344b8d33d454950097bd2a8aac9c517195c72fd95a28a339b4cff07e630e66233d050672e46e0a97183b41943d9895b58fa234302cd72774c4587b00ca56f9058f880fb01444799b2621816fccce6a2762609913b8ab01a0ec36aeebdfecb0da19bed7bf8e7fb98c4c32611e9c976c186ee0db312c070ed1735f28172fac3167bb987b7788ede814c42336ae8d0c13187bc393c41426ae6283880b0bb76981b022ae069e4afb2ea5451178b37e2b6bdae3a6a2d6a170801a85b54b9514e7ea7dcdb88fcd71c531c208947f128040592a52154b95077185cb93390d12b576799973c687d974b0ba5748d91151de333e56d258948a370f8c34250d746f249980af48cb71f49d078e9bbad743ada008977302c8b86319a677331532e70c4db90a5f2ed77e6f5afb32a9040ac2e9bb17c76db70fc6e88107e49ff5103855d710db07bc5917d0aed2ac52085ee5f5c0b0b3a831675d2e725447c6353f2e1a858f4400d5bc0679de2ebecccf68faf92cac171aca16c6efc74469899619aa47d65d03c12ce2bcc991db47810a5102e276d11da574d46d338b98fee85d02dd78885de7b25c2ff2fb35eef17734a64ad61719bc72cd11bb89b046cb9f58d830219b26dd4d52006a8866074a9529edd0b2e5810bf333433d858c691abca9bafb3aa0522ba34d0b2e906acbc965d982e02a5859b974349dac1fa0415a705e9ad4ca2317b4aba829a9644b49620779562250de699e1a3e3ecf1b45908a2baf3805781d401181b8953904278989888e997ef185b84aed1e5a45350a3538706c102c048082cef9dd6f03c5d525c73bbe5cbf67f630113fbda16d256bff00687879f5b07a742146599cac03dba298741bc50e662726e70d965330dee7713002ec56392f0699854abd70017acd1d8f43ed1657261c11f40d0da9a144672a18a3ef11695f14e85ece855137082eaf5f5b544020e36cb861c500e3b0b335b3238aabc782615a8a30783ed3d0e4abd40727f772e9d5081d8c9fc2b9c1a08f4911d136fa2fa09f8a9d00dab28e176029c1d37c3ac6e0778c1f0b4e5dc789b6b818f65fb5f191f4ac0503a25a538cbbcf2892d58c337947c3da5ac8bf570c2d9acd60bf129d4eeafc0b793dabd9e682442f03c3468f51f9a712e86c5696a93bb5f02e1ecb3aa429b4bb25358c8a67c4d0da874b6f158ae3def894d45370ab719168ce5daad942f7c204a0771ff8a9dabbbfff006cbfffda000c03010002000300000010f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cffc0ae93edbcf3cf3cf3cf3cf3cf3cf37e3df3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cee10d0fd2b6b23bf3cf3cf3cf3cf3cf34030438e79f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3b66af9e1d8d847d9f3cf3cf3cf3cf3c8153d2af8280ff00cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3d611c95047440e1e2f6f3cf3cf3ce464b0683f41b8765cf3cf3cf3cf3cf3cf3cf3cf3cf3dbb68e945009cf48456696f3cf3cf386ec92789777447cb2057cf3cf3cf3cf3cf3cf3cf3cf165b3fd0852c15eac9688cbd3cf3cf3b76c40bf3ff00bdc4238d37cf3cf3cf3cf3cf3cf3cf3cf0791a4a1ead9fedbd20316e7bcf3cf369c010fb7a19f71ea0d4dff3cf3cf3cf3cf3cf3cf3cf03f437334887ff00ada206f8a47cf3cf356e516ccedcaf8362fa1b0f3cf3cf3cf3cf3cf3cf3cf25c61f38cb6fdfc930d49d8d3cf3cf1bf829ba33fca4eb434a924dbcf3cf3cf3cf3cf3cf3cf28f50ad82577ee400fa52d373fdfbcba203a8eff00fed22bd98fa0a7bcf3cf3cf3cf3cf3cf3cf3c761dbab6bc7ee356fd06e56c176ba83d02eaff18eb10d7c23bff3cf3cf3cf3cf3cf3cf3cf3cbde2b8fbe3e0b3300f0ffa8e42fb1ce62e142d5dbb2dfdb127df3cf3cf3cf3cf3cf3cf3cf3cf3ed25581d3b14f73414b801d7cfb45f9dbcd2698a76dd7a77cf3cf3cf3cf3cf3cf3cf3cf3cf2e8ce99cafaaec0daa187b454ef9ee09cf8f64ca3d837fef3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf1df9fb7f55d1c87b17cafcef7c2dcd74be441a9b5f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3c5de36a8422976b38f0667900c557ce7bcf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf88e8b1e8a9b250121de9c415b4f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3c632567f85716b00bfbbe81cb0b1f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3c717a7b2ae83407debaae5aaa729f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf212fbc0f6e7840823b3b9adff008f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3dbdff004f0ed08149435ca605ccf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf2ddf671de301701811e2fe555df3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3bc63482c07f8d57e681438b77f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3ca07cc0542510c9dd2d36df3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3ce80fd6b8d82b67192257ef3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf4ed7a5af2a2fb67e883ef3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf794ea5ed07d17dec1cb5f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf90cab769713d68b57735f3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf095d43215041934ec09cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3d6f3f8eeddbd0aea47cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf26dc3092db8bd67fcf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3f10f43fd957bcf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cf3cffc4002b1101000201020503040301010100000000010011213141105161719181a1b14050d1f020c1e130f180ffda0008010301013f10fbf0b6a68e5c76441f509d123b45406d7fdf2e822b42f526993d9fa9505b1df963164328801c2ccc693650f175ff00a5c9e5d0e73058bdfcc66db30a3306ba0d1226fa96b79dfc5d8e98a6ecaaee98f0dfd17e985b8633a428250510af3391164148a759710610bff95ba5f36c7e7e3bc52f6b30c25560ccdb5a2be5311e9344c31613f31a3d7973efafd198ade3b6b809940415c10944a98a6e2aa8d64d7f9b87a096a29dcbb7bfe23330c621b6a68e803829693cc289ae904ec0d1e675ea794ceb6fd164f7962dc104b38223980688b2f80bcc448162554cc753f96919aff73fe6de653b65f4262178d00eaa6f4c1a92bd4da81170f432465cb750db71fbb7d0e07947b129817354cac83978eb1c520dd7856bfc929e44edbf10a0005c6884331e06c8eb51c364621aca4dcb93e89a79d3c7d0a557821506e4511b61a3f812e2254bc4195dd3f8a6c03ddfd2320649b385442e1e0e2ab4c1b84b50bab899225841865c7d0f73e2f5aebf40ea22eac835162998af8234e1a4d2164c3a954a66f16bfc4eae61f112db95c71ba40c334482370408752c250d41b22434c29361a7f334f259d5afa0efa94620ed23b30929844d2135897344c2704d11c45ccb59fe2807a58bf1fdcd1a5ce758ae9341081ab2329acb1ae4bcd4680462259ea3223e201341ff00be93acb8c1691aa2b544fe0cbae1ad90d66779c3076bb1ac00007f4d0ca3f3d5f827bc4fbb8efb1fef48c3647168d9a561cdf503d6a28a61b853cda384299ca086f10b83611b635660e020f687b7151cf88001b7d066f6781aceb06ce0f033085c39a58cbb11b1fca2bc1ab0aabbebee2fdf6ef072f2105eabab5eb17a3a057bb98dc018d875b778956c25303ba9cd22868b5015f788d53c943b0a7de28f173d1df99d6640e0661cba8644238b62342329ebbaf7dbdfe3e8705bcc1a6592c8e550219957c4f184b4bace91293a732f2d0f78788a3a8fbc45543260dd6a65d9ccc3cb9f69449344b1e86081ea776133f185d9e78d652b17aad7a3550cb8392f67a06fe61d7179202737c7fc8b419bb8773a3ecf78883881835c105c4198608c032b5e5046e657aff9feeff4597c90a19926ac8870c67646c625b744096c86a9cde47bc2e869a861deb2bde2342fa248cc80ce0f42f35ef3dc61a7994aa55bcae3dc87801beaf7b0807a610364397132837e2202ac8c4e95b7789a889160d9e9d1dbf137dc61e6b23e236b8a6ef8788a8c93c0615afee7a41358d5d8e87e757a69f49d0d15758830c12b982674410a5e77af23d6553b85748052ec6e214cb01034256cb2f216742f1eb97b4cb85f34bf1ef1f22aeabbff004476eea11a347120a95234c1cde81fbd6377560694bcac5af5863e9a429dd4e8e9e1bf31e663441921b9518d41e4573fe39fc4a41ddddeff004cb5e8c741985da51a91ba108cf7fd36fcc5035813b4ace9817cf95f588aaaaf57e2e0e11595dfb5c1b844351c1416e93281ea18fdec41e96b46047c001df2fef4828d910a6d6cef45fbc14355ea327bcbc9526194c097948c83a73dbce9131adc9a1df9fc77808141f5953edfd2e01361994fdcc9e9191bd4617a6df88afa8e62845e80f2caf82e51c8fdd7943cfae9ed57c149a18789410034744efb311a06ba377df6af9811aa806e969eb9dab7b8b6ff0047ac5ecfd73f370050a3eb992d81fa76e38c85816bb1158f23affb2c20b8a8801745f047e96b2b0768047d0b21b5a9d47acbc2a73b2bf3ed3382995e6cef9579952e2c4afcd09e3ff7ec36d7f965f23159c7e72c4d1282798c70a7efa243c26f08e6685e11ece23c0c6c858fe3d62ce0580e5cefaff05459b4085c5402f6329061102d63e70fd86c3e0dd753fc5836591eda43f98f0ca6a5e48ae10c29b9b5c0a011d8cd1b8350e15c344066b97a450bb663bea95284e83e3ec28235483d9c3eccb86e3a2975163c15f51dbd49ab2101f3c33088710b28aca4c70858b9aa658daddda5cabc594781d7d6a72981cc4d939ca1358af7afb115ce529ee61f73804da00d5e49cfac30aacc3a9ad7b42325b30ab300a085cc4402b02321b196028db36758acea83c287e4577acff510d64ffe7d899769e0703b5d3dd61ba596f5ceced12cb0b21cdcbfa9927382cc6d033b92d46f50376191bea2fb5f089accac2e2d2aba567da229873d1da9e8c6afac1775db4f78e73e67758bde0d7d3fdaaecfd8ae4d4f261ef8c9d427712d66d6e74998bd0ecede650528a8d0c5a83296dc06ee05bd57f1fdc146c95ee01eae3fb8e87555b97b72ed2cad6df9679c0e96c204711be839e97fdbb13702f63ec6e06c56da3b9752fd7b32ad24b7eb2e194085de4b9f10b9deb1f94ccf778e0ff7d26396287c273eb0214dc0a6ba0e57db9c2f68003a1823652c58997504b77fd7dbe7a4718dcbf75f5fb2504b209b4aea761dbb3af4872ebe0a9a656c42b87223513068f2e9da055b640b012ce84a7baf37fad59dc58c0f1abed2c2539183c10533afd9a61b25d1a39ea1dff33b5d587be904a04e9113d2b1718e71baba3c4aa008e47fe10c683a50fb46eb5d5b9643ecc1d0953316a6a19d9489297e66ae7963de596c21c9326780efecc2a18b51ee3b95c52908c208b135becdbae016124a22ee28a9519a92f57d9c5564cb70658ca328d115510654cf0db7da2c6748225902e900c18952a2185210542de7ed3a4456b8966f3ab11de3b117b13507ff00843fffc4002b1101000201020405050101010100000000010011213141105161718191a1b1d1204050c1f0e13080f1ffda0008010201013f10fcf5f3be869ddd0f188616ce6bae9463be7a6b1cc93a03dc76e9f107b77f87dbfec5b836e2cf91b6de37b41460347238c1af36a91da8b9fe9d1ee7fdf5554c001e24d4c3f737d44006eb065cb34ed8dc564e4639a92a141a000f22367119b58886b65b24ecd17941fef38832dbc4efe3a7cffd2f8b3ca7253d623b5b65d33032526e441f10d7cfdf5e99bb26bb861feed7f6db00655c0eed3aec52be0ca816e332f6cb58d87305a690d5b604a152bd9302872d94e8a2532805853e32c5ce65faee7f72ff00958a5f36c7cfb47c96b31065259ba89352749cc6c09b91117a39ba3d7dfbebf677dac56e5d5d6cbc977a0de74824e3cb9baaf37ff9a450d65a45c9883e408bca5bca0240e595cae079d21072b93f0e8f9ed1c1d2611d47eb449412fd57a980e58c0104b5c0a31c063454148a5a21d4b3a3ccf9f7d79fd90b4872cd8f1dd706aef543156a2a0e18f9565c1184ef812a5980288c301d48845cc1e3431c395c5cfe46ff00333defeb75a1f5654b65b4260dbc1f5a4e906f306b2bd4de9316094999a9dba9c9dcfb1a694ab7b183e7c586eccc15432c8c59a732557029d6134e0e4210644288aa85286b3542abe81a7c766224293e9c1eac3e65222d482cc68d0f8c591d6a28ec9467594aa5cfc4269e7a797d8f2ba0f23c3da2ca0cbd8728e624c359d1113e850cb0aa38839a03784911c1b1f1d7d6ff00b07d09b007bc75d8dd23462a00cce3233a6546e1242b0e25cb2586cd65c1ae8f73e75f1fb06b1769cf6251a82b20cb2d99a2666e598ef897652b4415b84101758e2e1333339f06a7efe9767a11b5c52946c92b8cbd8848b8616384a14a30c260353440d054d2f79f3a77afb0a185baf0722fa9d6b9dc6dc2313c9a4b386d0f6e4dd32441e6e389673278024082a2d6649021ea9f23fdfa5c0dd17dbf734a96398b351541992ce10ce589c106da340c1313c1b5408c40068ffdce80f24d9f069ed7370c8ca8504bbc8822188f2896aa6b15114b71b92620462ecc34b0ac51d0d34f7a3ed5edb4505b3189f0cfb41ee7fbac6ca57ca2622fde286165f0f138f596601b8d398380410b866628b9a0500fb002dbd76dde457718a6a9938a90883c3658e710c2d887ed1c83589bcc15718a0bdd3cdda069052bbb97afc467a0736096b0b862b781a88b1a91a948769563485326700b85c04c6705cb11b70aeae32f8f5fb142adb00dceda29a9058ad022688e47c66e365b4e22155c42a581a91d711619aaadc2f443563cf091e4d5f48e571741e92dbb65b0ede933807ac402abc881871a8a62171af30c624d668354da24430634209c19492a98625c2c76b086f65eff00e7d930d0373e2a9fe6a72229bf09f2211e57660918e01454d1146b9433c756c2e4734f220d5bae6fb6c45d479ff91d380e444b5544b57ae60503ca67dcb05b104c54a6003109b5434236008513546133103a26b3c0c85ac0ed9e7f8f9fb3146c8401b13068c2be47b2197aa007292d7311f3e2180db12af26737c3de35bddf9cc00a9908d4bb835b3a102d7a352c5b255de256bc48cce5c9a4e902950d1bc622f598a039c08d21121aae7f8e72affcbdfed88545a16b3b3cba367297ced8a7999f46bbeb3d7e9af7a944507a3e62a0f17a2ebb6af86b2c05804d177654328914a469111442b970d386b062ea20a759596af07df59aa6e80284ae04b7446a9539ede72812dcb6ff00600a147de3f35f7e16e92826f98500478654c628d41b5db8100f48741c44bd577e20d574f28ab7ef7e62361e39f7b800a0c7df557afbe65638508de21b1b224ef28f998001a406eedc0491771154b93892ee45fc0d4f39ed2f12822200da58cd1208a0a03696aba83c9fd0671158e22a98164ef23f0265cbc28f7b440b497e749ac105b2a2b7e8210b9a515054b3a4ff00023d123ac4348dcd2a16e76e0317031c88a9689b9722b46a8d6245031a4a92294bf0889176368a7a371a4cd8ab16a1666d1e63d671cf81c814e1773aa1ab63f057837c3fa8d47311eb1394a5a94618d8225c6542d9a70414c311b4d220801a24bb070bd72c6fc1573c1d3bcbec37ef0437099a12eb22184cc734a8237ab8e054da3096949b11f7e00622d231ef0007e0f23e1d7bf3fede0595c0035cc4c3cb1da409a9a99e5bc15c644fd203f711b519a0955accda170ed020d43f097b539b2663a7de585c4588a94b0e32acc5784b32928de5cced11436f225752f9625b0daf9d45fe16e5e49f1fb8c70082b51a7ac1ae084234cd401de0e96ae9f3a44508e6e5f8f48a1db33b0aa5d3a7d0afee7afe154e725f2cfea692216e55b994a2688fce2f57e6f0205950cd23b188794650403fd8bdff00b963f0a80a749b453e9a9e950c5b1c7247e00cd650a594d31cdc32c850eb2172dfe83f0cc87a63cd87d6bc8e049d2c12d6382845b2120e59bc532dc2c21459b431ae5a80036fc3093614ccb0d68f32649821713bc03a4bb28754b5d9c1cad151b07ab9fe20eee8b6be63fae4f760974308f06d8019886c5f6816c60f571ba72b0050734eaf73639de7a6f058d014763f13e148309aefcb3a3888d99f37938f59596874cfb4c67a49ef16c57ba12adf5c7d31eb328bd807cfea0544699d5c757ff087ffc4002d1001000202010303020603010101000000010011213141516171108191a1c1205060b1d1f03040e1f17090ffda0008010100013f10ff00ec8291400da0dd295a69f8657e4af434abe1d1e735a689ce0b15cac28adb379ad3cd08a0150d0415974a6dd3786eb1757a006550704a30c9619e8d36ec8650163346eaebd1ad8f1c85855be889e47f4d8b6b1c5297da05e5b6687e332e3d946178dd5f329750695e446ab372d0986bbc120cd0e46aae9e63262201195ecdaef37c9c0118cb9a0adcef6305f264cffbc899c016aba0224d50d958bc85bd1df4a7a3040810d420dd8adf3bd9825b5f09213658614a5aa285bb3e9f86916a5a000e6196d221204cf0a11e447f34097b5a72a083a32c6f363551bec365c1cadac18d4360c5a4abbd7de1b501437a658a355bb86324ec53ab256a18cd562be22182295eb32add1a80ecbae8e7559dea8c76e2e52cae8991ff006b4c0abd42bb5feb863994b88072adb7d2c480555008d5006de682f6de6e165d2fa46c5ddf30500717a97a9fba1072b83b5a76aece1a08963847502e476c71c303b9118715b935620096a2bacafcbea7c255b406d41ae9e52138b915a61c1d556f7bd6a12f6aa2a9b0e1abc7fec37755e42cc11ac60ac36c6dad0d6e5c405cc9ab4714be200cb99772e812ccd62862fe95e26f20760160f4f111a32dba9febd8c02c931780ed1c99a8050bc919b4dddf4c43021001a7f4891574ed2f4e9e60c300957d20841d4b203476c130148f1312c722162b48360574e7b04a5e0ed0c4a959002c82b02178cda5a0c82d403a83f2dbe764209c3578291d8de4c6494b620aa106addbcee13ed13ed23c4ee713048ff0b547f7b4aeac9e55c4425b778c0403e5dee662ee5985468f694388fde651a962cb84810e0d8915d5b70a5d4d293a7faadbe45e285a0614a2f2553ac12d094d04f22abbe618a0c1d20f07c44f16f306a20ca9b820970179aa9aae5ac22f2c1d710134ea5a9bc4b35bb46d2b7baa8405e121a5d2da112b180800192987f2b3e396748f5a237db8e73a3ce371d18c4128b78699f020620f234bc06e25b3e88ac2be52e9f7c04249de604dacc612d443d1c09999155914aec87688a44a4d8ffa6972185b842e385c02f46a0fde0e00140070054b36938b28ae1ab4dccb9af78a0a4a8874e18180c379dc0bca41199764553032a6ff00ae38833948257f586d32c34a371ec00e1ee058ba2025620b3f282cdfc3745d5bd0c99712963828cc0a51545b7270725b131e96956ea25828956aa602b1ce920b46a7250b05c23f031421a8a8b55f43ea0d60c1e62a283587f881d16e78b9404cc3b4ca080b0ad983cbbfd9ff0046993757a8b0b5d05a5ad07315882aa1d381440500bd5750ceae7500dc011700063cc25514ca7cc7e50eaa94352a560c6639ce6059e7d61aa96222f0cb80ec14d2070893aab8828a32ae62f040a25ad74d13282e72f30b00b781d0913c10035cadf92ab5549e86d765d02d76850d13423297acaf4dfe0d04a2cc21c912bccba972cb9c422e72c62065129c41d6a1582bcc3387b92d5bbccd6584ada01d565b0ed4aa657ac46aa2c4e42882b5b20f783bd2d303b843aace912c226755701ff382f42c728b7c4527569a1712bc02981906b5b616802b84ed85f425547082acbcb778c41305ae0b6e6930099e095924d42026a177028ad3df11489dae5aef55b622177bbe61a0f625a06c7ac22a093952c97e88852a03d0d08a15541144086380632c531636d71401f926cd315d3ca783fbb9c1ad82dd0106868dd5f56527aeda5fb100d46bcc7872f217172e6ac7103b2c72c559034f2412cfa600631d0353c6063d693a32961700757c4bedadba948afa409ee555695cd40385384a7a476dadc05c5c0a0ae13abd7b0747fccdab834ceb4342d16a0b2d0b609294e81068ad5f03ace504a494dc0b0c119e68bee73089acc2387c4460b966f70aadb2f313aecbd27d12e186b18494b0a6a50fbcb8302f104dc698a44ab9e603333d39ecc1f5d52926b36092e86d3a718dbf5aac0a3489635f91d4d9f5c45c5df39262ab3a9766a098efd11c5a8fca5f2519398301616063a40cca30d590a092a50cf3d3de1294652b5aa37131701610acde2fd56a9963228ba6076a7ec53eff0011ad54bb067b79c9f32d2ad94ec38859d4b88305762d5c84016744be4730cd06a6d63c89edfe43ad070834006d5e22e5d2d900b48badb9553835714fb6b96298443e86a12cb251da0d1d2395a672da648b7de3d8adf524b7b444a36461b412a4a66b38c213efd6438990851947689b0473894d533008315090140cac3202154cb5171fc88a0843f7ca7e39edb05a567a4501b97df81c198f6998dd456bd8dc3e3ead60feea5ed050b3516b42ecd9ef169371b5a0d67b439a81dcdd27dff02dc750d661f3114af4ff00b50219c0baf1adea3c4e56d93acbb6e562f0955c16e6247a66fa2c7b5aff009178b62c80129d51454c14b816871cb77e3311504777f88a644f312972f304cac5f4fbcaec3ea8d295759659cfe0ad9170f3105c65e269549d270125ca798b2deb9944362faabff6305aaf14ef11d2487cb51675feed56044b14b1bef0472840a5c54142828588f3f9093c2992b559fef89556482ad9d44039a542ed1ef1b85a2094b32cc9e90eb52f4b8244eede1a0f6d71de346e24fb4605cacb68305e63b465ef16d566fd57d352842e093799752f6d08de035b8ee9467be00e1c5e2fbb100c6c693dbfc772081da69001083395cb2a0e9e023fdf2c6388ed509ca4adb4a8282621db9a8388e61990382a22003549a8b70dfe0c1b83ace09bf595a0b13701e04561d2079d0f785ceb02dcd408c2670dc0f791686c23b42d7de5ea2380752745301c668de9f9052aac05aa5e7a637afdf6e1230e64f799839bda4a26a6e3ea588e8e6de651162675f80acc0135f1dd8ac2b6c4429469ae232ca8317f1a5972dacccd711bb514a5c6b6632dc044413c446044b514062aeab46aef0f15fe2a10012a7292223828052a0ee3a3b05b7cf3145a635c30985a1c730a0051d2130c52c51ac447f7408a96b225f8fef58f9e35659a8669416420d16c9f505402d7820f74d4d38e669ef29a56d4c662618e157c46a58ce09445d4204a774ec9721bac5cb5d5b5e599ac3d0ca9131448588962e4ccb1c69a05a14c5ab28bc5d393f20b10b32a14555e2f5b8a1655694f022aec22cb7b39224b480a812a67f720c09d499a55d55b2865020654b506bde58af2fe05f56d29fa312923de78806e4348ec611663b19ee7a567d2803cc6e556816956589fdd74bff120d0b7055076d1be2526d6a20731bbac1c4396a5585d7796f0078f681b6b1c5c1642df797294b80dbb3b55d5bb88a45260eadbfdbeb098468545ca8d3c4ae9917a03d6d452e7d886b4c9465ed72fc2b3a222e415c563996d6ba6cbc730b3ec8b4432761a0a8fa316cafa3fbda5856af238859f0944d33888c6057688e5e5575894c5cb7202e05c8478ad9cacfe401140910b5b0183c12a20b812ef5d4a1bc6e9a1c3fdfde2eb3e97841511740a97179e91f048f58c8ace91261ae92a65266729645bf55a205b46e55b92f8a85fc5cab997cc2858e2bac0bc86e0bfa12a6d02715d1dd55e3de1205cc459588aaa57619d21ed395372c230b9aba786b9ac3013a451dcff000a1ed97cc03295d6dbbe987b5a3ac5eb1ab0728912056f074f31141e8739888130d60c5cc1aa5aef8961e93977062f1c16e2139617cc0a288375d3d04334046b39bf8fa4b52517c200d703c43d073e212eb1e174e7db705f8b36537e7fbc4772588207371f6dde7714b191801ca4a55046a5ccc6c3db349348b55fde84cae671640b292c5c90004d6b4801e8a1cd39c8367fbe1a9aca616d10a1cdb8bcdddac342c5dc146d625e93161b8806534ac1d21b5c0d99fa44159b11000f78953046944381170166b297dee03532d731c3ea55b6f746444a79bfc0b72acab08c3831d2293882de3ccd4595e22ba3b2cb0c7684060d13bab51073447a47de16fa3b452570c451bc90f365840b6155632aa50671972e76fcff0086de45b2a8d8450e00c9c1757605d1314ebb45dc09862f2e69946531cb3355a9484e77d67c4ca66b2d38e2f1d3bc3552e189c9e974db3158cab7d2e8166d5d2cf3da14366cf66ee58e4d3cc1b7ce63147911c5bfd660a146ae3a480e1b350259712e0d97b38840ac6986323676897149b3ccbdf647f1c438ec153fbe235528bc8f58b962bc102ed050f00b55cb9ff79e20a05d861a31c85ab0614b36388d9b9ef00aed9ccd701a41fceeb0cde08108b5775729216348ff007ccb5860c9157a16ef0afeb18601d2587118e155eabd1961eb0cc60cc1c8b75a806868e36d9bfda1e62556e7247786004b5e37d32d4c8c77cb72f80965ca9394c26ae05425eef40e0840c468b0b2e72ff8689c78cef8a3871c78a6909b7f12a8459a39836d5ca008d7796342e5db530f4e1283d88ad997a11eb0d4e5cba9fb7fe4bfd0387684c158a2d89990b5db9b962edb66374b08b04d622361f32a04b23011e95a866bb48841a46c6235154d38738ef09ed19ea5c288c9d6398d80eee0c0ee81b99cf7a7fde3063e58c534d94b605a691a485d1f173b4d3b9dc44376782b3d59bda577964b364155afde8c3c13bcb9b4d177894f46cd45710d3919c131955316bd6adbde2031040bbff28b50015c0f1fdf12f9586db07ce7bfd0966066ddab5777aafde6f1c1ae60eea5d5465a48291a5c390335d6aa3b4c675366dfe27382eb506b222ecc0335c4fd982b3a85d3309435421abf40971f7424800c247685ff0071da1e86bc294040a11de9b70742022b822dbb8cb30881814f46337ce59856e219f647b45588c329b2ab30b0b9e158be5e1922943844c54e56f845b20038e03a2eff00dfbcd9235819b96c25b9b62c25ee296291252a411d29c47696aaa28b43d9545c9dd6af72d64439eb0b152c9ad4bd7ac5bf4544cd2841b950cc5af3a765e3550dd131b7a147d3d2a1431d2c554c08c3b30ef5bb799a909861166451d963fc79d8b9b09ab361070d50e168c4f0c6dc9190dbed2ea5cb9a5dc642f05c70622eaaa38041b1fc742d131c76861c250b0aef0b1b2f2c5cd3b06e2bfd812551ca15b6a13b978b6d030b2a0a45200832c0006494100020e02ad5bbff007d46ba1cd2cd436e5c3841bb878582946444d8c1f4841150e91d1a8b427e0550ba94c648695f5d660cdce1e89546e166a5175e52100d5116d666671cc2ba3092f4172e1564b6ea0f914fbbd6c6a08c582e838b4cf7855d438d628d070136d0889637355a454146554b0a2a8b01b2e8564cba41bc18298c30275864b5db04f312d2b312a464e16ec6c4a360046115a05b600a14b86cba7609a703845b64050a5c365d3b0420427641e0682588ab1620abb3bc2c18305885b5b477eac71b2a5d26a0dec2f35158601b1f7fc37441186adc42c6c97e087620b4b5cc54543f163a30700c9cc1dc48cc4a86e4993b5a8eea040482269377dbf21262901b5050550da56b2aa301020822a8604cd665b21a835322bd228a7998a371783bc7a0b3d212c64427635a98643a348ec52e1f6352be09a61f1305fb2302774439a865890a740397b40eec590038b25392ae2f75347eaee55de9e3addbaacd7859152aa95861a46edbb3108f2aca429ae5555e55973c5d4afbe0fac6436676fe0947d0ca9ac79c6b89675142455c5f683161d0cc28cec601b945c60ba8679260b6877980567292a5200368f8bc47b0976f696b95b80c297a3cfe00b40db3b2471c9f319a987b4128d3880d32605bde5d160229045004b1970487623b9b2a6b88b1221475acd39014b5ad323f90988916acc4ec021bcf108b9a8c2e5d532d106b4ec97c225c0acc63e7d2a70b4de223df5a7d16218e70c532b7bcce9af2c3fbfc44d510ef4a8334063138d15cf596d48693d822b68d022ded9083c813808359439097658e570bdaf24e397167eed476afd083f9ae616cb692df862fc26fa2600a607f6daa70c41b2f06e28db3387750f96a16662f81336af4b2475dabd589b2b035700f3774399575d595e65efb43424a6bbc75936b6de3d6a59b05ae3c410b44bfc5bc47704c875702d2ee879301f14f4f4260dc20bec160618226e1181480da9a74d2ebf2207545d001aca11b5f2370149464cc71576711c08eb92a3dccbaef2c02fa256cc1da28f20a8b5289d388b563b31e8a416b8a8b620a6b9577fdef014d94b0dc3594f7318861b4706c38978b23c3879ac444afe88414af56002828e84ce0c45020f74a122ef1706cea476d8cb0e20517036512890c51cfb7f58dc1b9b605b09ec8fa8ebbb80e5e08bc0b3841f848feb71a43da5281307f8588b16e11e35fe258a8e7920dddd06159835ff504c912a44e6235b42c90e7bce735d972d73066097986e96c6254552d604bd0882f50a16a85ac4f1312a512f1724b38832c45b297d10a975440082961194628cdb236fc8b44f790cd9723fc2520c231974641a0584025a2eaea5b330b20dd5ac76b697cca440e21a02ef1538185d916215a166e59c74089614ce4d454020d23029978a7110d482bc4040b886341cf98354550f6fbf3356313c246163793d3e9297d2e893a50518823c783d56bd05e6eb293ed889f7793f986b58bb9fda6608da153d5651c02893ed514015aefc134163904296b05ce487df16a2a747ac34e98540fb32d67bb1be8205f4dda663a0fd4e221a174c08b406fb730db4522a7cf98ed5585fa5f58819560e180ae2747a9d628185dc2db4d74991b8cf74cd0bbde25145621167a45d1b0b2d559b22ac7a8f69588d44e610bf4d3e994272a1aa5127072d18abba1273a710d062c06357468a003f23b33e8a89496c2321a42f1698d0cad48449856618a14bc732a8583a7994f435056a86c413b47c75ba8607655051151042c710e07552c704cc1f67961d79982b08934ff007f98eed04344b8c00d4170f189697d5d8cc0be509f2f31369c8adaf39942156bdd7fe7c798a569a4b2fb426be5974d5c410b34bc4b5aa9642fe5b38bd39fe9c0d913221f6889414696bcc01b3aaf910f029df2f99b6f305d2f68aadbde2ceaa5a9dd463e5c4dad5d0e4f2fda5c2dd4d2f762354ed6d1f01471548841a7da2f4e4e6a00ec23fa9de7084c7c0f72688ad1ebca227120a8c5fb206dc60ce4982a370f3950464bcebb45240d0bda2f15055a99f89421874fde5dee8b7728d7e1369b5d69d814345d0a28b2f64765138ea0a0e940a816e68d7e4c1920f09e816702d56d29cad1c788b982a2716ecdca860cec99d38620b0d5372b8a8be23315748b2838c9d3bcaeba86a10aa4c0cce34b45e3ac1d48f4635888f08c29bf34f98e0bf39453a15f8b3920f96a0482bccaaabc6b7df339540732b84af22a399bd8ca105d637b84fb9da879c5e7ec90e87e407cd473b17ab1f5cdb5cc7e11caf7b21b81e5088acf2060e5a22ef69abfb7d595fbb417f2fe0891336a597873178cc1ee48b4b1a07afa31c3ab35dc9997adcfb18f858bbf31d89459efd434a35c45aba1da6ea87ed144caee555d83ad4642a2f37a88146f60ccbe97eb57141b4b494c0738b144e19d49e0ca0300341c7e4e1da4e026c44d23ccd95a515090a280563474dd0c42733d90476d75839657796e2cbd2ddb994c0ea2b2c3a42dc660e4386602565996c462fd587116f9539879f452b5830b5306ea58cd9de05c0f86121f3e4b401959aac28298ab716ce297598f1af48fef017256d04eec5e041466af43e0b970465bf131933d618f24672e2042ed6caaa58a91f06dfa4ec21e27d8f47d5854abdf495305a5e3d0de40364a54218798bd8f8822e158f9837c530055ef17fce259b36f12e438954bd216c7bcc25d92b1832e3be96ec94a0ca88a2da1968daa85ca1cb06d1155d5fca4e841c00d889a479815503b695a50118a357a2ad092482f501ab871be0622aab139ca63d65cc051758632a4e5d4ba81963329cf937d25cf9a6b4f923eeb8a55ef503128f4655d23114bf6089371e011576c7c72ddb71c81200234d7308101492af54196dc22c09e5a7d23b2eba88f0554f68b4ba17cc001d31294df2a0edd86d69dd36ec7ccc97ee81f406081573cb16a2dfa9d8ea0307925a27487b863145d7a3c4c114adb7f55cd9ba6727a9dbd16e04b9594ef1d50512e71a7e20b4d2768e98c719263539811440c88548d4720861865ccfaf6fdaa28151458a19928c03d5b1528bd825945145dafe595eee04da2d0b480b99710d0dc4bc18b5ab42ec2e0595cb698a150a398583537188e1c8174434ae354cc850cb501f15f240478067d05241a6cb2254bc4bf5f717fc46c7fb8f7e601555c0efa07598bebf43e4b8bc8abd6b2fac16f37ad1f231f316382f2af76e2d95f797896f0830c2ce809c1d3460bbf43b7cccd700341447f0b0ceb287b443b6f5883f5186400074856fb61eb6906c664611fc5d9f449b29c90ef5cc439b3a714fe62db96e09e112961945788cc2c135138c19ae23d7f1408b557001cb2dabf65ef5559555d17614336aa6dea5ad1ab168282f07e618500d0c150694305603415a3121db14a852af15c99c65c15df2b1aa4dc53a6d335685f1b6d5bb282eeab0740bb62e2d5478dacd63dccf88027103373f32cd509a0d0b2c2f39131d23e454b030b9ae98802f6381500ca19c970e6a9e48ecc0b8ad2943300238ca0a5298b2e98340160ceb81e8ea061b21058a5c1d0e0f894a9d6edfd7de6d0e497317ec15ab41093a9f23abec7a2d45bf532c91ed5c137d2f8c3d14b5f18f30b4a39438b194b183ee23ee389884497285de4bfb40b6047140c0a85675631bb06b7cebd781958de37f48421284991bc458a196000d2183f0962b4d0b5343740d0b5ac12cc965f78052e45b607c272ca11104a35bcb22b9abaa0fce45fa5c6bd2c4f1d2267b7d2b863279263406cdf4b735de250b95e3834f045a8f2422140b434a1df43d17f0e464b13503e9763c409fc702df6ca9325838466b2853f11e476889d02d577cd7aa07e61b72de88b11f458b064565038c6bbc7208b536ac5a8b704cb04621209e062eae5869bcaef34a40e5f56350c8bd2a956651bc52a57c825c651016b8b2fa912d6f15266815d03d19ce2f38ad9a00d4574ba6735c460e51482cd565c972a9a4a42b348f64590da734efa0707e78db81d0850260378cbb5cb96623d02fd42a014652fdb99a99693588fd08b314ca4b7d908cc9b32afa062f0061cd5ac07d5e2e007bdd794227b10532a9faa39d9dc62d45b95bd7ff22fd46300a9823407d88503666d3599bbc436c99f451a2e8ac797359d47a6a0a5bca5a8f899ff007823ede08b37e8e89416f0a38079f9af9fd0461c9bb2b6edb0a71d29559bb080330859e815e9612b603962b2ba4bb1d05f786ab2336847d66895a30654eb9f4cbe496a983e520f461b93dafd8382111124225226e502cc0516c16780bee7a02d12e11a0e44d9efd5e8050b8c188a9132c9ee1c62258d7a5e0ed67a2fe079948a22735321ee9fa089ab820d058bb0aa9a2f93580b9b54c14815e810db0201a7a9923fa5e48f641daba8ac795cbd5c88f59403b9e23a953285efe806299138d3ecb7ed0412044a9fb25801400c0eabf71fafa5904984e43d21856186254046e5a7f722e88e7baf429f597aa39043455b5576beb633430155c3f856580599b458ca1fbf56ac6bf40900949d88468c1c3142af14e9373d112d1e088c609e89d673c5e6002223913995aa05b9c187b972a88e4788a871af1167bccbbb676d6239a1adc02abc09f6897c008588e981b17193196001b57820e4215284b49dd5e123177eb71f4282d56c1d78d6bafe0336ce482ad504ccb94b317855bfa8beff00a04ceae46da0164012c6d0a72d419acdc7b2ee5410c06b68112ca9950c1bb4e1846d341e465c23876d844c28d618be72302ba80c14bbc4a29c395250c715014d875c06f10021bd2925ef6d63792cfdf35cf3aa4e4157afe05afc2c821e418ea618e65ab393afa1b515769a1329842ae2dc98f039ff009fa04d031d40362269215778545264c8597a6a92930902a04d38b8d679efe815090d02afc0e9104097d5ce1cb38e92deb3b12c6beb05fe4c7989338721e8841a46c610725addcb1cddc14153032f12ab5f4b97f8162dfa40870d65c4ba852a05ef4c849858d32836e58b936a707eff003dbf410b4238148a4d85075be78194e1f4474dc6a72b82ac6d0b8cb770047c4a9d2e68f781025d2b18fef0204d398b2f508708ea57a3f32c489e56e641b65afa59e276221c46fd75236b40666baa2b28c71307a64a053c99212719aa2f065f7abafd0543be0520d414a1cad8d0ce20127240a86d4106aad8f8c2e8db059828013971a67fec0a94260cbe6040a9a1ed6f796dec8ea229514c1716cf86125388767a9c5098ae308710535e8ca6c111e2000a1eb0c2f0dfa7b1caf18086c5cb83960fd05bb9d01c7646c505e6b0ea547026e695058426dfb286c168a49467da595499a2e641eb021ec558e6d648826ec91d6037134a10ed8094128e92e16e89921a898be8892f75a95350f48ff7a7a043ba3216ce84282df407e619059cd97463d79a5bb4c0168ef514afe83b87a8104d1470d31d3102a05ca940aa40882363c9280942af370fad564c4480cfab251c069fda58e50c01d58fa242788108441ac413e91298b54ac3946b62640470d4b2ef01da88af7817531770d6d3a0d30a3bdd7c403f6c1163000651d5958f1ff6282a57231a036e23cc5ccaab27e83c9118af758e002d4c5f0e42fd02a0a6e26680aa2fe65b58dccd994cd9301840079f0248649444ad883e86308604b2004017378617b581b614836988b729af295e08b8b97413d8e63a89c89777371df2d3d0701e8ca33442f1b628b162b20b63ef9feec14c46081949f7609aaf72e678aae8db068dbb1413752405a194f7acf17fa0f9352ff50f4553b05c82c50c1fecc2f44aa24e63e640b8105340c05884e8110ac408624a5d10c1b17274ff00a089188df72e32d4ba900800d4eba58ce7f1d626084177312244653e2c67de534fe20da6b963592b27d65d928784219bf088c105d9d30f970201d448aaac30075580a8eb4ba70e69588a184d0c5fa228778a28d1553742fb557d7f420fc0031682300a257098352b1e9c3153cf1048db5d7385d9e20545a51699964e9fc23a4e412563ad09541330a015de5a391fb42712f578d9f465d732d6158a63acaa5c0ebc12d0097783caa43dfaf5aceac6c359ef2c1861a8688f8b80a54e9da10a034fd9281db005946d5f67eb03da2ab661db7bbb6e13fbd075c67306ae281f48c7fec769376423ec9a9403654397017e6a2394da73fa108100a714a23a048d8c812c3bb805978a847360b8bae195c72e33965179aace2275278c056e0f898674ee08d9876969498788c768551dc7f88feb4fc63755dbd8dc14b64f90d423f862f431b6078942907e73e1750a0ddb573357cfb056fba43e9e8ca0a482dab02015b3062fc9c76877d573e629dacbba5806a2b76f332ef668c3004aba1de5e4d96979b69d269f3fa1409558008887445314e708e6710085a7ee236239111c90299d673059d2e19a8d75a881b5e18ab686538854391c444739ba7848005a3a5e6291afda53fb30892c4b1f566a777b060be73aff009083ed847f0c3a66962f38d34d7bd4a37c0413d28ddf39106904689b7b57cbb880c379fdc5700bacb0af30057bb258779023f942bde65a714dcb642e0ed7f895d88cd9f13e87de98e5956a5abd7f4364f6d4b4e62f194638b3a203a8ca605c4164662320a731e9152b98462c6c4d9138318e8f680ab1a487eecbf5a7fecccd78fdb88b0366a533f1e8cd235408040080e419c64eee02fe65a902b1a703e03d33399ffa0936c74d887060e1394460a6e900d9c996e5423e2ae7c5a7edfb62346ae56b3747433afd0e13fcd026c44c88f312dc8a1bb0d6ad6d58b555a329af328c5da70660c691db2f4ab2235946c712b059d2814aee8998d971c1e658881b3a5977f48a164d0ea4a9bccb7083ccb850e78ec1ca2d349d332b24800c4a3969d0f923951b843d1681f31237380516c516d3ad1eb99370340d99f4c02710eb62429161caf69b00cd3a055f42239e357125aad62dda72bcfed83cfe8ac97e5eb3b5814aac765be0a519af854055dd596007c45b61cdc5ce6735018728ea2885182ad060296432042d771843a1634c59e2fa4c65bbcdf58f436c454908014fa5d44b98c969880ac712d6f1a8fa568977deb07bc1578986879d7082597aed9ce4a21db73981ad06800028afd176cdbe8201c669c2ca5bd6487061d2ec983d7c86a1a2f6466222749c266bc4218e07f11b92f9831a1c3c42e61f4e8bf98350d600aaa6059ca83bfe4472ff0096ebf88270110e6e2dd6267ba09ed1f9fed6a185929402b7c0f76e207ab48f28ecc0507350be5db04979af9952cd2ef247b641b0d25a58082c0297e8b807b03292c790a195828ed6aa40e5c665645433132fa2625a08d5c125b37322179088214653221d2afdd2dbfb406e3d18205331b55374298d7d62f8c8094dbde36020ae171a59f002dfb4af7a029fcca8ee08cbbd18f784cb08b8ba5687de654af7a04766b730632843d8313006de08c2464e253ab9399bc4a97323a74b66963dd15fa2f4316a41a8b176154359e4d60a7aa506e0f301168958b87916c6433f0c1c19cb8c6576d302a04c173f32e7ae219af332b326a78400306a8cc5c56734f9331c257f038c4548e74b7e8c4c0be942ff001b9920d971896139be66d08da5588e552a0332183ae6e209276f585d4facf972cfe83f45100a8cd7512e8d0e4e725618198c3d61452d7585d0d76883239ccd1103aa84201f24c2af740d3b148c8fd22a2c734009476d3c3fb4be279bd1120dc2cae1be18ce6ab9c712fd579eb50c1205d830ac73e61a14b4965ea8ca510aaa91b78a9dbf1e6559ba8c45ef116b0e83fa64442f0ad8ff0045a8c402094ade8a814838588a057e25e1af88abb774980d2a144234f1069289370d8b7f873797dc15ff00b1372c6707d6674c0e798d53b5d891da281de076848695705846e026acddce2c23822d461a8d6941b19419a85d43de01af5c1318bc6c3fa2b1d6285312911a430950fc8186b40d2bb16a8002c6345858a86e16b97dc93ad4c8d06ed0fef32ff52802bde887296d29ad0fdcf45c7a3a8cd9175a94d928318f885bb1c585cce20bef8bb80f38738c66016064423b304c660b066e01dd973041ec978a852dee341aa7ace52007a2d0c0a1de4b44d3f45ae63d9d84b72652cbaa2802346001a95af5e22a0e734cc8cf152cad0c34bb1a2cc26269aa19afeb38c611d4592d85afc7c4ba02ef38e618f794baefe2240b9cd27d25d9bae906e48adb46d98df64104f7479a2f748ee0acc15d65e9488e91dd3d5b867c214dbdcb0505f1400a3f458ad1bf105d82d4268581722dcece46f9ebcf98954eb70c58a6e60a1dd96f4a75491576decdfda66b67de1a620bc127897bdcba9903be9550584be4986d3c406502763ec4779405638831c7e2351a2aa0148966c8a4c3cff788c905869e267890ab52505683805c6213f9eaca818508a36dfe8d64b8b477139c160a951abb0f9950a33da355141f30167bd4716d31d11f50c4da24a2018d2da9fda99853dad95870618150e96408da78c3352c6f1a9a4e08bb0710d469eb016768e5416ef8000a16016acd7e8e37072ed600bd6e802c44b4a304049404b11a4139251a0f05cbca2ae2eb5f5864c2dea5b08d7c4bc1bf7950d6392528a791d46962bc5cb2eaccb96d7b2c242e0852ec90ea6af7e22db17f58caec878e0c2ae202d69fb30aa8938ba0cec3010b40b761fa3a822b56140b278166a016ae20a5cb16a5a27ed15259784ed2b285933548fcf129a53fbc6bcaaf336b76c658978c6b843164acf48257bb3009027351c36bce3304a609b80d0241381f64a2821a9579aae81794a319575fa3c3f115cad387088a284451114887e29ab0d0eb09440038040688456b0c2c85b0d545f79a143bdccf50e3304c35d8620b99b5e9d425c82d6fac3420355cc0acf745496dee0b1771b1c468071538c4add81400485c61db450e900502a02dd0507e90cb0b962282ed2134a305b2c3a27a06f40b5619b5626314e9300dd1181639802d0f11d673554cb358b589e8a8bdbd514ab423dd28157955416805a87948da068a50b2e7208376450a7b727a03100501838fd27ad025562650b68c611044021535a3d564d16b4348bb22473825667116867de6cda965ae7d6e34223ac040d2055a180015583863aab605112ed0ee5165d0f4a88996bd8d5c403cd14983c2ea25a06907c87e97c650cfbb6268c4a6d32b24120ca9ad52968dabadd9269b6b0aee49a156b0039a8b9d46a101de5a12b9ba0a1467a8da857658e0b8b00e92b5338ae7230b9a52cb5229b0c91ea4410660b53b2e28c505356e5ca602296058ce6266e1a927a10a70037064b005d80b16151530e023e0692ab6a3955555ff00f6cbffd9 ================================================ FILE: resources/multicolumn-lorem-ipsum.txt ================================================ Two-Column Document with Lorem Ipsum Your Name January 3, 2024 Abstract pellentesque ante. Phasellus adipiscing semper elit. Proin fermentum massa ac quam. Sed diam turpis, This is a sample document with two columns filled molestie vitae, placerat a, molestie nec, leo. Maece- with Lorem Ipsum text. nas lacinia. Nam ipsum ligula, eleifend at, accumsan Lorem ipsum dolor sit amet, consectetuer adip- nec, suscipit a, ipsum. Morbi blandit ligula feugiat iscing elit. Ut purus elit, vestibulum ut, placerat magna. Nunc eleifend consequat lorem. Sed lacinia ac, adipiscing vitae, felis. Curabitur dictum gravida nulla vitae enim. Pellentesque tincidunt purus vel mauris. Nam arcu libero, nonummy eget, con- magna. Integer non enim. Praesent euismod nunc sectetuer id, vulputate a, magna. Donec vehicula eu purus. Donec bibendum quam in tellus. Nullam augue eu neque. Pellentesque habitant morbi tris- cursus pulvinar lectus. Donec et mi. Nam vulpu- tique senectus et netus et malesuada fames ac turpis tate metus eu enim. Vestibulum pellentesque felis eu egestas. Mauris ut leo. Cras viverra metus rhon- massa. cus sem. Nulla et lectus vestibulum urna fringilla Quisque ullamcorper placerat ipsum. Cras nibh. ultrices. Phasellus eu tellus sit amet tortor gravida Morbi vel justo vitae lacus tincidunt ultrices. Lorem placerat. Integer sapien est, iaculis in, pretium quis, ipsum dolor sit amet, consectetuer adipiscing elit. In viverra ac, nunc. Praesent eget sem vel leo ultri- hachabitasseplateadictumst. Integertempusconva- ces bibendum. Aenean faucibus. Morbi dolor nulla, llis augue. Etiam facilisis. Nunc elementum fermen- malesuada eu, pulvinar at, mollis ac, nulla. Cur- tum wisi. Aenean placerat. Ut imperdiet, enim sed abitur auctor semper nulla. Donec varius orci eget gravida sollicitudin, felis odio placerat quam, ac pul- risus. Duis nibh mi, congue eu, accumsan eleifend, vinar elit purus eget enim. Nunc vitae tortor. Proin sagittis quis, diam. Duis eget orci sit amet orci dig- tempus nibh sit amet nisl. Vivamus quis tortor vitae nissim rutrum. risus porta vehicula. Nam dui ligula, fringilla a, euismod sodales, sollic- itudin vel, wisi. Morbi auctor lorem non justo. Nam Fusce mauris. Vestibulum luctus nibh at lectus. lacus libero, pretium at, lobortis vitae, ultricies et, Sed bibendum, nulla a faucibus semper, leo velit ul- tellus. Donec aliquet, tortor sed accumsan biben- tricies tellus, ac venenatis arcu wisi vel nisl. Vestibu- dum, erat ligula aliquet magna, vitae ornare odio lum diam. Aliquam pellentesque, augue quis sagittis metus a mi. Morbi ac orci et nisl hendrerit mollis. posuere, turpis lacus congue quam, in hendrerit risus Suspendisse ut massa. Cras nec ante. Pellentesque eros eget felis. Maecenas eget erat in sapien mattis a nulla. Cum sociis natoque penatibus et magnis dis porttitor. Vestibulum porttitor. Nulla facilisi. Sed parturient montes, nascetur ridiculus mus. Aliquam a turpis eu lacus commodo facilisis. Morbi fringilla, tincidunturna. Nullaullamcorpervestibulumturpis. wisi in dignissim interdum, justo lectus sagittis dui, Pellentesque cursus luctus mauris. et vehicula libero dui cursus dui. Mauris tempor Nulla malesuada porttitor diam. Donec felis erat, ligula sed lacus. Duis cursus enim ut augue. Cras congue non, volutpat at, tincidunt tristique, libero. ac magna. Cras nulla. Nulla egestas. Curabitur a Vivamus viverra fermentum felis. Donec nonummy leo. Quisque egestas wisi eget nunc. Nam feugiat 1 ================================================ FILE: resources/toy.layout.txt ================================================ AWAY again1 AWAY again2 Something[cited] Single quote operator Double quote operator Last Txt ================================================ FILE: tests/__init__.py ================================================ import concurrent.futures import os import ssl import sys import urllib.request from pathlib import Path from typing import Optional from urllib.error import HTTPError if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self import yaml TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" def _get_data_from_url(url: str) -> bytes: ssl._create_default_https_context = ssl._create_unverified_context attempts = 0 while attempts < 3: try: with urllib.request.urlopen( # noqa: S310 url ) as response: return response.read() except HTTPError as e: if attempts < 3: attempts += 1 else: raise e raise ValueError(f"Unknown error handling {url}") # TODO: Make keyword-only and drop name being optional. def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes: """ Download a File from a URL and return its contents. This function makes sure the PDF is not downloaded too often. This function is a last resort for PDF files where we are uncertain if we may add it for testing purposes to https://github.com/py-pdf/sample-files Args: url: location of the PDF file name: unique name across all files Returns: Read File as bytes """ if name is None: raise ValueError("A name must always be specified") if os.getenv("GITHUB_JOB", None) is not None: cache_dir = Path("tests", "pdf_cache").resolve() else: cache_dir = Path(__file__).parent / "pdf_cache" if not cache_dir.exists(): cache_dir.mkdir() cache_path = cache_dir / name if url is not None: if url.startswith("file://"): path = Path(url[7:].replace("\\", "/")) return path.read_bytes() if not cache_path.exists(): cache_path.write_bytes(_get_data_from_url(url)) return cache_path.read_bytes() def _strip_position(line: str) -> str: """ Remove the location information. The message WARNING pypdf._reader:_utils.py:364 Xref table not zero-indexed. becomes Xref table not zero-indexed. Args: line: the original line Returns: A line with stripped position """ line = ".py".join(line.split(".py:")[1:]) return " ".join(line.split(" ")[1:]) def normalize_warnings(caplog_text: str) -> list[str]: return [_strip_position(line) for line in caplog_text.strip().split("\n")] def is_sublist(child_list, parent_list): """ Check if child_list is a sublist of parent_list, with respect to * elements order * elements repetition Elements are compared using `==` """ if len(child_list) == 0: return True if len(parent_list) == 0: return False if parent_list[0] == child_list[0]: return is_sublist(child_list[1:], parent_list[1:]) return is_sublist(child_list, parent_list[1:]) def read_yaml_to_list_of_dicts(yaml_file: Path) -> list[dict[str, str]]: with open(yaml_file) as yaml_input: return yaml.safe_load(yaml_input) def download_test_pdfs(): """ Run this before the tests are executed to ensure you have everything locally. This is especially important to avoid pytest timeouts. """ pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml") with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: futures = [ executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"]) for pdf in pdfs ] concurrent.futures.wait(futures) class PILContext: """Allow changing the PIL/Pillow configuration for some limited scope.""" def __init__(self) -> None: self._saved_load_truncated_images = False def __enter__(self) -> Self: # Allow loading incomplete images. from PIL import ImageFile # noqa: PLC0415 self._saved_load_truncated_images = ImageFile.LOAD_TRUNCATED_IMAGES ImageFile.LOAD_TRUNCATED_IMAGES = True return self def __exit__(self, type_, value, traceback) -> Optional[bool]: from PIL import ImageFile # noqa: PLC0415 ImageFile.LOAD_TRUNCATED_IMAGES = self._saved_load_truncated_images if type_: # Error. return None return True ================================================ FILE: tests/bench.py ================================================ """ Benchmark the speed of pypdf. The results are on https://py-pdf.github.io/pypdf/dev/bench/ Please keep in mind that the variance is high. """ from io import BytesIO from tempfile import NamedTemporaryFile import pytest import pypdf from pypdf import PageObject, PdfReader, PdfWriter, Transformation from pypdf.generic import Destination, read_string_from_stream from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url def page_ops(pdf_path, password): pdf_path = RESOURCE_ROOT / pdf_path reader = PdfReader(pdf_path) writer = PdfWriter() if password: reader.decrypt(password) page = reader.pages[0] page = writer.add_page(page) op = Transformation().rotate(90).scale(1.2) page.add_transformation(op) page.merge_page(page) op = Transformation().scale(1).translate(tx=1, ty=1) page.add_transformation(op) page.merge_page(page) op = Transformation().rotate(90).scale(1).translate(tx=1, ty=1) page.add_transformation(op) page.merge_page(page) page.add_transformation((1, 0, 0, 0, 0, 0)) page.scale(2, 2) page.scale_by(0.5) page.scale_to(100, 100) page = writer.pages[0] page.compress_content_streams() page.extract_text() def test_page_operations(benchmark): """ Apply various page operations. Rotation, scaling, translation, content stream compression, text extraction """ benchmark(page_ops, "libreoffice-writer-password.pdf", "openpassword") def merge(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" outline = RESOURCE_ROOT / "pdflatex-outline.pdf" pdf_forms = RESOURCE_ROOT / "pdflatex-forms.pdf" pdf_pw = RESOURCE_ROOT / "libreoffice-writer-password.pdf" writer = PdfWriter() # string path: writer.append(pdf_path) writer.append(outline) writer.append(pdf_path, pages=pypdf.pagerange.PageRange(slice(0, 0))) writer.append(pdf_forms) # Merging an encrypted file reader = PdfReader(pdf_pw) reader.decrypt("openpassword") writer.append(reader) # PdfReader object: writer.append(PdfReader(pdf_path), outline_item="True") # File handle with open(pdf_path, "rb") as fh: writer.append(fh) outline_item = writer.add_outline_item("An outline item", 0) writer.add_outline_item("deeper", 0, parent=outline_item) writer.add_metadata({"/Author": "Martin Thoma"}) writer.add_named_destination("title", 0) writer.set_page_layout("/SinglePage") writer.page_mode = "/UseThumbs" with NamedTemporaryFile(suffix=".pdf") as target_file: write_path = target_file.name writer.write(write_path) writer.close() # Check if outline is correct reader = PdfReader(write_path) assert [ el.title for el in reader.outline if isinstance(el, Destination) ] == [ "Foo", "Bar", "Baz", "Foo", "Bar", "Baz", "Foo", "Bar", "Baz", "True", "An outline item", ] def test_merge(benchmark): """ Apply various page operations. Rotation, scaling, translation, content stream compression, text extraction """ benchmark(merge) def text_extraction(pdf_path): with open(pdf_path, mode="rb") as fd: reader = PdfReader(fd) text = "" for page in reader.pages: text += page.extract_text() return text def test_text_extraction(benchmark): file_path = SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" benchmark(text_extraction, file_path) def read_string_from_stream_performance(): stream = BytesIO(b"(" + b"".join([b"x"] * 1024 * 256) + b")") assert read_string_from_stream(stream) def test_read_string_from_stream_performance(benchmark): """ This test simulates reading an embedded base64 image of 256kb. It should be faster than a second, even on ancient machines. Runs < 100ms on a 2019 notebook. Takes 10 seconds prior to #1350. """ benchmark(read_string_from_stream_performance) def image_new_property(data): reader = PdfReader(data) assert reader.pages[0].images.keys() == [ "/I0", "/I1", "/I2", "/I3", "/I4", "/I5", "/I6", "/I7", "/I8", "/I9", ["/TPL1", "/Image5"], ["/TPL2", "/Image53"], ["/TPL2", "/Image37"], ["/TPL2", "/Image49"], ["/TPL2", "/Image51"], ["/TPL2", "/Image39"], ["/TPL2", "/Image57"], ["/TPL2", "/Image55"], ["/TPL2", "/Image43"], ["/TPL2", "/Image30"], ["/TPL2", "/Image22"], ["/TPL2", "/Image41"], ["/TPL2", "/Image47"], ["/TPL2", "/Image45"], ["/TPL3", "/Image65"], ["/TPL3", "/Image30"], ["/TPL3", "/Image61"], ["/TPL4", "/Image30"], ["/TPL5", "/Image30"], ["/TPL6", "/Image30"], ["/TPL7", "/Image30"], ["/TPL8", "/Image30"], ["/TPL9", "/Image30"], ["/TPL10", "/Image30"], ["/TPL11", "/Image30"], ["/TPL12", "/Image30"], ] assert len(reader.pages[0].images.items()) == 36 assert reader.pages[0].images[0].name == "I0.png" assert len(reader.pages[0].images[-1].data) > 10000 assert reader.pages[0].images["/TPL1", "/Image5"].image.format == "JPEG" assert ( reader.pages[0].images["/I0"].indirect_reference.get_object() == reader.pages[0]["/Resources"]["/XObject"]["/I0"] ) list(reader.pages[0].images[0:2]) with pytest.raises(TypeError): reader.pages[0].images[b"0"] with pytest.raises(IndexError): reader.pages[0].images[9999] # just for test coverage: with pytest.raises(KeyError): reader.pages[0]._get_image(["test"], reader.pages[0]) assert list(PageObject(None, None).images) == [] @pytest.mark.enable_socket def test_image_new_property_performance(benchmark): url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" name = "pdf_font_garbled.pdf" data = BytesIO(get_data_from_url(url, name=name)) benchmark(image_new_property, data) def image_extraction(data): reader = PdfReader(data) list(reader.pages[0].images) @pytest.mark.enable_socket def test_large_compressed_image_performance(benchmark): url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf" data = BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")) benchmark(image_extraction, data) ================================================ FILE: tests/conftest.py ================================================ """Fixtures that are available automatically for all tests.""" import uuid import pytest @pytest.fixture(scope="session") def pdf_file_path(tmp_path_factory): return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.pdf" @pytest.fixture(scope="session") def txt_file_path(tmp_path_factory): return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.txt" ================================================ FILE: tests/example_files.yaml ================================================ - local_filename: 2201.00214.pdf url: https://arxiv.org/pdf/2201.00214.pdf - local_filename: ASurveyofImageClassificationBasedTechniques.pdf url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf - local_filename: Giacalone.pdf url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf - local_filename: iss1718.pdf url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf - local_filename: iss2077.pdf url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf - local_filename: pdf_font_garbled.pdf url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf - local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf - local_filename: tika-908104.pdf url: https://github.com/user-attachments/files/18382273/tika-908104.pdf - local_filename: tika-923406.pdf url: https://github.com/user-attachments/files/18382274/tika-923406.pdf - local_filename: tika-955562.pdf url: https://github.com/user-attachments/files/18382288/tika-955562.pdf - local_filename: tika-959173.pdf url: https://github.com/user-attachments/files/18382295/tika-959173.pdf - local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf - local_filename: tika-957144.pdf url: https://github.com/user-attachments/files/18382302/tika-957144.pdf - local_filename: ascii charset.pdf url: https://github.com/py-pdf/pypdf/files/9472500/main.pdf - local_filename: cmap1370.pdf url: https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf - local_filename: 02voc.pdf url: https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf - local_filename: iss1533.pdf url: https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf - local_filename: tstUCS2.pdf url: https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf - local_filename: tst-GBK_EUC.pdf url: https://github.com/py-pdf/pypdf/files/11315397/3.pdf - local_filename: math_latex.pdf url: https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf - local_filename: unixxx_glyphs.pdf url: https://arxiv.org/pdf/2201.00021.pdf - local_filename: TextAttack_paper.pdf url: https://arxiv.org/pdf/2005.05909.pdf - local_filename: iss2173.pdf url: https://github.com/py-pdf/pypdf/files/12552700/tt.pdf - local_filename: iss2290.pdf url: https://github.com/py-pdf/pypdf/files/13452885/example.pdf - local_filename: NewJersey.pdf url: https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf - local_filename: tika-952445.pdf url: https://github.com/user-attachments/files/18382348/tika-952445.pdf - local_filename: tika-921632.pdf url: https://github.com/user-attachments/files/18382354/tika-921632.pdf - local_filename: tika-976970.pdf url: https://github.com/user-attachments/files/18382397/tika-976970.pdf - local_filename: tika-914102.pdf url: https://github.com/user-attachments/files/18381687/tika-914102.pdf - local_filename: iss1737.pdf url: https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf - local_filename: issue-1801.pdf url: https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf - local_filename: tika-924546.pdf url: https://github.com/user-attachments/files/18381697/tika-924546.pdf - local_filename: issue-1801.png url: https://user-images.githubusercontent.com/1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png - local_filename: grimm10 url: https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf - local_filename: labeled-edges-center-image.png url: https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png - local_filename: watermark1.png url: https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png - local_filename: tika-977609.pdf url: https://github.com/user-attachments/files/18381754/tika-977609.pdf - local_filename: tifimage.png url: https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png - local_filename: tika-972174.pdf url: https://github.com/user-attachments/files/18381744/tika-972174.pdf - local_filename: tika-972174_p0-im0.png url: https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png - local_filename: Vitocal.pdf url: https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf - local_filename: VitocalImage.png url: https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg - local_filename: cmyk_deflate.pdf url: https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf - local_filename: cmyk_deflate.tif url: https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt - local_filename: o1whh9b3.pdf url: https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf - local_filename: selbst.72916.pdf url: https://github.com/py-pdf/pypdf/files/14395695/selbst.72916.pdf - local_filename: iss1912.pdf url: https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf - local_filename: calRGB.pdf url: https://github.com/py-pdf/pypdf/files/12061061/tt.pdf - local_filename: 2023USDC.pdf url: https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf - local_filename: iss1982_im1.png url: https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt - local_filename: iss1982_im2.png url: https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt - local_filename: usa.png url: https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42 - local_filename: paid.pdf url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf - local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf - local_filename: iss2138.pdf url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf - local_filename: iss3268.pdf url: https://github.com/user-attachments/files/20060394/broken.pdf - local_filename: direct-link.pdf url: https://github.com/user-attachments/files/20348304/tst.pdf - local_filename: named-reference.pdf url: https://github.com/user-attachments/files/20455804/MinimalJob.pdf - local_filename: large_lzw_example_encoded.dat url: https://github.com/user-attachments/files/20923310/large_lzw_example_encoded.dat.txt - local_filename: issue-3419.pdf url: https://github.com/user-attachments/files/21578875/layout-parser-paper-with-empty-pages.pdf - local_filename: issue-3429.pdf url: https://github.com/user-attachments/files/21711469/bomb.pdf - local_filename: issue-3508.pdf url: https://github.com/user-attachments/files/23211824/repair-manual-thermo-230-300-350-2012-en.pdf - local_filename: issue-3553.pdf url: https://github.com/user-attachments/files/23996861/ATOLCertificate.pdf - local_filename: issue-3633.pdf url: https://github.com/user-attachments/files/25212719/minimal_signature.pdf ================================================ FILE: tests/generic/__init__.py ================================================ ================================================ FILE: tests/generic/test_base.py ================================================ """Test the pypdf.generic._base module.""" from io import BytesIO import pytest from pypdf import PdfReader, PdfWriter from pypdf.generic import read_hex_string_from_stream from tests import get_data_from_url @pytest.mark.parametrize( ("source", "expected"), [ (b"<00FE00FF>", "\xfe\xff"), (b"<00FE00FF00D6>", "\xfe\xff\xd6"), ] ) def test_text_string_object__looks_like_bom(source: bytes, expected: str) -> None: stream = BytesIO(source) result = read_hex_string_from_stream(stream) assert result == expected @pytest.mark.enable_socket def test_text_string_object__wrongly_detected_bom(): url = "https://github.com/user-attachments/files/24401507/minimal.pdf" name = "issue3587.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader_page = reader.pages[0] writer = PdfWriter() for page in reader.pages: writer_page = writer.add_blank_page(reader_page.mediabox.width, reader_page.mediabox.height) writer_page.merge_page(page) assert writer_page.extract_text() == ( "无译形带 r的参 z慧队手行 c要枪互工先调 uC一在你 k该方导最 xT况 M味政没出 v大同团\n" "想急压游这体构主 m基重张预另做内已织程术并 U种规被中应 s过小立就公测和 F更为 BS\n" "把强型 w利 qfJ现能您关文)己个言 VW是 Z亲社 y。说准密令 K络通自力 i诸旦明量放及 I\n" "成战康养 d都蜂多开 pE次提朋动比台有培愿 A确 l充计标去人如么 b灵 N它 g弃语看 X;j\n" "轮 HG采共由地友入(器 Y果感建切理情从集德翻 a单第识任 Q模 eh目经相哪受起时着 DR\n" "用好 o备划付信、度解效作协读 O讨高具击始者意群治扩到 P才兰网认 t马倒来本整 L们 n\n" "系可论,步各之但\n" "12" ) ================================================ FILE: tests/generic/test_data_structures.py ================================================ """Test the pypdf.generic._data_structures module.""" import os import subprocess import sys from io import BytesIO from pathlib import Path from typing import Callable import pytest from pypdf import PdfReader, PdfWriter from pypdf.errors import LimitReachedError from pypdf.generic import ( ArrayObject, ContentStream, DictionaryObject, NameObject, NullObject, RectangleObject, StreamObject, TreeObject, ) from tests import RESOURCE_ROOT, get_data_from_url try: import resource except ImportError: resource = None def test_dictionary_object__get_next_object_position(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") # reader.xref = {0: {7: 15, 9: 10245, 12: 939, 14: 2999, 16: 4982, 18: 9949, 22: 11160}} assert DictionaryObject._get_next_object_position( position_before=12345, position_end=999999, generations=list(reader.xref), pdf=reader ) == 999999 # No value after 12345 in dictionary assert DictionaryObject._get_next_object_position( position_before=11111, position_end=999999, generations=list(reader.xref), pdf=reader ) == 11160 # First value after 11111 in dictionary. assert DictionaryObject._get_next_object_position( position_before=42, position_end=999999, generations=list(reader.xref), pdf=reader ) == 939 # First value after 42 in dictionary. # New generation. reader.xref[1] = {7: 42, 24: 15000} assert DictionaryObject._get_next_object_position( position_before=10, position_end=999999, generations=list(reader.xref), pdf=reader ) == 15 def test_tree_object__cyclic_reference(caplog): writer = PdfWriter() child1 = writer._add_object(DictionaryObject()) child2 = writer._add_object(DictionaryObject({NameObject("/Next"): child1})) child3 = writer._add_object(DictionaryObject({NameObject("/Next"): child2})) child1.get_object()[NameObject("/Next")] = child3 tree = TreeObject() tree[NameObject("/First")] = child2 tree[NameObject("/Last")] = writer._add_object(DictionaryObject()) assert list(tree.children()) == [child2.get_object(), child1.get_object(), child3.get_object()] assert "Detected cycle in outline structure for " in caplog.text @pytest.mark.enable_socket def test_array_object__clone_same_object_multiple_times(caplog): url = "https://github.com/user-attachments/files/25412858/Draft_OSMF_financial_statement_2013.pdf" name = "issue2991.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) writer = PdfWriter() for page in reader.pages: page2 = writer.add_page(page) assert page2.mediabox == RectangleObject((0, 0, 595, 841)) assert caplog.messages == [] def test_array_object__clone_same_stream_multiple_times(): writer = PdfWriter() # Unique streams. stream1 = StreamObject() stream1.set_data(b"Hello World!") stream2 = StreamObject() stream2.set_data(b"Lorem ipsum!") # Shared streams. shared_streams = [StreamObject() for _ in range(3)] [shared_stream.set_data(f"Shared stream {index}".encode()) for index, shared_stream in enumerate(shared_streams)] # Add to writer. writer._add_object(stream1) writer._add_object(stream2) shared_references = [writer._add_object(shared_stream) for shared_stream in shared_streams] # Arrays. array1 = ArrayObject([stream1.indirect_reference, *shared_references]) array2 = ArrayObject([stream2.indirect_reference, *shared_references]) # Cloned. cloned1 = array1.clone(pdf_dest=writer) cloned2 = array2.clone(pdf_dest=writer) # Nullify one shared object. writer._replace_object(shared_references[1].indirect_reference, NullObject()) # The first entry is always different. The remaining shared entries should be dedicated copies. assert cloned1[1:] != cloned2[1:] assert ContentStream(stream=array1, pdf=None).get_data() == b"Hello World!\nShared stream 0\nShared stream 2\n" assert ContentStream(stream=array2, pdf=None).get_data() == b"Lorem ipsum!\nShared stream 0\nShared stream 2\n" assert ( ContentStream(stream=cloned1, pdf=None).get_data() == b"Hello World!\nShared stream 0\nShared stream 1\nShared stream 2\n" ) assert ( ContentStream(stream=cloned2, pdf=None).get_data() == b"Lorem ipsum!\nShared stream 0\nShared stream 1\nShared stream 2\n" ) @pytest.mark.enable_socket def test_dictionary_object__read_from_stream__limit(): name = "read_from_stream__length_2gb.pdf" url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) page = reader.pages[0] with pytest.raises( expected_exception=LimitReachedError, match=r"^Declared stream length of 2147483647 exceeds maximum allowed length\.$" ): page.extract_text() def _prepare_test_dictionary_object__read_from_stream__no_limit( path: Path ) -> tuple[str, dict[str, str], Callable[[], None]]: env = os.environ.copy() env["COVERAGE_PROCESS_START"] = "pyproject.toml" name = "read_from_stream__length_2gb.pdf" url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf" data = get_data_from_url(url=url, name=name) pdf_path = path / name pdf_path.write_bytes(data) pdf_path_str = pdf_path.resolve().as_posix() try: env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"] except KeyError: env["PYTHONPATH"] = "." def limit_virtual_memory() -> None: limit_kb = 1_000_000 limit_bytes = limit_kb * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes)) return pdf_path_str, env, limit_virtual_memory @pytest.mark.enable_socket @pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.") @pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.") def test_dictionary_object__read_from_stream__no_limit(tmp_path): pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path) source_file = tmp_path / "script.py" source_file.write_text( f""" import sys from pypdf import filters, PdfReader filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize with open({pdf_path_str!r}, mode="rb") as fd: reader = PdfReader(fd) print(reader.pages[0].extract_text()) """ ) result = subprocess.run( # noqa: S603 # We have the control here. [sys.executable, source_file], capture_output=True, env=env, text=True, preexec_fn=limit_virtual_memory, ) assert result.returncode == 1 assert result.stdout == "" assert result.stderr.replace("\r", "").endswith("\nMemoryError\n") @pytest.mark.enable_socket @pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.") @pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.") def test_dictionary_object__read_from_stream__no_limit__path(tmp_path): pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path) source_file = tmp_path / "script.py" source_file.write_text( f""" import sys from pypdf import filters, PdfReader filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize reader = PdfReader({pdf_path_str!r}) print(reader.pages[0].extract_text()) """ ) result = subprocess.run( # noqa: S603 # We have the control here. [sys.executable, source_file], capture_output=True, env=env, text=True, preexec_fn=limit_virtual_memory, ) assert result.returncode == 0 assert result.stdout.replace("\r", "") == "Hello from pypdf\n" assert result.stderr == "" def _get_array_based_buffer(stream_count: int, chunk_bytes: int) -> BytesIO: writer = PdfWriter() page = writer.add_blank_page(width=10, height=10) streams = [ContentStream(stream=None, pdf=writer) for _ in range(stream_count)] chunk = b"q\n" + (b"A" * chunk_bytes) + b"\nQ\n" [stream.set_data(chunk) for stream in streams] contents = ArrayObject([writer._add_object(stream) for stream in streams]) page[NameObject("/Contents")] = contents buffer = BytesIO() writer.write(buffer) buffer.flush() return buffer @pytest.mark.timeout(10) def test_content_stream__array_based__performance(): buffer = _get_array_based_buffer(stream_count=10_000, chunk_bytes=7000) reader = PdfReader(buffer) _ = reader.pages[0].get_contents() def test_content_stream__array_based__length(): buffer = _get_array_based_buffer(stream_count=11_000, chunk_bytes=1) reader = PdfReader(buffer) with pytest.raises( expected_exception=LimitReachedError, match=r"^Array\-based stream has 11000 > 10000 elements\.$" ): _ = reader.pages[0].get_contents() @pytest.mark.timeout(10) def test_content_stream__array_based__output_length(): buffer = _get_array_based_buffer(stream_count=10_000, chunk_bytes=8192) reader = PdfReader(buffer) with pytest.raises( expected_exception=LimitReachedError, match=r"^Array\-based stream has at least 75003501 > 75000000 output bytes\.$" ): _ = reader.pages[0].get_contents() ================================================ FILE: tests/generic/test_files.py ================================================ """Test the pypdf.generic._files module.""" import datetime import shutil import subprocess from io import BytesIO import pytest from pypdf import PdfReader, PdfWriter from pypdf.constants import AFRelationship from pypdf.errors import PdfReadError, PyPdfError from pypdf.generic import ( ArrayObject, ByteStringObject, DictionaryObject, EmbeddedFile, IndirectObject, NameObject, NullObject, NumberObject, TextStringObject, create_string_object, ) from tests import SAMPLE_ROOT, get_data_from_url PDFATTACH_BINARY = shutil.which("pdfattach") @pytest.mark.skipif(PDFATTACH_BINARY is None, reason="Requires poppler-utils") def test_embedded_file__basic(tmpdir): clean_path = SAMPLE_ROOT / "002-trivial-libre-office-writer" / "002-trivial-libre-office-writer.pdf" attached_path = tmpdir / "attached.pdf" file_path = tmpdir / "test.txt" file_path.write_binary(b"Hello World\n") subprocess.run([PDFATTACH_BINARY, clean_path, file_path, attached_path]) # noqa: S603 with PdfReader(str(attached_path)) as reader: attachment = next(iter(EmbeddedFile._load(reader.root_object))) assert attachment.name == "test.txt" assert attachment.alternative_name == "test.txt" assert attachment.description is None assert attachment.associated_file_relationship == AFRelationship.UNSPECIFIED assert attachment.subtype is None assert attachment.content == b"Hello World\n" assert attachment.size == 12 assert attachment.creation_date is None assert attachment.modification_date is None assert attachment.checksum is None assert repr(attachment) == "" def test_embedded_file__artificial(): # No alternative name. pdf_object = DictionaryObject(answer=42) attachment = EmbeddedFile(name="dummy", pdf_object=pdf_object) assert attachment.alternative_name is None # No /EF. with pytest.raises(PdfReadError, match=f"/EF entry not found: {pdf_object}"): _ = attachment._embedded_file # Empty /EF dictionary. pdf_object = DictionaryObject() pdf_object[NameObject("/EF")] = DictionaryObject() attachment = EmbeddedFile(name="dummy", pdf_object=pdf_object) with pytest.raises(PdfReadError, match=r"No /\(U\)F key found in file dictionary: {}"): _ = attachment._embedded_file # Missing /Params key. pdf_object[NameObject("/EF")] = DictionaryObject() pdf_object[NameObject("/EF")][NameObject("/F")] = DictionaryObject(answer=42) assert attachment._params == DictionaryObject() # An actual checksum is set. # Generated using `hashlib.md5(b"Hello World!\n").digest()` params = DictionaryObject() params[NameObject("/CheckSum")] = ByteStringObject(b"\x8d\xdd\x8b\xe4\xb1y\xa5)\xaf\xa5\xf2\xff\xaeK\x98X") pdf_object[NameObject("/EF")][NameObject("/F")][NameObject("/Params")] = params assert attachment.checksum == b"\x8d\xdd\x8b\xe4\xb1y\xa5)\xaf\xa5\xf2\xff\xaeK\x98X" @pytest.mark.enable_socket def test_embedded_file__kids(): # Generated using the instructions available from # https://medium.com/@pymupdf/zugferd-and-ghostscript-how-to-create-industry-standard-and-compliant-pdf-e-invoices-83c9fde31ee5 # Notes: # * Yes, we need the full paths. Otherwise, the output file will only have an empty page. # * The XML file has been a custom basic text file. # * The input PDF file has been the `002-trivial-libre-office-writer.pdf` file. url = "https://github.com/user-attachments/files/18691309/embedded_files_kids.pdf" name = "embedded_files_kids.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) attachments = list(EmbeddedFile._load(reader.root_object)) assert len(attachments) == 1 attachment = attachments[0] assert attachment.name == "factur-x.xml" assert attachment.alternative_name == "factur-x.xml" assert attachment.description == "ZUGFeRD electronic invoice" assert attachment.associated_file_relationship == AFRelationship.ALTERNATIVE assert attachment.subtype == "/text/xml" assert attachment.content.startswith(b"Hello World!\n\nLorem ipsum dolor sit amet, ") assert attachment.content.endswith(b"\ntakimata sanctus est Lorem ipsum dolor sit amet.\n") assert attachment.size == 606 assert attachment.creation_date is None assert attachment.modification_date == datetime.datetime( 2013, 1, 21, 8, 14, 33, tzinfo=datetime.timezone(datetime.timedelta(hours=1)) ) assert attachment.checksum is None assert repr(attachment) == "" # No /Names in /Kids. del ( reader.root_object[NameObject("/Names")][NameObject("/EmbeddedFiles")][NameObject("/Kids")][0] .get_object()[NameObject("/Names")] ) attachments = list(EmbeddedFile._load(reader.root_object)) assert attachments == [] @pytest.mark.enable_socket def test_embedded_file__ensure_params__existing_params(): url = "https://github.com/user-attachments/files/18691309/embedded_files_kids.pdf" name = "embedded_files_kids.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) attachments = list(EmbeddedFile._load(reader.root_object)) assert len(attachments) == 1 attachment = attachments[0] assert "/Params" in attachment._embedded_file params_dict = attachment._ensure_params assert isinstance(params_dict, DictionaryObject) assert NameObject("/ModDate") in params_dict original_mod_date = params_dict.get(NameObject("/ModDate")) params_dict[NameObject("/TestParam")] = TextStringObject("test_value") assert params_dict[NameObject("/TestParam")] == TextStringObject("test_value") assert params_dict[NameObject("/ModDate")] == original_mod_date params_dict2 = attachment._ensure_params assert params_dict is params_dict2 assert params_dict2[NameObject("/TestParam")] == TextStringObject("test_value") def test_embedded_file__name_is_read_only(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") assert embedded_file.name == "test.txt" with pytest.raises(AttributeError): embedded_file.name = "new_name.txt" def test_embedded_file__alternative_name_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.alternative_name = TextStringObject("Alternative Name") assert embedded_file.alternative_name == "Alternative Name" embedded_file.alternative_name = None if NameObject("/UF") in embedded_file.pdf_object: assert embedded_file.pdf_object[NameObject("/UF")] == NullObject() if NameObject("/F") in embedded_file.pdf_object: assert embedded_file.pdf_object[NameObject("/F")] == NullObject() assert embedded_file.alternative_name is None pdf_string = TextStringObject("PDF String") embedded_file.alternative_name = pdf_string assert embedded_file.alternative_name == "PDF String" def test_embedded_file__alternative_name__uf_key_only(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.pdf_object[NameObject("/UF")] = create_string_object("original_uf") del embedded_file.pdf_object[NameObject("/F")] assert NameObject("/UF") in embedded_file.pdf_object assert NameObject("/F") not in embedded_file.pdf_object embedded_file.alternative_name = None assert embedded_file.pdf_object[NameObject("/UF")] == NullObject() assert NameObject("/F") not in embedded_file.pdf_object embedded_file.alternative_name = TextStringObject("new_uf") assert embedded_file.pdf_object[NameObject("/UF")] == create_string_object("new_uf") assert embedded_file.pdf_object[NameObject("/F")] == create_string_object("new_uf") def test_embedded_file__alternative_name__f_key_only(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.pdf_object[NameObject("/F")] = create_string_object("original_f") if NameObject("/UF") in embedded_file.pdf_object: del embedded_file.pdf_object[NameObject("/UF")] assert NameObject("/F") in embedded_file.pdf_object assert NameObject("/UF") not in embedded_file.pdf_object embedded_file.alternative_name = None assert embedded_file.pdf_object[NameObject("/F")] == NullObject() assert NameObject("/UF") not in embedded_file.pdf_object embedded_file.alternative_name = TextStringObject("new_f") assert embedded_file.pdf_object[NameObject("/F")] == create_string_object("new_f") assert embedded_file.pdf_object[NameObject("/UF")] == create_string_object("new_f") def test_embedded_file__alternative_name__both_f_and_uf(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.pdf_object[NameObject("/F")] = create_string_object("original_f") embedded_file.pdf_object[NameObject("/UF")] = create_string_object("original_uf") embedded_file.alternative_name = TextStringObject("new_name") assert embedded_file.pdf_object[NameObject("/F")] == create_string_object("new_name") assert embedded_file.pdf_object[NameObject("/UF")] == create_string_object("new_name") assert embedded_file.alternative_name == "new_name" embedded_file.alternative_name = None assert embedded_file.pdf_object[NameObject("/F")] == NullObject() assert embedded_file.pdf_object[NameObject("/UF")] == NullObject() assert embedded_file.alternative_name is None def test_embedded_file__description_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.description = TextStringObject("Test Description") assert embedded_file.description == "Test Description" embedded_file.description = None assert embedded_file.pdf_object[NameObject("/Desc")] == NullObject() pdf_string = TextStringObject("PDF Description") embedded_file.description = pdf_string assert embedded_file.description == "PDF Description" def test_embedded_file__subtype_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.subtype = NameObject("/text/plain") assert embedded_file.subtype == "/text/plain" embedded_file.subtype = None assert embedded_file._embedded_file[NameObject("/Subtype")] == NullObject() name_obj = NameObject("/application#2Fjson") embedded_file.subtype = name_obj assert embedded_file.subtype == "/application#2Fjson" def test_embedded_file__content_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") assert embedded_file.content == b"content" embedded_file.content = b"Hello World!" assert embedded_file.content == b"Hello World!" embedded_file.content = "Lorem ipsum dolor sit amet" assert embedded_file.content == b"Lorem ipsum dolor sit amet" def test_embedded_file__size_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.size = NumberObject(1024) assert embedded_file.size == 1024 embedded_file.size = None assert embedded_file._ensure_params[NameObject("/Size")] == NullObject() num_obj = NumberObject(2048) embedded_file.size = num_obj assert embedded_file.size == 2048 def test_embedded_file__size_getter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file._ensure_params[NameObject("/Size")] = NullObject() assert embedded_file.size is None embedded_file._ensure_params[NameObject("/Size")] = NumberObject(4096) retrieved_size = embedded_file.size assert retrieved_size == 4096 def test_embedded_file__creation_date_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") test_date = datetime.datetime(2023, 1, 1, 12, 0, 0) embedded_file.creation_date = test_date assert embedded_file.creation_date == test_date embedded_file.creation_date = None assert embedded_file._ensure_params[NameObject("/CreationDate")] == NullObject() def test_embedded_file__modification_date_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") test_date = datetime.datetime(2023, 1, 2, 12, 0, 0) embedded_file.modification_date = test_date assert embedded_file.modification_date == test_date embedded_file.modification_date = None assert embedded_file._ensure_params[NameObject("/ModDate")] == NullObject() def test_embedded_file__checksum_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") checksum_bytes = ByteStringObject(b"checksum_value") embedded_file.checksum = checksum_bytes assert embedded_file.checksum == b"checksum_value" embedded_file.checksum = None assert embedded_file._ensure_params[NameObject("/CheckSum")] == NullObject() byte_string = ByteStringObject(b"pdf_checksum") embedded_file.checksum = byte_string assert embedded_file.checksum == b"pdf_checksum" def test_embedded_file__associated_file_relationship_setter(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.associated_file_relationship = NameObject("/Data") assert embedded_file.associated_file_relationship == "/Data" def test_embedded_file__setters_integration(): writer = PdfWriter() writer.add_blank_page(100, 100) embedded_file = writer.add_attachment("test.txt", b"Hello, World!") embedded_file.alternative_name = TextStringObject("Alternative Name") embedded_file.description = TextStringObject("Test Description") embedded_file.subtype = NameObject("/text/plain") embedded_file.size = NumberObject(13) creation_date = datetime.datetime(2023, 1, 1, 12, 0, 0) embedded_file.creation_date = creation_date modification_date = datetime.datetime(2023, 1, 2, 12, 0, 0) embedded_file.modification_date = modification_date embedded_file.checksum = ByteStringObject(b"checksum123") embedded_file.associated_file_relationship = NameObject(AFRelationship.DATA) # Make sure that this is an indirect object for PDF/A-3 compliance. assert embedded_file.pdf_object.indirect_reference == IndirectObject(6, 0, writer) pdf_bytes = BytesIO() writer.write(pdf_bytes) reader = PdfReader(pdf_bytes) assert "test.txt" in reader.attachments def test_embedded_file__null_object_handling(): writer = PdfWriter() embedded_file = writer.add_attachment("test.txt", b"content") embedded_file.alternative_name = TextStringObject("Name") embedded_file.description = TextStringObject("Description") embedded_file.subtype = NameObject("/text/plain") embedded_file.size = NumberObject(1024) embedded_file.checksum = ByteStringObject(b"checksum") embedded_file.alternative_name = None embedded_file.description = None embedded_file.subtype = None embedded_file.size = None embedded_file.checksum = None assert embedded_file.alternative_name is None assert embedded_file.description is None assert embedded_file.subtype is None assert embedded_file.size is None assert embedded_file.checksum is None def test_embedded_file__delete_without_parent(): attachment = EmbeddedFile(name="test.txt", pdf_object=DictionaryObject()) with pytest.raises(PyPdfError, match=r"^Parent required to delete file from document\.$"): attachment.delete() def test_embedded_file__delete_known(): writer = PdfWriter() writer.add_blank_page(100, 100) attachment = writer.add_attachment("test.txt", b"content") writer.add_attachment("test2.txt", b"content2") attachments = list(writer.attachment_list) assert len(attachments) == 2 attachment.delete() with pytest.raises(PdfReadError, match=r"^/EF entry not found: {}$"): _ = attachment.content attachments = list(writer.attachment_list) assert len(attachments) == 1 assert attachments[0].name == "test2.txt" # Delete second time. with pytest.raises(PyPdfError, match=r"^File not found in parent object\.$"): attachment.delete() def test_embedded_file__delete__no_indirect_reference(): writer = PdfWriter() writer.add_blank_page(100, 100) # Add an attachment and replace the indirect reference in the name tree # by the dictionary itself. This is how pypdf <= 6.1.0 would embed files # and thus should be supported as well. embedded_file = writer.add_attachment("test.txt", b"Hello, World!") assert embedded_file.pdf_object.indirect_reference == IndirectObject(6, 0, writer) embedded_file._parent[-1] = embedded_file.pdf_object.get_object() embedded_file.delete() attachments = list(writer.attachment_list) assert len(attachments) == 0 @pytest.mark.enable_socket def test_embedded_file__create__kids_based_name_tree(): """Test for issue #3473.""" url = "https://github.com/user-attachments/files/18691309/embedded_files_kids.pdf" name = "embedded_files_kids.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) writer.add_attachment("test.pdf", b"content") assert dict(writer.attachments) == { "factur-x.xml": [ ( b"Hello World!\n\nLorem ipsum dolor sit amet, consetetur sad" b"ipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut" b" labore et dolore magna aliquyam erat, sed diam voluptua" b". At vero eos et accusam\net justo duo dolores et ea rebu" b"m. Stet clita kasd gubergren, no sea takimata sanctus es" b"t Lorem\nipsum dolor sit amet. Lorem ipsum dolor sit amet" b", consetetur sadipscing elitr, sed diam\nnonumy eirmod te" b"mpor invidunt ut labore et dolore magna aliquyam erat, s" b"ed diam voluptua.\nAt vero eos et accusam et justo duo do" b"lores et ea rebum. Stet clita kasd gubergren, no sea\ntak" b"imata sanctus est Lorem ipsum dolor sit amet.\n" ) ], "test.pdf": [b"content"] } attachments = list(writer.attachment_list) assert len(attachments) == 2 assert writer.root_object["/Names"]["/EmbeddedFiles"]["/Names"] == [ "factur-x.xml", attachments[0].pdf_object.indirect_reference, "test.pdf", attachments[1].pdf_object.indirect_reference, ] def test_embedded_file__create__neither_kids_nor_names(): writer = PdfWriter() writer.add_blank_page(100, 100) # Add an attachment and remove the corresponding /Names key. writer.add_attachment("test.txt", b"Hello, World!") del writer.root_object["/Names"]["/EmbeddedFiles"]["/Names"] with pytest.raises(expected_exception=PdfReadError, match=r"^Got neither Names nor Kids in embedded files tree\.$"): writer.add_attachment("test2.txt", b"content2") def test_embedded_file__get_insertion_index(): # Empty list. assert EmbeddedFile._get_insertion_index(ArrayObject(), "test.txt") == 0 # One mismatching entry. assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("dummy.txt"), NullObject()]), "test.txt" ) == 2 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("xxx.txt"), NullObject()]), "test.txt" ) == 0 # Multiple entries. assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("dummy.txt"), NullObject(), TextStringObject("xxx.txt"), NullObject()]), "test.txt" ) == 2 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("xxx.txt"), NullObject(), TextStringObject("yyy.txt"), NullObject()]), "test.txt" ) == 0 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("aaa.txt"), NullObject(), TextStringObject("bbb.txt"), NullObject()]), "test.txt" ) == 4 assert EmbeddedFile._get_insertion_index( ArrayObject([ TextStringObject("aaa.txt"), NullObject(), TextStringObject("test.txt"), NullObject(), TextStringObject("zzz.txt"), NullObject() ]), "test.txt" ) == 4 # Length. assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("a"), NullObject()]), "aa" ) == 2 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("a"), NullObject()]), "a" ) == 2 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("aaa"), NullObject()]), "aa" ) == 0 # Special characters. assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("café"), NullObject()]), "cafe" ) == 0 assert EmbeddedFile._get_insertion_index( ArrayObject([TextStringObject("Tun"), NullObject()]), "Tür" ) == 2 def test_embedded_file__order(): writer = PdfWriter() writer.add_blank_page(100, 100) attachment1 = writer.add_attachment("test.txt", "content") attachment2 = writer.add_attachment("abc.txt", "content") attachment3 = writer.add_attachment("xyz.txt", "content") attachment4 = writer.add_attachment("test.txt", "content2") assert dict(writer.attachments) == { "abc.txt": [b"content"], "test.txt": [b"content", b"content2"], "xyz.txt": [b"content"] } assert writer.root_object["/Names"]["/EmbeddedFiles"]["/Names"] == [ "abc.txt", attachment2.pdf_object.indirect_reference, "test.txt", attachment1.pdf_object.indirect_reference, "test.txt", attachment4.pdf_object.indirect_reference, "xyz.txt", attachment3.pdf_object.indirect_reference, ] ================================================ FILE: tests/generic/test_image_inline.py ================================================ """Test the pypdf.generic._image_inline module.""" from io import BytesIO import pytest from pypdf import PdfReader from pypdf.errors import PdfReadError from pypdf.generic._image_inline import is_followed_by_binary_data from tests import get_data_from_url def test_is_followed_by_binary_data(): # Empty/too short stream. stream = BytesIO() assert not is_followed_by_binary_data(stream) stream = BytesIO(b" q\n") assert not is_followed_by_binary_data(stream) # byte < 32 and no whitespace. stream = BytesIO(b"\x00\x11\x13\x37") assert is_followed_by_binary_data(stream) assert stream.read(1) == b"\x00" assert is_followed_by_binary_data(stream) assert stream.read(1) == b"\x11" assert is_followed_by_binary_data(stream) assert stream.read() == b"\x13\x37" # byte < 32, but whitespace. stream = BytesIO(b" q\n") assert not is_followed_by_binary_data(stream) # Whitespace only. stream = BytesIO(b" \n\n\n \n") assert not is_followed_by_binary_data(stream) # No `operator_end`. stream = BytesIO(b"\n\n\n\n\n\n\n\nBT\n") assert not is_followed_by_binary_data(stream) # Operator length is <= 3. stream = BytesIO(b"\n\n\n\n\n\n\nBT\n") assert not is_followed_by_binary_data(stream) # Operator length is > 3. stream = BytesIO(b"\n\n\n\n\nTEST\n") assert is_followed_by_binary_data(stream) # Just characters. stream = BytesIO(b" ABCDEF") assert is_followed_by_binary_data(stream) # No `operator_start`. stream = BytesIO(b"ABCDEFG") assert is_followed_by_binary_data(stream) # Name object. stream = BytesIO(b"/R10 gs\n/R12 cs\n") assert not is_followed_by_binary_data(stream) # Numbers. stream = BytesIO(b"1337 42 m\n") assert not is_followed_by_binary_data(stream) stream = BytesIO(b"1234.56 42 13 37 10 20 c\n") assert not is_followed_by_binary_data(stream) @pytest.mark.enable_socket def test_extract_inline_dct__early_end_of_file(): url = "https://github.com/user-attachments/files/23056988/inline_dct__early_eof.pdf" name = "inline_dct__early_eof.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"): page.images[0].image.load() @pytest.mark.enable_socket def test_extract_inline_dct__multiple_eod(): url = "https://github.com/user-attachments/files/23900687/cedolini_esempio-1.pdf" name = "issue3517.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: for image in page.images: _ = image.image.load() ================================================ FILE: tests/generic/test_image_xobject.py ================================================ """Test the pypdf.generic._image_xobject module.""" from io import BytesIO import pytest from PIL import Image from pypdf import PdfReader from pypdf._utils import Version from pypdf.constants import FilterTypes, ImageAttributes, StreamAttributes from pypdf.errors import EmptyImageDataError, PdfReadError from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject, StreamObject, TextStringObject from pypdf.generic._image_xobject import _extended_image_from_bytes, _handle_flate, _xobj_to_image from .. import RESOURCE_ROOT, get_data_from_url from ..utils import get_image_data @pytest.mark.enable_socket def test_get_imagemode_recursion_depth(): """Avoid infinite recursion for nested color spaces.""" url = "https://github.com/py-pdf/pypdf/files/12814018/out1.pdf" name = "issue2240.pdf" # Simple example: Just let the color space object reference itself. # The alternative would be to generate a chain of referencing objects. content = get_data_from_url(url, name=name) source = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 7 0 R 11 0 R 12 0 R ]\nendobj\n" target = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 10 0 R 11 0 R 12 0 R ]\nendobj\n" reader = PdfReader(BytesIO(content.replace(source, target))) with pytest.raises( PdfReadError, match=r"Color spaces nested too deeply\. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH\.", ): reader.pages[0].images[0] def test_handle_flate__image_mode_1(caplog): data = b"\x00\xe0\x00" lookup = DecodedStreamObject() expected_data = ( (66, 66, 66), (66, 66, 66), (66, 66, 66), (0, 19, 55), (0, 19, 55), (0, 19, 55), (66, 66, 66), (66, 66, 66), (66, 66, 66), ) # No trailing data. lookup.set_data(b"\x42\x42\x42\x00\x13\x37") result = _handle_flate( size=(3, 3), data=data, mode="1", color_space=ArrayObject( [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup] ), colors=2, obj_as_text="dummy", ) assert expected_data == get_image_data(result[0]) assert not caplog.text # Trailing whitespace. lookup.set_data(b"\x42\x42\x42\x00\x13\x37 \x0a") result = _handle_flate( size=(3, 3), data=data, mode="1", color_space=ArrayObject( [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup] ), colors=2, obj_as_text="dummy", ) assert expected_data == get_image_data(result[0]) assert not caplog.text # Trailing non-whitespace character. lookup.set_data(b"\x42\x42\x42\x00\x13\x37\x12") result = _handle_flate( size=(3, 3), data=data, mode="1", color_space=ArrayObject( [ NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup, ] ), colors=2, obj_as_text="dummy", ) assert expected_data == get_image_data(result[0]) assert "Too many lookup values: Expected 6, got 7." in caplog.text # Not enough lookup data. # `\xe0` of the original input (the middle part) does not use `0x37 = 55` for the lookup # here, but received a custom padding of `0`. lookup.set_data(b"\x42\x42\x42\x00\x13") caplog.clear() expected_short_data = tuple([entry if entry[0] == 66 else (0, 19, 0) for entry in expected_data]) result = _handle_flate( size=(3, 3), data=data, mode="1", color_space=ArrayObject( [ NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup, ] ), colors=2, obj_as_text="dummy", ) assert expected_short_data == get_image_data(result[0]) assert "Not enough lookup values: Expected 6, got 5." in caplog.text def test_extended_image_frombytes_zero_data(): mode = "RGB" size = (1, 1) data = b"" with pytest.raises(EmptyImageDataError, match=r"Data is 0 bytes, cannot process an image from empty data\."): _extended_image_from_bytes(mode, size, data) def test_handle_flate__autodesk_indexed(): reader = PdfReader(RESOURCE_ROOT / "AutoCad_Diagram.pdf") page = reader.pages[0] for name, image in page.images.items(): assert name.startswith("/") image.image.load() data = RESOURCE_ROOT.joinpath("AutoCad_Diagram.pdf").read_bytes() data = data.replace(b"/DeviceRGB\x00255", b"/DeviceRGB") reader = PdfReader(BytesIO(data)) page = reader.pages[0] with pytest.raises( PdfReadError, match=r"^Expected color space with 4 values, got 3: \['/Indexed', '/DeviceRGB', '\\x00\\x80\\x00\\x80\\x80耀" # noqa: E501 ): for name, _image in page.images.items(): # noqa: PERF102 assert name.startswith("/") @pytest.mark.enable_socket def test_get_mode_and_invert_color(): url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf" name = "tika-957721.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[12] for _name, image in page.images.items(): # noqa: PERF102 image.image.load() @pytest.mark.enable_socket def test_get_imagemode__empty_array(): url = "https://github.com/user-attachments/files/23050451/poc.pdf" name = "issue3499.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] with pytest.raises(expected_exception=PdfReadError, match=r"^ColorSpace field not found in .+"): page.images[0].image.load() def test_p_image_with_alpha_mask(): # Generate the base image. Use TIFF as this is easy to do on the fly. image = Image.new(mode="P", size=(10, 10), color=0) image_data = BytesIO() image.save(image_data, format="tiff") # Set the common values. x_object = StreamObject() mask_object = StreamObject() for obj in [x_object, mask_object]: obj[NameObject(ImageAttributes.WIDTH)] = NumberObject(image.width) obj[NameObject(ImageAttributes.HEIGHT)] = NumberObject(image.height) obj[NameObject(StreamAttributes.FILTER)] = NameObject(FilterTypes.CCITT_FAX_DECODE) # Set the basic image data. x_object.set_data(image_data.getvalue()) x_object[NameObject(ImageAttributes.COLOR_SPACE)] = TextStringObject("palette") # Generate the mask image. Will be a diagonal white stripe. image = Image.new(mode="1", size=(image.width, image.height)) [image.putpixel((i, i), 1) for i in range(10)] image_data = BytesIO() image.save(image_data, format="tiff") # Set the mask data. mask_object.set_data(image_data.getvalue()) mask_object[NameObject(ImageAttributes.COLOR_SPACE)] = TextStringObject("1bit") # Add the mask to the image. x_object[NameObject("/SMask")] = mask_object # Generate the output image and make sure that the diagonal stripe is present. extension, data, image = _xobj_to_image(x_object) assert extension == ".png" assert data.startswith(b"\x89PNG") for i in range(10): for j in range(10): assert image.getpixel((i, j)) == (0, 0, 0, 255 * (i == j)) @pytest.mark.enable_socket def test_handle_flate__icc_based__image_mode_1(): url = "https://github.com/user-attachments/files/23756943/pypdf_bug_3534_iccbased.pdf" name = "issue3534.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] image = page.images[0].image assert image is not None image.load() assert image.size == (64, 64) assert image.mode == "1" for y in range(64): for x in range(64): # Determine which chess square this pixel belongs to square_x = x // 8 square_y = y // 8 is_black_square = (square_x + square_y) % 2 == 1 assert image.getpixel((x, y)) == 255 * int(not is_black_square) @pytest.mark.skipif( condition=Version(Image.__version__) < Version("12.1.0"), reason="Unsuitable Pillow version." ) def test_handle_jpx__explicit_decode(): stream = StreamObject() stream[NameObject("/BitsPerComponent")] = NumberObject(8) stream[NameObject("/ColorSpace")] = NameObject("/DeviceCMYK") stream[NameObject("/Decode")] = ArrayObject([1, 0, 1, 0, 1, 0, 1, 0]) stream[NameObject("/Filter")] = NameObject("/JPXDecode") stream[NameObject("/Height")] = NumberObject(16) stream[NameObject("/Width")] = NumberObject(16) image = Image.new(mode="CMYK", size=(16, 16)) [image.putpixel((i, i), 255) for i in range(16)] image_data = BytesIO() image.save(image_data, format="JPEG2000") stream.set_data(image_data.getvalue()) image.save(image_data, format="JPEG2000") result = _xobj_to_image(x_object=stream)[2] for y in range(16): for x in range(16): assert result.getpixel((x, y)) == (255 * (x != y), 255, 255, 255), (x, y) assert image.getpixel((x, y)) == (255 * (x == y), 0, 0, 0), (x, y) ================================================ FILE: tests/generic/test_link.py ================================================ """Test the pypdf.generic._link module.""" from io import BytesIO import pytest from pypdf import PageObject, PdfReader, PdfWriter from pypdf.generic import ArrayObject, NameObject, NullObject, extract_links from tests import get_data_from_url @pytest.mark.enable_socket def test_extract_links__null_object_in_old_page(): url = "https://github.com/user-attachments/files/25507697/sample.pdf" name = "issue3656.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) writer = PdfWriter() writer.append(reader) def test_extract_links(caplog): page1 = PageObject() page2 = PageObject() # No annotations. assert extract_links(page1, page2) == [] assert caplog.messages == [] # Only old annotations. page1[NameObject("/Annots")] = NullObject() assert extract_links(page1, page2) == [] assert caplog.messages == [] caplog.clear() page1[NameObject("/Annots")] = ArrayObject([NullObject()]) assert extract_links(page1, page2) == [] assert caplog.messages == [] caplog.clear() # Both old and new annotations. page2[NameObject("/Annots")] = ArrayObject([NullObject()]) assert extract_links(page1, page2) == [] assert caplog.messages == [] # Same size. caplog.clear() page2[NameObject("/Annots")] = NullObject() assert extract_links(page1, page2) == [] assert caplog.messages == [] caplog.clear() # Only new annotations. del page1[NameObject("/Annots")] page2[NameObject("/Annots")] = ArrayObject([NullObject()]) assert extract_links(page1, page2) == [] assert caplog.messages == [] ================================================ FILE: tests/scripts/__init__.py ================================================ ================================================ FILE: tests/scripts/data/commits__version_4_0_1.json ================================================ [ { "sha": "b7bfd0d7eddfd0865a94cc9e7027df6596242cf7", "node_id": "C_kwDOAC-ZndoAKGI3YmZkMGQ3ZWRkZmQwODY1YTk0Y2M5ZTcwMjdkZjY1OTYyNDJjZjc", "commit": { "author": { "name": "rsinger417", "email": "159086296+rsinger417@users.noreply.github.com", "date": "2024-02-13T21:42:56Z" }, "committer": { "name": "GitHub", "email": "noreply@github.com", "date": "2024-02-13T21:42:56Z" }, "message": "BUG: Use NumberObject for /Border elements of annotations (#2451)\n\nAs defined in Table 164 – Entries common to all annotation dictionaries, the /Border Array consists of NumberObjects. Previously, pypdf used NameObject which is wrong.\r\n\r\nThe previous version caused a warning in the class NameObject: \"Incorrect first char in NameObject:({self})\".\r\n\r\nFixes #2444", "tree": { "sha": "e75b96ca1bb3e60a696bd57c5bb5aac9e7c5651b", "url": "https://api.github.com/repos/py-pdf/pypdf/git/trees/e75b96ca1bb3e60a696bd57c5bb5aac9e7c5651b" }, "url": "https://api.github.com/repos/py-pdf/pypdf/git/commits/b7bfd0d7eddfd0865a94cc9e7027df6596242cf7", "comment_count": 0, "verification": { "verified": true, "reason": "valid", "signature": "-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJly+JgCRC1aQ7uu5UhlAAA56IQAKmAJws3vIKAR1dTx6e8fgp+\nWFMSZub7HGMi0Wz6MKF+hp1fxTNheBwlWVvVHuIGggtg9QSFkpHmi5Qqn+IUHJq1\nI5Jst6Il3lCF2UXUboyN+XbS/lo6rXHriz+Yi7Xwgj+JulHnruFvFEU40AdKnI1w\n88Wh94KXEJqQ6nyP4R2qpDLLlhQ0/4FTIZCWfw8XK1vPmTQwP0ZroL5N7s1pq2s9\nBBDtvcxTE1EbWIyyMzAiNByxdaTakqNLRMq80saiArR4t6f1H4v8dgYep/6R5dxU\n2GGXjh6JOS6xNObrSNvuFanrgAxZoft255OGsU5Y/2yxryp+Bs1QO/PXYXch2ERN\nXyYQKxp886PRcL1vGukksqx5t8Oc781z7RHV/QCIJ5Ry66vC7zDmkk2+Eq6gwWMr\nHzTg3eQ2DL+I4CsNIezb470UOKdIWu9SdQmOrGeUAnQ0rB0V7VOe9n1buPmMP/e0\ngXcq/BNFaNWTCyIHv1XgB6G516k4zM1F5j1BF0GCrhrdX8lXMZUB+WI3V8CsRObI\naKdnE9aGBJPZCFN5O+92ntKt7tUQQLmLNPDgYZktzBg73ejRlpQ4zOBRBFiSNfPj\nRrNzBn1LFtSLxc7/MsP1lLl5NtI0oLWrZMjym3CcAJQYmqsZCv22b94R4hSorAwW\nGr2/wRH3JQVIToDU1/W4\n=JYMh\n-----END PGP SIGNATURE-----\n", "payload": "tree e75b96ca1bb3e60a696bd57c5bb5aac9e7c5651b\nparent 8cacb0fc8fee9920b0515d1289e6ee8191eb3f21\nauthor rsinger417 <159086296+rsinger417@users.noreply.github.com> 1707860576 -0600\ncommitter GitHub 1707860576 +0100\n\nBUG: Use NumberObject for /Border elements of annotations (#2451)\n\nAs defined in Table 164 – Entries common to all annotation dictionaries, the /Border Array consists of NumberObjects. Previously, pypdf used NameObject which is wrong.\r\n\r\nThe previous version caused a warning in the class NameObject: \"Incorrect first char in NameObject:({self})\".\r\n\r\nFixes #2444" } }, "url": "https://api.github.com/repos/py-pdf/pypdf/commits/b7bfd0d7eddfd0865a94cc9e7027df6596242cf7", "html_url": "https://github.com/py-pdf/pypdf/commit/b7bfd0d7eddfd0865a94cc9e7027df6596242cf7", "comments_url": "https://api.github.com/repos/py-pdf/pypdf/commits/b7bfd0d7eddfd0865a94cc9e7027df6596242cf7/comments", "author": { "login": "rsinger417", "id": 159086296, "node_id": "U_kgDOCXt22A", "avatar_url": "https://avatars.githubusercontent.com/u/159086296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/rsinger417", "html_url": "https://github.com/rsinger417", "followers_url": "https://api.github.com/users/rsinger417/followers", "following_url": "https://api.github.com/users/rsinger417/following{/other_user}", "gists_url": "https://api.github.com/users/rsinger417/gists{/gist_id}", "starred_url": "https://api.github.com/users/rsinger417/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/rsinger417/subscriptions", "organizations_url": "https://api.github.com/users/rsinger417/orgs", "repos_url": "https://api.github.com/users/rsinger417/repos", "events_url": "https://api.github.com/users/rsinger417/events{/privacy}", "received_events_url": "https://api.github.com/users/rsinger417/received_events", "type": "User", "site_admin": false }, "committer": { "login": "web-flow", "id": 19864447, "node_id": "MDQ6VXNlcjE5ODY0NDQ3", "avatar_url": "https://avatars.githubusercontent.com/u/19864447?v=4", "gravatar_id": "", "url": "https://api.github.com/users/web-flow", "html_url": "https://github.com/web-flow", "followers_url": "https://api.github.com/users/web-flow/followers", "following_url": "https://api.github.com/users/web-flow/following{/other_user}", "gists_url": "https://api.github.com/users/web-flow/gists{/gist_id}", "starred_url": "https://api.github.com/users/web-flow/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/web-flow/subscriptions", "organizations_url": "https://api.github.com/users/web-flow/orgs", "repos_url": "https://api.github.com/users/web-flow/repos", "events_url": "https://api.github.com/users/web-flow/events{/privacy}", "received_events_url": "https://api.github.com/users/web-flow/received_events", "type": "User", "site_admin": false }, "parents": [ { "sha": "8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "url": "https://api.github.com/repos/py-pdf/pypdf/commits/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "html_url": "https://github.com/py-pdf/pypdf/commit/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21" } ] }, { "sha": "8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "node_id": "C_kwDOAC-ZndoAKDhjYWNiMGZjOGZlZTk5MjBiMDUxNWQxMjg5ZTZlZTgxOTFlYjNmMjE", "commit": { "author": { "name": "Stefan", "email": "96178532+stefan6419846@users.noreply.github.com", "date": "2024-02-13T21:33:37Z" }, "committer": { "name": "GitHub", "email": "noreply@github.com", "date": "2024-02-13T21:33:37Z" }, "message": "DOC: Document easier way to update metadata (#2454)", "tree": { "sha": "79408055102933a8d62a3d1ec49df9f25fd5e963", "url": "https://api.github.com/repos/py-pdf/pypdf/git/trees/79408055102933a8d62a3d1ec49df9f25fd5e963" }, "url": "https://api.github.com/repos/py-pdf/pypdf/git/commits/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "comment_count": 0, "verification": { "verified": true, "reason": "valid", "signature": "-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJly+AxCRC1aQ7uu5UhlAAAd2kQAB8venK8xBYafzASXTRV2ye/\nOkGIVobepYja0lKIgZpipPlmDbDnHB2UptWRMpAd7rNiL9iYnSqBNxOmCvfux99/\nqx0h9XuYzSZ1KJ6cK43ab1ErSsrjvLpO/LsMmtakzZR7BrFUjO6mIE3YuU0GmhKM\nNUPngT+A6/Lxz6Z+UwqkeylkcDj+90gNAPiKY2yr+mKmg99RI5Xqvm7j++vT3bPF\nJQmr46w0aiGW30Von0JAtu/IvprGksrfHWALFIYMHnJCaXJdv2mPJ8mwiLew/o4L\n0uicPmwnDvS7VdCObi6EKbEP4ptgierco8pAMVRpkUpnmu8ObgT7ZzPLT6iay6U1\n2Gtc0zYXlcVSo4JQW9iE9zrGMk91m+BmIOZAhJsgfdz4DewCWCBxmz4+u0wlIlzN\n6JwwZQsW3Yq/P/gJ9qxBUKPe3SAcs3jz2VG3fiOt/HzAA6YLAUPUDxnhwvWhju5i\nLiQEApEnIri4OeNhqYmOjsEI3aV/3s6jE2fEiGPDkQW61yMAAiSVgZk3BcnFwZzL\nHrf+JWTRnosPFOhkRoTH3AOzmOWOKUCCUmVdC8nKn4Sp0tp+31HIH/h3LmVflBLy\nXHwPT/6OwW1yBzueYM6LWwovNlk3AS2g19fgylOmokIkrnlmi4nCwD30hM8plEFk\ni7hsSGE/rfsjTt5lTBip\n=1RO9\n-----END PGP SIGNATURE-----\n", "payload": "tree 79408055102933a8d62a3d1ec49df9f25fd5e963\nparent 3fb63f7e3839ce39ac98978c996f3086ba230a20\nauthor Stefan <96178532+stefan6419846@users.noreply.github.com> 1707860017 +0100\ncommitter GitHub 1707860017 +0100\n\nDOC: Document easier way to update metadata (#2454)\n\n" } }, "url": "https://api.github.com/repos/py-pdf/pypdf/commits/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "html_url": "https://github.com/py-pdf/pypdf/commit/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", "comments_url": "https://api.github.com/repos/py-pdf/pypdf/commits/8cacb0fc8fee9920b0515d1289e6ee8191eb3f21/comments", "author": { "login": "stefan6419846", "id": 96178532, "node_id": "U_kgDOBbuRZA", "avatar_url": "https://avatars.githubusercontent.com/u/96178532?v=4", "gravatar_id": "", "url": "https://api.github.com/users/stefan6419846", "html_url": "https://github.com/stefan6419846", "followers_url": "https://api.github.com/users/stefan6419846/followers", "following_url": "https://api.github.com/users/stefan6419846/following{/other_user}", "gists_url": "https://api.github.com/users/stefan6419846/gists{/gist_id}", "starred_url": "https://api.github.com/users/stefan6419846/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/stefan6419846/subscriptions", "organizations_url": "https://api.github.com/users/stefan6419846/orgs", "repos_url": "https://api.github.com/users/stefan6419846/repos", "events_url": "https://api.github.com/users/stefan6419846/events{/privacy}", "received_events_url": "https://api.github.com/users/stefan6419846/received_events", "type": "User", "site_admin": false }, "committer": { "login": "web-flow", "id": 19864447, "node_id": "MDQ6VXNlcjE5ODY0NDQ3", "avatar_url": "https://avatars.githubusercontent.com/u/19864447?v=4", "gravatar_id": "", "url": "https://api.github.com/users/web-flow", "html_url": "https://github.com/web-flow", "followers_url": "https://api.github.com/users/web-flow/followers", "following_url": "https://api.github.com/users/web-flow/following{/other_user}", "gists_url": "https://api.github.com/users/web-flow/gists{/gist_id}", "starred_url": "https://api.github.com/users/web-flow/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/web-flow/subscriptions", "organizations_url": "https://api.github.com/users/web-flow/orgs", "repos_url": "https://api.github.com/users/web-flow/repos", "events_url": "https://api.github.com/users/web-flow/events{/privacy}", "received_events_url": "https://api.github.com/users/web-flow/received_events", "type": "User", "site_admin": false }, "parents": [ { "sha": "3fb63f7e3839ce39ac98978c996f3086ba230a20", "url": "https://api.github.com/repos/py-pdf/pypdf/commits/3fb63f7e3839ce39ac98978c996f3086ba230a20", "html_url": "https://github.com/py-pdf/pypdf/commit/3fb63f7e3839ce39ac98978c996f3086ba230a20" } ] }, { "sha": "3fb63f7e3839ce39ac98978c996f3086ba230a20", "node_id": "C_kwDOAC-ZndoAKDNmYjYzZjdlMzgzOWNlMzlhYzk4OTc4Yzk5NmYzMDg2YmEyMzBhMjA", "commit": { "author": { "name": "Stefan", "email": "96178532+stefan6419846@users.noreply.github.com", "date": "2024-02-04T20:32:49Z" }, "committer": { "name": "GitHub", "email": "noreply@github.com", "date": "2024-02-04T20:32:49Z" }, "message": "TST: Avoid catching not emitted warnings (#2429)\n\nFix compatibility with pytest==8. \r\n\r\nRelevant upstream change: pytest-dev/pytest#9288\r\n\r\nFixes #2427", "tree": { "sha": "c96cab2f682f6db4c84440e26869b4d9de6a2bab", "url": "https://api.github.com/repos/py-pdf/pypdf/git/trees/c96cab2f682f6db4c84440e26869b4d9de6a2bab" }, "url": "https://api.github.com/repos/py-pdf/pypdf/git/commits/3fb63f7e3839ce39ac98978c996f3086ba230a20", "comment_count": 0, "verification": { "verified": true, "reason": "valid", "signature": "-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJlv/RxCRC1aQ7uu5UhlAAATI8QAB/yRz+hoeJVtjW/CePJo2Jv\n451gPAo66s7JMG+PwcCiI8KAAUEusDbrJAmdq8rfqnShSB83h/7g/s5oFr/1lFyh\noKkwoeMt6hGKtwEkTpa877gAWJ4ssRb1ymJoy7quPNlbFYtKngMC60Vc5TNEY1ZX\nQ1FdIG5rVRBsA5H7fP7k0q2QC6w/Ns6nftpPFIf3JSVnillJ/RKDLhEfPw6/PMi0\nnIJ2moTgTs6uyc4R0blR44BoElPd46ot/SQDcnHEwIQlWpfa2RIpulhF8qkO9fe3\neCRBQ7TZXjedsG+Da71QKxRWRFdwPqO+HI4u5EHNLIaw8z9450jtbz5H1NhNIB1s\nkIDTMgFXxGVuFKXfneduA6TAxrrJ12ONHcrUkN30y9AQ7Qe/B8LJ50iXQvo81SwZ\nqTFBluB6WiVuMMMT0pHgNCjsAEPvaagPa10qvjVokXh1rXlzQiNwqBWCbwj2b6f4\n8i3Vf9ufrK5p2WhsfO1aCW7Yc2C620sq66ic2Ck5cT2HLJA+cF1j7d7PT3/N0veo\ncnpPpAFeUs2A6R/zL0yJSoPV+BLM0BdahxfsBlT9pdrdqvBA7JIGOC9c3msSWBZY\n6GmfmsmWp0xdwYDJEzUL06shKjH6GlzhWvkjYuYH3myJBCoAjlUWCsJCvXWOD3iX\nPID6Cv+BtDfu80muR94A\n=bK1N\n-----END PGP SIGNATURE-----\n", "payload": "tree c96cab2f682f6db4c84440e26869b4d9de6a2bab\nparent 61b73d49778e8f0fb172d5323e67677c9974e420\nauthor Stefan <96178532+stefan6419846@users.noreply.github.com> 1707078769 +0100\ncommitter GitHub 1707078769 +0100\n\nTST: Avoid catching not emitted warnings (#2429)\n\nFix compatibility with pytest==8. \r\n\r\nRelevant upstream change: pytest-dev/pytest#9288\r\n\r\nFixes #2427" } }, "url": "https://api.github.com/repos/py-pdf/pypdf/commits/3fb63f7e3839ce39ac98978c996f3086ba230a20", "html_url": "https://github.com/py-pdf/pypdf/commit/3fb63f7e3839ce39ac98978c996f3086ba230a20", "comments_url": "https://api.github.com/repos/py-pdf/pypdf/commits/3fb63f7e3839ce39ac98978c996f3086ba230a20/comments", "author": { "login": "stefan6419846", "id": 96178532, "node_id": "U_kgDOBbuRZA", "avatar_url": "https://avatars.githubusercontent.com/u/96178532?v=4", "gravatar_id": "", "url": "https://api.github.com/users/stefan6419846", "html_url": "https://github.com/stefan6419846", "followers_url": "https://api.github.com/users/stefan6419846/followers", "following_url": "https://api.github.com/users/stefan6419846/following{/other_user}", "gists_url": "https://api.github.com/users/stefan6419846/gists{/gist_id}", "starred_url": "https://api.github.com/users/stefan6419846/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/stefan6419846/subscriptions", "organizations_url": "https://api.github.com/users/stefan6419846/orgs", "repos_url": "https://api.github.com/users/stefan6419846/repos", "events_url": "https://api.github.com/users/stefan6419846/events{/privacy}", "received_events_url": "https://api.github.com/users/stefan6419846/received_events", "type": "User", "site_admin": false }, "committer": { "login": "web-flow", "id": 19864447, "node_id": "MDQ6VXNlcjE5ODY0NDQ3", "avatar_url": "https://avatars.githubusercontent.com/u/19864447?v=4", "gravatar_id": "", "url": "https://api.github.com/users/web-flow", "html_url": "https://github.com/web-flow", "followers_url": "https://api.github.com/users/web-flow/followers", "following_url": "https://api.github.com/users/web-flow/following{/other_user}", "gists_url": "https://api.github.com/users/web-flow/gists{/gist_id}", "starred_url": "https://api.github.com/users/web-flow/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/web-flow/subscriptions", "organizations_url": "https://api.github.com/users/web-flow/orgs", "repos_url": "https://api.github.com/users/web-flow/repos", "events_url": "https://api.github.com/users/web-flow/events{/privacy}", "received_events_url": "https://api.github.com/users/web-flow/received_events", "type": "User", "site_admin": false }, "parents": [ { "sha": "61b73d49778e8f0fb172d5323e67677c9974e420", "url": "https://api.github.com/repos/py-pdf/pypdf/commits/61b73d49778e8f0fb172d5323e67677c9974e420", "html_url": "https://github.com/py-pdf/pypdf/commit/61b73d49778e8f0fb172d5323e67677c9974e420" } ] }, { "sha": "61b73d49778e8f0fb172d5323e67677c9974e420", "node_id": "C_kwDOAC-ZndoAKDYxYjczZDQ5Nzc4ZThmMGZiMTcyZDUzMjNlNjc2NzdjOTk3NGU0MjA", "commit": { "author": { "name": "CWKSC", "email": "cwksc.person@gmail.com", "date": "2024-02-03T08:02:35Z" }, "committer": { "name": "GitHub", "email": "noreply@github.com", "date": "2024-02-03T08:02:35Z" }, "message": "DOC: Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426)", "tree": { "sha": "9fb79466999d9d73c6ba15afdc76ce4d6f59c470", "url": "https://api.github.com/repos/py-pdf/pypdf/git/trees/9fb79466999d9d73c6ba15afdc76ce4d6f59c470" }, "url": "https://api.github.com/repos/py-pdf/pypdf/git/commits/61b73d49778e8f0fb172d5323e67677c9974e420", "comment_count": 0, "verification": { "verified": true, "reason": "valid", "signature": "-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJlvfMbCRC1aQ7uu5UhlAAA1X0QADKiCwRr4WJNPYlwgJKp/I4l\nO/6H/uQ5XO6fSvkLNchzU+017kgwEfaPoEunTvb0rpAVfwjJknytCCaR5duQQ7np\naP23J6gIViawM15qp20C53q+5r6NUZnerOIrMKMGLaRtsDMIePYT6zd5Q9KTnx5/\nhF6X+LMx5zKDuXHmRV8Jhmii+8IQA4Ekgv/t+UNmkqpVQig603/IzPTVnUkY+Gcu\nNEHb1W66bS5/BvMyrqwDx//Z0kpxJltNAoaVNAAz1+KgUm/NncBJcuR95U7ffGkO\neoi9UqlF06YO4mkA7ZbAUfgujWEDsbCsnFuVsKe5RJLeRvidHQl7YJQg36mWV+He\nNTMttZX2UJOiFLDeWeEoJ+DixBmXO5EbYsZlFDhGFizNAtY14zW/7RUioBao20DZ\ny8RmYmmJW5p39h4gEvDD6+62lYqz+2SIPPSQdPNmANn2OOge43KArfyNYHbg4M13\n6yLzMZuY61B5arfV0JdDlBdLncws3C7JjKljOfSCYCJ0/Bq8fKL5206k60U3jyru\nRCoTtHFIWn1vzHgOf9cJMiIPWTa8HxH2+2mvZbDxmT+p4J5qgfRJ0BUw1i/klWqt\n1OfmSgMgdkgPxczSjHd2gnnasClNy4yyrWsdDjRKaTEOMSIsb7DUm8UnD3oDs+nC\nKMudDi6gn5ASiZf+ZsA5\n=MAdq\n-----END PGP SIGNATURE-----\n", "payload": "tree 9fb79466999d9d73c6ba15afdc76ce4d6f59c470\nparent f851a532a5ec23b572d86bd7185b327a3fac6b58\nauthor CWKSC 1706947355 +0800\ncommitter GitHub 1706947355 +0100\n\nDOC: Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426)\n\n" } }, "url": "https://api.github.com/repos/py-pdf/pypdf/commits/61b73d49778e8f0fb172d5323e67677c9974e420", "html_url": "https://github.com/py-pdf/pypdf/commit/61b73d49778e8f0fb172d5323e67677c9974e420", "comments_url": "https://api.github.com/repos/py-pdf/pypdf/commits/61b73d49778e8f0fb172d5323e67677c9974e420/comments", "author": { "login": "CWKSC", "id": 53114952, "node_id": "MDQ6VXNlcjUzMTE0OTUy", "avatar_url": "https://avatars.githubusercontent.com/u/53114952?v=4", "gravatar_id": "", "url": "https://api.github.com/users/CWKSC", "html_url": "https://github.com/CWKSC", "followers_url": "https://api.github.com/users/CWKSC/followers", "following_url": "https://api.github.com/users/CWKSC/following{/other_user}", "gists_url": "https://api.github.com/users/CWKSC/gists{/gist_id}", "starred_url": "https://api.github.com/users/CWKSC/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/CWKSC/subscriptions", "organizations_url": "https://api.github.com/users/CWKSC/orgs", "repos_url": "https://api.github.com/users/CWKSC/repos", "events_url": "https://api.github.com/users/CWKSC/events{/privacy}", "received_events_url": "https://api.github.com/users/CWKSC/received_events", "type": "User", "site_admin": false }, "committer": { "login": "web-flow", "id": 19864447, "node_id": "MDQ6VXNlcjE5ODY0NDQ3", "avatar_url": "https://avatars.githubusercontent.com/u/19864447?v=4", "gravatar_id": "", "url": "https://api.github.com/users/web-flow", "html_url": "https://github.com/web-flow", "followers_url": "https://api.github.com/users/web-flow/followers", "following_url": "https://api.github.com/users/web-flow/following{/other_user}", "gists_url": "https://api.github.com/users/web-flow/gists{/gist_id}", "starred_url": "https://api.github.com/users/web-flow/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/web-flow/subscriptions", "organizations_url": "https://api.github.com/users/web-flow/orgs", "repos_url": "https://api.github.com/users/web-flow/repos", "events_url": "https://api.github.com/users/web-flow/events{/privacy}", "received_events_url": "https://api.github.com/users/web-flow/received_events", "type": "User", "site_admin": false }, "parents": [ { "sha": "f851a532a5ec23b572d86bd7185b327a3fac6b58", "url": "https://api.github.com/repos/py-pdf/pypdf/commits/f851a532a5ec23b572d86bd7185b327a3fac6b58", "html_url": "https://github.com/py-pdf/pypdf/commit/f851a532a5ec23b572d86bd7185b327a3fac6b58" } ] }, { "sha": "f851a532a5ec23b572d86bd7185b327a3fac6b58", "node_id": "C_kwDOAC-ZndoAKGY4NTFhNTMyYTVlYzIzYjU3MmQ4NmJkNzE4NWIzMjdhM2ZhYzZiNTg", "commit": { "author": { "name": "dependabot[bot]", "email": "49699333+dependabot[bot]@users.noreply.github.com", "date": "2024-02-03T08:00:35Z" }, "committer": { "name": "GitHub", "email": "noreply@github.com", "date": "2024-02-03T08:00:35Z" }, "message": "DEV: Bump codecov/codecov-action from 3 to 4 (#2430)\n\nBumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 4.\r\n- [Release notes](https://github.com/codecov/codecov-action/releases)\r\n- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)\r\n- [Commits](https://github.com/codecov/codecov-action/compare/v3...v4)\r\n\r\n---\r\nupdated-dependencies:\r\n- dependency-name: codecov/codecov-action\r\n dependency-type: direct:production\r\n update-type: version-update:semver-major\r\n...\r\n\r\nSigned-off-by: dependabot[bot] \r\nCo-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>", "tree": { "sha": "fb40fe05c5f1a6679bc1e7a24b0f9fc55c150c88", "url": "https://api.github.com/repos/py-pdf/pypdf/git/trees/fb40fe05c5f1a6679bc1e7a24b0f9fc55c150c88" }, "url": "https://api.github.com/repos/py-pdf/pypdf/git/commits/f851a532a5ec23b572d86bd7185b327a3fac6b58", "comment_count": 0, "verification": { "verified": true, "reason": "valid", "signature": "-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJlvfKjCRC1aQ7uu5UhlAAA9NUQAGTOt3JzejSo6o5fHUrLreus\nv8TScA1B4nuWsJLH0nvGArZ8y8L/9JqG2fUTs3WGjY3PL9Dgn9fhmO+3dMcUDEav\nEtBXdNHsodAUvNHKh1d9ZwCK+jSzbO9tSKiY4enxqUHnr+0m0q3XQHkYLf9eUklE\n9/vi/OCV8JSptRkiS+VOsSrqfO+zqNUfnOxpNy6UNLPaNDwZyom6WROZE6yXLm1W\nE0rsG10rBEXyvhjF2E4znoEcN/5+OIJr87h1Jys7y3qMXOo61my6bEpHY+gZpBRQ\nN3xo3ptu4BhP0a4oI8iDjnQMQLS4cLN++LeMuUbWIEpKtiKkF5q/bGP3s1wniLTD\nSYh14z0jIaJ7QPdkOEK2/Fv9lx5tno66bFe4vKC4DSmX3itcqh/XOiPFPkgRAalj\nAd5g6hs1QlJErAwQShe6lzNDRnIDGoD6ZOaTMdxlbRNdwInr83Qz4Gt92D+dX4eQ\njln9Welx4xTuPnYv6Qhmdc69Kk2nyhRuTnCsI0jaoqDRSLQxlzCuuQMn7u5XyqSS\npSkWUYOw8zjrJd7ItPVe3YII5JIiRLEkHrDzTwGZAcy2E6GPMDLeXsx4K6GUhsfC\nXenOpPuoo6BDk/bhrkWb7klyYG09JQtum31bCpDp1qxafXh5jh9Y0mztZJ4gWjaF\n0NawJ3AozsNrioHxf6xz\n=0OMP\n-----END PGP SIGNATURE-----\n", "payload": "tree fb40fe05c5f1a6679bc1e7a24b0f9fc55c150c88\nparent 757932944f54ba661b89e0629ed3fc9d8345dbab\nauthor dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> 1706947235 +0100\ncommitter GitHub 1706947235 +0100\n\nDEV: Bump codecov/codecov-action from 3 to 4 (#2430)\n\nBumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 4.\r\n- [Release notes](https://github.com/codecov/codecov-action/releases)\r\n- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)\r\n- [Commits](https://github.com/codecov/codecov-action/compare/v3...v4)\r\n\r\n---\r\nupdated-dependencies:\r\n- dependency-name: codecov/codecov-action\r\n dependency-type: direct:production\r\n update-type: version-update:semver-major\r\n...\r\n\r\nSigned-off-by: dependabot[bot] \r\nCo-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>" } }, "url": "https://api.github.com/repos/py-pdf/pypdf/commits/f851a532a5ec23b572d86bd7185b327a3fac6b58", "html_url": "https://github.com/py-pdf/pypdf/commit/f851a532a5ec23b572d86bd7185b327a3fac6b58", "comments_url": "https://api.github.com/repos/py-pdf/pypdf/commits/f851a532a5ec23b572d86bd7185b327a3fac6b58/comments", "author": { "login": "dependabot[bot]", "id": 49699333, "node_id": "MDM6Qm90NDk2OTkzMzM=", "avatar_url": "https://avatars.githubusercontent.com/in/29110?v=4", "gravatar_id": "", "url": "https://api.github.com/users/dependabot%5Bbot%5D", "html_url": "https://github.com/apps/dependabot", "followers_url": "https://api.github.com/users/dependabot%5Bbot%5D/followers", "following_url": "https://api.github.com/users/dependabot%5Bbot%5D/following{/other_user}", "gists_url": "https://api.github.com/users/dependabot%5Bbot%5D/gists{/gist_id}", "starred_url": "https://api.github.com/users/dependabot%5Bbot%5D/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/dependabot%5Bbot%5D/subscriptions", "organizations_url": "https://api.github.com/users/dependabot%5Bbot%5D/orgs", "repos_url": "https://api.github.com/users/dependabot%5Bbot%5D/repos", "events_url": "https://api.github.com/users/dependabot%5Bbot%5D/events{/privacy}", "received_events_url": "https://api.github.com/users/dependabot%5Bbot%5D/received_events", "type": "Bot", "site_admin": false }, "committer": { "login": "web-flow", "id": 19864447, "node_id": "MDQ6VXNlcjE5ODY0NDQ3", "avatar_url": "https://avatars.githubusercontent.com/u/19864447?v=4", "gravatar_id": "", "url": "https://api.github.com/users/web-flow", "html_url": "https://github.com/web-flow", "followers_url": "https://api.github.com/users/web-flow/followers", "following_url": "https://api.github.com/users/web-flow/following{/other_user}", "gists_url": "https://api.github.com/users/web-flow/gists{/gist_id}", "starred_url": "https://api.github.com/users/web-flow/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/web-flow/subscriptions", "organizations_url": "https://api.github.com/users/web-flow/orgs", "repos_url": "https://api.github.com/users/web-flow/repos", "events_url": "https://api.github.com/users/web-flow/events{/privacy}", "received_events_url": "https://api.github.com/users/web-flow/received_events", "type": "User", "site_admin": false }, "parents": [ { "sha": "757932944f54ba661b89e0629ed3fc9d8345dbab", "url": "https://api.github.com/repos/py-pdf/pypdf/commits/757932944f54ba661b89e0629ed3fc9d8345dbab", "html_url": "https://github.com/py-pdf/pypdf/commit/757932944f54ba661b89e0629ed3fc9d8345dbab" } ] } ] ================================================ FILE: tests/scripts/test_example_files.py ================================================ """Tests related to the example files.""" from operator import itemgetter from pathlib import Path from tests import read_yaml_to_list_of_dicts def test_consistency(): pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent.parent / "example_files.yaml") # Ensure the names are unique assert len(pdfs) == len(set(map(itemgetter("local_filename"), pdfs))) # Ensure the urls are unique assert len(pdfs) == len(set(map(itemgetter("url"), pdfs))) ================================================ FILE: tests/scripts/test_make_release.py ================================================ """Test the `make_release.py` script.""" import sys from pathlib import Path from unittest import mock import pytest DATA_PATH = Path(__file__).parent.resolve() / "data" # line starting with \ and ending with " have been observed on Windows GIT_LOG__VERSION_4_0_1 = """ b7bfd0d7eddfd0865a94cc9e7027df6596242cf7:::BUG: Use NumberObject for /Border elements of annotations (#2451):::rsinger417 8cacb0fc8fee9920b0515d1289e6ee8191eb3f21:::DOC: Document easier way to update metadata (#2454):::Stefan 3fb63f7e3839ce39ac98978c996f3086ba230a20:::TST: Avoid catching not emitted warnings (#2429):::Stefan \\61b73d49778e8f0fb172d5323e67677c9974e420:::DOC: Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426):::CWKSC" f851a532a5ec23b572d86bd7185b327a3fac6b58:::DEV: Bump codecov/codecov-action from 3 to 4 (#2430):::dependabot[bot]""".encode() # noqa: E501 COMMITS__VERSION_4_0_1 = DATA_PATH.joinpath("commits__version_4_0_1.json") VERSION_3_9_PLUS = sys.version_info[:2] >= (3, 9) @pytest.mark.skipif(not VERSION_3_9_PLUS, reason="Function uses method removeprefix added in Python 3.9") @pytest.mark.parametrize( ("data", "expected"), [ ("", ""), ("# CHANGELOG", ""), ("# CHANGELOG ", ""), ("# CHANGELOG ", ""), ("## CHANGELOG", "## CHANGELOG"), ("CHANGELOG", "CHANGELOG"), ("# CHANGELOG #", "#"), ] ) def test_strip_header(data, expected): """Removal of the 'CHANGELOG' header.""" make_release = pytest.importorskip("make_release") assert make_release.strip_header(data) == expected def test_get_git_commits_since_tag(): make_release = pytest.importorskip("make_release") with open(COMMITS__VERSION_4_0_1, mode="rb") as commits, mock.patch( "urllib.request.urlopen", side_effect=lambda _: commits ), mock.patch("subprocess.check_output", return_value=GIT_LOG__VERSION_4_0_1): commits = make_release.get_git_commits_since_tag("4.0.1") assert commits == [ make_release.Change( commit_hash="b7bfd0d7eddfd0865a94cc9e7027df6596242cf7", prefix="BUG", message="Use NumberObject for /Border elements of annotations (#2451)", author="rsinger417", author_login="rsinger417", ), make_release.Change( commit_hash="8cacb0fc8fee9920b0515d1289e6ee8191eb3f21", prefix="DOC", message="Document easier way to update metadata (#2454)", author="Stefan", author_login="stefan6419846", ), make_release.Change( commit_hash="3fb63f7e3839ce39ac98978c996f3086ba230a20", prefix="TST", message="Avoid catching not emitted warnings (#2429)", author="Stefan", author_login="stefan6419846", ), make_release.Change( commit_hash="61b73d49778e8f0fb172d5323e67677c9974e420", prefix="DOC", message="Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426)", author="CWKSC", author_login="CWKSC", ), make_release.Change( commit_hash="f851a532a5ec23b572d86bd7185b327a3fac6b58", prefix="DEV", message="Bump codecov/codecov-action from 3 to 4 (#2430)", author="dependabot[bot]", author_login="dependabot[bot]", ), ] def test_get_formatted_changes(): make_release = pytest.importorskip("make_release") with open(COMMITS__VERSION_4_0_1, mode="rb") as commits, mock.patch( "urllib.request.urlopen", side_effect=lambda _: commits ), mock.patch("subprocess.check_output", return_value=GIT_LOG__VERSION_4_0_1): output, output_with_user = make_release.get_formatted_changes("4.0.1") assert ( output == """ ### Bug Fixes (BUG) - Use NumberObject for /Border elements of annotations (#2451) ### Documentation (DOC) - Document easier way to update metadata (#2454) - Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426) ### Developer Experience (DEV) - Bump codecov/codecov-action from 3 to 4 (#2430) ### Testing (TST) - Avoid catching not emitted warnings (#2429) """ ) assert ( output_with_user == """ ### Bug Fixes (BUG) - Use NumberObject for /Border elements of annotations (#2451) by @rsinger417 ### Documentation (DOC) - Document easier way to update metadata (#2454) by @stefan6419846 - Typo `Polyline` → `PolyLine` in adding-pdf-annotations.md (#2426) by @CWKSC ### Developer Experience (DEV) - Bump codecov/codecov-action from 3 to 4 (#2430) by @dependabot[bot] ### Testing (TST) - Avoid catching not emitted warnings (#2429) by @stefan6419846 """ ) def test_get_formatted_changes__other(): make_release = pytest.importorskip("make_release") changes = [ make_release.Change( commit_hash="f20c36eabd59ea661f30c5da35af7c9e435c7de9", prefix="", message="Improve lossless compression example (#2488)", author="j-t-1", author_login="j-t-1", ), make_release.Change( commit_hash="afbee382f8fd2b39588db6470b9b2b2c82905318", prefix="ENH", message="Add reattach_fields function (#2480)", author="pubpub-zz", author_login="pubpub-zz", ), make_release.Change( commit_hash="cd705f959064d8125397ddf4f7bdd2ea296f889f", prefix="FIX", message="Broken test due to expired test file URL (#2468)", author="pubpub-zz", author_login="pubpub-zz", ), ] with mock.patch.object( make_release, "get_git_commits_since_tag", return_value=changes ): output, output_with_user = make_release.get_formatted_changes("dummy") assert ( output == """ ### New Features (ENH) - Add reattach_fields function (#2480) ### Other - : Improve lossless compression example (#2488) - FIX: Broken test due to expired test file URL (#2468) """ ) assert ( output_with_user == """ ### New Features (ENH) - Add reattach_fields function (#2480) by @pubpub-zz ### Other - : Improve lossless compression example (#2488) by @j-t-1 - FIX: Broken test due to expired test file URL (#2468) by @pubpub-zz """ ) ================================================ FILE: tests/test_annotations.py ================================================ """Test the pypdf.annotations submodule.""" from io import BytesIO from pathlib import Path import pytest from pypdf import PdfReader, PdfWriter from pypdf.annotations import ( AnnotationDictionary, Ellipse, FreeText, Highlight, Line, Link, Polygon, PolyLine, Popup, Rectangle, Text, ) from pypdf.errors import PdfReadError from pypdf.generic import ArrayObject, FloatObject, NumberObject from . import RESOURCE_ROOT, get_data_from_url def test_ellipse(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act ellipse_annotation = Ellipse( rect=(50, 550, 500, 650), interior_color="ff0000", ) writer.add_annotation(0, ellipse_annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_text(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act text_annotation = Text( text="Hello World\nThis is the second line!", rect=(50, 550, 500, 650), open=True, ) writer.add_annotation(0, text_annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_free_text(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act free_text_annotation = FreeText( text="Hello World - bold and italic\nThis is the second line!", rect=(50, 550, 200, 650), font="Arial", bold=True, italic=True, font_size="20pt", font_color="00ff00", border_color=None, background_color=None, ) writer.add_annotation(0, free_text_annotation) free_text_annotation = FreeText( text="Another free text annotation (not bold, not italic)", rect=(500, 550, 200, 650), font="Arial", bold=False, italic=False, font_size="20pt", font_color="00ff00", border_color="0000ff", background_color="cdcdcd", ) writer.add_annotation(0, free_text_annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_free_text__font_specifier(): free_text_annotation = FreeText( text="Hello World", rect=(0, 0, 0, 0), ) assert free_text_annotation["/DS"] == "font: normal normal 14pt Helvetica;text-align:left;color:#000000" free_text_annotation = FreeText( text="Hello World", rect=(50, 550, 200, 650), font="Arial", bold=True, italic=True, font_size="20pt", font_color="00ff00", border_color=None, background_color=None, ) assert free_text_annotation["/DS"] == "font: italic bold 20pt Arial;text-align:left;color:#00ff00" def test_annotation_dictionary(): a = AnnotationDictionary() a.flags = 123 assert a.flags == 123 def test_polygon(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) with pytest.raises(ValueError): Polygon( vertices=[], ) annotation = Polygon( vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], ) writer.add_annotation(0, annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_polyline(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) with pytest.raises( ValueError, match=r"A polyline needs at least 1 vertex with two coordinates", ): PolyLine( vertices=[], ) annotation = PolyLine( vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], ) writer.add_annotation(0, annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_line(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act line_annotation = Line( text="Hello World\nLine2", rect=(50, 550, 200, 650), p1=(50, 550), p2=(200, 650), ) writer.add_annotation(0, line_annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_rectangle(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act square_annotation = Rectangle( rect=(50, 550, 200, 650), interior_color="ff0000" ) writer.add_annotation(0, square_annotation) square_annotation = Rectangle(rect=(40, 400, 150, 450)) writer.add_annotation(0, square_annotation) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_highlight(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act highlight_annotation = Highlight( rect=(95.79332, 704.31777, 138.55779, 724.6855), highlight_color="ff0000", quad_points=ArrayObject( [ FloatObject(100.060779), FloatObject(723.55398), FloatObject(134.29033), FloatObject(723.55398), FloatObject(100.060779), FloatObject(705.4493), FloatObject(134.29033), FloatObject(705.4493), ] ), printing=False, ) writer.add_annotation(0, highlight_annotation) for annot in writer.pages[0]["/Annots"]: obj = annot.get_object() subtype = obj["/Subtype"] if subtype == "/Highlight": assert "/F" not in obj or obj["/F"] == NumberObject(0) writer.add_page(page) # Act highlight_annotation = Highlight( rect=(95.79332, 704.31777, 138.55779, 724.6855), highlight_color="ff0000", quad_points=ArrayObject( [ FloatObject(100.060779), FloatObject(723.55398), FloatObject(134.29033), FloatObject(723.55398), FloatObject(100.060779), FloatObject(705.4493), FloatObject(134.29033), FloatObject(705.4493), ] ), printing=True, ) writer.add_annotation(1, highlight_annotation) for annot in writer.pages[1]["/Annots"]: obj = annot.get_object() subtype = obj["/Subtype"] if subtype == "/Highlight": assert obj["/F"] == NumberObject(4) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_link(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act # Part 1: Too many args with pytest.raises(ValueError): Link( rect=(50, 550, 200, 650), url="https://martin-thoma.com/", target_page_index=3, ) # Part 2: Too few args with pytest.raises(ValueError): Link( rect=(50, 550, 200, 650), ) # Part 3: External Link link_annotation = Link( rect=(50, 50, 100, 100), url="https://martin-thoma.com/", border=[1, 0, 6, [3, 2]], ) writer.add_annotation(0, link_annotation) # Part 4: Internal Link link_annotation = Link( rect=(100, 100, 300, 200), target_page_index=1, border=[50, 10, 4], ) writer.add_annotation(0, link_annotation) for page in reader.pages[1:]: writer.add_page(page) # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) def test_popup(caplog): # Arrange pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) # Act text_annotation = Text( title_bar="hello world", text="Hello World\nThis is the second line!", rect=(50, 550, 200, 650), open=True, ) ta = writer.add_annotation(0, text_annotation) popup_annotation = Popup( rect=(50, 550, 200, 650), open=True, parent=ta, # prefer to use for evolutivity ) writer.add_annotation(writer.pages[0], popup_annotation) Popup( rect=(50, 550, 200, 650), open=True, parent=True, # broken parameter # type: ignore ) assert "Unregistered Parent object : No Parent field set" in caplog.text target = "annotated-pdf-popup.pdf" writer.write(target) Path(target).unlink() # comment this out for manual inspection def test_markup_annotation_in_reply_to(): """Test that a reply annotation gets /IRT, /RT, and /NM after a write/read cycle.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") parent = Text( text="Parent comment", rect=(50, 550, 200, 650), open=True, ) parent_ref = writer.add_annotation(0, parent) reply = Text( text="Reply to parent", rect=(50, 550, 200, 650), in_reply_to=parent_ref, ) writer.add_annotation(0, reply) assert "/IRT" in reply assert reply["/IRT"].get_object() is parent_ref assert reply["/RT"] == "/R" assert "/NM" in reply assert "/NM" not in parent_ref buf = BytesIO() writer.write(buf) reader2 = PdfReader(buf) annots = reader2.pages[0]["/Annots"] assert len(annots) == 2 reply_obj = annots[1].get_object() assert reply_obj["/IRT"].get_object()["/Contents"] == "Parent comment" assert reply_obj["/NM"] == reply["/NM"] def test_markup_annotation_in_reply_to_group_type(): """Test that a grouped annotation sets /RT to /Group.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") parent = Text( text="Parent", rect=(50, 550, 200, 650), ) parent_ref = writer.add_annotation(0, parent) grouped = Text( text="Grouped with parent", rect=(50, 550, 200, 650), in_reply_to=parent_ref, reply_type="Group", ) writer.add_annotation(0, grouped) assert grouped["/RT"] == "/Group" assert "/IRT" in grouped assert "/NM" in grouped def test_markup_annotation_name_without_reply(): """Test that annotation_name without in_reply_to raises ValueError.""" with pytest.raises(ValueError, match="annotation_name is only supported when in_reply_to is set"): Text( text="Named but not a reply", rect=(50, 550, 200, 650), annotation_name="my-unique-name", ) def test_markup_annotation_reply_type_without_reply(): """Test that non-default reply_type without in_reply_to raises ValueError.""" with pytest.raises(ValueError, match="reply_type is only meaningful when in_reply_to is set"): Text( text="Grouped but not a reply", rect=(50, 550, 200, 650), reply_type="Group", ) def test_markup_annotation_in_reply_to_custom_name(): """Test explicit annotation_name with in_reply_to.""" writer = PdfWriter() writer.add_blank_page(width=200, height=200) parent = Text(text="Parent", rect=(0, 0, 100, 100)) parent_ref = writer.add_annotation(0, parent) reply = Text( text="Reply", rect=(0, 0, 100, 100), in_reply_to=parent_ref, annotation_name="custom-reply-name", ) writer.add_annotation(0, reply) assert reply["/NM"] == "custom-reply-name" assert "/IRT" in reply def test_markup_annotation_in_reply_to_unregistered(): """Test that an unregistered parent raises ValueError.""" unregistered = Text(text="Not added to writer", rect=(0, 0, 100, 100)) with pytest.raises(ValueError, match="in_reply_to must be a registered annotation"): Text( text="Reply", rect=(0, 0, 100, 100), in_reply_to=unregistered, ) def test_markup_annotation_in_reply_to_indirect_object(): """Test passing an IndirectObject directly as in_reply_to.""" writer = PdfWriter() writer.add_blank_page(width=200, height=200) parent = Text(text="Parent", rect=(0, 0, 100, 100)) parent_ref = writer.add_annotation(0, parent) indirect_ref = parent_ref.indirect_reference reply = Text( text="Reply via IndirectObject", rect=(0, 0, 100, 100), in_reply_to=indirect_ref, ) writer.add_annotation(0, reply) assert "/IRT" in reply assert reply["/RT"] == "/R" assert "/NM" in reply buf = BytesIO() writer.write(buf) reader = PdfReader(buf) annots = reader.pages[0]["/Annots"] assert len(annots) == 2 reply_obj = annots[1].get_object() assert reply_obj["/IRT"].get_object()["/Contents"] == "Parent" assert reply_obj["/NM"] == reply["/NM"] @pytest.mark.enable_socket def test_outline_action_without_d_lenient(): reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) assert len(reader.outline) == 2 @pytest.mark.enable_socket def test_outline_action_without_d_strict(pdf_file_path): reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) reader.strict = True with pytest.raises(PdfReadError) as e: assert len(reader.outline) == 2 assert "Outline Action Missing /D" in str(e) ================================================ FILE: tests/test_appearance_stream.py ================================================ """Test the pypdf.generic._appearance_stream module.""" from pypdf.generic._appearance_stream import BaseStreamConfig, TextStreamAppearance def test_comb(): layout=BaseStreamConfig(rectangle=(0.0, 0.0, 197.285, 18.455)) font_size = 10.0 text = "01234567" max_length = 10 is_comb = True appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_comb=is_comb, max_length=max_length ) assert appearance_stream.get_data() == ( b"q\n/Tx BMC \nq\n2 1 193.285 16.455 re\nW\nBT\n/Helv 10.0 Tf 0 g\n" b"7.084250000000001 5.637499999999999 Td\n(0) Tj\n" b"19.7285 0.0 Td\n(1) Tj\n" b"19.728500000000004 0.0 Td\n(2) Tj\n" b"19.728499999999997 0.0 Td\n(3) Tj\n" b"19.728499999999997 0.0 Td\n(4) Tj\n" b"19.728499999999997 0.0 Td\n(5) Tj\n" b"19.72850000000001 0.0 Td\n(6) Tj\n" b"19.728499999999997 0.0 Td\n(7) Tj\nET\nQ\nEMC\nQ\n" ) layout.rectangle = (0.0, 0.0, 20.852, 20.84) text = "AA" max_length = 1 appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_comb=is_comb, max_length=max_length ) assert appearance_stream.get_data() == ( b"q\n/Tx BMC \nq\n2 1 16.852 18.84 re\nW\nBT\n/Helv 10.0 Tf 0 g\n7.091 6.83 Td\n(A) Tj\nET\nQ\nEMC\nQ\n" ) def test_scale_text(): layout=BaseStreamConfig(rectangle=(0, 0, 9.1, 55.4)) font_size = 10.1 text = "Hello World" is_multiline = False appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) assert b"10.1 Tf" in appearance_stream.get_data() text = "This is a very very long sentence that probably will scale below the minimum font size" font_size = 0.0 appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) assert b"4.0 Tf" in appearance_stream.get_data() layout.rectangle = (0, 0, 160, 360) font_size = 0.0 text = """Welcome to pypdf pypdf is a free and open source pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. pypdf can retrieve text and metadata from PDFs as well. See pdfly for a CLI application that uses pypdf to interact with PDFs. """ is_multiline = True appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) assert b"12 Tf" in appearance_stream.get_data() assert b"pypdf is a free and open" in appearance_stream.get_data() layout.rectangle = (0, 0, 160, 160) appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) assert b"9.8 Tf" in appearance_stream.get_data() layout.rectangle = (0, 0, 160, 12) appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) text = """Option A Option B Option C Option D """ selection = "Option A" assert b"4.0 Tf" in appearance_stream.get_data() text = "pneumonoultramicroscopicsilicovolcanoconiosis" appearance_stream = TextStreamAppearance( layout=layout, text=text, selection=selection, font_size=font_size, is_multiline=is_multiline ) assert b"7.3 Tf" in appearance_stream.get_data() layout.rectangle = (0, 0, 10, 100) text = "OneWord" appearance_stream = TextStreamAppearance( layout=layout, text=text, font_size=font_size, is_multiline=is_multiline ) assert b"OneWord" in appearance_stream.get_data() ================================================ FILE: tests/test_cmap.py ================================================ """Test the pypdf_cmap module.""" from io import BytesIO import pytest from pypdf import PdfReader, PdfWriter from pypdf._cmap import get_encoding, parse_bfchar, parse_bfrange from pypdf._codecs import charset_encoding from pypdf._font import Font from pypdf.errors import LimitReachedError from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject, StreamObject from . import RESOURCE_ROOT, get_data_from_url @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("url", "name", "strict"), [ # compute_space_width: ( None, "tika-923406.pdf", False, ), # _parse_to_unicode_process_rg: ( None, "tika-959173.pdf", False, ), ( None, "tika-959173.pdf", True, ), # issue #1718: ( None, "iss1718.pdf", False, ), ], ) def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() assert caplog.text == "" @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "strict"), [ # bfchar_on_2_chars: issue #1293 ( None, "ASurveyofImageClassificationBasedTechniques.pdf", False, ), # L40, get_font_width_from_default ( None, "tika-908104.pdf", False, ), # multiline_bfrange / regression test for issue #1285: ( None, "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", False, ), ( None, "Giacalone.pdf", False, ), ], ) def test_text_extraction_fast(caplog, url: str, name: str, strict: bool): """Text extraction runs without exceptions or warnings""" reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() assert caplog.text == "" @pytest.mark.enable_socket def test_parse_encoding_advanced_encoding_not_implemented(caplog): reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf"))) for page in reader.pages: page.extract_text() # The correctly spelled encoding is /WinAnsiEncoding assert "Advanced encoding /WinAnsEncoding not implemented yet" in caplog.text @pytest.mark.enable_socket def test_ascii_charset(): # Issue #1312 reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf"))) assert "/a" not in reader.pages[0].extract_text() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "page_nb", "within_text"), [ ( None, "cmap1370.pdf", 0, "", ), ( None, "02voc.pdf", 2, "Document delineation and character sequence decoding", ), ], ids=["iss1370", "iss1379"], ) def test_text_extraction_of_specific_pages( url: str, name: str, page_nb: int, within_text ): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert within_text in reader.pages[page_nb].extract_text() @pytest.mark.enable_socket def test_iss1533(): reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf"))) reader.pages[0].extract_text() # no error font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"]) assert font.character_map["\x01"] == "Ü" @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "page_index", "within_text", "caplog_text"), [ ( None, "tstUCS2.pdf", 1, ["2 / 12", "S0490520090001", "于博"], "", ), ( None, "tst-GBK_EUC.pdf", 0, ["NJA", "中华男科学杂志"], "Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n", ), ], ) def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) extracted = reader.pages[page_index].extract_text() # no error for contained in within_text: assert contained in extracted assert caplog_text in caplog.text @pytest.mark.enable_socket def test_latex(): reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt # actually the ϕ and φ seems to be crossed in latex @pytest.mark.enable_socket def test_unixxx_glyphs(): reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt @pytest.mark.enable_socket def test_cmap_compute_space_width(): # issue 2137 # original file URL: # url = "https://arxiv.org/pdf/2005.05909.pdf" # URL from github issue is too long to pass code type check, use original arxiv URL instead # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf"))) reader.pages[0].extract_text() # no error @pytest.mark.enable_socket def test_tabs_in_cmap(): """Issue #2173""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf"))) reader.pages[0].extract_text() @pytest.mark.enable_socket def test_ignoring_non_put_entries(): """Issue #2290""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text() @pytest.mark.enable_socket def test_eten_b5(): """Issue #2356""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") def test_missing_entries_in_cmap(): """ Issue #2702: this issue is observed on damaged pdfs use of this file in test has been discarded as too slow/long we will create the same error from crazyones """ pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) p = reader.pages[0] p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject( 99999999, 0, reader ) p.extract_text() def test_null_missing_width(): """For coverage of #2792""" writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") page = writer.pages[0] ft = page["/Resources"]["/Font"]["/F1"] ft[NameObject("/Widths")] = ArrayObject() ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject() page.extract_text() @pytest.mark.enable_socket def test_unigb_utf16(): """Cf #2812""" url = ( "https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf" ) name = "iss2812.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text() @pytest.mark.enable_socket def test_too_many_differences(): """Cf #2836""" url = ( "https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf" ) name = "iss2836.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.pages[0].extract_text() == "" @pytest.mark.enable_socket def test_iss2925(): url = ( "https://github.com/user-attachments/files/17621508/2305.09315.pdf" ) name = "iss2925.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text() @pytest.mark.enable_socket def test_iss2966(): """Regression test for issue #2966: indirect objects in fonts""" url = ( "https://github.com/user-attachments/files/17904233/repro_out.pdf" ) name = "iss2966.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text() @pytest.mark.enable_socket def test_binascii_odd_length_string(caplog): """Tests for #2216""" url = "https://github.com/user-attachments/files/18199642/iss2216.pdf" name = "iss2216.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] assert "\n(Many other theorems may\n" in page.extract_text() assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text @pytest.mark.enable_socket def test_standard_encoding(caplog): """Tests for #3156""" url = "https://github.com/user-attachments/files/18983503/standard-encoding.pdf" name = "issue3156.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] assert page.extract_text() == "Lorem ipsum" assert "Advanced encoding" not in caplog.text @pytest.mark.enable_socket def test_function_in_font_widths(caplog): """Tests for #3153""" url = "https://github.com/user-attachments/files/18945709/Marseille_pypdf_level_0.2._compressed.pdf" name = "issue3153.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[455] assert "La vulnérabilité correspond aux conséquences potentielles" in page.extract_text() assert "Expected numeric value for width, got {'/Bounds': [0.25, 0.25]," in caplog.text def test_get_encoding__encoding_value_is_none(): ft = DictionaryObject() ft[NameObject("/Encoding")] = NullObject() assert get_encoding(ft) == ( dict(zip(range(256), charset_encoding["/StandardEncoding"])), {} ) def test_parse_bfchar(caplog): map_dict = {} int_entry = [] parse_bfchar(line=b"057e 1337", map_dict=map_dict, int_entry=int_entry) parse_bfchar(line=b"056e 1f310", map_dict=map_dict, int_entry=int_entry) assert map_dict == {-1: 2, "ծ": "", "վ": "ጷ"} assert int_entry == [1406, 1390] assert caplog.messages == ["Got invalid hex string: Odd-length string (b'1f310')"] def test_parse_bfrange__iteration_limit(): writer = PdfWriter() to_unicode = StreamObject() to_unicode.set_data( b"beginbfrange\n" b"<00000000> <001FFFFF> <00000000>\n" b"endbfrange\n" ) font = writer._add_object(DictionaryObject({ NameObject("/Type"): NameObject("/Font"), NameObject("/Subtype"): NameObject("/Type1"), NameObject("/BaseFont"): NameObject("/Helvetica"), NameObject("/ToUnicode"): to_unicode, })) page = writer.add_blank_page(width=100, height=100) page[NameObject("/Resources")] = DictionaryObject({ NameObject("/Font"): DictionaryObject({ NameObject("/F1"): font.indirect_reference, }) }) # Case without list, exceeding list directly. with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 2097152 > 100000\.$" ): _ = page.extract_text() # Use a pre-filled dummy list to simulate multiple calls where the upper bound does # not overflow, but the overall size does. Case without list. int_entry = [0] * 99_999 map_dict = {} with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 165535 > 100000\.$" ): _ = parse_bfrange(line=b"0000 FFFF 0000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None) assert map_dict == {-1: 2} # Exceeding from previous call. int_entry.append(1) map_dict = {} with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" ): _ = parse_bfrange(line=b"00000000 00000000 00000000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None) assert map_dict == {-1: 4} # multiline_rg int_entry = [0] * 99_995 map_dict = {-1: 1} with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" ): _ = parse_bfrange( line=b"0020 0021 0022 0023 0024 0025 0026 2019", map_dict=map_dict, int_entry=int_entry, multiline_rg=(32, 251) ) assert map_dict == {-1: 1, " ": " ", "!": "!", '"': '"', "#": "#", "$": "$"} # No multiline_rg, but list. int_entry = [0] * 99_995 map_dict = {} with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" ): _ = parse_bfrange( line=b"01 8A [ FFFD FFFD FFFD FFFF FFAB AAAA BBBB", map_dict=map_dict, int_entry=int_entry, multiline_rg=None ) assert map_dict == {-1: 1, "\x01": "�", "\x02": "�", "\x03": "�", "\x04": "\uffff", "\x05": "ᆱ"} def test_parse_bfchar__iteration_limit(): int_entry = [0] * 99_995 map_dict = {} with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100002 > 100000\.$" ): parse_bfchar( line=b"0003 0020 0008 0025 0009 0026 000A 0027 000B 0028 000C 0029 000D 002A", map_dict=map_dict, int_entry=int_entry, ) assert map_dict == {} ================================================ FILE: tests/test_codecs.py ================================================ """Test LZW-related code.""" from io import BytesIO import pytest from pypdf import PdfReader from pypdf._codecs._codecs import LzwCodec from pypdf.errors import LimitReachedError from . import RESOURCE_ROOT, get_data_from_url test_cases = [ pytest.param(b"", id="Empty input"), pytest.param(b"A", id="Single character"), pytest.param(b"AAAAAA", id="Repeating character"), pytest.param(b"Hello, World!", id="Simple text"), pytest.param(b"ABABABABABAB", id="Repeating pattern"), pytest.param(b"The quick brown fox jumps over the lazy dog", id="Longer text"), pytest.param(b"\x00\xFF\x00\xFF", id="Binary data"), pytest.param( b"BBBCBDBEBFBGBHBIBJBKBLBMBNBOBPBQBRBSBTBUBVBWBXBYBZB[B\\B]B^B_B`BaBbBcBdBeBfBgBhBiBjBkBlBmBnBoBpBqBrBsBtBuBvBwBxByCBCCCDCECFCGCHCICJCKCLCMCNCOCPCQCRCSCTCUCVCWCXCYCZC[C\\C]C^C_C`CaCbCcCdCeCfCgChCiCjCkClCmCnCoCpCqCrCsCtCuCvCwCxCyDBDCDDDEDFDGDHDIDJDKDLDMDNDODPDQDRDSDTDUDVDWDXDYDZD[D\\D]D^D_D`DaDbDcDdDeDfDgDhDiDjDkDlDmDnDoDpDqDrDsDtDuDvDwDxDyEBECEDEEEFEGEHEIEJEKELEMENEOEPEQERESETEUEVEWEXEYEZE[E\\E]E^E_E`EaEbEcEdEeEfEgEhEiEjEkElEmEnEoEpEqErEsEtEuEvEwExEyFBFCFDFEFFFGFHFIFJFKFLFMFNFOFPFQFRFSFTFUFVFWFXFYFZF[F\\F]F^F_F`FaFbFcFdFeFfFgFhFiFjFkFlFmFnFoFpFqFrFsFtFuFvFwFxFyGBGCGDGEGFGGGHGIGJGKGLGMGNGOGPGQGRGSGTGUGVGWGXGYGZG[G\\G]G^G_G`GaGbGcGdGeGfGgGhGiGjGkGlGmGnGoGpGqGrGsGtGuGvGwGxGyHBHCHDHEHFHGHHHIHJHKHLHMHNHOHPHQHRHSHTHUHVHWHXHYHZH[H\\H]H^H_H`HaHbHcHdHeHfHgHhHiHjHkHlHmHnHoHpHqHrHsHtHuHvHwHxHyIBICIDIEIFIGIHIIIJIKILIMINIOIPIQIRISITIUIVIWIXIYIZI[I\\I]I^I_I`IaIbIcIdIeIfIgIhIiIjIkIlImInIoIpIqIrIsItIuIvIwIxIyJBJCJDJEJFJGJHJIJJJKJLJMJNJOJPJQJRJSJTJUJVJWJXJYJZJ[J\\J]J^J_J`JaJbJcJdJeJfJgJhJiJjJkJlJmJnJoJpJqJrJsJtJuJvJwJxJyKBKCKDKEKFKGKHKIKJKKKLKMKNKOKPKQKRKSKTKUKVKWKXKYKZK[K\\K]K^K_K`KaKbKcKdKeKfKgKhKiKjKkKlKmKnKoKpKqKrKsKtKuKvKwKxKyLBLCLDLELFLGLHLILJLKLLLMLNLOLPLQLRLSLTLULVLWLXLYLZL[L\\L]L^L_L`LaLbLcLdLeLfLgLhLiLjLkLlLmLnLoLpLqLrLsLtLuLvLwLxLyMBMCMDMEMFMGMHMIMJMKMLMMMNMOMPMQMRMSMTMUMVMWMXMYMZM[M\\M]M^M_M`MaMbMcMdMeMfMgMhMiMjMkMlMmMnMoMpMqMrMsMtMuMvMwMxMyNBNCNDNENFNGNHNINJNKNLNMNNNONPNQNRNSNTNUNVNWNXNYNZN[N\\N]N^N_N`NaNbNcNdNeNfNgNhNiNjNkNlNmNnNoNpNqNrNsNtNuNvNwNxNyOBOCODOEOFOGOHOIOJOKOLOMONOOOPOQOROSOTOUOVOWOXOYOZO[O\\O]O^O_O`OaObOcOdOeOfOgOhOiOjOkOlOmOnOoOpOqOrOsOtOuOvOwOxOyPBPCPDPEPFPGPHPIPJPKPLPMPNPOPPPQPRPSPTPUPVPWPXPYPZP[P\\P]P^P_P`PaPbPcPdPePfPgPhPiPjPkPlPmPnPoPpPqPrPsPtPuPvPwPxPyQBQCQDQEQFQGQHQIQJQKQLQMQNQOQPQQQRQSQTQUQVQWQXQYQZQ[Q\\Q]Q^Q_Q`QaQbQcQdQeQfQgQhQiQjQkQlQmQnQoQpQqQrQsQtQuQvQwQxQyRBRCRDRERFRGRHRIRJRKRLRMRNRORPRQRRRSRTRURVRWRXRYRZR[R\\R]R^R_R`RaRbRcRdReRfRgRhRiRjRkRlRmRnRoRpRqRrRsRtRuRvRwRxRySBSCSDSESFSGSHSISJSKSLSMSNSOSPSQSRSSSTSUSVSWSXSYSZS[S\\S]S^S_S`SaSbScSdSeSfSgShSiSjSkSlSmSnSoSpSqSrSsStSuSvSwSxSyTBTCTDTETFTGTHTITJTKTLTMTNTOTPTQTRTSTTTUTVTWTXTYTZT[T\\T]T^T_T`TaTbTcTdTeTfTgThTiTjTkTlTmTnToTpTqTrTsTtTuTvTwTxTyUBUCUDUEUFUGUHUIUJUKULUMUNUOUPUQURUSUTUUUVUWUXUYUZU[U\\U]U^U_U`UaUbUcUdUeUfUgUhUiUjUkUlUmUnUoUpUqUrUsUtUuUvUwUxUyVBVCVDVEVFVGVHVIVJVKVLVMVNVOVPVQVRVSVTVUVVVWVXVYVZV[V\\V]V^V_V`VaVbVcVdVeVfVgVhViVjVkVlVmVnVoVpVqVrVsVtVuVvVwVxVyWBWCWDWEWFWGWHWIWJWKWLWMWNWOWPWQWRWSWTWUWVWWWXWYWZW[W\\W]W^W_W`WaWbWcWdWeWfWgWhWiWjWkWlWmWnWoWpWqWrWsWtWuWvWwWxWyXBXCXDXEXFXGXHXIXJXKXLXMXNXOXPXQXRXSXTXUXVXWXXXYXZX[X\\X]X^X_X`XaXbXcXdXeXfXgXhXiXjXkXlXmXnXoXpXqXrXsXtXuXvXwXxXyYBYCYDYEYFYGYHYIYJYKYLYMYNYOYPYQYRYSYTYUYVYWYXYYYZY[Y\\Y]Y^Y_Y`YaYbYcYdYeYfYgYhYiYjYkYlYmYnYoYpYqYrYsYtYuYvYwYxYyZBZCZDZEZFZGZHZIZJZKZLZMZNZOZPZQZRZSZTZUZVZWZXZYZZZ[Z\\Z]Z^Z_Z`ZaZbZcZdZeZfZgZhZiZjZkZlZmZnZoZpZqZrZsZtZuZvZwZxZy[B[C[D[E[F[G[H[I[J[K[L[M[N[O[P[Q[R[S[T[U[V[W[X[Y[Z[[[\\[][^[_[`[a[b[c[d[e[f[g[h[i[j[k[l[m[n[o[p[q[r[s[t[u[v[w[x[y\\B\\C\\D\\E\\F\\G\\H\\I\\J\\K\\L\\M\\N\\O\\P\\Q\\R\\S\\T\\U\\V\\W\\X\\Y\\Z\\[\\\\\\]\\^\\_\\`\\a\\b\\c\\d\\e\\f\\g\\h\\i\\j\\k\\l\\m\\n\\o\\p\\q\\r\\s\\t\\u\\v\\w\\x\\y]B]C]D]E]F]G]H]I]J]K]L]M]N]O]P]Q]R]S]T]U]V]W]X]Y]Z][]\\]]]^]_]`]a]b]c]d]e]f]g]h]i]j]k]l]m]n]o]p]q]r]s]t]u]v]w]x]y^B^C^D^E^F^G^H^I^J^K^L^M^N^O^P^Q^R^S^T^U^V^W^X^Y^Z^[^\\^]^^^_^`^a^b^c^d^e^f^g^h^i^j^k^l^m^n^o^p^q^r^s^t^u^v^w^x^y_B_C_D_E_F_G_H_I_J_K_L_M_N_O_P_Q_R_S_T_U_V_W_X_Y_Z_[_\\_]_^___`_a_b_c_d_e_f_g_h_i_j_k_l_m_n_o_p_q_r_s_t_u_v_w_x_y`B`C`D`E`F`G`H`I`J`K`L`M`N`O`P`Q`R`S`T`U`V`W`X`Y`Z`[`\\`]`^`_```a`b`c`d`e`f`g`h`i`j`k`l`m`n`o`p`q`r`s`t`u`v`w`x`yaBaCaDaEaFaGaHaIaJaKaLaMaNaOaPaQaRaSaTaUaVaWaXaYaZa[a\\a]a^a_a`aaabacadaeafagahaiajakalamanaoapaqarasatauavawaxaybBbCbDbEbFbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb[b\\b]b^b_b`babbbcbdbebfbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbycBcCcDcEcFcGcHcIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXcYcZc[c\\c]c^c_c`cacbcccdcecfcgchcicjckclcmcncocpcqcrcsctcucvcwcxcydBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdyeBeCeDeEeFeGeHeIeJeKeLeMeNeOePeQeReSeTeUeVeWeXeYeZe[e\\e]e^e_e`eaebecedeeefegeheiejekelemeneoepeqereseteuevewexeyfBfCfDfEfFfGfHfIfJfKfLfMfNfOfPfQfRfSfTfUfVfWfXfYfZf[f\\f]f^f_f`fafbfcfdfefffgfhfifjfkflfmfnfofpfqfrfsftfufvfwfxfygBgCgDgEgFgGgHgIgJgKgLgMgNgOgPgQgRgSgTgUgVgWgXgYgZg[g\\g]g^g_g`gagbgcgdgegfggghgigjgkglgmgngogpgqgrgsgtgugvgwgxgyhBhChDhEhFhGhHhIhJhKhLhMhNhOhPhQhRhShThUhVhWhXhYhZh[h\\h]h^h_h`hahbhchdhehfhghhhihjhkhlhmhnhohphqhrhshthuhvhwhxhyiBiCiDiEiFiGiHiIiJiKiLiMiNiOiPiQiRiSiTiUiViWiXiYiZi[i\\i]i^i_i`iaibicidieifigihiiijikiliminioipiqirisitiuiviwixiyjBjCjDjEjFjGjHjIjJjKjLjMjNjOjPjQjRjSjTjUjVjWjXjYjZj[j\\j]j^j_j`jajbjcjdjejfjgjhjijjjkjljmjnjojpjqjrjsjtjujvjwjxjykBkCkDkEkFkGkHkIkJkKkLkMkNkOkPkQkRkSkTkUkVkWkXkYkZk[k\\k]k^k_k`kakbkckdkekfkgkhkikjkkklkmknkokpkqkrksktkukvkwkxkylBlClDlElFlGlHlIlJlKlLlMlNlOlPlQlRlSlTlUlVlWlXlYlZl[l\\l]l^l_l`lalblcldlelflglhliljlklllmlnlolplqlrlsltlulvlwlxlymBmCmDmEmFmGmHmImJmKmLmMmNmOmPmQmRmSmTmUmVmWmXmYmZm[m\\m]m^m_m`mambmcmdmemfmgmhmimjmkmlmmmnmompmqmrmsmtmumvmwmxmynBnCnDnEnFnGnHnInJnKnLnMnNnOnPnQnRnSnTnUnVnWnXnYnZn[n\\n]n^n_n`nanbncndnenfngnhninjnknlnmnnnonpnqnrnsntnunvnwnxnyoBoCoDoEoFoGoHoIoJoKoLoMoNoOoPoQoRoSoToUoVoWoXoYoZo[o\\o]o^o_o`oaobocodoeofogohoiojokolomonooopoqorosotouovowoxoypBpCpDpEpFpGpHpIpJpKpLpMpNpOpPpQpRpSpTpUpVpWpXpYpZp[p\\p]p^p_p`papbpcpdpepfpgphpipjpkplpmpnpopppqprpsptpupvpwpxpyqBqCqDqEqFqGqHqIqJqKqLqMqNqOqPqQqRqSqTqUqVqWqXqYqZq[q\\q]q^q_q`qaqbqcqdqeqfqgqhqiqjqkqlqmqnqoqpqqqrqsqtquqvqwqxqyrBrCrDrErFrGrHrIrJrKrLrMrNrOrPrQrRrSrTrUrVrWrXrYrZr[r\\r]r^r_r`rarbrcrdrerfrgrhrirjrkrlrmrnrorprqrrrsrtrurvrwrxrysBsCsDsEsFsGsHsIsJsKsLsMsNsOsPsQsRsSsTsUsVsWsXsYsZs[s\\s]s^s_s`sasbscsdsesfsgshsisjskslsmsnsospsqsrssstsusvswsxsytBtCtDtEtFtGtHtItJtKtLtMtNtOtPtQtRtStTtUtVtWtXtYtZt[t\\t]t^t_t`tatbtctdtetftgthtitjtktltmtntotptqtrtstttutvtwtxtyuBuCuDuEuFuGuHuIuJuKuLuMuNuOuPuQuRuSuTuUuVuWuXuYuZu[u\\u]u^u_u`uaubucudueufuguhuiujukulumunuoupuqurusutuuuvuwuxuyvBvCvDvEvFvGvHvIvJvKvLvMvNvOvPvQvRvSvTvUvVvWvXvYvZv[v\\v]v^v_v`vavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvywBwCwDwEwFwGwHwIwJwKwLwMwNwOwPwQwRwSwTwUwVwWwXwYwZw[w\\w]w^w_w`wawbwcwdwewfwgwhwiwjwkwlwmwnwowpwqwrwswtwuwvwwwxwyxBxCxDxExFxGxHxIxJxKxLxMxNxOxPxQxRxSxTxUxVxWxXxYxZx[x\\x]x^x_x`xaxbxcxdxexfxgxhxixjxkxlxmxnxoxpxqxrxsxtxuxvxwxxxyyByCyDyEyFyGyHyIyJyKyLyMyNyOyPyQyRySyTyUyVyWyXyYyZy[y\\y]y^y_y`yaybycydyeyfygyhyiyjykylymynyoypyqyrysytyuyvywyxyyBBBBBCBBDBBEBBFBBGBBHBBIBBJBBKBBLBBMBBNBBOBBPBBQBBRBBSBBTBBUBBVBBWBBXBBYBBZBB[BB\\BB]BB^BB_BB`BBaBBbBBcBBdBBeBBfBBgBBhBBiBBjBBkBBlBBmBBnBBoBBpBBqBBrBBsBBtBBuBBvBBwBBxBByBCBBCCBCDBCEBCFBCGBCHBCIBCJBCKBCLBCMBCNBCOBCPBCQBCRBCSBCTBCUBCVBCWBCXBCYBCZBC[BC\\BC]BC^BC_BC`BCaBCbBCcBCdBCeBCfBCgBChBCiBCjBCkBClBCmBCnBCoBCpBCqBCrBCsBCtBCuBCvBCwBCxBCyBDBBDCBDDBDEBDFBDGBDHBDIBDJBDKBDLBDMBDNBDOBDPBDQBDRBDSBDTBDUBDVBDWBDXBDYBDZBD[BD\\BD]BD^BD_BD`BDaBDbBDcBDdBDeBDfBDgBDhBDiBDjBDkBDlBDmBDnBDoBDpBDqBDrBDsBDtBDuBDvBDwBDxBDyBEBBECBEDBEEBEFBEGBEHBEIBEJBEKBELBEMBENBEOBEPBEQBERBESBETBEUBEVBEWBEXBEYBEZBE[BE\\BE]BE^BE_BE`BEaBEbBEcBEdBEeBEfBEgBEhBEiBEjBEkBElBEmBEnBEoBEpBEqBErBEsBEtBEuBEvBEwBExBEyBFBBFCBFDBFEBFFBFGBFHBFIBFJBFKBFLBFMBFNBFOBFPBFQBFRBFSBFTBFUBFVBFWBFXBFYBFZBF[BF\\BF]BF^BF_BF`BFaBFbBFcBFdBFeBFfBFgBFhBFiBFjBFkBFlBFmBFnBFoBFpBFqBFrBFsBFtBFuBFvBFwBFxBFyBGBBGCBGDBGEBGFBGGBGHBGIBGJBGKBGLBGMBGNBGOBGPBGQBGRBGSBGTBGUBGVBGWBGXBGYBGZBG[BG\\BG]BG^BG_BG`BGaBGbBGcBGdBGeBGfBGgBGhBGiBGjBGkBGlBGmBGnBGoBGpBGqBGrBGsBGtBGuBGvBGwBGxBGyBHBBHCBHDBHEBHFBHGBHHBHIBHJBHKBHLBHMBHNBHOBHPBHQBHRBHSBHTBHUBHVBHWBHXBHYBHZBH[BH\\BH]BH^BH_BH`BHaBHbBHcBHdBHeBHfBHgBHhBHiBHjBHkBHlBHmBHnBHoBHpBHqBHrBHsBHtBHuBHvBHwBHxBHyBIBBICBIDBIEBIFBIGBIHBIIBIJBIKBILBIMBINBIOBIPBIQBIRBISBITBIUBIVBIWBIXBIYBIZBI[BI\\BI]BI^BI_BI`BIaBIbBIcBIdBIeBIfBIgBIhBIiBIjBIkBIlBImBInBIoBIpBIqBIrBIsBItBIuBIvBIwBIxBIyBJBBJCBJDBJEBJFBJGBJHBJIBJJBJKBJLBJMBJNBJOBJPBJQBJRBJSBJTBJUBJVBJWBJXBJYBJZBJ[BJ\\BJ]BJ^BJ_BJ`BJaBJbBJcBJdBJeBJfBJgBJhBJiBJjBJkBJlBJmBJnBJoBJpBJqBJrBJsBJtBJuBJvBJwBJxBJyBKBBKCBKDBKEBKFBKGBKHBKIBKJBKKBKLBKMBKNBKOBKPBKQBKRBKSBKTBKUBKVBKWBKXBKYBKZBK[BK\\BK]BK^BK_BK`BKaBKbBKcBKdBKeBKfBKgBKhBKiBKjBKkBKlBKmBKnBKoBKpBKqBKrBKsBKtBKuBKvBKwBKxBKyBLBBLCBLDBLEBLFBLGBLHBLIBLJBLKBLLBLMBLNBLOBLPBLQBLRBLSBLTBLUBLVBLWBLXBLYBLZBL[BL\\BL]BL^BL_BL`BLaBLbBLcBLdBLeBLfBLgBLhBLiBLjBLkBLlBLmBLnBLoBLpBLqBLrBLsBLtBLuBLvBLwBLxBLyBMBBMCBMDBMEBMFBMGBMHBMIBMJBMKBMLBMMBMNBMOBMPBMQBMRBMSBMTBMUBMVBMWBMXBMYBMZBM[BM\\BM]BM^BM_BM`BMaBMbBMcBMdBMeBMfBMgBMhBMiBMjBMkBMlBMmBMnBMoBMpBMqBMrBMsBMtBMuBMvBMwBMxBMyBNBBNCBNDBNEBNFBNGBNHBNIBNJBNKBNLBNMBNNBNOBNPBNQBNRBNSBNTBNUBNVBNWBNXBNYBNZBN[BN\\BN]BN^BN_", id="Table overflow", ), ] @pytest.mark.parametrize("data", test_cases) def test_encode_decode(data): """Decoder and encoder match.""" codec = LzwCodec() compressed_data = codec.encode(data) decoded = codec.decode(compressed_data) assert decoded == data @pytest.mark.parametrize( ("plain", "expected_encoded"), [ (b"", b"\x80@@"), (b"A", b"\x80\x10` "), (b"AAAAAA", b"\x80\x10`P8\x08"), (b"Hello, World!", b"\x80\x12\x0c\xa6\xc3a\xbcX +\x9b\xceF\xc3 \x86\x02"), ], ) def test_encode_lzw(plain, expected_encoded): codec = LzwCodec() actual_encoded = codec.encode(plain) assert actual_encoded == expected_encoded @pytest.mark.parametrize( ("encoded", "expected_decoded"), [ # _pack_codes_into_bytes([256, 65, 66, 67, 68, 256, 256, 69, 70, 71, 72, 257]) (b"\x80\x10HD2$\x02\x00E#\x11\xc9\x10\x10", b"ABCDEFGH"), # Clear twice. # _pack_codes_into_bytes([65, 66, 67, 68, 257]) (b" \x90\x88dH\x08", b"ABCD"), # No explicit initial clear marker. ], ) def test_decode_lzw(encoded, expected_decoded): codec = LzwCodec() actual_decoded = codec.decode(encoded) assert actual_decoded == expected_decoded def test_lzw_decoder_table_overflow(caplog): path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin" codec = LzwCodec() assert codec.decode(path.read_bytes()).startswith( b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@' ) assert len(codec.decoding_table) == 4096 assert "Ignoring too large LZW table index." in caplog.text @pytest.mark.enable_socket @pytest.mark.timeout(timeout=15, method="thread") def test_lzw_decoder_large_stream_performance(caplog): LzwCodec().decode(get_data_from_url(name="large_lzw_example_encoded.dat")) @pytest.mark.enable_socket def test_lzw_decoder__output_limit(): url = "https://github.com/user-attachments/files/23057035/lzw__output_limit.pdf" name = "lzw__output_limit.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] with pytest.raises( expected_exception=LimitReachedError, match=r"^Limit reached while decompressing: 75000828 > 75000000$" ): page.images[0].image.load() ================================================ FILE: tests/test_constants.py ================================================ """Test the pypdf.constants module.""" import re from typing import Callable import pytest from pypdf.constants import PDF_KEYS, GraphicsStateParameters, UserAccessPermissions def test_slash_prefix(): """ Naming conventions of PDF_KEYS (constant names) are followed. This test function validates if PDF key names follow the required pattern: - Starts with a slash "/" - Followed by an uppercase letter - Contains alphanumeric characters (letters and digits) - The attribute name should be a case-insensitive match, with underscores removed """ pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$") for cls in PDF_KEYS: for attr in dir(cls): # Skip magic methods if attr.startswith("__") and attr.endswith("__"): continue # Skip methods constant_value = getattr(cls, attr) if isinstance(constant_value, Callable): continue assert constant_value.startswith("/") assert attr.replace("_", "").casefold() == constant_value[1:].casefold() # There are a few exceptions that may be lowercase if cls == GraphicsStateParameters and attr in ["ca", "op"]: continue assert pattern.match(constant_value) def test_user_access_permissions__dict_handling(): # Value is mix of configurable and reserved bits. # Reserved bits should not be part of the dictionary. as_dict = UserAccessPermissions(512 + 64 + 8).to_dict() assert as_dict == { "add_or_modify": False, "assemble_doc": False, "extract": False, "extract_text_and_graphics": True, "fill_form_fields": False, "modify": True, "print": False, "print_to_representation": False, } # Convert the dictionary back to an integer. # This should add the reserved bits automatically. permissions = UserAccessPermissions.from_dict(as_dict) assert permissions == 4294963912 # Roundtrip for valid dictionary. data = { "add_or_modify": True, "assemble_doc": False, "extract": False, "extract_text_and_graphics": True, "fill_form_fields": False, "modify": True, "print": False, "print_to_representation": True, } assert UserAccessPermissions.from_dict(data).to_dict() == data # Empty inputs. assert UserAccessPermissions.from_dict({}) == 4294963392 # Reserved bits. assert UserAccessPermissions(0).to_dict() == { "add_or_modify": False, "assemble_doc": False, "extract": False, "extract_text_and_graphics": False, "fill_form_fields": False, "modify": False, "print": False, "print_to_representation": False, } # Unknown dictionary keys. data = { "add_or_modify": True, "key1": False, "key2": True, } unknown = { "key1": False, "key2": True, } with pytest.raises( ValueError, match=f"Unknown dictionary keys: {unknown!r}" ): UserAccessPermissions.from_dict(data) def test_user_access_permissions__all(): all_permissions = UserAccessPermissions.all() all_int = int(all_permissions) all_string = bin(all_permissions) assert all_string.startswith("0b") assert len(all_string[2:]) == 32 # 32-bit integer assert all_int & UserAccessPermissions.R1 == 0 assert all_int & UserAccessPermissions.R2 == 0 assert all_int & UserAccessPermissions.PRINT == UserAccessPermissions.PRINT assert all_int & UserAccessPermissions.R7 == UserAccessPermissions.R7 assert all_int & UserAccessPermissions.R31 == UserAccessPermissions.R31 ================================================ FILE: tests/test_doc_common.py ================================================ """Test the pypdf._doc_common module.""" import itertools import re import shutil import subprocess from io import BytesIO from operator import itemgetter from pathlib import Path from unittest import mock import pytest from pypdf import PdfReader, PdfWriter from pypdf.errors import LimitReachedError, PdfReadError from pypdf.filters import FlateDecode from pypdf.generic import ( ArrayObject, DictionaryObject, EmbeddedFile, EncodedStreamObject, NameObject, NullObject, TextStringObject, ViewerPreferences, ) from tests import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url PDFATTACH_BINARY = shutil.which("pdfattach") @pytest.mark.skipif(PDFATTACH_BINARY is None, reason="Requires poppler-utils") def test_attachments(tmpdir): tmpdir = Path(tmpdir) # No attachments. clean_path = SAMPLE_ROOT / "002-trivial-libre-office-writer" / "002-trivial-libre-office-writer.pdf" with PdfReader(clean_path) as pdf: assert pdf._list_attachments() == [] assert list(pdf.attachment_list) == [] # UF = name. attached_path = tmpdir / "attached.pdf" file_path = tmpdir / "test.txt" file_path.write_bytes(b"Hello World\n") subprocess.run([PDFATTACH_BINARY, clean_path, file_path, attached_path]) # noqa: S603 with PdfReader(attached_path) as pdf: assert pdf._list_attachments() == ["test.txt"] assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"} assert [(x.name, x.content) for x in pdf.attachment_list] == [("test.txt", b"Hello World\n")] assert next(pdf.attachment_list).alternative_name == "test.txt" # UF != name. different_path = tmpdir / "different.pdf" different_path.write_bytes(re.sub(rb" /UF [^/]+ /", b" /UF(my-file.txt) /", attached_path.read_bytes())) with PdfReader(different_path) as pdf: assert pdf._list_attachments() == ["test.txt", "my-file.txt"] assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"} assert pdf._get_attachments("my-file.txt") == {"my-file.txt": b"Hello World\n"} assert [(x.name, x.content) for x in pdf.attachment_list] == [("test.txt", b"Hello World\n")] assert next(pdf.attachment_list).alternative_name == "my-file.txt" # Only name. no_f_path = tmpdir / "no-f.pdf" no_f_path.write_bytes(re.sub(rb" /UF [^/]+ /", b" /", attached_path.read_bytes())) with PdfReader(no_f_path) as pdf: assert pdf._list_attachments() == ["test.txt"] assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"} assert [(x.name, x.content) for x in pdf.attachment_list] == [("test.txt", b"Hello World\n")] assert next(pdf.attachment_list).alternative_name is None # UF and F. uf_f_path = tmpdir / "uf-f.pdf" uf_f_path.write_bytes(attached_path.read_bytes().replace(b" /UF ", b"/F(file.txt) /UF ")) with PdfReader(uf_f_path) as pdf: assert pdf._list_attachments() == ["test.txt"] assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"} assert [(x.name, x.content) for x in pdf.attachment_list] == [("test.txt", b"Hello World\n")] assert next(pdf.attachment_list).alternative_name == "test.txt" # Only F. only_f_path = tmpdir / "f.pdf" only_f_path.write_bytes(attached_path.read_bytes().replace(b" /UF ", b" /F ")) with PdfReader(only_f_path) as pdf: assert pdf._list_attachments() == ["test.txt"] assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"} assert [(x.name, x.content) for x in pdf.attachment_list] == [("test.txt", b"Hello World\n")] assert next(pdf.attachment_list).alternative_name == "test.txt" def test_get_attachments__same_attachment_more_than_twice(): writer = PdfWriter() writer.add_blank_page(100, 100) for i in range(5): writer.add_attachment("test.txt", f"content{i}") assert writer._get_attachments("test.txt") == { "test.txt": [b"content0", b"content1", b"content2", b"content3", b"content4"] } assert [(x.name, x.content) for x in writer.attachment_list] == [ ("test.txt", b"content0"), ("test.txt", b"content1"), ("test.txt", b"content2"), ("test.txt", b"content3"), ("test.txt", b"content4"), ] def test_get_attachments__alternative_name_is_none(): writer = PdfWriter() attachment = EmbeddedFile(name="test.txt", pdf_object=writer.root_object) assert attachment.alternative_name is None with mock.patch( "pypdf._writer.PdfWriter.attachment_list", new_callable=mock.PropertyMock(return_value=[attachment]) ), mock.patch( "pypdf.generic._files.EmbeddedFile.content", new_callable=mock.PropertyMock(return_value=b"content") ): assert writer._get_attachments() == {"test.txt": b"content"} @pytest.mark.enable_socket def test_byte_encoded_named_destinations(): url = "https://github.com/user-attachments/files/19820164/pypdf_issue.pdf" name = "issue3261.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) page = reader.pages[0] for annotation in page.annotations: if annotation.get("/Subtype") == "/Link": action = annotation["/A"] if action["/S"] == "/GoTo": named_dest = action["/D"] assert str(named_dest) in reader.named_destinations assert TextStringObject(named_dest) in reader.named_destinations assert reader.named_destinations == { "Doc-Start": { "/Title": "Doc-Start", "/Page": page.indirect_reference, "/Type": "/XYZ", "/Left": 133.768, "/Top": 667.198, "/Zoom": NullObject() }, "cite.dacÃ\xadk2025racerflightweightstaticdata": { "/Title": "cite.dacÃ\xadk2025racerflightweightstaticdata", "/Page": page.indirect_reference, "/Type": "/XYZ", "/Left": 133.768, "/Top": 614.424, "/Zoom": NullObject() }, # This is the same as the previous entry, but with `str(name)` instead of the title. "楣整搮捡귃㉫㈰爵捡牥汦杩瑨敷杩瑨瑳瑡捩慤慴": { "/Left": 133.768, "/Page": page.indirect_reference, "/Title": "cite.dacÃ\xadk2025racerflightweightstaticdata", "/Top": 614.424, "/Type": "/XYZ", "/Zoom": NullObject() }, "page.1": { "/Title": "page.1", "/Page": page.indirect_reference, "/Type": "/XYZ", "/Left": 132.768, "/Top": 705.06, "/Zoom": NullObject() }, "section*.1": { "/Title": "section*.1", "/Page": page.indirect_reference, "/Type": "/XYZ", "/Left": 133.768, "/Top": 642.222, "/Zoom": NullObject() } } def test_viewer_preferences__indirect_reference(): input_path = RESOURCE_ROOT / "git.pdf" reader = PdfReader(input_path) assert (0, 24) not in reader.resolved_objects viewer_preferences = reader.viewer_preferences assert isinstance(viewer_preferences, ViewerPreferences) assert viewer_preferences == {"/DisplayDocTitle": True} assert (0, 24) in reader.resolved_objects assert id(viewer_preferences) == id(reader.viewer_preferences) assert id(viewer_preferences) == id(reader.resolved_objects[(0, 24)]) @pytest.mark.enable_socket def test_named_destinations__tree_is_null_object(): url = "https://github.com/user-attachments/files/20885216/test.pdf" name = "issue3330.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) assert reader.named_destinations == {} @pytest.mark.enable_socket def test_outline__issue3462(): url = "https://github.com/user-attachments/files/22293402/e371fffe0b_a7cccde95a.pdf" name = "issue3462.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) outline_flat = list( itertools.chain.from_iterable( entry if isinstance(entry, list) else [entry] for entry in reader.outline ) ) assert list(map(itemgetter("/Title"), outline_flat)) == [ "AR 2021 - Daftar Isi", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "AR 2021 Book 001 (Highlights - Ikhtisar Saham)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "AR 2021 Book 002 (Laporan Manajemen)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "AR 2021 Book 003-1 (Profil Perusahaan)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "Page 13", "Page 14", "Page 15", "Page 16", "Page 17", "Page 18", "Page 19", "Page 20", "Page 21", "Page 22", "Page 23", "Page 24", "Page 25", "Page 26", "Page 27", "Page 28", "Page 29", "Page 30", "Page 31", "Page 32", "Page 33", "Page 34", "Page 35", "Page 36", "Page 37", "Page 38", "Page 39", "Page 40", "Page 41", "Page 42", "Page 43", "Page 44", "Page 45", "Page 46", "Page 47", "AR 2021 Book 003-2 (Sumber Daya Manusia)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "AR 2021 Book 003-3 (Komposisi pemegang saham)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "AR 2021 Book 003-4 (Kronologis Pencatatan Saham)", "Page 1", "Page 2", "AR 2021 Book 003-5 (Akuntan Publik Independen)", "Page 1", "Page 2", "Page 3", "AR 2021 Book 004 (Analisa dan Pembahasan Manajemen)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "Page 13", "Page 14", "Page 15", "Page 16", "Page 17", "Page 18", "Page 19", "Page 20", "Page 21", "AR 2021 Book 005-1 (Tata Kelola Perusahaan)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "AR 2021 Book 005-2 (Direksi-Komisaris)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "Page 13", "Page 14", "Page 15", "Page 16", "Page 17", "Page 18", "Page 19", "Page 20", "Page 21", "Page 22", "Page 23", "Page 24", "Page 25", "Page 26", "Page 27", "Page 28", "Page 29", "Page 30", "Page 31", "Page 32", "Page 33", "Page 34", "Page 35", "Page 36", "Page 37", "Page 38", "AR 2021 Book 005-3 (Komite Audit)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "AR 2021 Book 005-4 (Sekretaris Perusahaan)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "AR 2021 Book 005-5 (Unit Audit Internal)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "AR 2021 Book 005-6 (Sistem Pengendalian Internal)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "AR 2021 Book 005-7 (Program Saham)", "Page 1", "AR 2021 Book 005-8 ( Whistleblowing)", "Page 1", "Page 2", "Page 3", "Page 4", "Page 5", "Page 6", "Page 7", "Page 8", "Page 9", "Page 10", "Page 11", "Page 12", "Page 13", "Page 14", "Page 15", "Page 16", "Page 17", "Page 18", "Page 19", "Page 20", "Page 21", "Page 22", "Page 23", "Page 24", "Page 25", "AR 2021 Book 006 (Tanggung Jawab Sosial - CSR)", "Page 1", "Page 2", "AR 2021 Book 007-1 (LAPORAN KEUANGAN KONSOLIDASIAN)", "Page 1", "AR 2021 Book 007-2 (Isi Laporan Keuangan)", "AR 2021 Book 008 (Tanggung Jawab Atas Laporan Tahunan)", "Page 1", "Page 2" ] def test_flatten__cyclic_references(): path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(path) assert len(reader.pages) == 1 reader._flatten() # Make the first child point to the object itself. pages_object = reader.get_object(10) pages_object[NameObject("/Kids")][0].indirect_reference.idnum = 10 reader.resolved_objects[(10, 0)] = pages_object with pytest.raises(expected_exception=PdfReadError, match=r"^Detected cyclic page references\.$"): reader._flatten() @pytest.mark.enable_socket @pytest.mark.timeout(10) def test_get_outline__cyclic_references(caplog): url = "https://github.com/user-attachments/files/24859044/circular_outline.pdf" name = "circular_outline.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) assert reader.outline == [ { "/%is_open%": True, "/Page": reader.pages[0].indirect_reference, "/Title": "Bookmark A", "/Type": "/Fit" }, { "/%is_open%": True, "/Page": reader.pages[0].indirect_reference, "/Title": "Bookmark B", "/Type": "/Fit" } ] assert caplog.messages[0].startswith("Detected cycle in outline structure for {") @pytest.mark.enable_socket @pytest.mark.timeout(10) def test_get_outline__cyclic_references__nested_handling(caplog): url = "https://github.com/user-attachments/files/24859044/circular_outline.pdf" name = "circular_outline.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url=url, name=name))) nested_outline = DictionaryObject() writer._add_object(nested_outline) nested_outline.update({ NameObject("/Title"): TextStringObject("Nested entry"), NameObject("/Parent"): writer.get_object(5), NameObject("/Dest"): ArrayObject([writer.pages[0].indirect_reference, NameObject("/Fit")]), NameObject("/Next"): writer.get_object(6), }) writer.get_object(5)[NameObject("/First")] = nested_outline.indirect_reference writer.get_object(6)[NameObject("/First")] = nested_outline.indirect_reference assert writer.outline == [ { "/%is_open%": True, "/Page": writer.pages[0].indirect_reference, "/Title": "Bookmark A", "/Type": "/Fit" }, [ { "/%is_open%": True, "/Page": writer.pages[0].indirect_reference, "/Title": "Nested entry", "/Type": "/Fit" }, { "/%is_open%": True, "/Page": writer.pages[0].indirect_reference, "/Title": "Bookmark B", "/Type": "/Fit" } ], { "/%is_open%": True, "/Page": writer.pages[0].indirect_reference, "/Title": "Bookmark B", "/Type": "/Fit" }, [ { "/%is_open%": True, "/Page": writer.pages[0].indirect_reference, "/Title": "Nested entry", "/Type": "/Fit" } ] ] assert caplog.messages[0].startswith("Detected cycle in outline structure for {") def test_xfa__decompression_limit(): payload = b"A" * 100_0000 compressed = FlateDecode.encode(payload, 9) writer = PdfWriter() writer.add_blank_page(width=72, height=72) stream = EncodedStreamObject() stream._data = compressed stream[NameObject("/Filter")] = NameObject("/FlateDecode") stream_reference = writer._add_object(stream) acro = DictionaryObject() acro[NameObject("/XFA")] = ArrayObject([TextStringObject("datasets"), stream_reference]) writer.root_object[NameObject("/AcroForm")] = writer._add_object(acro) data = BytesIO() writer.write(data) data.flush() reader = PdfReader(data) with mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", 75_000), pytest.raises( expected_exception=LimitReachedError, match=r"^Limit reached while decompressing. 902 bytes remaining.$" ): _ = reader.xfa ================================================ FILE: tests/test_encryption.py ================================================ """Test the pypdf._encryption module.""" import secrets from io import BytesIO import pytest import pypdf from pypdf import PasswordType, PdfReader, PdfWriter from pypdf._crypt_providers import crypt_provider from pypdf._crypt_providers._fallback import _DEPENDENCY_ERROR_STR from pypdf._encryption import AlgV5, CryptAES, CryptRC4 from pypdf.errors import DependencyError, PdfReadError from tests import RESOURCE_ROOT, SAMPLE_ROOT USE_CRYPTOGRAPHY = crypt_provider[0] == "cryptography" USE_PYCRYPTODOME = crypt_provider[0] == "pycryptodome" HAS_AES = USE_CRYPTOGRAPHY or USE_PYCRYPTODOME @pytest.mark.parametrize( ("name", "requires_aes"), [ # unencrypted pdf ("unencrypted.pdf", False), # created by: # qpdf --encrypt "" "" 40 -- unencrypted.pdf r2-empty-password.pdf ("r2-empty-password.pdf", False), # created by: # qpdf --encrypt "" "" 128 -- unencrypted.pdf r3-empty-password.pdf ("r3-empty-password.pdf", False), # created by: # qpdf --encrypt "asdfzxcv" "" 40 -- unencrypted.pdf r2-user-password.pdf ("r2-user-password.pdf", False), # created by: # qpdf --encrypt "" "asdfzxcv" 40 -- unencrypted.pdf r2-owner-password.pdf ("r2-owner-password.pdf", False), # created by: # qpdf --encrypt "asdfzxcv" "" 128 -- unencrypted.pdf r3-user-password.pdf ("r3-user-password.pdf", False), # created by: # qpdf --encrypt "asdfzxcv" "" 128 --force-V4 -- unencrypted.pdf r4-user-password.pdf ("r4-user-password.pdf", False), # created by: # qpdf --encrypt "" "asdfzxcv" 128 --force-V4 -- unencrypted.pdf r4-owner-password.pdf ("r4-owner-password.pdf", False), # created by: # qpdf --encrypt "asdfzxcv" "" 128 --use-aes=y -- unencrypted.pdf r4-aes-user-password.pdf ("r4-aes-user-password.pdf", True), # created by: # qpdf --encrypt "" "" 256 --force-R5 -- unencrypted.pdf r5-empty-password.pdf ("r5-empty-password.pdf", True), # created by: # qpdf --encrypt "asdfzxcv" "" 256 --force-R5 -- unencrypted.pdf r5-user-password.pdf ("r5-user-password.pdf", True), # created by: # qpdf --encrypt "" "asdfzxcv" 256 --force-R5 -- unencrypted.pdf r5-owner-password.pdf ("r5-owner-password.pdf", True), # created by: # qpdf --encrypt "" "" 256 -- unencrypted.pdf r6-empty-password.pdf ("r6-empty-password.pdf", True), # created by: # qpdf --encrypt "asdfzxcv" "" 256 -- unencrypted.pdf r6-user-password.pdf ("r6-user-password.pdf", True), # created by: # qpdf --encrypt "" "asdfzxcv" 256 -- unencrypted.pdf r6-owner-password.pdf ("r6-owner-password.pdf", True), ], ) def test_encryption(name, requires_aes): """ Encrypted PDFs are handled correctly. This test function ensures that: - If PyCryptodome or cryptography is not available and required, a DependencyError is raised - Encrypted PDFs are identified correctly - Decryption works for encrypted PDFs - Metadata is properly extracted from the decrypted PDF """ inputfile = RESOURCE_ROOT / "encryption" / name if requires_aes and not HAS_AES: with pytest.raises(DependencyError) as exc: ipdf = pypdf.PdfReader(inputfile) ipdf.decrypt("asdfzxcv") dd = dict(ipdf.metadata) assert exc.value.args[0] == _DEPENDENCY_ERROR_STR return ipdf = pypdf.PdfReader(inputfile) if str(inputfile).endswith("unencrypted.pdf"): assert not ipdf.is_encrypted else: assert ipdf.is_encrypted ipdf.decrypt("asdfzxcv") assert len(ipdf.pages) == 1 dd = dict(ipdf.metadata) # remove empty value entry dd = {x[0]: x[1] for x in dd.items() if x[1]} assert dd == { "/Author": "cheng", "/CreationDate": "D:20220414132421+05'24'", "/Creator": "WPS Writer", "/ModDate": "D:20220414132421+05'24'", "/SourceModified": "D:20220414132421+05'24'", "/Trapped": "/False", } @pytest.mark.parametrize( ("name", "user_passwd", "owner_passwd"), [ # created by # qpdf --encrypt "foo" "bar" 256 -- unencrypted.pdf r6-both-passwords.pdf ("r6-both-passwords.pdf", "foo", "bar"), ], ) @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): """ PDFs with both user and owner passwords are handled correctly. This test function ensures that: - Encrypted PDFs with both user and owner passwords are identified correctly - Decryption works for both user and owner passwords - The correct password type is returned after decryption - The number of pages is correctly identified after decryption """ inputfile = RESOURCE_ROOT / "encryption" / name ipdf = pypdf.PdfReader(inputfile) assert ipdf.is_encrypted assert ipdf.decrypt(user_passwd) == PasswordType.USER_PASSWORD assert ipdf.decrypt(owner_passwd) == PasswordType.OWNER_PASSWORD assert len(ipdf.pages) == 1 @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_aesv2_without_length_in_encrypt_dict(): """ AESV2-encrypted PDF without /Length in encrypt dict decrypts correctly. Some PDFs omit /Length in the main encrypt dict (defaulting to 40 bits), but AESV2 requires 128 bits. The key length should be read from the crypt filter dict instead. """ inputfile = RESOURCE_ROOT / "encryption" / "r4-aes-v2-no-key-length.pdf" reader = PdfReader(inputfile) assert reader.is_encrypted result = reader.decrypt("") assert result in (PasswordType.USER_PASSWORD, PasswordType.OWNER_PASSWORD) assert len(reader.pages) == 1 @pytest.mark.parametrize( ("pdffile", "password"), [ ("crazyones-encrypted-256.pdf", "password"), ("crazyones-encrypted-256.pdf", b"password"), ], ) @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_read_page_from_encrypted_file_aes_256(pdffile, password): """ A page can be read from an encrypted. This is a regression test for issue 327: IndexError for get_page() of decrypted file """ path = RESOURCE_ROOT / pdffile pypdf.PdfReader(path, password=password).pages[0] @pytest.mark.parametrize( "names", [ ( [ "unencrypted.pdf", "r3-user-password.pdf", "r4-aes-user-password.pdf", "r5-user-password.pdf", ] ), ], ) @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_encrypted_pdfs(names): """Encrypted PDFs can be merged after decryption.""" merger = pypdf.PdfWriter() files = [RESOURCE_ROOT / "encryption" / x for x in names] pdfs = [pypdf.PdfReader(x) for x in files] for pdf in pdfs: if pdf.is_encrypted: pdf.decrypt("asdfzxcv") merger.append(pdf) # no need to write to file merger.close() @pytest.mark.skipif( USE_CRYPTOGRAPHY, reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494", ) @pytest.mark.parametrize( "cryptcls", [ CryptRC4, ], ) def test_encrypt_decrypt_with_cipher_class(cryptcls): """Encryption and decryption using a cipher class work as expected.""" message = b"Hello World" key = bytes(0 for _ in range(128)) # b"secret key" crypt = cryptcls(key) assert crypt.decrypt(crypt.encrypt(message)) == message def test_attempt_decrypt_unencrypted_pdf(): """Attempting to decrypt an unencrypted PDF raises a PdfReadError.""" path = RESOURCE_ROOT / "crazyones.pdf" with pytest.raises(PdfReadError) as exc: PdfReader(path, password="nonexistent") assert exc.value.args[0] == "Not an encrypted file" @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_alg_v5_generate_values(): """ Algorithm V5 values are generated without raising exceptions. This test function checks if there is an exception during the value generation. It does not verify that the content is correct. """ key = b"0123456789123451" values = AlgV5.generate_values( R=5, user_password=b"foo", owner_password=b"bar", key=key, p=0, metadata_encrypted=True, ) assert values == { "/U": values["/U"], "/UE": values["/UE"], "/O": values["/O"], "/OE": values["/OE"], "/Perms": values["/Perms"], } @pytest.mark.parametrize( ("alg", "requires_aes"), [ ("RC4-40", False), ("RC4-128", False), ("AES-128", True), ("AES-256-R5", True), ("AES-256", True), ("ABCD", False), ], ) def test_pdf_encrypt(pdf_file_path, alg, requires_aes): user_password = secrets.token_urlsafe(10) owner_password = secrets.token_urlsafe(10) reader = PdfReader(RESOURCE_ROOT / "encryption" / "unencrypted.pdf") page = reader.pages[0] text0 = page.extract_text() writer = PdfWriter() writer.add_page(page) # test with invalid algorithm name if alg == "ABCD": with pytest.raises(ValueError) as exc: writer.encrypt( user_password=user_password, owner_password=owner_password, algorithm=alg, ) assert exc.value.args[0] == "Algorithm 'ABCD' NOT supported" return if requires_aes and not HAS_AES: with pytest.raises(DependencyError) as exc: writer.encrypt( user_password=user_password, owner_password=owner_password, algorithm=alg, ) with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) assert exc.value.args[0] == _DEPENDENCY_ERROR_STR return writer.encrypt( user_password=user_password, owner_password=owner_password, algorithm=alg ) with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) reader = PdfReader(pdf_file_path) assert reader.is_encrypted assert reader.decrypt(owner_password) == PasswordType.OWNER_PASSWORD assert reader.decrypt(user_password) == PasswordType.USER_PASSWORD page = reader.pages[0] text1 = page.extract_text() assert text0 == text1 @pytest.mark.parametrize( "count", [1, 2, 3, 4, 5, 10], ) def test_pdf_encrypt_multiple(pdf_file_path, count): user_password = secrets.token_urlsafe(10) owner_password = secrets.token_urlsafe(10) reader = PdfReader(RESOURCE_ROOT / "encryption" / "unencrypted.pdf") page = reader.pages[0] text0 = page.extract_text() writer = PdfWriter() writer.add_page(page) if count == 1: owner_password = None for _i in range(count): writer.encrypt( user_password=user_password, owner_password=owner_password, algorithm="RC4-128", ) with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) reader = PdfReader(pdf_file_path) assert reader.is_encrypted if owner_password is None: # NOTICE: owner_password will set to user_password if it's None assert reader.decrypt(user_password) == PasswordType.OWNER_PASSWORD else: assert reader.decrypt(owner_password) == PasswordType.OWNER_PASSWORD assert reader.decrypt(user_password) == PasswordType.USER_PASSWORD page = reader.pages[0] text1 = page.extract_text() assert text0 == text1 @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_aes_decrypt_corrupted_data(): """Just for robustness""" aes = CryptAES(secrets.token_bytes(16)) for num in [0, 17, 32]: aes.decrypt(secrets.token_bytes(num)) @pytest.mark.samples def test_encrypt_stream_dictionary(pdf_file_path): user_password = secrets.token_urlsafe(10) reader = PdfReader(SAMPLE_ROOT / "023-cmyk-image/cmyk-image.pdf") page = reader.pages[0] original_image_obj = reader.get_object(page.images["/I"].indirect_reference) writer = PdfWriter() writer.add_page(reader.pages[0]) writer.encrypt( user_password=user_password, owner_password=None, algorithm="RC4-128", ) with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) reader = PdfReader(pdf_file_path) assert reader.is_encrypted assert reader.decrypt(user_password) == PasswordType.OWNER_PASSWORD page = reader.pages[0] decrypted_image_obj = reader.get_object(page.images["/I"].indirect_reference) assert decrypted_image_obj["/ColorSpace"][3] == original_image_obj["/ColorSpace"][3] def test_are_permissions_valid_none_for_unencrypted(): """are_permissions_valid is None for unencrypted documents.""" reader = PdfReader(RESOURCE_ROOT / "encryption" / "unencrypted.pdf") assert reader.are_permissions_valid is None @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_are_permissions_valid_none_before_decrypt(): """are_permissions_valid is None for encrypted documents before decrypt().""" reader = PdfReader(RESOURCE_ROOT / "encryption" / "r6-both-passwords.pdf") assert reader.are_permissions_valid is None @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_are_permissions_valid_true_for_valid_r6(): """are_permissions_valid is True when /Perms integrity check passes.""" reader = PdfReader(RESOURCE_ROOT / "encryption" / "r6-owner-password.pdf") reader.decrypt("usersecret") assert reader.are_permissions_valid is True def test_are_permissions_valid_true_for_v4(): """are_permissions_valid defaults to True for V4 encryption (no /Perms field).""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "encryption" / "unencrypted.pdf") writer.encrypt(user_password="user", owner_password="owner", algorithm="RC4-128") output = BytesIO() writer.write(output) reader = PdfReader(output) reader.decrypt("user") assert reader.are_permissions_valid is True @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_are_permissions_valid_false_when_tampered(): """are_permissions_valid is False when /Perms has been tampered with.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "encryption" / "unencrypted.pdf") writer.encrypt(user_password="user", owner_password="owner", algorithm="AES-256") output = BytesIO() writer.write(output) # Tamper with /Perms by modifying the raw bytes data = bytearray(output.getvalue()) perms_marker = b"/Perms " idx = data.find(perms_marker) assert idx != -1, "/Perms not found in PDF" # Find the hex string value after /Perms and corrupt a byte start = data.index(b"<", idx) data[start + 2] ^= 0xFF # flip bits in the first byte of the hex string tampered = BytesIO(bytes(data)) reader = PdfReader(tampered) reader.decrypt("user") assert reader.are_permissions_valid is False ================================================ FILE: tests/test_filters.py ================================================ """Test the pypdf.filters module.""" import os import string import subprocess import sys import zlib from io import BytesIO from itertools import product as cartesian_product from pathlib import Path from typing import cast from unittest import mock import pytest from PIL import Image, ImageOps from pypdf import PdfReader, PdfWriter from pypdf.errors import DependencyError, DeprecationError, LimitReachedError, PdfReadError, PdfStreamError from pypdf.filters import ( ASCII85Decode, ASCIIHexDecode, CCITParameters, CCITTFaxDecode, CCITTParameters, FlateDecode, JBIG2Decode, RunLengthDecode, decode_stream_data, decompress, ) from pypdf.generic import ( ArrayObject, BooleanObject, ContentStream, DictionaryObject, IndirectObject, NameObject, NullObject, NumberObject, StreamObject, TextStringObject, ) from . import RESOURCE_ROOT, PILContext, get_data_from_url from .test_encryption import HAS_AES from .test_images import image_similarity from .utils import get_image_data filter_inputs = ( string.ascii_letters, string.ascii_lowercase, string.ascii_uppercase, string.digits, string.hexdigits, string.octdigits, string.punctuation, string.printable, string.whitespace, # Add more ) @pytest.mark.parametrize( ("predictor", "s"), list(cartesian_product([1], filter_inputs)) ) def test_flate_decode_encode(predictor, s): """FlateDecode encode() and decode() methods work as expected.""" codec = FlateDecode() s = s.encode() encoded = codec.encode(s) assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s def test_flatedecode_unsupported_predictor(): """ FlateDecode raises PdfReadError for unsupported predictors. Predictor values outside the ranges [1, 2] and [10, 15] are not supported. Checks that a PdfReadError is raised when decoding with unsupported predictors. """ codec = FlateDecode() predictors = (-10, -1, 0, 3, 9, 16, 20, 100) for predictor, s in cartesian_product(predictors, filter_inputs): s = s.encode() with pytest.raises(PdfReadError): codec.decode(codec.encode(s), DictionaryObject({NameObject("/Predictor"): NumberObject(predictor)})) @pytest.mark.parametrize( ("data", "expected"), [ (">", b""), ( "6162636465666768696a6b6c6d6e6f707172737475767778797a>", string.ascii_lowercase.encode(), ), ( "4142434445464748494a4b4c4d4e4f505152535455565758595a>", string.ascii_uppercase.encode(), ), ( "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>", string.ascii_letters.encode(), ), ("30313233343536373839>", string.digits.encode()), ( "3 031323334353637 3839>", string.digits.encode(), ), # Same as previous, but whitespaced ("30313233343536373839616263646566414243444546>", string.hexdigits.encode()), ("20090a0d0b0c>", string.whitespace.encode()), # Odd number of hexadecimal digits behaves as if a 0 (zero) followed the last digit ("3938373635343332313>", string.digits[::-1].encode()), ], ids=[ "empty", "ascii_lowercase", "ascii_uppercase", "ascii_letters", "digits", "digits_whitespace", "hexdigits", "whitespace", "odd_number", ], ) def test_ascii_hex_decode_method(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. """ assert ASCIIHexDecode.decode(data) == expected def test_ascii_hex_decode_missing_eod(caplog): """ASCIIHexDecode.decode() logs warning when no EOD character is present.""" ASCIIHexDecode.decode("") assert "missing EOD in ASCIIHexDecode, check if output is OK" in caplog.text @pytest.mark.enable_socket def test_decode_ahx(): """ See #1979 Gray Image in CMYK : requiring reverse """ reader = PdfReader(BytesIO(get_data_from_url(name="NewJersey.pdf"))) for p in reader.pages: _ = list(p.images.keys()) def test_ascii85decode_with_overflow(): inputs = ( v + "~>" for v in "\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a" "\x1b\x1c\x1d\x1e\x1fvwxy{|}~\x7f\x80\x81\x82" "\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d" "\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98" "\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬" "\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇ" ) for i in inputs: with pytest.raises(ValueError): ASCII85Decode.decode(i) def test_ascii85decode_five_zero_bytes(): """ ASCII85Decode handles the special case of five zero bytes correctly. ISO 32000-1:2008 §7.4.3: «As a special case, if all five bytes are 0, they shall be represented by the character with code 122 (z) instead of by five exclamation points (!!!!!).» """ inputs = ("z", "zz", "zzz") exp_outputs = ( b"\x00\x00\x00\x00", b"\x00\x00\x00\x00" * 2, b"\x00\x00\x00\x00" * 3, ) assert ASCII85Decode.decode("!!!!!~>") == ASCII85Decode.decode("z~>") for expected, i in zip(exp_outputs, inputs): assert ASCII85Decode.decode(i + "~>") == expected def test_ccitparameters(): with pytest.raises( DeprecationError, match=r"CCITParameters is deprecated and was removed in pypdf 6\.0\.0\. Use CCITTParameters instead", ): CCITParameters() def test_ccittparameters(): params = CCITTParameters() assert params.K == 0 # zero is the default according to page 78 assert params.BlackIs1 is False assert params.group == 3 @pytest.mark.parametrize( ("parameters", "expected_k", "expected_black_is_1"), [ (None, 0, False), ( ArrayObject([{"/K": NumberObject(1)}, {"/Columns": NumberObject(13)}, {"/BlackIs1": BooleanObject(True)}]), 1, True ), ], ) def test_ccitt_get_parameters(parameters, expected_k, expected_black_is_1): parameters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0) assert parameters.K == expected_k # noqa: SIM300 assert parameters.BlackIs1 == expected_black_is_1 def test_ccitt_get_parameters__indirect_object(): class Pdf: def get_object(self, reference) -> NumberObject: return NumberObject(42) parameters = CCITTFaxDecode._get_parameters( parameters=None, rows=IndirectObject(13, 1, Pdf()) ) assert parameters.rows == 42 def test_ccitt_fax_decode(): data = b"" parameters = DictionaryObject( {"/K": NumberObject(-1), "/Columns": NumberObject(17)} ) # This is the header of an empty TIFF image. assert CCITTFaxDecode.decode(data, parameters) == ( b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00" b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01" b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00" b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00" b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01" b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00" ) @pytest.mark.enable_socket def test_decompress_zlib_error(caplog): reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf"))) for page in reader.pages: page.extract_text() assert "incorrect startxref pointer(3)" in caplog.text @pytest.mark.enable_socket def test_lzw_decode_neg1(): reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf"))) page = reader.pages[47] assert page.extract_text().startswith("Chapter 2") @pytest.mark.enable_socket def test_issue_399(): reader = PdfReader(BytesIO(get_data_from_url(name="tika-976970.pdf"))) reader.pages[1].extract_text() @pytest.mark.enable_socket def test_image_without_pillow(tmp_path): env = os.environ.copy() env["COVERAGE_PROCESS_START"] = "pyproject.toml" name = "tika-914102.pdf" pdf_path = Path(__file__).parent / "pdf_cache" / name pdf_path_str = pdf_path.resolve().as_posix() source_file = tmp_path / "script.py" source_file.write_text( f""" import sys from pypdf import PdfReader import pytest sys.modules["PIL"] = None reader = PdfReader("{pdf_path_str}", strict=True) for page in reader.pages: with pytest.raises(ImportError) as exc: page.images[0] assert exc.value.args[0] == ( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ), exc.value.args[0] """ ) try: env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"] except KeyError: env["PYTHONPATH"] = "." result = subprocess.run( # noqa: S603 # We have the control here. [sys.executable, source_file], capture_output=True, env=env, ) assert result.returncode == 0 assert result.stdout == b"" assert ( result.stderr.replace(b"\r", b"") == b"Superfluous whitespace found in object header b'4' b'0'\n" ) @pytest.mark.enable_socket def test_issue_1737(): reader = PdfReader(BytesIO(get_data_from_url(name="iss1737.pdf"))) reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() @pytest.mark.enable_socket def test_pa_image_extraction(): """ PNG images with PA mode can be extracted. This is a regression test for issue #1801 """ reader = PdfReader(BytesIO(get_data_from_url(name="issue-1801.pdf"))) page0 = reader.pages[0] images = page0.images assert len(images) == 1 assert images[0].name == "Im1.png" # Ensure visual appearance expected_data = BytesIO(get_data_from_url(name="issue-1801.png")) assert image_similarity(expected_data, images[0].image) == 1 @pytest.mark.enable_socket def test_1bit_image_extraction(): """Cf issue #1814""" reader = PdfReader(BytesIO(get_data_from_url(name="grimm10"))) for p in reader.pages: p.images @pytest.mark.enable_socket def test_png_transparency_reverse(): """Cf issue #1599""" pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" reader = PdfReader(pdf_path) refimg = Image.open( BytesIO(get_data_from_url(name="labeled-edges-center-image.png")) ) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".jp2" in data.name assert get_image_data(img) == get_image_data(refimg) @pytest.mark.enable_socket def test_iss1787(): """Cf issue #1787""" reader = PdfReader(BytesIO(get_data_from_url(name="pdf_font_garbled.pdf"))) refimg = Image.open(BytesIO(get_data_from_url(name="watermark1.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name assert get_image_data(img) == get_image_data(refimg) obj = data.indirect_reference.get_object() obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000) obj.decoded_self = None with pytest.raises(expected_exception=PdfReadError, match=r"^Unsupported PNG filter 244$"): _ = reader.pages[0].images[0] @pytest.mark.enable_socket def test_tiff_predictor(): """Decode Tiff Predictor 2 Images""" reader = PdfReader(BytesIO(get_data_from_url(name="tika-977609.pdf"))) refimg = Image.open(BytesIO(get_data_from_url(name="tifimage.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name assert get_image_data(img) == get_image_data(refimg) @pytest.mark.enable_socket def test_rgba(): """Decode RGB with transparency""" with PILContext(): reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) data = reader.pages[0].images[0] assert ".jp2" in data.name similarity = image_similarity( data.image, BytesIO(get_data_from_url(name="tika-972174_p0-im0.png")) ) assert similarity > 0.99 @pytest.mark.enable_socket @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_cmyk(): """Decode CMYK""" # JPEG compression reader = PdfReader(BytesIO(get_data_from_url(name="Vitocal.pdf"))) refimg = BytesIO(get_data_from_url(name="VitocalImage.png")) data = reader.pages[1].images[0] assert data.image.mode == "CMYK" assert ".jpg" in data.name assert image_similarity(data.image, refimg) > 0.99 # deflate reader = PdfReader(BytesIO(get_data_from_url(name="cmyk_deflate.pdf"))) refimg = BytesIO(get_data_from_url(name="cmyk_deflate.tif")) data = reader.pages[0].images[0] assert data.image.mode == "CMYK" assert ".tif" in data.name assert image_similarity(data.image, refimg) > 0.999 # lossless compression expected @pytest.mark.enable_socket def test_iss1863(): """Test doc from iss1863""" reader = PdfReader(BytesIO(get_data_from_url(name="o1whh9b3.pdf"))) for p in reader.pages: for i in p.images: i.name @pytest.mark.enable_socket def test_read_images(): reader = PdfReader(BytesIO(get_data_from_url(name="selbst.72916.pdf"))) page = reader.pages[0] for _ in page.images: pass @pytest.mark.enable_socket def test_cascaded_filters_images(): reader = PdfReader(BytesIO(get_data_from_url(name="iss1912.pdf"))) # for focus, analyse the page 23 for p in reader.pages: for i in p.images: _ = i.name, i.image @pytest.mark.enable_socket def test_calrgb(): reader = PdfReader(BytesIO(get_data_from_url(name="calRGB.pdf"))) reader.pages[0].images[0] @pytest.mark.enable_socket def test_index_lookup(): """The lookup is provided as an str and bytes""" reader = PdfReader(BytesIO(get_data_from_url(name="2023USDC.pdf"))) # TextStringObject Lookup refimg = BytesIO(get_data_from_url(name="iss1982_im1.png")) data = reader.pages[0].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # ByteStringObject Lookup refimg = BytesIO(get_data_from_url(name="iss1982_im2.png")) data = reader.pages[-1].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # indexed CMYK images # currently with a TODO as we convert the palette to RGB reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) refimg = Image.open(BytesIO(get_data_from_url(name="usa.png"))) data = reader.pages[0].images["/Im3"] # assert data.image.mode == "PA" but currently "RGBA" assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket def test_2bits_image(): """From #1954, test with 2bits image. TODO: 4bits also""" reader = PdfReader(BytesIO(get_data_from_url(name="paid.pdf"))) url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png" name_png = "Paid.png" refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert image_similarity(data.image, refimg) > 0.99 @pytest.mark.enable_socket def test_gray_devicen_cmyk(): """ Cf #1979 Gray Image in CMYK : requiring reverse """ url = "https://github.com/py-pdf/pypdf/files/12080338/example_121.pdf" name = "gray_cmyk.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/254545494-42df4949-1557-4f2d-acca-6be6e8de1122.png" name_png = "velo.png" refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert data.image.mode == "L" assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket def test_runlengthdecode(): """From #1954, test with 2bits image. TODO: 4bits also""" url = "https://github.com/py-pdf/pypdf/files/12159941/out.pdf" name = "RunLengthDecode.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/255940800-6d63972e-a3d6-4cf9-aa6f-0793af24cded.png" name_png = "RunLengthDecode.png" refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert image_similarity(data.image, refimg) > 0.999 url = "https://github.com/py-pdf/pypdf/files/12162905/out.pdf" name = "FailedRLE1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] url = "https://github.com/py-pdf/pypdf/files/12162926/out.pdf" name = "FailedRLE2.pdf" reader.pages[0].images[0] @pytest.mark.enable_socket def test_gray_separation_cmyk(): """ Cf #1955 Gray Image in Separation/RGB : requiring reverse """ url = "https://github.com/py-pdf/pypdf/files/12143372/tt.pdf" name = "TestWithSeparationBlack.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/254545494-42df4949-1557-4f2d-acca-6be6e8de1122.png" name_png = "velo.png" # reused refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert data.image.mode == "L" assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket def test_singleton_device(): """From #2023""" url = "https://github.com/py-pdf/pypdf/files/12177287/tt.pdf" name = "pypdf_with_arr_deviceRGB.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] @pytest.mark.enable_socket def test_jpx_no_spacecode(): """From #2061""" url = "https://github.com/py-pdf/pypdf/files/12253581/tt2.pdf" name = "jpx_no_spacecode.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) im = reader.pages[0].images[0] # create an object without filter and without colorspace # just for coverage del im.indirect_reference.get_object()["/Filter"] with pytest.raises(PdfReadError) as exc: reader.pages[0].images[0] assert exc.value.args[0].startswith("ColorSpace field not found") @pytest.mark.enable_socket def test_encodedstream_lookup(): """From #2124""" url = "https://github.com/py-pdf/pypdf/files/12455580/10.pdf" name = "iss2124.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[12].images[0] @pytest.mark.enable_socket def test_convert_1_to_la(): """From #2165""" url = "https://github.com/py-pdf/pypdf/files/12543290/whitepaper.WBT.token.blockchain.whitepaper.pdf" name = "iss2165.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for i in reader.pages[13].images: _ = i @pytest.mark.enable_socket def test_nested_device_n_color_space(): """From #2240""" url = "https://github.com/py-pdf/pypdf/files/12814018/out1.pdf" name = "issue2240.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] @pytest.mark.enable_socket @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_flate_decode_with_image_mode_1(): """From #2248""" url = "https://github.com/py-pdf/pypdf/files/12847339/Prototype-Declaration-VDE4110-HYD-5000-20000-ZSS-DE.pdf" name = "issue2248.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for image in reader.pages[7].images: _ = image @pytest.mark.enable_socket def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): """From #2331""" url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf" name = "issue2331.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] @pytest.mark.enable_socket def test_ascii85decode__invalid_end__recoverable(caplog): """From #2996""" url = "https://github.com/user-attachments/files/18050808/1af7d56a-5c8c-4914-85b3-b2536a5525cd.pdf" name = "issue2996.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[1] assert page.extract_text() == "" assert "Ignoring missing Ascii85 end marker." in caplog.text def test_ascii85decode__non_recoverable(caplog): # Without our custom handling, this would complain about the final `~>` being missing. data = "äöüß" with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): ASCII85Decode.decode(data) assert "Ignoring missing Ascii85 end marker." in caplog.text caplog.clear() data += "~>" with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): ASCII85Decode.decode(data) assert caplog.text == "" def test_ascii85decode__ignore_whitespaces(caplog): """Whitespace characters must be silently ignored""" data = b"Cqa;:3k~\n>" result = ASCII85Decode.decode(data) assert result == b"l\xbe`\x8d:" @pytest.mark.enable_socket def test_ccitt_fax_decode__black_is_1(): url = "https://github.com/user-attachments/files/19288881/imagemagick-CCITTFaxDecode_BlackIs1-true.pdf" name = "issue3193.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) other_reader = PdfReader(RESOURCE_ROOT / "imagemagick-CCITTFaxDecode.pdf") actual_image = reader.pages[0].images[0].image expected_image_inverted = other_reader.pages[0].images[0].image expected_pixels = get_image_data(ImageOps.invert(expected_image_inverted)) actual_pixels = get_image_data(actual_image) assert expected_pixels == actual_pixels # AttributeError: 'NullObject' object has no attribute 'get' data_modified = get_data_from_url(url, name=name).replace( b"/DecodeParms [ << /K -1 /BlackIs1 true /Columns 16 /Rows 16 >> ]", b"/DecodeParms [ null ]" ) reader = PdfReader(BytesIO(data_modified)) _ = reader.pages[0].images[0].image @pytest.mark.enable_socket def test_flate_decode__image_is_none_due_to_size_limit(caplog): url = "https://github.com/user-attachments/files/19464256/file.pdf" name = "issue3220.pdf" with mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", 0): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) images = reader.pages[0].images assert len(images) == 1 image = images[0] assert image.name == "Im0.png" assert image.image is None assert ( "Failed loading image: Image size (180000000 pixels) exceeds limit of " "178956970 pixels, could be decompression bomb DOS attack." ) in caplog.messages @pytest.mark.enable_socket def test_flate_decode__not_rectangular(caplog): url = "https://github.com/user-attachments/files/19663603/issue3241_compressed.txt" name = "issue3241.txt" data = get_data_from_url(url, name=name) decode_parms = DictionaryObject() decode_parms[NameObject("/Predictor")] = NumberObject(15) decode_parms[NameObject("/Columns")] = NumberObject(4881) actual = FlateDecode.decode(data=data, decode_parms=decode_parms) actual_image = Image.frombytes(mode="1", size=(4881, 81), data=actual) url = "https://github.com/user-attachments/assets/c5695850-c076-4255-ab72-7c86851a4a04" name = "issue3241.png" expected_data = BytesIO(get_data_from_url(url, name=name)) assert image_similarity(expected_data, actual_image) == 1 assert caplog.messages == ["Image data is not rectangular. Adding padding."] def test_jbig2decode__binary_errors(): with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \ pytest.raises(DependencyError, match=r"jbig2dec binary is not available\."): JBIG2Decode.decode(b"dummy") result = subprocess.CompletedProcess( args=["dummy"], returncode=0, stdout=b"", stderr=( b"jbig2dec: unrecognized option '--embedded'\n" b"Usage: jbig2dec [options] \n" b" or jbig2dec [options] \n" ) ) with mock.patch("pypdf.filters.subprocess.run", return_value=result), \ mock.patch("pypdf.filters.JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \ pytest.raises(DependencyError, match=r"jbig2dec>=0.19 is required\."): JBIG2Decode.decode(b"dummy") result = subprocess.CompletedProcess( args=["dummy"], returncode=0, stdout=b"", stderr=( b"jbig2dec: unrecognized option '-M'\n" b"Usage: jbig2dec [options] \n" b" or jbig2dec [options] \n" ) ) with mock.patch("pypdf.filters.subprocess.run", return_value=result), \ mock.patch("pypdf.filters.JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \ pytest.raises(DependencyError, match=r"jbig2dec>=0.19 is required\."): JBIG2Decode.decode(b"dummy") @pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec") def test_jbig2decode__edge_cases(caplog): image_data = ( b'\x00\x00\x00\x010\x00\x01\x00\x00\x00\x13\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x06"' b'\x00\x01\x00\x00\x00\x1c\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x9f\xa8_\xff\xac' ) jbig2_globals = b"\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x18\x00\x00\x03\xff\xfd\xff\x02\xfe\xfe\xfe\x00\x00\x00\x01\x00\x00\x00\x01R\xd0u7\xff\xac" # noqa: E501 # Validation: Is our image data valid? content_stream = ContentStream(stream=None, pdf=None) content_stream.set_data(jbig2_globals) result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": content_stream})) image = Image.open(BytesIO(result), formats=("PNG", "PPM")) for x in range(5): for y in range(5): assert image.getpixel((x, y)) == (255 if x < 3 else 0), (x, y) assert caplog.messages == [] # No decode_params. Completely white image. result = JBIG2Decode.decode(image_data) image = Image.open(BytesIO(result), formats=("PNG", "PPM")) for x in range(5): for y in range(5): assert image.getpixel((x, y)) == 255, (x, y) assert caplog.messages == [ "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)", "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)" ] caplog.clear() # JBIG2Globals is NULL. Completely white image. result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": NullObject()})) image = Image.open(BytesIO(result), formats=("PNG", "PPM")) for x in range(5): for y in range(5): assert image.getpixel((x, y)) == 255, (x, y) assert caplog.messages == [ "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)", "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)" ] caplog.clear() # JBIG2Globals is DictionaryObject. Completely white image. result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": DictionaryObject()})) image = Image.open(BytesIO(result), formats=("PNG", "PPM")) for x in range(5): for y in range(5): assert image.getpixel((x, y)) == 255, (x, y) assert caplog.messages == [ "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)", "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)" ] caplog.clear() # Invalid input. with pytest.raises(PdfStreamError, match=r"Unable to decode JBIG2 data\. Exit code: 1"): JBIG2Decode.decode(b"aaaaaa") assert caplog.messages == [ "jbig2dec FATAL ERROR page has no image, cannot be completed", "jbig2dec WARNING unable to complete page" ] @pytest.mark.timeout(timeout=30, method="thread") @pytest.mark.enable_socket def test_flate_decode_stream_with_faulty_tail_bytes(): """ Test for #3332 The test ensures two things: 1. stream can be decoded at all 2. decoding doesn't falls through to last fallback in try-except blocks that is too slow and takes ages for this stream """ data = get_data_from_url( url="https://github.com/user-attachments/files/20901522/faulty_stream_tail_example.1.pdf", name="faulty_stream_tail_example.1.pdf" ) expected = get_data_from_url( url="https://github.com/user-attachments/files/20941717/decoded.dat.txt", name="faulty_stream_tail_example.1.decoded.dat" ) reader = PdfReader(BytesIO(data)) obj = reader.get_object(IndirectObject(182, 0, reader)) assert cast(StreamObject, obj).get_data() == expected @pytest.mark.enable_socket def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog): """ Test for #3355 The test ensures that the inner RLE encoded stream can be decoded, because this stream contains an extra faulty newline byte in the end that can be ignored during decoding. """ data = get_data_from_url( url="https://github.com/user-attachments/files/21038398/test_data_rle.txt", name="multi_decoding_example_with_faulty_tail_byte.pdf" ) reader = PdfReader(BytesIO(data)) obj = reader.get_object(IndirectObject(60, 0, reader)) cast(StreamObject, obj).get_data() assert "Found trailing newline in stream data, check if output is OK" in caplog.messages @pytest.mark.enable_socket def test_rle_decode_exception_with_corrupted_stream(caplog): """ Additional Test to #3355 This test must report the EOD warning during RLE decoding and ensures that we do not fail during code coverage analyses in the git PR pipeline. """ data = get_data_from_url( url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt", name="rle_stream_with_error.txt" ) decoded = RunLengthDecode.decode(data) assert decoded.startswith(b"\x01\x01\x01\x01\x01\x01\x01\x02\x02\x02\x02\x02\x02\x02\x03\x03") assert decoded.endswith(b"\x87\x83\x83\x83\x83\x83\x83\x83]]]]]]]RRRRRRRX\xa5") assert len(decoded) == 1048576 assert caplog.messages == ["Early EOD in RunLengthDecode, check if output is OK"] def test_decompress(): data = string.printable.encode("utf-8") + string.printable[::-1].encode("utf-8") compressed = FlateDecode.encode(data) # Decompress regularly. decompressed = decompress(compressed) assert decompressed == data # Decompress byte-wise. with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error): decompressed = decompress(compressed) assert decompressed == data # Decompress byte-wise with very low output limit. with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error), \ mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", len(compressed) - 13), \ pytest.raises( LimitReachedError, match=r"^Limit reached while decompressing\. 12 bytes remaining\.$" ): decompress(compressed) # Decompress byte-wise with input limit. with mock.patch("pypdf.filters.ZLIB_MAX_RECOVERY_INPUT_LENGTH", 1000), \ pytest.raises( LimitReachedError, match=r"^Recovery limit reached while decompressing\. 336 bytes remaining\.$" ): decompress(b"A" * 1337) def test_decompress__logging_on_invalid_data(caplog): """We do not like suddenly getting empty outputs for non-empty inputs without a warning.""" codec = FlateDecode() encoded = codec.encode(b"My test string") assert len(encoded) > 5 assert codec.decode(encoded[5:]) == b"" assert caplog.messages == ["Error -3 while decompressing data: incorrect header check"] def test_ccittfaxdecode__ccf_inline(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "jpeg.pdf") page = writer.pages[0] writer.remove_images() image_data = ( b"\nBI\n /W 16\n /H 16\n /CS /G\n /BPC 1\n /F [/CCF]\n" b" /DP [ << /K -1 /BlackIs1 false /Columns 16 /Rows 16 >> ]\nID\n" b"&\xa0\xbf\xcc9\x14|G#\x1f\xff\xf1\xcc9\x18\xfe\xbbX\xfc\x00@\x04" b"\nEI\n" ) content_stream = page.get_contents() content_stream.set_data( content_stream.get_data().replace(b"/Im4 Do", b"").replace(b"\nET", image_data) ) page.replace_contents(content_stream) expected = PdfReader(RESOURCE_ROOT / "imagemagick-CCITTFaxDecode.pdf").pages[0].images[0].image assert get_image_data(expected) == get_image_data(page.images[0].image) def test_dctdecode__dct_inline(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "jpeg.pdf") page = writer.pages[0] writer.remove_images() image_data = ( b"\nBI\n /W 16\n /H 16\n /CS /G\n /BPC 8\n /F [/DCT]\nID\n" b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xfe\x00\x13Created with GIMP\xff\xe2" b"\x02\xb0ICC_PROFILE\x00\x01\x01\x00\x00\x02\xa0lcms\x040\x00\x00mntrRGB XYZ \x07\xe6\x00\x04\x00\x0f\x00" b"\t\x00\x1d\x007acspAPPL\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\xf6\xd6\x00\x01\x00\x00\x00\x00\xd3-lcms\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\rdesc\x00\x00\x01 \x00\x00\x00@cprt\x00\x00\x01`" b"\x00\x00\x006wtpt\x00\x00\x01\x98\x00\x00\x00\x14chad\x00\x00\x01\xac\x00\x00\x00,rXYZ\x00\x00\x01\xd8" b"\x00\x00\x00\x14bXYZ\x00\x00\x01\xec\x00\x00\x00\x14gXYZ\x00\x00\x02\x00\x00\x00\x00\x14rTRC\x00\x00" b"\x02\x14\x00\x00\x00 gTRC\x00\x00\x02\x14\x00\x00\x00 bTRC\x00\x00\x02\x14\x00\x00\x00 chrm\x00\x00" b"\x024\x00\x00\x00$dmnd\x00\x00\x02X\x00\x00\x00$dmdd\x00\x00\x02|\x00\x00\x00$mluc\x00\x00\x00\x00" b"\x00\x00\x00\x01\x00\x00\x00\x0cenUS\x00\x00\x00$\x00\x00\x00\x1c\x00G\x00I\x00M\x00P\x00 \x00b\x00" b"u\x00i\x00l\x00t\x00-\x00i\x00n\x00 \x00s\x00R\x00G\x00Bmluc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00" b"\x00\x0cenUS\x00\x00\x00\x1a\x00\x00\x00\x1c\x00P\x00u\x00b\x00l\x00i\x00c\x00 \x00D\x00o\x00m\x00a" b"\x00i\x00n\x00\x00XYZ \x00\x00\x00\x00\x00\x00\xf6\xd6\x00\x01\x00\x00\x00\x00\xd3-sf32\x00\x00\x00" b"\x00\x00\x01\x0cB\x00\x00\x05\xde\xff\xff\xf3%\x00\x00\x07\x93\x00\x00\xfd\x90\xff\xff\xfb\xa1\xff" b"\xff\xfd\xa2\x00\x00\x03\xdc\x00\x00\xc0nXYZ \x00\x00\x00\x00\x00\x00o\xa0\x00\x008\xf5\x00\x00\x03" b"\x90XYZ \x00\x00\x00\x00\x00\x00$\x9f\x00\x00\x0f\x84\x00\x00\xb6\xc4XYZ \x00\x00\x00\x00\x00\x00b" b"\x97\x00\x00\xb7\x87\x00\x00\x18\xd9para\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02ff\x00\x00\xf2\xa7" b"\x00\x00\rY\x00\x00\x13\xd0\x00\x00\n[chrm\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\xa3\xd7\x00\x00T|" b"\x00\x00L\xcd\x00\x00\x99\x9a\x00\x00&g\x00\x00\x0f\\mluc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00" b"\x00\x0cenUS\x00\x00\x00\x08\x00\x00\x00\x1c\x00G\x00I\x00M\x00Pmluc\x00\x00\x00\x00\x00\x00\x00" b"\x01\x00\x00\x00\x0cenUS\x00\x00\x00\x08\x00\x00\x00\x1c\x00s\x00R\x00G\x00B\xff\xdb\x00C\x00\x01" b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xc0\x00\x0b\x08\x00\x10\x00\x10" b"\x01\x01\x11\x00\xff\xc4\x00\x17\x00\x00\x03\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x06\x07\x08\n\xff\xc4\x00\x1d\x10\x00\x03\x00\x03\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00" b"\x00\x00\x05\x06\x07\x01\x04\x08\x02\x03\x13\x15\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xc4D\x0eA" b"\x8e\x91\xa8\xf3\xcf5N\xb5\x7f\x87k\xbc_\x96\xe3\x83]\x9c\\\xff\x00\x19f1^=:A\x98jm.\x03\x9f\x10" b"mW\xc2\xcbYF\xd2T\x06\xef,OXfX`^\x18\x0ez\xb4U \x91\x17\xd4\xf6\xbe\xc2\xb7\x85s:{\xa1\x8f\xec;}" b"\x8f-l/1|\x19\x86|\x14\xc5+j\x8cm\xf0\xde\x10\xba\x7f\xa5=\xe2\x86\xd8\x18\r\xed$o\xab2h\xbc\xad" b"\x8cS\x18\xba\xd8,\xb2\xa3\xbf\xd9\xd8I\x84+\x07\x9d\x1ay\x1cr\xba\x81\nu\x0f\xa7yk\xa0%5\xf2\xf4" b"\xf4\x9e\x8d\xe6\x19\x90+s;P\xfd\xd1\xb3\x8f\xac\xf8\x0e@5\xf5\x8f(i\xc3\x0e\xf3\xd3\xbc\xf5\xa5" b"\xed:\x85<$\xee\xd1@%i\xde\x1ao\xdaF$\t?Vq\xce\x92\xde\xe1\xbd\x14H\x8a'\"\x8d\xbf75\xaef\x90\xc3|" b"\xe8~\x82\x04\xab+3O.\xdeX&\xac\xf2t\x89\xcf\xd3\xfa\x85\xbdFu=\x8e*\xa9\xfb!\x96\xed\xfa\xe3S\xe5A" b"\xf2\xa8\xf5\xe8\xd7\x85\xa5\x05\t\xf8a\xff\x00\xff\xd9" b"\nEI\n" ) content_stream = page.get_contents() content_stream.set_data( content_stream.get_data().replace(b"/Im4 Do", b"").replace(b"\nET", image_data) ) page.replace_contents(content_stream) expected = PdfReader(RESOURCE_ROOT / "imagemagick-images.pdf").pages[3].images[0].image assert get_image_data(expected) == get_image_data(page.images[0].image) def test_deprecate_inline_image_filters(): stream = ContentStream(stream=None, pdf=None) stream.set_data(b"&\xa0\xbf\xcc9\x14|G#\x1f\xff\xf1\xcc9\x18\xfe\xbbX\xfc\x00@\x04") # The abbreviations do not work here, which is one of the reasons for the deprecation. stream[NameObject("/Width")] = NumberObject(16) stream[NameObject("/Height")] = NumberObject(16) stream[NameObject("/ColorSpace")] = NameObject("/DeviceGray") stream[NameObject("/BitsPerComponent")] = NumberObject(1) stream[NameObject("/Filter")] = NameObject("/CCF") stream[NameObject("/DecodeParams")] = ArrayObject( [ DictionaryObject( { NameObject("/K"): NumberObject(-1), NameObject("/BlackIs1"): TextStringObject("false"), NameObject("/Columns"): NumberObject(16), NameObject("/Rows"): NumberObject(16), } ) ] ) with pytest.warns( expected_warning=DeprecationWarning, match=r"^The filter name /CCF is deprecated and will be removed in pypdf 7\.0\.0\. Use /CCITTFaxDecode instead\.$" # noqa: E501 ): decode_stream_data(stream) stream[NameObject("/Filter")] = NameObject("/CCITTFaxDecode") assert decode_stream_data(stream).startswith(b"II*") def test_flatedecode__columns_is_zero(): codec = FlateDecode() data = b"Hello World!" parameters = DictionaryObject({ NameObject("/Predictor"): NumberObject(13), NameObject("/Columns"): NumberObject(0) }) with pytest.raises(expected_exception=PdfReadError, match=r"^Expected positive number for /Columns, got 0!$"): codec.decode(codec.encode(data), parameters) def test_runlengthdecode__decode_limit(): uncompressed_size = 76 * 1024 * 1024 # 76 MB target runs = uncompressed_size // 128 encoded = (b"\x81A" * runs) + b"\x80" with pytest.raises(expected_exception=LimitReachedError, match=r"^Limit reached while decompressing\.$"): RunLengthDecode.decode(encoded) uncompressed_size = 5 * 1024 runs = uncompressed_size // 128 encoded = (b"\x81A" * runs) + b"\x80" # Use a very low limit for this exact comparison, otherwise *pytest* takes ages to render a failure diff. with mock.patch("pypdf.filters.RUN_LENGTH_MAX_OUTPUT_LENGTH", uncompressed_size): assert RunLengthDecode.decode(encoded) == b"A" * uncompressed_size @pytest.mark.timeout(10) def test_asciihexdecode__speed(): encoded = (b"41" * 1_200_000) + b">" ASCIIHexDecode.decode(encoded) ================================================ FILE: tests/test_font.py ================================================ """Test font-related functionality.""" from pypdf._font import Font from pypdf.generic import DictionaryObject, NameObject def test_font_descriptor(): font_res = DictionaryObject({ NameObject("/BaseFont"): NameObject("/Helvetica"), NameObject("/Subtype"): NameObject("/Type1") }) my_font = Font.from_font_resource(font_res) assert my_font.font_descriptor.family == "Helvetica" assert my_font.font_descriptor.weight == "Medium" assert my_font.font_descriptor.ascent == 718 assert my_font.font_descriptor.descent == -207 test_string = "This is a long sentence. !@%%^€€€. çûįö¶´" charwidth = my_font.text_width(test_string) assert charwidth == 19251 font_res[NameObject("/BaseFont")] = NameObject("/Palatino") my_font = Font.from_font_resource(font_res) assert my_font.font_descriptor.weight == "Unknown" font_res[NameObject("/BaseFont")] = NameObject("/Courier-Bold") my_font = Font.from_font_resource(font_res) assert my_font.font_descriptor.italic_angle == 0 assert my_font.font_descriptor.flags == 33 assert my_font.font_descriptor.bbox == (-113.0, -250.0, 749.0, 801.0) ================================================ FILE: tests/test_forms.py ================================================ """Test form-related functionality. Separate file to keep overview.""" from io import BytesIO import pytest from pypdf import PdfReader, PdfWriter from tests import get_data_from_url @pytest.mark.enable_socket def test_form_button__v_value_should_be_name_object(): url = "https://github.com/user-attachments/files/18736500/blank-form.pdf" name = "issue3115.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) writer.update_page_form_field_values( writer.pages[0], {"Other": "/On"}, auto_regenerate=False, ) stream = BytesIO() writer.write(stream) # Wrong: `/V (/On)`. assert b"\n/V /On\n" in stream.getvalue() ================================================ FILE: tests/test_generic.py ================================================ """Test the pypdf.generic module.""" import codecs import gc import weakref from base64 import a85encode from copy import deepcopy from io import BytesIO import pytest from pypdf import PdfReader, PdfWriter from pypdf.constants import CheckboxRadioButtonAttributes from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError from pypdf.generic import ( ArrayObject, BooleanObject, ByteStringObject, ContentStream, DecodedStreamObject, Destination, DictionaryObject, Fit, FloatObject, IndirectObject, NameObject, NullObject, NumberObject, OutlineItem, PdfObject, RectangleObject, StreamObject, TextStringObject, TreeObject, create_string_object, encode_pdfdocencoding, is_null_or_none, read_hex_string_from_stream, read_object, read_string_from_stream, ) from pypdf.generic._image_inline import ( extract_inline__ascii85_decode, extract_inline__ascii_hex_decode, extract_inline__dct_decode, extract_inline__run_length_decode, ) from . import RESOURCE_ROOT, get_data_from_url from .utils import ReaderDummy class ChildDummy(DictionaryObject): @property def indirect_reference(self): return self def test_float_object_exception(caplog): assert FloatObject("abc") == 0 assert caplog.text != "" def test_number_object_exception(caplog): assert NumberObject("0,0") == 0 assert caplog.text != "" def test_number_object_no_exception(): NumberObject(2**100_000_000) def test_create_string_object_exception(): with pytest.raises(TypeError) as exc: create_string_object(123) assert ( # typeguard is not running exc.value.args[0] == "create_string_object should have str or unicode arg" ) or ( # typeguard is enabled 'type of argument "string" must be one of (str, bytes); got int instead' in exc.value.args[0] ) @pytest.mark.parametrize( ("value", "expected", "tell"), [(b"true", b"true", 4), (b"false", b"false", 5)] ) def test_boolean_object(value, expected, tell): stream = BytesIO(value) assert BooleanObject.read_from_stream(stream).value == (expected == b"true") stream.seek(0, 0) assert stream.read() == expected assert stream.tell() == tell def test_boolean_object_write(): stream = BytesIO() boolobj = BooleanObject(None) boolobj.write_to_stream(stream) stream.seek(0, 0) assert stream.read() == b"false" def test_boolean_eq(): boolobj = BooleanObject(True) assert (boolobj == True) is True # noqa: E712 assert (boolobj == False) is False # noqa: E712 assert (boolobj == "True") is False hash1 = hash(boolobj) assert hash1 == hash(boolobj) boolobj = BooleanObject(False) assert (boolobj == True) is False # noqa: E712 assert (boolobj == False) is True # noqa: E712 assert (boolobj == "True") is False assert hash1 != hash(boolobj) def test_boolean_object_exception(): stream = BytesIO(b"False") with pytest.raises(PdfReadError) as exc: BooleanObject.read_from_stream(stream) assert exc.value.args[0] == "Could not read Boolean object" def test_array_object_exception(): stream = BytesIO(b"False") with pytest.raises(PdfReadError) as exc: ArrayObject.read_from_stream(stream, None) assert exc.value.args[0] == "Could not read array" def test_null_object_exception(): stream = BytesIO(b"notnull") with pytest.raises(PdfReadError) as exc: NullObject.read_from_stream(stream) assert exc.value.args[0] == "Could not read Null object" @pytest.mark.parametrize("value", [b"", b"False", b"foo ", b"foo ", b"foo bar"]) def test_indirect_object_premature(value): stream = BytesIO(value) with pytest.raises(PdfStreamError) as exc: IndirectObject.read_from_stream(stream, None) assert exc.value.args[0] == "Stream has ended unexpectedly" def test_read_hex_string_from_stream(): stream = BytesIO(b"a1>") assert read_hex_string_from_stream(stream) == "\x10" def test_read_hex_string_from_stream_exception(): stream = BytesIO(b"") with pytest.raises(PdfStreamError) as exc: read_hex_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" def test_read_string_from_stream_exception(): stream = BytesIO(b"x") with pytest.raises(PdfStreamError) as exc: read_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" def test_read_string_from_stream_not_in_escapedict_no_digit(): stream = BytesIO(b"x\\y") with pytest.raises(PdfReadError) as exc: read_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" def test_read_string_from_stream_multichar_eol(): stream = BytesIO(b"x\\\n )") assert read_string_from_stream(stream) == " " def test_read_string_from_stream_multichar_eol2(): stream = BytesIO(b"x\\\n\n)") assert read_string_from_stream(stream) == "" def test_read_string_from_stream_excape_digit(): stream = BytesIO(b"x\\1a )") assert read_string_from_stream(stream) == "\x01a " def test_read_string_from_stream_excape_digit2(): stream = BytesIO(b"(hello \\1\\2\\3\\4)") assert read_string_from_stream(stream) == "hello \x01\x02\x03\x04" def test_name_object(caplog): stream = BytesIO(b"x") with pytest.raises(PdfReadError) as exc: NameObject.read_from_stream(stream, None) assert exc.value.args[0] == "Name read error" with pytest.raises( DeprecationError, match=r"surfix is deprecated and was removed in pypdf 5\.0\.0\. Use prefix instead\.", ): _ = NameObject.surfix assert ( NameObject.read_from_stream( BytesIO(b"/A;Name_With-Various***Characters?"), None ) == "/A;Name_With-Various***Characters?" ) assert ( NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None) == "/paired()parentheses" ) assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB" assert ( NameObject.read_from_stream( BytesIO(b"/#f1j#d4#aa#0c#ce#87#b4#b3#b0#23J#86#fe#2a#b2jYJ#94"), ReaderDummy(), ) == "/ñjÔª\x0cÎ\x87´³°#J\x86þ*²jYJ\x94" ) assert (NameObject.read_from_stream(BytesIO(b"/#JA#231f"), None)) == "/#JA#1f" assert ( NameObject.read_from_stream( BytesIO(b"/#e4#bd#a0#e5#a5#bd#e4#b8#96#e7#95#8c"), None ) ) == "/你好世界" # test PDFDocEncoding (latin-1) assert ( NameObject.read_from_stream(BytesIO(b"/DocuSign\xae"), None) ) == "/DocuSign®" # test write b = BytesIO() NameObject("/hello").write_to_stream(b) assert bytes(b.getbuffer()) == b"/hello" caplog.clear() b = BytesIO() with pytest.raises( expected_exception=DeprecationError, match=r"Incorrect first char in NameObject, should start with '/': \(hello\) is deprecated and was" ): NameObject("hello").write_to_stream(b) caplog.clear() b = BytesIO() NameObject("/DIJMAC+Arial Black#1").write_to_stream(b) assert bytes(b.getbuffer()) == b"/DIJMAC+Arial#20Black#231" assert caplog.text == "" caplog.clear() b = BytesIO() NameObject("/你好世界 (%)").write_to_stream(b) assert bytes(b.getbuffer()) == b"/#E4#BD#A0#E5#A5#BD#E4#B8#96#E7#95#8C#20#28#25#29" assert caplog.text == "" caplog.clear() b = BytesIO() NameObject("/{foo}(baz)[qux]#/%").write_to_stream(b) assert bytes(b.getbuffer()) == b"/#7Bfoo#7D#3Cbar#3E#28baz#29#5Bqux#5D#23#2F#25" assert caplog.text == "" def test_destination_fit_r(): d = Destination( TextStringObject("title"), NullObject(), Fit.fit_rectangle(0, 0, 0, 0) ) assert d.title == NameObject("title") assert d.typ == "/FitR" assert d.zoom is None assert d.left == FloatObject(0) assert d.right == FloatObject(0) assert d.top == FloatObject(0) assert d.bottom == FloatObject(0) assert list(d) == [] d.empty_tree() def test_destination_fit_v(): d = Destination(NameObject("title"), NullObject(), Fit.fit_vertically(left=0)) writer = PdfWriter() writer.add_named_destination_object(d) # Trigger Exception Destination(NameObject("title"), NullObject(), Fit.fit_vertically(left=None)) def test_outline_item_write_to_stream(): stream = BytesIO() oi = OutlineItem(NameObject("title"), NullObject(), Fit.fit_vertically(left=0)) oi.write_to_stream(stream) stream.seek(0, 0) assert stream.read() == b"<<\n/Title (title)\n/Dest [ null /FitV 0.0 ]\n>>" def test_encode_pdfdocencoding_keyerror(): with pytest.raises(UnicodeEncodeError) as exc: encode_pdfdocencoding("😀") assert exc.value.args[0] == "pdfdocencoding" @pytest.mark.parametrize("test_input", ["", "data"]) def test_encode_pdfdocencoding_returns_bytes(test_input): """ Test that encode_pdfdocencoding() always returns bytes because bytearray is duck type compatible with bytes in mypy """ out = encode_pdfdocencoding(test_input) assert isinstance(out, bytes) def test_read_object_comment_exception(): stream = BytesIO(b"% foobar") pdf = None with pytest.raises(PdfStreamError) as exc: read_object(stream, pdf) assert exc.value.args[0] == "File ended unexpectedly." def test_read_object_empty(): stream = BytesIO(b"endobj") pdf = None assert isinstance(read_object(stream, pdf), NullObject) def test_read_object_empty_in_array(): stream = BytesIO(b"[endobj") pdf = None result = read_object(stream, pdf) assert isinstance(result, ArrayObject) assert len(result) == 1 assert isinstance(result[0], NullObject) def test_read_object_invalid(): stream = BytesIO(b"hello") pdf = None with pytest.raises(PdfReadError) as exc: read_object(stream, pdf) assert "hello" in exc.value.args[0] def test_read_object_comment(): stream = BytesIO(b"% foobar\n1 ") pdf = None out = read_object(stream, pdf) assert out == 1 def test_bytestringobject(): bo = ByteStringObject("stream", encoding="utf-8") stream = BytesIO(b"") bo.write_to_stream(stream) stream.seek(0, 0) assert stream.read() == b"<73747265616d>" # TODO: how can we verify this? def test_dictionaryobject_key_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do["foo"] = NameObject("/GoTo") assert exc.value.args[0] == "Key must be a PdfObject" def test_dictionaryobject_xmp_meta(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) assert do.xmp_metadata is None def test_dictionaryobject_value_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do[NameObject("/S")] = "/GoTo" assert exc.value.args[0] == "Value must be a PdfObject" def test_dictionaryobject_setdefault_key_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do.setdefault("foo", NameObject("/GoTo")) assert exc.value.args[0] == "Key must be a PdfObject" def test_dictionaryobject_setdefault_value_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do.setdefault(NameObject("/S"), "/GoTo") assert exc.value.args[0] == "Value must be a PdfObject" def test_dictionaryobject_setdefault_value(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) do.setdefault(NameObject("/S"), NameObject("/GoTo")) def test_dictionaryobject_read_from_stream(): stream = BytesIO(b"<< /S /GoTo >>") pdf = None out = DictionaryObject.read_from_stream(stream, pdf) assert out.get_object() == {NameObject("/S"): NameObject("/GoTo")} def test_dictionaryobject_read_from_stream_broken(): stream = BytesIO(b"< /S /GoTo >>") pdf = None with pytest.raises(PdfReadError) as exc: DictionaryObject.read_from_stream(stream, pdf) assert ( exc.value.args[0] == "Dictionary read error at byte 0x2: stream must begin with '<<'" ) def test_dictionaryobject_read_from_stream_unexpected_end(): stream = BytesIO(b"<< \x00/S /GoTo") pdf = None with pytest.raises(PdfStreamError) as exc: DictionaryObject.read_from_stream(stream, pdf) assert exc.value.args[0] == "Stream has ended unexpectedly" def test_dictionaryobject_read_from_stream_stream_no_newline(): stream = BytesIO(b"<< /S /GoTo >>stream") pdf = None with pytest.raises(PdfReadError) as exc: DictionaryObject.read_from_stream(stream, pdf) assert exc.value.args[0] == "Stream data must be followed by a newline" @pytest.mark.parametrize(("strict"), [(True), (False)]) def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict, caplog): stream = BytesIO(b"<< /S /GoTo >>stream\n123456789endstream abcd") class Tst: # to replace pdf strict = False pdf = Tst() pdf.strict = strict if strict: with pytest.raises(PdfReadError) as exc: DictionaryObject.read_from_stream(stream, pdf) assert exc.value.args[0] == "Stream length not defined" else: o = DictionaryObject.read_from_stream(stream, pdf) assert "Stream length not defined" in caplog.text assert o.get_data() == b"123456789" @pytest.mark.parametrize( ("strict", "length", "should_fail"), [ (True, 6, False), (True, 10, False), (True, 4, True), (False, 6, False), (False, 10, False), ], ) def test_dictionaryobject_read_from_stream_stream_stream_valid( strict, length, should_fail ): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) class Tst: # to replace pdf strict = True pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: do = DictionaryObject.read_from_stream(stream, pdf) # TODO: What should happen with the stream? assert do == {"/S": "/GoTo"} if length in (6, 10): assert b"BT /F1" in do.get_data() raise PdfReadError("__ALLGOOD__") assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__") def test_rectangleobject(): ro = RectangleObject((1, 2, 3, 4)) assert ro.lower_left == (1, 2) assert ro.lower_right == (3, 2) assert ro.upper_left == (1, 4) assert ro.upper_right == (3, 4) ro.lower_left = (5, 6) assert ro.lower_left == (5, 6) ro.bottom -= 2 ro.left -= 2 assert ro.lower_left == (3, 4) ro.lower_right = (7, 8) assert ro.lower_right == (7, 8) ro.upper_left = (9, 11) assert ro.upper_left == (9, 11) ro.upper_right = (13, 17) assert ro.upper_right == (13, 17) ro.top += 1 ro.right += 1 assert ro.upper_right == (14, 18) def test_textstringobject_exc(): tso = TextStringObject("foo") assert tso.get_original_bytes() == b"foo" def test_textstringobject_autodetect_utf16(): tso = TextStringObject("foo") tso.autodetect_utf16 = True tso.utf16_bom = codecs.BOM_UTF16_BE assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" tso.utf16_bom = codecs.BOM_UTF16_LE assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00" assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00" def test_textstringobject__numbers_as_input(): _ = TextStringObject(42) _ = TextStringObject(13.37) def test_remove_child_not_in_tree(): tree = TreeObject() with pytest.raises(ValueError) as exc: tree.remove_child(ChildDummy()) assert exc.value.args[0] == "Removed child does not appear to be a tree item" def test_remove_child_not_in_that_tree(): tree = TreeObject() tree.indirect_reference = NullObject() child = TreeObject() child.indirect_reference = NullObject() with pytest.raises(ValueError) as exc: child.remove_from_tree() assert exc.value.args[0] == "Removed child does not appear to be a tree item" tree.add_child(child, ReaderDummy()) with pytest.raises(ValueError) as exc: tree.remove_child(child) assert exc.value.args[0] == "Removed child is not a member of this tree" def test_remove_child_not_found_in_tree(): class ChildDummy(DictionaryObject): @property def indirect_reference(self) -> "ChildDummy": return self tree = TreeObject() tree.indirect_reference = NullObject() child = ChildDummy(TreeObject()) tree.add_child(child, ReaderDummy()) child2 = ChildDummy(TreeObject()) child2[NameObject("/Parent")] = tree with pytest.raises(ValueError) as exc: tree.remove_child(child2) assert exc.value.args[0] == "Removal couldn't find item in tree" def test_remove_child_found_in_tree(): writer = PdfWriter() # Add Tree tree = TreeObject() writer._add_object(tree) # Add first child # It's important to set a value, otherwise the writer.get_reference will # return the same object when a second child is added. child1 = TreeObject() child1[NameObject("/Foo")] = TextStringObject("bar") child1_ref = writer._add_object(child1) tree.add_child(child1_ref, writer) assert tree[NameObject("/Count")] == 1 assert len(list(tree.children())) == 1 # Add second child child2 = TreeObject() child2[NameObject("/Foo")] = TextStringObject("baz") child2_ref = writer._add_object(child2) tree.add_child(child2_ref, writer) assert tree[NameObject("/Count")] == 2 assert len(list(tree.children())) == 2 # Remove last child tree.remove_child(child2_ref) assert tree[NameObject("/Count")] == 1 assert len(list(tree.children())) == 1 # Add new child child3 = TreeObject() child3[NameObject("/Foo")] = TextStringObject("3") child3_ref = writer._add_object(child3) tree.add_child(child3_ref, writer) assert tree[NameObject("/Count")] == 2 assert len(list(tree.children())) == 2 # Remove first child child1 = tree[NameObject("/First")] tree.remove_child(child1) assert tree[NameObject("/Count")] == 1 assert len(list(tree.children())) == 1 child4 = TreeObject() child4[NameObject("/Foo")] = TextStringObject("4") child4_ref = writer._add_object(child4) tree.add_child(child4_ref, writer) assert tree[NameObject("/Count")] == 2 assert len(list(tree.children())) == 2 child5 = TreeObject() child5[NameObject("/Foo")] = TextStringObject("5") child5_ref = writer._add_object(child5) tree.add_child(child5_ref, writer) assert tree[NameObject("/Count")] == 3 assert len(list(tree.children())) == 3 # Remove middle child child4.remove_from_tree() assert tree[NameObject("/Count")] == 2 assert len(list(tree.children())) == 2 tree.empty_tree() def test_remove_child_in_tree(): pdf = RESOURCE_ROOT / "form.pdf" tree = TreeObject() reader = PdfReader(pdf) writer = PdfWriter() writer._add_object(tree) writer.add_page(reader.pages[0]) writer.add_outline_item("foo", page_number=0) obj = writer._objects[-1] tree.add_child(obj, writer) tree.remove_child(obj) tree.add_child(obj, writer) tree.empty_tree() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "caplog_content"), [ ( # parse_content_stream_peek_percentage "https://github.com/user-attachments/files/18381763/tika-985770.pdf", "tika-985770.pdf", "", ), ( # read_inline_image_no_has_q "https://github.com/user-attachments/files/18381775/tika-998719.pdf", "tika-998719.pdf", "", ), ( # read_inline_image_loc_neg_1 "https://github.com/user-attachments/files/18381706/tika-935066.pdf", "tika-935066.pdf", "", ), ( # object_read_from_stream_unicode_error "https://github.com/user-attachments/files/18381750/tika-974966.pdf", "tika-974966.pdf", "", ), ( # dict_read_from_stream "https://github.com/user-attachments/files/18381762/tika-984877.pdf", "tika-984877.pdf", "Multiple definitions in dictionary at byte 0x1084 for key /Length", ), ], ids=[ "parse_content_stream_peek_percentage", "read_inline_image_no_has_q", "read_inline_image_loc_neg_1", "object_read_from_stream_unicode_error", "dict_read_from_stream", ], ) def test_extract_text(caplog, url: str, name: str, caplog_content: str): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() if caplog_content == "": assert caplog_content == caplog.text else: assert caplog_content in caplog.text @pytest.mark.slow @pytest.mark.enable_socket def test_text_string_write_to_stream(): url = "https://github.com/user-attachments/files/18381698/tika-924562.pdf" name = "tika-924562.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) for page in writer.pages: page.compress_content_streams() @pytest.mark.enable_socket def test_bool_repr(tmp_path): url = "https://github.com/user-attachments/files/18381703/tika-932449.pdf" name = "tika-932449.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) write_path = tmp_path / "tmp-fields-report.txt" with open(write_path, "w") as fp: fields = reader.get_fields(fileobj=fp) assert fields assert list(fields.keys()) == ["USGPOSignature"] with open(write_path) as fp: data = fp.read() assert data.startswith( "Field Name: USGPOSignature\nField Type: Signature\nField Flags: 1\n" "Value: {'/Type': '/Sig', '/Filter': '/Adobe.PPKLite', " "'/SubFilter':" ) @pytest.mark.enable_socket def test_issue_997(pdf_file_path): url = ( "https://github.com/py-pdf/pypdf/files/8908874/" "Exhibit_A-2_930_Enterprise_Zone_Tax_Credits_final.pdf" ) name = "gh-issue-997.pdf" merger = PdfWriter() merger.append(BytesIO(get_data_from_url(url, name=name))) # here the error raises with open(pdf_file_path, "wb") as f: merger.write(f) merger.close() # Strict merger = PdfWriter() merger.append(BytesIO(get_data_from_url(url, name=name))) # here the error raises with open(pdf_file_path, "wb") as f: merger.write(f) merger.close() def test_checkboxradiobuttonattributes_opt(): assert "/Opt" in CheckboxRadioButtonAttributes.attributes_dict() def test_name_object_invalid_decode(): charsets = deepcopy(NameObject.CHARSETS) try: NameObject.CHARSETS = ("utf-8",) stream = BytesIO(b"/\x80\x02\x03") # strict: with pytest.raises(PdfReadError) as exc: NameObject.read_from_stream(stream, ReaderDummy(strict=True)) assert "Illegal character in NameObject " in exc.value.args[0] # non-strict: stream.seek(0) NameObject.read_from_stream(stream, ReaderDummy(strict=False)) finally: NameObject.CHARSETS = charsets def test_indirect_object_invalid_read(): stream = BytesIO(b"0 1 s") with pytest.raises(PdfReadError) as exc: IndirectObject.read_from_stream(stream, ReaderDummy()) assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5" def test_create_string_object_utf16_bom(): # utf16-be result = create_string_object( b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True assert result.utf16_bom == b"\xfe\xff" assert ( result.get_encoded_bytes() == b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" ) # utf16-le result = create_string_object( b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True assert result.utf16_bom == b"\xff\xfe" assert ( result.get_encoded_bytes() == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) result = TextStringObject( b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True assert result.utf16_bom == b"\xff\xfe" assert ( result.get_encoded_bytes() == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) # utf16-be without bom result = TextStringObject("ÿ") result.autodetect_utf16 = True result.utf16_bom = b"" assert result.get_encoded_bytes() == b"\x00\xFF" assert result.original_bytes == b"\x00\xFF" def test_create_string_object_force(): assert create_string_object(b"Hello World", []) == "Hello World" assert create_string_object(b"Hello World", {72: "A"}) == "Aello World" assert create_string_object(b"Hello World", "utf8") == "Hello World" @pytest.mark.parametrize( ("value", "expected"), [ ("0.000000", "0.0"), ("0.0", "0.0"), ("1.0", "1"), ("0.123000", "0.123"), ("0.000123000", "0.000123"), ("0.0", "0.0"), ("0", "0.0"), ("1", "1"), ("1.0", "1"), ("1.01", "1.01"), ("1.010", "1.01"), ("0000.0000", "0.0"), ("0.10101010", "0.1010101"), ("50000000000", "50000000000"), ("99900000000000000123", "99900000000000000000"), ("99900000000000000123.456000", "99900000000000000000"), ("0.00000000000000000000123", "0.00000000000000000000123"), ("0.00000123", "0.00000123"), ("0.00000000000000000000123000", "0.00000000000000000000123"), ("-4.6", "-4.6"), # from #1910 # ( # "50032481330523882508234.00000000000000000000123000", # "50032481330523882508234.00000000000000000000123", # ), # ( # "928457298572093487502198745102973402987412908743.75249875981374981237498213740000", # "928457298572093487502198745102973402987412908743.7524987598137498123749821374", # ), ], ) def test_float_object_decimal_to_string(value, expected): assert repr(FloatObject(value)) == expected def test_cloning(caplog): writer = PdfWriter() with pytest.raises(Exception) as exc: PdfObject().clone(writer) assert "PdfObject does not implement .clone so far" in exc.value.args[0] obj1 = DictionaryObject() obj1.indirect_reference = None n = len(writer._objects) obj2 = obj1.clone(writer) assert len(writer._objects) == n + 1 obj3 = obj2.clone(writer) assert len(writer._objects) == n + 1 assert obj2.indirect_reference == obj3.indirect_reference obj3 = obj2.indirect_reference.clone(writer) assert len(writer._objects) == n + 1 assert obj2.indirect_reference == obj3.indirect_reference assert ( obj2.indirect_reference == obj2._reference_clone(obj2, writer).indirect_reference ) assert len(writer._objects) == n + 1 assert obj2.indirect_reference == obj3.indirect_reference obj3 = obj2.indirect_reference.clone(writer, True) assert len(writer._objects) == n + 2 assert obj2.indirect_reference != obj3.indirect_reference arr1 = ArrayObject([obj2]) arr2 = arr1.clone(writer) arr3 = arr2.clone(writer) assert arr2 == arr3 obj10 = StreamObject() arr1 = ArrayObject([obj10]) obj11 = obj10.clone(writer) assert arr1[0] == obj11 obj20 = DictionaryObject( {NameObject("/Test"): NumberObject(1), NameObject("/Test2"): StreamObject()} ) obj21 = obj20.clone(writer, ignore_fields=None) assert "/Test" in obj21 assert isinstance(obj21.get("/Test2"), IndirectObject) def test_cloning_indirect_obj_keeps_hard_reference(): """ Reported in #3450 Ensure that cloning an IndirectObject keeps a hard reference to the underlying object, preventing its deallocation, which could allow `id(obj)` to return the same value for different objects. """ writer1 = PdfWriter() indirect_object = IndirectObject(1, 0, writer1) # Create a weak reference to the underlying object to test later # if it is still alive in memory or not obj_weakref = weakref.ref(indirect_object.pdf) assert obj_weakref() is not None writer2 = PdfWriter() indirect_object.clone(writer2) # Mimic indirect_object/writer1 going out of scope and being # garbage collected. Clone should have kept a hard reference to # it, preventing its deallocation. del indirect_object del writer1 gc.collect() assert obj_weakref() is not None def test_cloning_null_obj_keeps_hard_reference(): """ Ensure that cloning a NullObject keeps a hard reference to the underlying object, preventing its deallocation, which could allow `id(obj)` to return the same value for different objects. """ writer1 = PdfWriter() indirect_object = IndirectObject(1, 0, writer1) null_obj = NullObject() null_obj.indirect_reference = indirect_object # Create a weak reference to the underlying object to test later # if it is still alive in memory or not obj_weakref = weakref.ref(indirect_object.pdf) assert obj_weakref() is not None writer2 = PdfWriter() null_obj.clone(writer2) # Mimic indirect_object/writer1 going out of scope and being # garbage collected. Clone should have kept a hard reference to # it, preventing its deallocation. del indirect_object del writer1 del null_obj gc.collect() assert obj_weakref() is not None @pytest.mark.enable_socket def test_append_with_indirectobject_not_pointing(caplog): """ Reported in #1631 the object 43 0 is not invalid """ url = "https://github.com/py-pdf/pypdf/files/10729142/document.pdf" name = "tst_iss1631.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) writer = PdfWriter() writer.append(reader) assert "Object 43 0 not defined." in caplog.text @pytest.mark.enable_socket def test_iss1615_1673(): """ Test cases where /N is not indicating chains of objects test also where /N,... are not part of chains """ # #1615 url = "https://github.com/py-pdf/pypdf/files/10671366/graph_letter.pdf" name = "graph_letter.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) assert ( "/N" in writer.pages[0]["/Annots"][0] .get_object()["/AP"]["/N"]["/Resources"]["/ColorSpace"]["/Cs1"][1] .get_object() ) # #1673 url = "https://github.com/py-pdf/pypdf/files/10848750/budgeting-loan-form-sf500.pdf" name = "budgeting-loan-form-sf500.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) @pytest.mark.enable_socket def test_destination_withoutzoom(): """Cf issue #1832""" url = "https://github.com/user-attachments/files/15605648/2021_book_security.pdf" name = "2021_book_security.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline out = BytesIO() writer = PdfWriter(clone_from=reader) writer.write(out) def test_encodedstream_set_data(): """ EncodedStreamObject.set_data to extend data stream works. Checks also the flate_encode. """ pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) co = reader.pages[0]["/Contents"][0].get_object() co.set_data(b"%hello\n" + co.get_data()) assert b"hello" in co.get_data() b = BytesIO() co.write_to_stream(b) b.seek(0) aa = read_object(b, None) assert b"hello" in aa.get_data() assert aa["/Filter"] == "/FlateDecode" assert "/DecodeParms" not in aa bb = aa.flate_encode() assert b"hello" in bb.get_data() assert bb["/Filter"] == ["/FlateDecode", "/FlateDecode"] assert str(bb["/DecodeParms"]) == "[NullObject, NullObject]" bb[NameObject("/Test")] = NameObject("/MyTest") cc = bb.flate_encode() assert bb["/Filter"] == ["/FlateDecode", "/FlateDecode"] assert b"hello" in cc.get_data() assert cc["/Filter"] == ["/FlateDecode", "/FlateDecode", "/FlateDecode"] assert str(cc["/DecodeParms"]) == "[NullObject, NullObject, NullObject]" assert cc[NameObject("/Test")] == "/MyTest" with pytest.raises(TypeError): aa.set_data("toto") aa[NameObject("/Filter")] = NameObject("/JPXEncode") with pytest.raises(PdfReadError): aa.set_data(b"toto") @pytest.mark.enable_socket def test_set_data_2(): """ Modify a stream not yet loaded and where the filter is ["/FlateDecode"] """ url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf" name = "iss2780.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) writer.root_object["/AcroForm"]["/XFA"][7].set_data(b"test") assert writer.root_object["/AcroForm"]["/XFA"][7].get_object()["/Filter"] == [ "/FlateDecode" ] assert writer.root_object["/AcroForm"]["/XFA"][7].get_object().get_data() == b"test" @pytest.mark.enable_socket def test_calling_indirect_objects(): """Cope with cases where attributes/items are called from indirectObject""" url = "https://github.com/user-attachments/files/15605648/2021_book_security.pdf" name = "2021_book_security.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.trailer.get("/Info")["/Creator"] reader.pages[0]["/Contents"][0].get_data() writer = PdfWriter(clone_from=reader) ind = writer._add_object(writer) assert ind.fileobj == writer.fileobj with pytest.raises(AttributeError): ind.not_existing_attribute # create an IndirectObject referencing an IndirectObject. writer._objects.append(writer.pages[0].indirect_reference) ind = IndirectObject(len(writer._objects), 0, writer) with pytest.raises(PdfStreamError): ind["/Type"] @pytest.mark.enable_socket def test_indirect_object_page_dimensions(): url = "https://github.com/py-pdf/pypdf/files/13302338/Zymeworks_Corporate.Presentation_FINAL1101.pdf.pdf" name = "issue2287.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) mediabox = reader.pages[0].mediabox assert mediabox == RectangleObject((0, 0, 792, 612)) def test_indirect_object_contains(): writer = PdfWriter() indirect_object = IndirectObject(1, 0, writer) assert "foo" not in indirect_object assert "/Producer" in indirect_object def test_indirect_object_iter(): writer = PdfWriter() indirect_object = IndirectObject(1, 0, writer) assert "foo" not in list(indirect_object) assert "/Producer" in list(indirect_object) def test_array_operators(): a = ArrayObject( [ NumberObject(1), NumberObject(2), NumberObject(3), NumberObject(4), ] ) b = a + 5 assert isinstance(b, ArrayObject) assert b == [1, 2, 3, 4, 5] assert a == [1, 2, 3, 4] a -= 2 a += "abc" a -= (3, 4) a += ["d", "e"] a += BooleanObject(True) assert a == [1, "abc", "d", "e", True] a += "/toto" assert isinstance(a[-1], NameObject) assert isinstance(a[1], TextStringObject) a += b"1234" assert a[-1] == ByteStringObject(b"1234") la = len(a) a -= 300 assert len(a) == la def test_unitary_extract_inline_buffer_invalid(): with pytest.raises(PdfReadError): extract_inline__ascii_hex_decode(BytesIO()) with pytest.raises(PdfReadError): extract_inline__ascii_hex_decode(BytesIO(4095 * b"00" + b" ")) with pytest.raises(PdfReadError): extract_inline__ascii_hex_decode(BytesIO(b"00")) with pytest.raises(PdfReadError): extract_inline__ascii85_decode(BytesIO()) with pytest.raises(PdfReadError): extract_inline__ascii85_decode(BytesIO(a85encode(b"1"))) with pytest.raises(PdfReadError): extract_inline__ascii85_decode(BytesIO(a85encode(b"1") + b"~> Q")) with pytest.raises(PdfReadError): extract_inline__ascii85_decode(BytesIO(a85encode(b"1234578" * 990))) with pytest.raises(PdfReadError): extract_inline__run_length_decode(BytesIO()) with pytest.raises(PdfReadError): extract_inline__run_length_decode(BytesIO(b"\x01\x01\x80")) with pytest.raises(PdfReadError): extract_inline__dct_decode(BytesIO(b"\xFF\xD9")) def test_unitary_extract_inline(): # AHx b = 16000 * b"00" assert len(extract_inline__ascii_hex_decode(BytesIO(b + b" EI"))) == len(b) with pytest.raises(PdfReadError): extract_inline__ascii_hex_decode(BytesIO(b + b"> ")) # RL b = 8200 * b"\x00\xAB" + b"\x80" assert len(extract_inline__run_length_decode(BytesIO(b + b" EI"))) == len(b) # default # EIDD instead of EI; using A85 b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F [/A85 /Fl]\nID Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> EIDD Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" ec = DecodedStreamObject() ec.set_data(b) co = ContentStream(ec, None) with pytest.raises(PdfReadError) as exc: co.operations assert "EI stream not found" in exc.value.args[0] # EIDD instead of EI; using /Fl (default extraction) b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> EIDD Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" ec = DecodedStreamObject() ec.set_data(b) co = ContentStream(ec, None) with pytest.raises(PdfReadError) as exc: co.operations assert "Unexpected end of stream" in exc.value.args[0] b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~>EI BT\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" ec = DecodedStreamObject() ec.set_data(b) co = ContentStream(ec, None) with pytest.raises(PdfReadError) as exc: co.operations assert "Unexpected end of stream" in exc.value.args[0] b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm BI\n/W 4 /H 4 /CS /G \nID abcdefghijklmnopEI Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" ec = DecodedStreamObject() ec.set_data(b) co = ContentStream(ec, None) assert co.operations[7][0]["data"] == b"abcdefghijklmnop" b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm BI\n/W 4 /H 4 \nID abcdefghijklmnopEI Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" ec = DecodedStreamObject() ec.set_data(b) co = ContentStream(ec, None) assert co.operations[7][0]["data"] == b"abcdefghijklmnop" def test_missing_hashbin(): assert NullObject().hash_bin() == hash((NullObject,)) assert hash(NullObject()) == NullObject().hash_bin() t = ByteStringObject(b"123") assert t.hash_bin() == hash((ByteStringObject, b"123")) def test_is_null_or_none(): assert is_null_or_none(NullObject()) assert not is_null_or_none(PdfObject()) reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") # used with get assert is_null_or_none(reader.root_object.get("/do_no_exist")) # object unknown... assert is_null_or_none(IndirectObject(99999, 0, reader).get_object()) # ... or which has been replaced with NullObject writer = PdfWriter(reader) writer.pages[0]["/Contents"].append(writer._add_object(NullObject())) assert is_null_or_none(writer.pages[0]["/Contents"][-1]) def test_coverage_arrayobject(): writer = PdfWriter() a = ArrayObject([1]) assert isinstance(a.replicate(writer)[0], int) assert isinstance(a.clone(writer)[0], int) a.indirect_reference = IndirectObject(1, 0, writer) assert isinstance(a.clone(writer)[0], int) r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") a = ArrayObject([r.pages[0]["/Contents"][0].get_object()]) aa = a.clone(writer) assert isinstance(aa[0], IndirectObject) for k, v in aa.items(): assert isinstance(k, int) assert isinstance(v, PdfObject) def test_coverage_streamobject(): writer = PdfWriter() s = StreamObject() del s.decoded_self s.replicate(writer) s.clone(writer) co = ContentStream(None, None) co.replicate(writer) co.clone(writer, False, None) co.indirect_reference = IndirectObject(1, 0, writer) assert co == co.clone(writer) r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") co = r.pages[0].get_contents() co[NameObject("/testkey")] = NameObject("/test") co.decoded_self = None assert "/testkey" in co.replicate(writer) co = r.pages[0].get_contents() co[NameObject("/testkey")] = NameObject("/test") co.decoded_self = DecodedStreamObject() assert "/testkey" in co.replicate(writer) def test_contentstream_arrayobject_containing_nullobject(caplog): stream_object = DecodedStreamObject() stream_object.set_data(b"Hello World!") input_stream = ArrayObject([NullObject(), stream_object]) content_stream = ContentStream(stream=input_stream, pdf=None) assert content_stream.get_data() == b"Hello World!\n" assert caplog.text == "" @pytest.mark.enable_socket def test_build_link__go_to_action_without_destination(): reader = PdfReader(BytesIO(get_data_from_url(name="issue-3419.pdf"))) writer = PdfWriter() for page in reader.pages: writer.add_page(page) assert len(writer.pages) == len(reader.pages) @pytest.mark.enable_socket def test_dictionaryobject__length_0_stream(): """Test for issue #3052.""" url = "https://github.com/user-attachments/files/18734105/correct.pdf" name = "issue3052.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) output = BytesIO() writer.write(output) assert b"\n8 0 obj\n<<\n/Length 0\n>>\nstream\n\nendstream\nendobj\n" in output.getvalue() ================================================ FILE: tests/test_images.py ================================================ """ Tests which ensure that image extraction works properly go here. Typically, tests in here should compare the extracted images count, names, and/or the actual image data with the expected value. """ from io import BytesIO from pathlib import Path from typing import Union from unittest import mock from zipfile import ZipFile import pytest from PIL import Image, ImageChops, ImageDraw from pypdf import PageObject, PdfReader, PdfWriter from pypdf.errors import LimitReachedError from pypdf.filters import JBIG2Decode from pypdf.generic import ContentStream, NameObject, NullObject from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url from .utils import get_image_data def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image: if isinstance(path, Image.Image): img = path else: if isinstance(path, Path): assert path.exists() with Image.open(path) as img: img = ( img.copy() ) # Opened image should be copied to avoid issues with file closing return img def image_size(image: Image.Image): buffer = BytesIO() image.save(buffer, format=image.format) return buffer.tell() def image_similarity( path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO] ) -> float: """ Check image similarity. A value of "0" means the images are different. A value of 1 means they are identical. A value above 0.9 means they are almost the same. This can be used to ensure visual similarity. """ # Open the images using Pillow image1 = open_image(path1) image2 = open_image(path2) # Check if the images have the same dimensions if image1.size != image2.size: return 0 # Check if the color modes are the same if image1.mode != image2.mode: return 0 # Calculate the Mean Squared Error (MSE) diff = ImageChops.difference(image1, image2) pixels = get_image_data(diff) if isinstance(pixels[0], tuple): mse = sum(sum((c / 255.0) ** 2 for c in p) for p in pixels) / ( len(pixels) * len(pixels[0]) ) else: mse = sum((p / 255.0) ** 2 for p in pixels) / len(pixels) return 1 - mse @pytest.mark.samples def test_image_similarity_one(): path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png" path_b = path_a assert image_similarity(path_a, path_b) == 1 @pytest.mark.samples def test_image_similarity_zero(): path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png" path_b = SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png" assert image_similarity(path_a, path_b) == 0 @pytest.mark.samples def test_image_similarity_mid(): path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png" img_b = Image.open(path_a) draw = ImageDraw.Draw(img_b) # Fill the rectangle with black color draw.rectangle([0, 0, 100, 100], fill=(0, 0, 0)) sim1 = image_similarity(path_a, img_b) assert sim1 > 0.9 assert sim1 > 0 assert sim1 < 1 draw.rectangle([0, 0, 200, 200], fill=(0, 0, 0)) sim2 = image_similarity(path_a, img_b) assert sim2 < sim1 assert sim2 > 0 @pytest.mark.enable_socket def test_image_new_property(): name = "pdf_font_garbled.pdf" reader = PdfReader(BytesIO(get_data_from_url(name=name))) assert reader.pages[0].images.keys() == [ "/I0", "/I1", "/I2", "/I3", "/I4", "/I5", "/I6", "/I7", "/I8", "/I9", ["/TPL1", "/Image5"], ["/TPL2", "/Image53"], ["/TPL2", "/Image37"], ["/TPL2", "/Image49"], ["/TPL2", "/Image51"], ["/TPL2", "/Image39"], ["/TPL2", "/Image57"], ["/TPL2", "/Image55"], ["/TPL2", "/Image43"], ["/TPL2", "/Image30"], ["/TPL2", "/Image22"], ["/TPL2", "/Image41"], ["/TPL2", "/Image47"], ["/TPL2", "/Image45"], ["/TPL3", "/Image65"], ["/TPL3", "/Image30"], ["/TPL3", "/Image61"], ["/TPL4", "/Image30"], ["/TPL5", "/Image30"], ["/TPL6", "/Image30"], ["/TPL7", "/Image30"], ["/TPL8", "/Image30"], ["/TPL9", "/Image30"], ["/TPL10", "/Image30"], ["/TPL11", "/Image30"], ["/TPL12", "/Image30"], ] assert len(reader.pages[0].images.items()) == 36 assert reader.pages[0].images[0].name == "I0.png" expected_image_url = "https://github.com/user-attachments/assets/3bf25760-2113-4e25-b4c2-fc1d3a84a263" expected_image_name = "pdf_font_garbled_image30.png" expected_image_data = BytesIO(get_data_from_url(url=expected_image_url, name=expected_image_name)) assert image_similarity( expected_image_data, reader.pages[0].images[-1].image ) == 1 assert reader.pages[0].images["/TPL1", "/Image5"].image.format == "JPEG" assert ( reader.pages[0].images["/I0"].indirect_reference.get_object() == reader.pages[0]["/Resources"]["/XObject"]["/I0"] ) list(reader.pages[0].images[0:2]) with pytest.raises(TypeError): reader.pages[0].images[b"0"] with pytest.raises(IndexError): reader.pages[0].images[9999] # just for test coverage: with pytest.raises(KeyError): reader.pages[0]._get_image(["test"], reader.pages[0]) assert list(PageObject(None, None).images) == [] @pytest.mark.parametrize( ("src", "page_index", "image_key", "expected"), [ ( SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", 23, "/Im2", SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png", ), ( SAMPLE_ROOT / "003-pdflatex-image/pdflatex-image.pdf", 0, "/Im1", SAMPLE_ROOT / "003-pdflatex-image/page-0-Im1.jpg", ), ( SAMPLE_ROOT / "018-base64-image/base64image.pdf", 0, "/QuickPDFImd32aa1ab", SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png", ), ( SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf", 0, "/X0", SAMPLE_ROOT / "019-grayscale-image/page-0-X0.png", ), ], ids=[ "009-pdflatex-geotopo/page-23-Im2.png", "003-pdflatex-image/page-0-Im1.jpg", "018-base64-image/page-0-QuickPDFImd32aa1ab.png", "019-grayscale-image/page-0-X0.png", ], ) @pytest.mark.samples def test_image_extraction(src, page_index, image_key, expected): reader = PdfReader(src) actual_image = reader.pages[page_index].images[image_key] if not expected.exists(): # A little helper for test generation with open(f"page-{page_index}-{actual_image.name}", "wb") as fp: fp.write(actual_image.data) assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 def test_get_inline_image_without_xobject_resources(): page = PageObject(None, None) inline_image = object() with mock.patch.object(page, "_get_inline_images", return_value={"~0~": inline_image}): assert page._get_image("~0~") is inline_image def test_get_inline_image_without_xobject_resources_raises_when_missing(): page = PageObject(None, None) with ( mock.patch.object(page, "_get_inline_images", return_value=None), pytest.raises(KeyError, match="No inline image can be found"), ): page._get_image("~0~") def test_get_xobject_image_without_xobject_resources_raises(): page = PageObject(None, None) with pytest.raises( KeyError, match="Cannot access image object /Im0 without XObject resources", ): page._get_image("/Im0") @pytest.mark.enable_socket @pytest.mark.timeout(30) def test_loop_in_image_keys(): """Cf #2077""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2077.pdf"))) reader.pages[0]["/Resources"]["/XObject"][NameObject("/toto")] = NullObject() reader.pages[0].images.keys() @pytest.mark.enable_socket def test_devicen_cmyk_black_only(): """Cf #2321""" url = "https://github.com/py-pdf/pypdf/files/13501846/Addressing_Adversarial_Attacks.pdf" name = "iss2321.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/cc2dabc1-86e6-4179-a8a4-2b0efea124be" name = "iss2321_img0.pdf" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[5].images[0].image, img) >= 0.99 url = "https://github.com/py-pdf/pypdf/assets/4083478/6b64a949-42be-40d5-9eea-95707f350d89" name = "iss2321_img1.pdf" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99 @pytest.mark.enable_socket def test_bi_in_text(): """Cf #2456""" url = "https://github.com/py-pdf/pypdf/files/14322910/BI_text_with_one_image.pdf" name = "BI_text_with_one_image.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.pages[0].images.keys() == ["~0~"] assert reader.pages[0].images[0].name == "~0~.png" @pytest.mark.enable_socket def test_cmyk_no_filter(): """Cf #2522""" url = "https://github.com/py-pdf/pypdf/files/14614887/out3.pdf" name = "iss2522.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0].image @pytest.mark.enable_socket def test_separation_1byte_to_rgb_inverted(): """Cf #2343""" url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf" name = "iss2343.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/b7f41897-96ef-4ea6-b165-5ef307a92b87" name = "iss2343.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 obj = reader.pages[0].images[0].indirect_reference.get_object() obj.set_data(obj.get_data() + b"\x00") with pytest.raises(ValueError): reader.pages[0].images[0] @pytest.mark.enable_socket def test_data_with_lf(): """Cf #2343""" url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" name = "iss2343b.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/1120b0cf-a67a-403f-aa1a-9a191cbc087f" name = "iss2343b0.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[8].images[9].image, img) == 1.0 @pytest.mark.enable_socket def test_oserror(): """Cf #2265""" url = "https://github.com/py-pdf/pypdf/files/13127130/Binance.discovery.responses.2.gov.uscourts.dcd.256060.140.1.pdf" name = "iss2265.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[2].images[1] # Due to errors in translation in pillow we may not get # the correct image. Therefore we cannot use `image_similarity`. @pytest.mark.parametrize( ("pdf", "pdf_name", "images", "images_name", "filtr"), [ ( "https://github.com/py-pdf/pypdf/files/13127197/FTX.Claim.SC30.01072023101624File595287144.pdf", "iss2266a.pdf", "https://github.com/py-pdf/pypdf/files/14967061/iss2266a_images.zip", "iss2266a_images.zip", ((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test ), ( "https://github.com/py-pdf/pypdf/files/13127242/FTX.Claim.Skybridge.Capital.30062023113350File971325116.pdf", "iss2266b.pdf", "https://github.com/py-pdf/pypdf/files/14967099/iss2266b_images.zip", "iss2266b_images.zip", ((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test ), ], ) @pytest.mark.enable_socket def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): """ Code to create zipfile: import pypdf;zipfile with pypdf.PdfReader("____inputfile___") as r: with zipfile.ZipFile("__outputzip___","w") as z: for p in r.pages: for ii,i in enumerate(p.images): print(i.name) b=BytesIO() i.image.save(b,"JPEG") z.writestr(f"image_{p.page_number}_{ii}_{i.name}",b.getbuffer()) """ url = pdf name = pdf_name reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = images name = images_name print(pdf_name, images_name) # noqa: T201 with ZipFile(BytesIO(get_data_from_url(url, name=name)), "r") as zf: for fn in zf.namelist(): sp = fn.split("_") p, i = int(sp[1]), int(sp[2]) if filtr is not None and (p, i) not in filtr: continue print(fn) # noqa: T201 img = Image.open(BytesIO(zf.read(fn))) assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99 @pytest.mark.enable_socket @pytest.mark.timeout(30) def test_large_compressed_image(): url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf" reader = PdfReader( BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")) ) list(reader.pages[0].images) @pytest.mark.enable_socket def test_ff_fe_starting_lut(): """Cf issue #2660""" url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf" name = "iss2660.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) b = BytesIO() writer.write(b) reader = PdfReader(b) url = "https://github.com/py-pdf/pypdf/assets/4083478/6150700d-87fd-43a2-8695-c2c05a44838c" name = "iss2660.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(writer.pages[1].images[0].image, img) == 1.0 assert image_similarity(reader.pages[1].images[0].image, img) == 1.0 @pytest.mark.enable_socket def test_inline_image_extraction(): """Cf #2598""" url = "https://github.com/py-pdf/pypdf/files/14982414/lebo102.pdf" name = "iss2598.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # there is no error because images are correctly extracted reader.pages[1].extract_text() reader.pages[2].extract_text() reader.pages[3].extract_text() url = "https://github.com/py-pdf/pypdf/files/15210011/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf" name = "iss2598a.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() reader.pages[1].extract_text() url = "https://github.com/mozilla/pdf.js/raw/master/test/pdfs/issue14256.pdf" name = "iss2598b.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/71bc5053-cfc7-44ba-b7be-8e2333e2c749" name = "iss2598b.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) for i in range(8): assert image_similarity(writer.pages[0].images[i].image, img) == 1 writer.pages[0].extract_text() # check recalculation of inline images assert writer.pages[0].inline_images is not None writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) assert writer.pages[0].inline_images is None reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") writer.pages[0].merge_page(reader.pages[0]) assert list(writer.pages[0].images.keys()) == [ "/Im0", "~0~", "~1~", "~2~", "~3~", "~4~", "~5~", "~6~", "~7~", "~8~", "~9~", "~10~", "~11~", "~12~", "~13~", "~14~", "~15~", ] url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" name = "iss2598c.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/bfb221be-11bd-46fe-8129-55a58088a4b6" name = "iss2598c.jpg" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 url = "https://github.com/py-pdf/pypdf/files/15282904/tt.pdf" name = "iss2598d.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/1a770e1b-9ad2-4125-89ae-6069992dda23" name = "iss2598d.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) == 1 @pytest.mark.enable_socket def test_extract_image_from_object(caplog): url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf" name = "iss2613.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][ "/X1" ].decode_as_image() assert isinstance(image, Image.Image) with pytest.raises(Exception): co = reader.pages[0].get_contents() co.decode_as_image() assert "does not seem to be an Image" in caplog.text caplog.clear() co.indirect_reference = "for_test" with pytest.raises(Exception): co = reader.pages[0].get_contents() co.decode_as_image() assert "does not seem to be an Image" in caplog.text def test_extract_jpeg_with_explicit_quality(): reader = PdfReader(RESOURCE_ROOT / "side-by-side-subfig.pdf") page = reader.pages[0] x_object = page["/Resources"]["/XObject"]["/Im1"] assert x_object["/Filter"] == "/DCTDecode" image = x_object.decode_as_image() assert isinstance(image, Image.Image) assert image.format == "JPEG" small_image = x_object.decode_as_image(pillow_parameters={"quality": 75}) assert image_size(small_image) < image_size(image) @pytest.mark.enable_socket def test_4bits_images(caplog): url = "https://github.com/user-attachments/files/16624406/tt.pdf" name = "iss2411.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/user-attachments/assets/53058564-9a28-4e4a-818f-a6528013d7dc" name = "iss2411.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[1].image, img) == 1.0 @pytest.mark.enable_socket def test_no_filter_with_colorspace_as_list(): """Tests for #2998""" url = "https://github.com/user-attachments/files/18058571/9bf7a2e2-72c8-4ac1-b8ae-164df16c8cef.pdf" name = "iss2998.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] page.images.items() def test_contentstream__read_inline_image__fallback_is_successful(): stream = ContentStream(stream=None, pdf=None) stream.set_data( b"""Q q 9.6 0 0 4.8 5523.6 1031 cm BI /CS /RGB /W 2 /H 1 /BPC 8 ID \x8b\x8b\x8b\xfe\xfe\xfe EI Q /R413 gs """ ) page = PageObject(pdf=None) with mock.patch.object(page, "get_contents", return_value=stream): images = page._get_inline_images() assert list(images) == ["~0~"] assert images["~0~"].data == ( b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x0f" b"IDATx\x9cc\xe8\xee\xee\xfe\xf7\xef\x1f\x00\x0e \x04\x9cpr_\x96\x00\x00\x00\x00IEND\xaeB`\x82" ) @pytest.mark.enable_socket def test_inline_image_containing_ei_in_body(): """Tests for #3107""" expected = """\nID ><8d>£^H<8e><8b>¢AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA¡^BêMEI E^N^^<8a>^AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA^D <8b>²: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA5>^D é^EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD<98>AAAAAA<8d><82> AAAAAAAA^B EI\nQ\n""".encode("latin1") # noqa: E501 url = "https://github.com/user-attachments/files/18943249/testing.pdf" name = "issue3107.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) for page in writer.pages: page.transfer_rotation_to_content() output = BytesIO() writer.write(output) assert expected in output.getvalue() @pytest.mark.enable_socket @pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec") def test_jbig2decode(): url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" name = "jbig2.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] image = next(iter(page.images)) assert image.image.size == (5138, 6630) assert image.image.mode == "1" assert image.image.format == "PNG" url = "https://github.com/user-attachments/assets/d6f88c80-a2e0-4ea9-b1e0-34442041d004" name = "jbig2.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(image.image, img) >= 0.999 @pytest.mark.enable_socket @pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec") def test_jbig2decode__jbig2globals(): url = "https://github.com/user-attachments/files/20119148/out.pdf" name = "jbig2_globals.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] image = next(iter(page.images)) assert image.image.size == (1067, 1067) assert image.image.mode == "1" assert image.image.format == "PNG" url = "https://github.com/user-attachments/assets/7ac41ee3-9c13-44cf-aa74-8f106287e354" name = "jbig2_globals.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) # Wrong image: 0.9618265964800714 assert image_similarity(image.image, img) >= 0.999 @pytest.mark.enable_socket @pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec") def test_jbig2decode__memory_limit(): url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" name = "jbig2.pdf" error_messages = [ # Version 0.20 ( r"^Memory limit reached while reading JBIG2 data:\n" r"jbig2dec FATAL ERROR memory: limit reached: limit: 5000000 \(4 Mbyte\) used: 4329386 \(4 Mbyte\) allocation: 4263106 \(4 Mbyte\)\n" # noqa: E501 r"jbig2dec FATAL ERROR failed to allocate image data buffer \(stride=643, height=6630\)" ), # Version 0.19 ( r"^Memory limit reached while reading JBIG2 data:\n" r"jbig2dec FATAL ERROR failed to allocate image data buffer \(stride=643, height=6630\)" ), ] with mock.patch("pypdf.filters.JBIG2_MAX_OUTPUT_LENGTH", 5_000_000): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] with pytest.raises(expected_exception=LimitReachedError, match=rf"({'|'.join(error_messages)})"): _ = next(iter(page.images)) @pytest.mark.enable_socket def test_get_ids_image__resources_is_none(): url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf" name = "tika-957721.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[2] assert list(page.images.items()) == [] ================================================ FILE: tests/test_javascript.py ================================================ """Test topics around the usage of JavaScript in PDF documents.""" from typing import Any import pytest from pypdf import PdfReader, PdfWriter from tests import RESOURCE_ROOT @pytest.fixture def pdf_file_writer(): reader = PdfReader(RESOURCE_ROOT / "issue-604.pdf") writer = PdfWriter() writer.append_pages_from_reader(reader) return writer def test_add_js(pdf_file_writer): pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") assert ( "/Names" in pdf_file_writer._root_object ), "add_js should add a name catalog in the root object." assert ( "/JavaScript" in pdf_file_writer._root_object["/Names"] ), "add_js should add a JavaScript name tree under the name catalog." def test_added_js(pdf_file_writer): def get_javascript_name() -> Any: assert "/Names" in pdf_file_writer._root_object assert "/JavaScript" in pdf_file_writer._root_object["/Names"] assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"] return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][ -2 ] # return -2 in order to get the latest javascript pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") first_js = get_javascript_name() pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") second_js = get_javascript_name() assert ( first_js != second_js ), "add_js should add to the previous script in the catalog." ================================================ FILE: tests/test_merger.py ================================================ """Test merging PDF functionality.""" from io import BytesIO from pathlib import Path import pytest import pypdf from pypdf import PdfReader, PdfWriter from pypdf.generic import ArrayObject, Destination, DictionaryObject, Fit, NameObject, NullObject from . import RESOURCE_ROOT, get_data_from_url from .test_encryption import HAS_AES def merger_operate(merger): pdf_path = RESOURCE_ROOT / "crazyones.pdf" outline = RESOURCE_ROOT / "pdflatex-outline.pdf" pdf_forms = RESOURCE_ROOT / "pdflatex-forms.pdf" pdf_pw = RESOURCE_ROOT / "libreoffice-writer-password.pdf" merger.append(pdf_path) merger.append(outline) merger.append(pdf_path, pages=pypdf.pagerange.PageRange(slice(0, 0))) merger.append(pdf_forms) merger.merge(0, pdf_path, import_outline=False) with pytest.raises(NotImplementedError) as exc: with open(pdf_path, "rb") as fp: data = fp.read() merger.append(data) assert exc.value.args[0].startswith( "Merging requires an object that PdfReader can parse. " "Typically, that is a Path" ) # Merging an encrypted file reader = pypdf.PdfReader(pdf_pw) reader.decrypt("openpassword") merger.append(reader) # PdfReader object: r = pypdf.PdfReader(pdf_path) merger.append(r, outline_item="foo", pages=list(range(len(r.pages)))) # File handle with open(pdf_path, "rb") as fh: merger.append(fh) # to force to build outlines and ensure the add_outline_item is # at end of the list merger.write(BytesIO()) outline_item = merger.add_outline_item("An outline item", 0) oi2 = merger.add_outline_item( "deeper", 0, parent=outline_item, italic=True, bold=True ) merger.add_outline_item( "Let's see", 2, oi2, (255, 255, 0), True, True, Fit.fit_box_vertically(left=12) ) merger.add_outline_item( "The XYZ fit", 0, outline_item, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3), ) merger.add_outline_item( "The FitH fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_horizontally(top=10), ) merger.add_outline_item( "The FitV fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_vertically(left=10), ) merger.add_outline_item( "The FitR fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), ) merger.add_outline_item( "The FitB fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_box() ) merger.add_outline_item( "The FitBH fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_box_horizontally(top=10), ) merger.add_outline_item( "The FitBV fit", 0, outline_item, (255, 0, 15), True, True, Fit.fit_box_vertically(left=10), ) found_oi = merger.find_outline_item("nothing here") assert found_oi is None found_oi = merger.find_outline_item("foo") assert found_oi == [9] merger.add_metadata({"/Author": "Martin Thoma"}) merger.add_named_destination("/Title", 0) merger.set_page_layout("/SinglePage") merger.page_mode = "/UseThumbs" def check_outline(tmp_path): # Check if outline is correct reader = pypdf.PdfReader(tmp_path) assert [el.title for el in reader.outline if isinstance(el, Destination)] == [ "Foo", "Bar", "Baz", "Foo", "Bar", "Baz", "Foo", "Bar", "Baz", "foo", "An outline item", # this has been moved to end normal??? ] # TODO: There seem to be no destinations for those links? tmp_filename = "dont_commit_merged.pdf" def test_merger_operations_by_traditional_usage_with_writer(tmp_path): # Arrange merger = PdfWriter() merger_operate(merger) path = tmp_path / tmp_filename # Act merger.write(path) merger.close() # Assert check_outline(path) def test_merger_operations_by_semi_traditional_usage_with_writer(tmp_path): path = tmp_path / tmp_filename with PdfWriter() as merger: merger_operate(merger) merger.write(path) # Act # Assert assert Path(path).is_file() check_outline(path) def test_merger_operation_by_new_usage_with_writer(tmp_path): path = tmp_path / tmp_filename with PdfWriter(fileobj=path) as merger: merger_operate(merger) # Assert assert Path(path).is_file() check_outline(path) def test_merge_page_exception_with_writer(): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" with pytest.raises(TypeError) as exc: merger.merge(0, pdf_path, pages="a:b") assert ( exc.value.args[0] == '"pages" must be a tuple of (start, stop[, step]) or a list' ) merger.close() def test_merge_page_tuple_with_writer(): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" merger.merge(0, pdf_path, pages=(0, 1)) merger.close() def test_merge_write_closed_fh_with_writer(pdf_file_path): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" merger.append(pdf_path) merger.close() merger.write(pdf_file_path) merger.add_metadata({"author": "Martin Thoma"}) merger.set_page_layout("/SinglePage") merger.page_mode = "/UseNone" merger.add_outline_item("An outline item", 0) @pytest.mark.enable_socket def test_trim_outline_list_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381771/tika-995175.pdf" name = "tika-995175.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.add_outline_item_dict(merger.outline[0]) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket def test_zoom_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381769/tika-994759.pdf" name = "tika-994759.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_zoom_xyz_no_left_with_add_page(pdf_file_path): url = "https://github.com/user-attachments/files/18381704/tika-933322.pdf" name = "tika-933322.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() for p in reader.pages: merger.add_page(p) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket def test_zoom_xyz_no_left_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381704/tika-933322.pdf" name = "tika-933322.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket @pytest.mark.slow def test_outline_item_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381773/tika-997511.pdf" name = "tika-997511.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket @pytest.mark.slow def test_trim_outline_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381759/tika-982336.pdf" name = "tika-982336.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket @pytest.mark.slow def test1_with_writer(pdf_file_path): url = "https://github.com/user-attachments/files/18381696/tika-923621.pdf" name = "tika-923621.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @pytest.mark.enable_socket @pytest.mark.slow def test_sweep_recursion1_with_writer(pdf_file_path): # TODO: This test looks like an infinite loop. url = "https://github.com/user-attachments/files/18381697/tika-924546.pdf" name = "tika-924546.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() reader2 = PdfReader(pdf_file_path) reader2.pages @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("url", "name"), [ ( # TODO: This test looks like an infinite loop. "https://github.com/user-attachments/files/18381700/tika-924794.pdf", "tika-924794.pdf", ), ( "https://github.com/user-attachments/files/18381697/tika-924546.pdf", "tika-924546.pdf", ), ], ) def test_sweep_recursion2_with_writer(url, name, pdf_file_path): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() reader2 = PdfReader(pdf_file_path) reader2.pages @pytest.mark.enable_socket def test_sweep_indirect_list_newobj_is_none_with_writer(caplog, pdf_file_path): url = "https://github.com/user-attachments/files/18381681/tika-906769.pdf" name = "tika-906769.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() # used to be: assert "Object 21 0 not defined." in caplog.text reader2 = PdfReader(pdf_file_path) reader2.pages @pytest.mark.enable_socket def test_iss1145_with_writer(): # issue with FitH destination with null param url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" name = "iss1145.pdf" merger = PdfWriter() merger.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) merger.close() @pytest.mark.enable_socket def test_iss1344_with_writer(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" m = PdfWriter() m.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) b = BytesIO() m.write(b) p = PdfReader(b).pages[0] assert "/DIJMAC+Arial Black" in p._debug_for_extract() assert "adresse où le malade peut être visité" in p.extract_text() @pytest.mark.enable_socket def test_articles_with_writer(caplog): url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "924666.pdf" m = PdfWriter() m.append(PdfReader(BytesIO(get_data_from_url(url, name=name))), (2, 10)) b = BytesIO() m.write(b) r = PdfReader(b) assert len(r.threads) == 4 assert r.threads[0].get_object()["/F"]["/P"] == r.pages[0] @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") merger = PdfWriter() merger.append(BytesIO(data)) assert len(merger.pages) == 98 def test_get_reference(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference @pytest.mark.enable_socket def test_direct_link_preserved(pdf_file_path): # this could be any PDF -- we don't care which reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) writer = PdfWriter(clone_from=reader) # this PDF has a direct link from p1 to p2 merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) for p in merger.pages: writer.add_page(p) writer.write(pdf_file_path) check = PdfReader(pdf_file_path) page3 = check.pages[2] link = page3["/Annots"][0].get_object() assert link["/Subtype"] == "/Link" dest = link["/Dest"][0] # indirect reference of page referred to page4 = check.flattened_pages[3] assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken" @pytest.mark.enable_socket def test_direct_link_preserved_reordering(pdf_file_path): # this could be any PDF -- we don't care which reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) writer = PdfWriter(clone_from=reader) # this PDF has a direct link from p1 to p2 merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) for p in merger.pages: writer.add_page(p) # let's insert a page to mess up the page order writer.insert_page(reader.pages[0], 3) writer.write(pdf_file_path) check = PdfReader(pdf_file_path) page3 = check.pages[2] link = page3["/Annots"][0].get_object() assert link["/Subtype"] == "/Link" dest = link["/Dest"][0] # indirect reference of page referred to page5 = check.flattened_pages[4] # it moved one out assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken" @pytest.mark.enable_socket def test_direct_link_page_missing(pdf_file_path): # this could be any PDF -- we don't care which reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) writer = PdfWriter(clone_from=reader) # this PDF has a direct link from p1 to p2 merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) writer.add_page(merger.pages[0]) # but we're not adding page 2 writer.write(pdf_file_path) # verify nothing crashes @pytest.mark.enable_socket def test_named_reference_preserved(pdf_file_path): # this could be any PDF -- we don't care which reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) writer = PdfWriter(clone_from=reader) # this PDF has a named reference from from p3 to p5 merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) for p in merger.pages: writer.add_page(p) writer.write(pdf_file_path) check = PdfReader(pdf_file_path) page5 = check.pages[4] page7 = check.flattened_pages[6] for link in page5["/Annots"]: action = link["/A"] assert action.get("/S") == "/GoTo" dest = str(action["/D"]) assert dest in check.named_destinations pref = check.named_destinations[dest].page assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken" @pytest.mark.enable_socket def test_named_ref_to_page_that_is_gone(pdf_file_path): source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) buf = BytesIO() tmp = PdfWriter() tmp.add_page(source.pages[2]) # we add only the page with the reference tmp.write(buf) source = PdfReader(buf) writer = PdfWriter() writer.add_page(source.pages[0]) # now references to non-existent page writer.write(pdf_file_path) # don't crash def test_merge__null_destination(): """Tests for issue #3444.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") writer2 = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") annotation = DictionaryObject() annotation[NameObject("/Subtype")] = NameObject("/Link") a = DictionaryObject() annotation[NameObject("/A")] = a a[NameObject("/S")] = NameObject("/GoTo") target = NullObject() a[NameObject("/D")] = writer._add_object(target) annots = ArrayObject([annotation]) page = writer2.pages[0] page[NameObject("/Annots")] = annots data = BytesIO() writer2.write(data) data.seek(0) writer.merge(position=1, fileobj=data) assert writer.pages[0].annotations is None ================================================ FILE: tests/test_page.py ================================================ """Test the pypdf._page module.""" import json import math import os import re import shutil import subprocess import sys from copy import deepcopy from io import BytesIO from pathlib import Path from random import shuffle from typing import Any from unittest import mock import pytest from pypdf import PdfReader, PdfWriter, Transformation from pypdf._page import PageObject from pypdf.constants import PageAttributes from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PdfReadWarning, PyPdfError from pypdf.generic import ( ArrayObject, ContentStream, DictionaryObject, FloatObject, IndirectObject, NameObject, NullObject, RectangleObject, TextStringObject, ) from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url, normalize_warnings from .test_images import image_similarity from .utils import extract_cell_text, extract_table, extract_text_and_rectangles GHOSTSCRIPT_BINARY = shutil.which("gs") def get_all_sample_files(): meta_file = SAMPLE_ROOT / "files.json" if not Path(meta_file).is_file(): return {"data": []} with open(meta_file) as fp: data = fp.read() return json.loads(data) all_files_meta = get_all_sample_files() @pytest.mark.samples @pytest.mark.parametrize( "meta", [m for m in all_files_meta["data"] if not m["encrypted"]], ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]], ) @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_read(meta): pdf_path = SAMPLE_ROOT / meta["path"] reader = PdfReader(pdf_path) try: reader.pages[0] except Exception: return assert len(reader.pages) == meta["pages"] @pytest.mark.samples @pytest.mark.enable_socket @pytest.mark.parametrize( ("pdf_path", "password"), [ ("crazyones.pdf", None), ("attachment.pdf", None), ( "libreoffice-writer-password.pdf", "openpassword", ), ("imagemagick-images.pdf", None), ("imagemagick-lzw.pdf", None), ("reportlab-inline-image.pdf", None), ("https://arxiv.org/pdf/2201.00029.pdf", None), ], ) def test_page_operations(pdf_path, password): """ This test just checks if the operation throws an exception. This should be done way more thoroughly: It should be checked if the output is as expected. """ if pdf_path.startswith("http"): pdf_path = BytesIO(get_data_from_url(pdf_path, pdf_path.split("/")[-1])) else: pdf_path = RESOURCE_ROOT / pdf_path reader = PdfReader(pdf_path) writer = PdfWriter() if password: reader.decrypt(password) writer.clone_document_from_reader(reader) page: PageObject = writer.pages[0] t = Transformation().translate(50, 100).rotate(90) assert abs(t.ctm[4] + 100) < 0.01 assert abs(t.ctm[5] - 50) < 0.01 transformation = ( Transformation() .rotate(90) .scale(1) .translate(1, 1) .transform(Transformation((1, 0, 0, -1, 0, 0))) ) page.add_transformation(transformation, expand=True) page.add_transformation((1, 0, 0, 0, 0, 0)) page.scale(2, 2) page.scale_by(0.5) page.scale_to(100, 100) page.compress_content_streams() page.extract_text() page.scale_by(0.5) page.scale_to(100, 100) page.extract_text() @pytest.mark.parametrize( ("angle", "expected_width", "expected_height"), [ (175, 680, 844), (45, 994, 994), (-80, 888, 742), ], ) def test_mediabox_expansion_after_rotation( angle: float, expected_width: int, expected_height: int ): """ Mediabox dimensions after rotation at a non-right angle with expansion are correct. The test was validated against pillow (see PR #2282) """ pdf_path = RESOURCE_ROOT / "crazyones.pdf" writer = PdfWriter(clone_from=pdf_path) transformation = Transformation().rotate(angle) for page_box in writer.pages: page_box.add_transformation(transformation, expand=True) mediabox = writer.pages[0].mediabox # Deviation of up to 2 pixels is acceptable assert math.isclose(mediabox.width, expected_width, abs_tol=2) assert math.isclose(mediabox.height, expected_height, abs_tol=2) def test_transformation_equivalence(): pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" writer_base = PdfWriter(clone_from=pdf_path) page_base = writer_base.pages[0] pdf_path = RESOURCE_ROOT / "box.pdf" writer_add = PdfWriter(clone_from=pdf_path) page_box = writer_add.pages[0] op = Transformation().scale(2).rotate(45) # Option 1: The new way page_box1 = deepcopy(page_box) page_base1 = deepcopy(page_base) page_box1.add_transformation(op, expand=True) page_base1.merge_page(page_box1, expand=False) # Option 2: The old way page_box2 = deepcopy(page_box) page_base2 = deepcopy(page_base) page_base2.merge_transformed_page(page_box2, op, expand=False) page_box2.add_transformation(op) page_base2.merge_page(page_box2) # Should be the same assert page_base1[NameObject(PG.CONTENTS)] == page_base2[NameObject(PG.CONTENTS)] assert page_base1.mediabox == page_base2.mediabox assert page_base1.trimbox == page_base2.trimbox assert page_base1.get(NameObject(PG.ANNOTS)) == page_base2.get(NameObject(PG.ANNOTS)) compare_dict_objects( page_base1[NameObject(PG.RESOURCES)], page_base2[NameObject(PG.RESOURCES)] ) def test_transformation_equivalence2(): pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" reader_base = PdfReader(pdf_path) pdf_path = RESOURCE_ROOT / "box.pdf" reader_add = PdfReader(pdf_path) writer = PdfWriter() writer.append(reader_base) writer.pages[0].merge_transformed_page( reader_add.pages[0], Transformation().scale(2).rotate(-45), False, False ) writer.pages[0].merge_transformed_page( reader_add.pages[0], Transformation().scale(2).translate(100, 100), True, False ) # No special assert: the test should be visual in a viewer; 2 box with a arrow rotated and translated writer = PdfWriter() writer.append(reader_add) writer.pages[0].merge_transformed_page( reader_base.pages[0], Transformation(), True, True ) # No special assert: Visual check the page has been increased and all is visible (box + graph) writer = PdfWriter() writer.append(reader_add) height = reader_add.pages[0].mediabox.height writer.pages[0].merge_transformed_page( reader_base.pages[0], Transformation().transform(Transformation((1, 0, 0, -1, 0, height))), False, False, ) # No special assert: Visual check the page has been increased and all is visible (box + graph) pdf_path = RESOURCE_ROOT / "commented-xmp.pdf" reader_comments = PdfReader(pdf_path) writer = PdfWriter() writer.append(reader_base) writer.pages[0].merge_transformed_page( reader_comments.pages[0], Transformation().rotate(-15), True, True ) nb_annots1 = len(writer.pages[0]["/Annots"]) writer.pages[0].merge_transformed_page( reader_comments.pages[0], Transformation().rotate(-30), True, True ) assert len(writer.pages[0]["/Annots"]) == 2 * nb_annots1 # No special assert: Visual check the overlay has its comments at the good position def test_get_user_unit_property(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) assert reader.pages[0].user_unit == 1 def compare_dict_objects(d1, d2): assert sorted(d1.keys()) == sorted(d2.keys()) for key in d1: if isinstance(d1[key], DictionaryObject): compare_dict_objects(d1[key], d2[key]) else: assert d1[key] == d2[key] @pytest.mark.slow def test_page_transformations(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" writer = PdfWriter(clone_from=pdf_path) page: PageObject = writer.pages[0] page.merge_rotated_page(page, 90, expand=True) op = Transformation().rotate(90).scale(1, 1) page.merge_transformed_page(page, op, expand=True) op = Transformation().rotate(90).scale(1, 1).translate(1, 1) page.merge_transformed_page(page, op, expand=True) op = Transformation().translate(-100, -100).rotate(90).translate(100, 100) page.merge_transformed_page(page, op, expand=False) page.merge_scaled_page(page, 2, expand=False) op = Transformation().scale(1, 1).translate(1, 1) page.merge_transformed_page(page, op) page.merge_translated_page(page, 100, 100, expand=False) page.add_transformation((1, 0, 0, 0, 0, 0)) @pytest.mark.parametrize( ("pdf_path", "password"), [ (RESOURCE_ROOT / "crazyones.pdf", None), (RESOURCE_ROOT / "attachment.pdf", None), (RESOURCE_ROOT / "side-by-side-subfig.pdf", None), ( RESOURCE_ROOT / "libreoffice-writer-password.pdf", "openpassword", ), ], ) def test_compress_content_streams(pdf_path, password): reader = PdfReader(pdf_path) writer = PdfWriter() if password: reader.decrypt(password) for i, page in enumerate(reader.pages): assert i == page.page_number assert isinstance(reader.pages[0].get_contents(), ContentStream) writer.clone_document_from_reader(reader) assert isinstance(writer.pages[0].get_contents(), ContentStream) for i, page in enumerate(writer.pages): assert i == page.page_number page.compress_content_streams() # test from reader should fail as adding_object out of # PdfWriter not possible with pytest.raises(ValueError): reader.pages[0].compress_content_streams() def test_page_properties(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") page = reader.pages[0] assert page.mediabox == RectangleObject((0, 0, 612, 792)) assert page.cropbox == RectangleObject((0, 0, 612, 792)) assert page.bleedbox == RectangleObject((0, 0, 612, 792)) assert page.trimbox == RectangleObject((0, 0, 612, 792)) assert page.artbox == RectangleObject((0, 0, 612, 792)) page.bleedbox = RectangleObject((0, 1, 100, 101)) assert page.bleedbox == RectangleObject((0, 1, 100, 101)) def test_page_rotation(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") page = writer.pages[0] with pytest.raises(ValueError) as exc: page.rotate(91) assert exc.value.args[0] == "Rotation angle must be a multiple of 90" # test rotation assert page.rotation == 0 page.rotation = 180 assert page.rotation == 180 page.rotation += 190 assert page.rotation == 0 # test transfer_rotate_to_content page.rotation -= 90 page.transfer_rotation_to_content() assert math.isclose(page.mediabox.left, 0, abs_tol=0.1) assert math.isclose(page.mediabox.bottom, 0, abs_tol=0.1) assert math.isclose(page.mediabox.right, 792, abs_tol=0.1) assert math.isclose(page.mediabox.top, 612, abs_tol=0.1) def test_page_indirect_rotation(): reader = PdfReader(RESOURCE_ROOT / "indirect-rotation.pdf") page = reader.pages[0] # test rotation assert page.rotation == 0 def test_page_scale(): op = Transformation() with pytest.raises(ValueError) as exc: op.scale() assert exc.value.args[0] == "Either sx or sy must be specified" assert op.scale(sx=2).ctm == (2, 0, 0, 2, 0, 0) assert op.scale(sy=3).ctm == (3, 0, 0, 3, 0, 0) def test_add_transformation_on_page_without_contents(): page = PageObject() assert page.get_contents() is None page.add_transformation(Transformation()) page[NameObject("/Contents")] = ContentStream(None, None) assert isinstance(page.get_contents(), ContentStream) @pytest.mark.enable_socket def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/pypdf/files/9150656/ST.2019.PDF" name = "st2019.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) txt = reader.pages[3].extract_text() # The following text is contained in two different cells: assert txt.find("有限公司") > 0 assert txt.find("郑州分公司") > 0 # 有限公司 = limited company # 郑州分公司 = branch office in Zhengzhou # First cell (see page 4/254): assert txt.find("郑州药素电子商务有限公司") > 0 # Next cell (first cell in next line): assert txt.find("郑州分公司") > 0 @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("url", "name"), [ # keyerror_potentially_empty_page ( "https://github.com/user-attachments/files/18381736/tika-964029.pdf", "tika-964029.pdf", ), # 1140 / 1141: ( "https://github.com/user-attachments/files/18381702/tika-932446.pdf", "tika-932446.pdf", ), # iss 1134: ( "https://github.com/py-pdf/pypdf/files/9150656/ST.2019.PDF", "iss_1134.pdf", ), # iss 1: ( "https://github.com/py-pdf/pypdf/files/9432350/Work.Flow.From.Check.to.QA.pdf", "WFCA.pdf", ), ( "https://github.com/user-attachments/files/18381736/tika-964029.pdf", "tika-964029.pdf", ), # single_quote_op ( "https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf", "tika-964029.pdf", ), # no_resources ( # https://www.itu.int/rec/T-REC-X.25-199610-I/en "https://github.com/py-pdf/pypdf/files/12423313/T-REC-X.25-199610-I.PDF-E.pdf", "T-REC-X.25-199610-I!!PDF-E.pdf", ), ], ) def test_extract_text(url, name): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() @pytest.mark.enable_socket @pytest.mark.slow def test_extract_text_page_pdf_impossible_decode_xform(caplog): url = "https://github.com/user-attachments/files/18381748/tika-972962.pdf" name = "tika-972962.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) assert warn_msgs == [""] # text extraction recognise no text @pytest.mark.enable_socket @pytest.mark.slow def test_extract_text_operator_t_star(): # L1266, L1267 url = "https://github.com/user-attachments/files/18381740/tika-967943.pdf" name = "tika-967943.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() def test_extract_text_visitor_callbacks(): """ Extract text in rectangle-objects or simple tables. This test uses GeoBase_NHNC1_Data_Model_UML_EN.pdf. It extracts the labels of package-boxes in Figure 2. It extracts the texts in table "REVISION HISTORY". """ # Test 1: We test the analysis of page 7 "2.1 LRS model". reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") page_lrs_model = reader.pages[6] # We ignore the invisible large rectangles. def ignore_large_rectangles(r) -> bool: return r.w < 400 and r.h < 400 (texts, rectangles) = extract_text_and_rectangles( page_lrs_model, rect_filter=ignore_large_rectangles ) # We see ten rectangles (5 tabs, 5 boxes) but there are 64 rectangles # (including some invisible ones). assert len(rectangles) == 60 rectangle2texts = {} for t in texts: for r in rectangles: if r.contains(t.x, t.y): texts = rectangle2texts.setdefault(r, []) texts.append(t.text.strip()) break # Five boxes and the figure-description below. assert len(rectangle2texts) == 6 box_texts = [" ".join(texts) for texts in rectangle2texts.values()] assert "Hydro Network" in box_texts assert "Hydro Events" in box_texts assert "Metadata" in box_texts assert "Hydrography" in box_texts assert "Toponymy (external model)" in box_texts # Test 2: Parse table "REVISION HISTORY" on page 3. page_revisions = reader.pages[2] # We ignore the second table, therefore: r.y > 350 def filter_first_table(r) -> bool: return r.w > 1 and r.h > 1 and r.w < 400 and r.h < 400 and r.y > 350 (texts, rectangles) = extract_text_and_rectangles( page_revisions, rect_filter=filter_first_table ) rows = extract_table(texts, rectangles) assert len(rows) == 9 assert extract_cell_text(rows[0][0]) == "Date" assert extract_cell_text(rows[0][1]) == "Version" assert extract_cell_text(rows[0][2]) == "Description" assert extract_cell_text(rows[1][0]) == "September 2002" # The line break between "English review;" # and "Remove" is not detected. assert ( extract_cell_text(rows[6][2]) == "English review;Remove the UML model for the Segmented view." ) assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments." # Check the fonts. We check: /F2 9.96 Tf [...] [(Dat)-2(e)] TJ text_dat_of_date = rows[0][0][0] assert text_dat_of_date.font_dict is not None assert text_dat_of_date.font_dict["/Name"] == "/F2" assert text_dat_of_date.get_base_font() == "/Arial,Bold" assert text_dat_of_date.font_dict["/Encoding"] == "/WinAnsiEncoding" assert text_dat_of_date.font_size == 9.96 # Check: /F1 9.96 Tf [...] [(S)4(ep)4(t)-10(em)-20(be)4(r)-3( 20)4(02)] TJ texts = rows[1][0][0] assert texts.font_dict is not None assert texts.font_dict["/Name"] == "/F1" assert texts.get_base_font() == "/Arial" assert texts.font_dict["/Encoding"] == "/WinAnsiEncoding" assert text_dat_of_date.font_size == 9.96 # Test 3: Read a table in a document using a non-translating # but scaling Tm-operand reader = PdfReader(RESOURCE_ROOT / "Sample_Td-matrix.pdf") page_td_model = reader.pages[0] # We store the translations of the Td-executions. list_td = [] def visitor_td(op, args, cm, tm) -> None: if op == b"Td": list_td.append((tm[4], tm[5])) page_td_model.extract_text(visitor_operand_after=visitor_td) assert len(list_td) == 4 # Check the translations of the four Td-executions. assert list_td[0] == (210.0, 110.0) assert list_td[1] == (410.0, 110.0) assert list_td[2] == (210.0, 210.0) assert list_td[3] == (410.0, 210.0) @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"), [ ( RESOURCE_ROOT / "crazyones.pdf", None, { "/HHXGQB+SFTI1440", "/TITXYI+SFRM0900", "/YISQAD+SFTI1200", }, set(), ), ( RESOURCE_ROOT / "attachment.pdf", None, { "/HHXGQB+SFTI1440", "/TITXYI+SFRM0900", "/YISQAD+SFTI1200", }, set(), ), ( RESOURCE_ROOT / "libreoffice-writer-password.pdf", "openpassword", {"/BAAAAA+DejaVuSans"}, set(), ), ( RESOURCE_ROOT / "imagemagick-images.pdf", None, set(), {"/Helvetica"}, ), (RESOURCE_ROOT / "imagemagick-lzw.pdf", None, set(), set()), ( RESOURCE_ROOT / "reportlab-inline-image.pdf", None, set(), {"/Helvetica"}, ), # fonts in annotations ( RESOURCE_ROOT / "FormTestFromOo.pdf", None, {"/CAAAAA+LiberationSans", "/EAAAAA+SegoeUI", "/BAAAAA+LiberationSerif"}, {"/LiberationSans", "/ZapfDingbats"}, ), ], ) def test_get_fonts(pdf_path, password, embedded, unembedded): reader = PdfReader(pdf_path, password=password) a = set() b = set() for page in reader.pages: a_tmp, b_tmp = page._get_fonts() a = a.union(a_tmp) b = b.union(b_tmp) assert (a, b) == (embedded, unembedded) @pytest.mark.enable_socket def test_get_fonts2(): url = "https://github.com/py-pdf/pypdf/files/12618104/WS_T.483.8-2016.pdf" name = "WS_T.483.8-2016.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.pages[1]._get_fonts() == ( { "/E-HZ9-PK7483a5-Identity-H", "/SSJ-PK748200005d9-Identity-H", "/QGNGZS+FzBookMaker1DlFont10536872415", "/E-BZ9-PK748344-Identity-H", "/E-FZ9-PK74836f-Identity-H", "/O9-PK748464-Identity-H", "/QGNGZR+FzBookMaker0DlFont00536872414", "/SSJ-PK748200005db-Identity-H", "/F-BZ9-PK7483cb-Identity-H", "/SSJ-PK748200005da-Identity-H", "/H-SS9-PK748200005e0-Identity-H", "/H-HT9-PK748200005e1-Identity-H", }, set(), ) assert reader.pages[2]._get_fonts() == ( { "/E-HZ9-PK7483a5-Identity-H", "/E-FZ9-PK74836f-Identity-H", "/E-BZ9-PK748344-Identity-H", "/QGNGZT+FzBookMaker0DlFont00536872418", "/O9-PK748464-Identity-H", "/F-BZ9-PK7483cb-Identity-H", "/H-SS9-PK748200005e0-Identity-H", "/QGNGZU+FzBookMaker1DlFont10536872420", "/H-HT9-PK748200005e1-Identity-H", }, set(), ) def test_annotation_getter(): pdf_path = RESOURCE_ROOT / "commented.pdf" reader = PdfReader(pdf_path) annotations = reader.pages[0].annotations assert annotations is not None assert isinstance(annotations[0], IndirectObject) annot_dict = dict(annotations[0].get_object()) assert "/P" in annot_dict assert isinstance(annot_dict["/P"], IndirectObject) del annot_dict["/P"] annot_dict["/Popup"] = annot_dict["/Popup"].get_object() del annot_dict["/Popup"]["/P"] del annot_dict["/Popup"]["/Parent"] assert annot_dict == { "/Type": "/Annot", "/Subtype": "/Text", "/Rect": ArrayObject( [ 270.75, 596.25, 294.75, 620.25, ] ), "/Contents": "Note in second paragraph", "/C": ArrayObject([1, 1, 0]), "/M": "D:20220406191858+02'00", "/Popup": DictionaryObject( { "/M": "D:20220406191847+02'00", "/Rect": ArrayObject([294.75, 446.25, 494.75, 596.25]), "/Subtype": "/Popup", "/Type": "/Annot", } ), "/T": "moose", } def test_annotation_setter(pdf_file_path): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) with pytest.raises(ValueError): writer.add_page(DictionaryObject()) # Act page_number = 0 page_link = writer.get_object(writer._pages)["/Kids"][page_number] annot_dict = { NameObject("/P"): page_link, NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Text"), NameObject("/Rect"): ArrayObject( [ FloatObject(270.75), FloatObject(596.25), FloatObject(294.75), FloatObject(620.25), ] ), NameObject("/Contents"): TextStringObject("Note in second paragraph"), NameObject("/C"): ArrayObject([FloatObject(1), FloatObject(1), FloatObject(0)]), NameObject("/M"): TextStringObject("D:20220406191858+02'00"), NameObject("/Popup"): DictionaryObject( { NameObject("/M"): TextStringObject("D:20220406191847+02'00"), NameObject("/Rect"): ArrayObject( [ FloatObject(294.75), FloatObject(446.25), FloatObject(494.75), FloatObject(596.25), ] ), NameObject("/Subtype"): NameObject("/Popup"), NameObject("/Type"): TextStringObject("/Annot"), } ), NameObject("/T"): TextStringObject("moose"), } arr = ArrayObject() page.annotations = arr # Delete Annotations page.annotations = None d = DictionaryObject(annot_dict) ind_obj = writer._add_object(d) arr.append(ind_obj) # Assert manually with open(pdf_file_path, "wb") as fp: writer.write(fp) @pytest.mark.enable_socket @pytest.mark.xfail(reason="#1091") def test_text_extraction_issue_1091(): url = "https://github.com/user-attachments/files/18381737/tika-966635.pdf" name = "tika-966635.pdf" stream = BytesIO(get_data_from_url(url, name=name)) with pytest.warns(PdfReadWarning): reader = PdfReader(stream) for page in reader.pages: page.extract_text() @pytest.mark.enable_socket def test_empyt_password_1088(): url = "https://github.com/user-attachments/files/18381712/tika-941536.pdf" name = "tika-941536.pdf" stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) len(reader.pages) @pytest.mark.enable_socket def test_old_habibi(): # this habibi has multiple characters associated with the h reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi.pdf") txt = reader.pages[0].extract_text() # very odd file # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴 assert "habibi" in txt assert "حَبيبي" in txt @pytest.mark.samples def test_read_link_annotation(): reader = PdfReader(SAMPLE_ROOT / "016-libre-office-link/libre-office-link.pdf") assert len(reader.pages[0].annotations) == 1 annot = dict(reader.pages[0].annotations[0].get_object()) expected = { "/Type": "/Annot", "/Subtype": "/Link", "/A": DictionaryObject( { "/S": "/URI", "/Type": "/Action", "/URI": "https://martin-thoma.com/", } ), "/Border": ArrayObject([0, 0, 0]), "/Rect": [ 92.043, 771.389, 217.757, 785.189, ], } assert set(expected.keys()) == set(annot.keys()) del expected["/Rect"] del annot["/Rect"] assert annot == expected @pytest.mark.enable_socket def test_no_resources(): url = "https://github.com/py-pdf/pypdf/files/9572045/108.pdf" name = "108.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) page_one = writer.pages[0] page_two = writer.pages[0] page_one.merge_page(page_two) def test_merge_page_reproducible_with_proc_set(): page1 = PageObject.create_blank_page(width=100, height=100) page2 = PageObject.create_blank_page(width=100, height=100) ordered = sorted(NameObject(f"/{x}") for x in range(20)) shuffled = list(ordered) shuffle(shuffled) # each page has some overlap in their /ProcSet, and they're in a weird order page1[NameObject("/Resources")][NameObject("/ProcSet")] = ArrayObject(shuffled[:15]) page2[NameObject("/Resources")][NameObject("/ProcSet")] = ArrayObject(shuffled[5:]) page1.merge_page(page2) assert page1[NameObject("/Resources")][NameObject("/ProcSet")] == ordered @pytest.mark.parametrize( ("apage1", "apage2", "expected_result", "expected_renames"), [ # simple cases: pytest.param({}, {}, {}, {}, id="no resources"), pytest.param( {"/1": "/v1"}, {"/2": "/v2"}, {"/1": "/v1", "/2": "/v2"}, {}, id="no overlap", ), pytest.param( {"/x": "/v"}, {"/x": "/v"}, {"/x": "/v"}, {}, id="overlap, matching values" ), pytest.param( {"/x": "/v1"}, {"/x": "/v2"}, {"/x": "/v1", "/x-0": "/v2"}, {"/x": "/x-0"}, id="overlap, different values", ), # carefully crafted names that match the renaming pattern: pytest.param( {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v1"}, {"/x": "/v2"}, { "/x": "/v1", "/x-0": "/v1", "/x-1": "/v1", "/x-2": "/v2", }, {"/x": "/x-2"}, id="crafted, different values", ), pytest.param( {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, {"/x": "/v"}, {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, {"/x": "/x-1"}, id="crafted, matching value in chain", ), pytest.param( {"/x": "/v1"}, {"/x": "/v2.1", "/x-0": "/v2.2"}, {"/x": "/v1", "/x-0": "/v2.1", "/x-0-0": "/v2.2"}, {"/x": "/x-0", "/x-0": "/x-0-0"}, id="crafted, overlaps with previous rename, different value", ), pytest.param( {"/x": "/v1"}, {"/x": "/v2", "/x-0": "/v2"}, {"/x": "/v1", "/x-0": "/v2"}, {"/x": "/x-0"}, id="crafted, overlaps with previous rename, matching value", ), ], ) def test_merge_resources(apage1, apage2, expected_result, expected_renames): for new_res in (False, True): # Arrange page1 = PageObject() page1[NameObject(PG.RESOURCES)] = DictionaryObject() for k, v in apage1.items(): page1[PG.RESOURCES][NameObject(k)] = NameObject(v) page2 = PageObject() page2[NameObject(PG.RESOURCES)] = DictionaryObject() for k, v in apage2.items(): page2[PG.RESOURCES][NameObject(k)] = NameObject(v) # Act result, renames = page1._merge_resources(page1, page2, PG.RESOURCES, new_res) # Assert assert result == expected_result assert renames == expected_renames def test_merge_page_resources_smoke_test(): # Arrange page1 = PageObject.create_blank_page(width=100, height=100) page2 = PageObject.create_blank_page(width=100, height=100) NO = NameObject # set up some dummy resources that overlap (or not) between the two pages # (note, all the edge cases are tested in test_merge_resources) props1 = page1[NO("/Resources")][NO("/Properties")] = DictionaryObject( { NO("/just1"): NO("/just1-value"), NO("/overlap-matching"): NO("/overlap-matching-value"), NO("/overlap-different"): NO("/overlap-different-value1"), } ) props2 = page2[NO("/Resources")][NO("/Properties")] = DictionaryObject( { NO("/just2"): NO("/just2-value"), NO("/overlap-matching"): NO("/overlap-matching-value"), NO("/overlap-different"): NO("/overlap-different-value2"), } ) # use these keys for some "operations", to validate renaming # (the operand name doesn't matter) contents1 = page1[NO("/Contents")] = ContentStream(None, None) contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")] contents2 = page2[NO("/Contents")] = ContentStream(None, None) contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")] expected_properties = { "/just1": "/just1-value", "/just2": "/just2-value", "/overlap-matching": "/overlap-matching-value", "/overlap-different": "/overlap-different-value1", "/overlap-different-0": "/overlap-different-value2", } expected_operations = [ # no renaming (ArrayObject(props1.keys()), b"page1-contents"), # some renaming ( ArrayObject( [ NO("/just2"), NO("/overlap-matching"), NO("/overlap-different-0"), ] ), b"page2-contents", ), ] # Act page1.merge_page(page2) # Assert assert page1[NO("/Resources")][NO("/Properties")] == expected_properties relevant_operations = [ (op, name) for op, name in page1.get_contents().operations if name in (b"page1-contents", b"page2-contents") ] assert relevant_operations == expected_operations @pytest.mark.enable_socket def test_merge_transformed_page_into_blank(): url = "https://github.com/py-pdf/pypdf/files/10768334/badges_3vjrh_7LXDZ_1-1.pdf" name = "badges_3vjrh_7LXDZ_1.pdf" r1 = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/files/10768335/badges_3vjrh_7LXDZ_2-1.pdf" name = "badges_3vjrh_7LXDZ_2.pdf" r2 = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.add_blank_page(100, 100) writer.pages[0].merge_translated_page(r1.pages[0], 0, 0, True, True) writer.pages[0].merge_translated_page(r2.pages[0], 1000, 1000, True, True) assert ( writer.pages[0]["/Resources"]["/Font"].raw_get("/F2+0").idnum != writer.pages[0]["/Resources"]["/Font"].raw_get("/F2+0-0").idnum ) writer.add_blank_page(100, 100) for x in range(4): for y in range(7): writer.pages[1].merge_translated_page( r1.pages[0], x * r1.pages[0].trimbox[2], y * r1.pages[0].trimbox[3], True, True, ) blank = PageObject.create_blank_page(width=100, height=100) assert blank.page_number is None inserted_blank = writer.add_page(blank) assert blank.page_number is None # the inserted page is a clone assert inserted_blank.page_number == len(writer.pages) - 1 writer.remove_page(inserted_blank.indirect_reference) assert inserted_blank.page_number is None inserted_blank = writer.add_page(blank) del writer._pages.get_object()["/Kids"][-1] assert inserted_blank.page_number is not None def test_pages_printing(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) assert str(reader.pages) == "[PageObject(0)]" assert len(reader.pages[0].images) == 0 with pytest.raises(KeyError): reader.pages[0].images["~1~"] @pytest.mark.enable_socket def test_del_pages(): url = "https://github.com/user-attachments/files/18381712/tika-941536.pdf" name = "tika-941536.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) ll = len(writer.pages) pp = writer.pages[1].indirect_reference del writer.pages[1] assert len(writer.pages) == ll - 1 pages = writer._pages.get_object() assert pages["/Count"] == ll - 1 assert len(pages["/Kids"]) == ll - 1 assert pp not in pages["/Kids"] del writer.pages[-2] with pytest.raises(TypeError): del writer.pages["aa"] with pytest.raises(IndexError): del writer.pages[9999] pp = tuple(p.indirect_reference for p in writer.pages[3:5]) ll = len(writer.pages) del writer.pages[3:5] assert len(writer.pages) == ll - 2 for p in pp: assert p not in pages["/Kids"] # del whole arborescence reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # error case pp = reader.pages[2] i = pp["/Parent"].get_object()["/Kids"].index(pp.indirect_reference) del pp["/Parent"].get_object()["/Kids"][i] with pytest.raises(PdfReadError): del reader.pages[2] url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" name = "iss2343b.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) node, idx = writer._get_page_in_node(53) assert (node.indirect_reference.idnum, idx) == (11776, 1) node, idx = writer._get_page_in_node(10000) assert (node.indirect_reference.idnum, idx) == (11769, -1) with pytest.raises(PyPdfError): writer._get_page_in_node(-1) del writer.pages[4] # to propagate among /Pages del writer.pages[:] assert len(writer.pages) == 0 assert len(writer.root_object["/Pages"]["/Kids"]) == 0 assert len(writer.flattened_pages) == 0 def test_pdf_pages_missing_type(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) del reader.trailer["/Root"]["/Pages"]["/Kids"][0].get_object()["/Type"] reader.pages[0] writer = PdfWriter(clone_from=reader) writer.pages[0] @pytest.mark.enable_socket def test_merge_with_stream_wrapped_in_save_restore(): """Test for issue #2587""" url = "https://github.com/py-pdf/pypdf/files/14895914/blank_portrait.pdf" name = "blank_portrait.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) page_one = writer.pages[0] assert page_one.get_contents().get_data() == b"q Q" page_two = writer.pages[0] page_one.merge_page(page_two) assert b"QQ" not in page_one.get_contents().get_data() @pytest.mark.samples def test_compression(): """Test for issue #1897""" def create_stamp_pdf() -> BytesIO: pytest.importorskip("fpdf") from fpdf import FPDF # noqa: PLC0415 pdf = FPDF() pdf.add_page() pdf.set_font("helvetica", "B", 16) pdf.cell(40, 10, "Hello World!") byte_string = pdf.output() return BytesIO(byte_string) template = PdfReader(create_stamp_pdf()) template_page = template.pages[0] writer = PdfWriter() writer.append(SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", [1]) nb1 = len(writer._objects) # 1 page only is modified for page in writer.pages: page.merge_page(template_page) # font is added; +1 streamobjects + 1 ArrayObject assert len(writer._objects) == nb1 + 1 + 2 for page in writer.pages: page.compress_content_streams() # objects are recycled assert len(writer._objects) == nb1 + 1 + 2 contents = writer.pages[0]["/Contents"] writer.pages[0].replace_contents(None) writer.pages[0].replace_contents(None) assert isinstance( writer._objects[contents.indirect_reference.idnum - 1], NullObject ) def test_merge_with_no_resources(): """Test for issue #2147""" writer = PdfWriter() p0 = writer.add_blank_page(900, 1200) del p0["/Resources"] p1 = writer.add_blank_page(900, 1200) del p1["/Resources"] writer.pages[0].merge_page(p1) def test_get_contents_from_nullobject(): """Issue #2157""" writer = PdfWriter() page1 = writer.add_blank_page(100, 100) page1[NameObject("/Contents")] = writer._add_object(NullObject()) assert page1.get_contents() is None page2 = writer.add_blank_page(100, 100) page1.merge_page(page2, over=True) @pytest.mark.enable_socket def test_pos_text_in_textvisitor(): """See #2200""" url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf" name = "test_text_pos.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) p = () def visitor_body2(text, cm, tm, fontdict, fontsize) -> None: nonlocal p if text.startswith("5425."): p = (tm[4], tm[5]) reader.pages[0].extract_text(visitor_text=visitor_body2) assert abs(p[0] - 323.5) < 0.1 assert abs(p[1] - 457.4) < 0.1 @pytest.mark.enable_socket def test_pos_text_in_textvisitor2(): """See #2075""" url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf" name = "LegIndex-page6.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) x_lvl = 26 lst = [] def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None: nonlocal x_lvl, lst if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210: lst.append(text.strip(" \n")) reader.pages[0].extract_text(visitor_text=visitor_lvl) assert lst == [ "ACUPUNCTURE BOARD", "ACUPUNCTURISTS AND ACUPUNCTURE", "ADMINISTRATIVE LAW AND PROCEDURE", "ADMINISTRATIVE LAW, OFFICE OF", "ADOPTION", "ADULT EDUCATION", "ADVERTISING. See also MARKETING; and particular subject matter (e.g.,", ] x_lvl = 35 lst = [] reader.pages[0].extract_text(visitor_text=visitor_lvl) assert lst == [ "members, AB 1264", "assistants, acupuncture, AB 1264", "complaints, investigations, etc., AB 1264", "day, california acupuncture, HR 48", "massage services, asian, AB 1264", "supervising acupuncturists, AB 1264", "supportive acupuncture services, basic, AB 1264", "rules and regulations—", "professional assistants and employees: employment and compensation, AB 916", "adults, adoption of, AB 1756", "agencies, organizations, etc.: requirements, prohibitions, etc., SB 807", "assistance programs, adoption: nonminor dependents, SB 9", "birth certificates, AB 1302", "contact agreements, postadoption—", "facilitators, adoption, AB 120", "failed adoptions: reproductive loss leave, SB 848", "hearings, adoption finalization: remote proceedings, technology, etc., SB 21", "native american tribes, AB 120", "parental rights, reinstatement of, AB 20", "parents, prospective adoptive: criminal background checks, SB 824", "services, adult educational, SB 877", "week, adult education, ACR 31", "alcoholic beverages: tied-house restrictions, AB 546", "campaign re social equity, civil rights, etc., SB 447", "cannabis, AB 794", "elections. See ELECTIONS.", "false, misleading, etc., advertising—", "hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683", "housing rental properties advertised rates: disclosures, SB 611", ] @pytest.mark.enable_socket def test_missing_basefont_in_type3(): """Cf #2289""" url = "https://github.com/py-pdf/pypdf/files/13307713/missing-base-font.pdf" name = "missing-base-font.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0]._get_fonts() def test_invalid_index(): src_abs = RESOURCE_ROOT / "git.pdf" reader = PdfReader(src_abs) with pytest.raises(TypeError): _ = reader.pages["0"] def test_negative_index(): src_abs = RESOURCE_ROOT / "git.pdf" reader = PdfReader(src_abs) assert reader.pages[0] == reader.pages[-1] def test_get_contents_as_bytes(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") co = writer.pages[0]["/Contents"][0] expected = co.get_data() assert writer.pages[0]._get_contents_as_bytes() == expected writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0] assert writer.pages[0]._get_contents_as_bytes() == expected del writer.pages[0]["/Contents"] assert writer.pages[0]._get_contents_as_bytes() is None def test_recursive_get_page_from_node(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) writer.root_object["/Pages"].get_object()[ NameObject("/Parent") ] = writer.root_object["/Pages"].indirect_reference with pytest.raises(PyPdfError): writer.add_page(writer.pages[0]) writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) writer.insert_page(writer.pages[0], -1) with pytest.raises(ValueError): writer.insert_page(writer.pages[0], -10) def test_get_contents__none_type(): # We can observe this in reality as well, but these documents might be # confidential. Thus use a more complex dummy implementation here while # assigning a value of `None` is not possible from code, but from PDFs # itself. class MyPage(PageObject): def __contains__(self, item) -> bool: assert item == "/Contents" return True def __getitem__(self, item) -> Any: assert item == "/Contents" page = MyPage() assert page.get_contents() is None def test_extract_text__none_type(): class MyPage(PageObject): def __getitem__(self, item) -> Any: if item == "/Contents": return None return super().__getitem__(item) page = MyPage() resources = DictionaryObject() none_reference = IndirectObject(1, 0, None) resources[NameObject("/Font")] = none_reference page[NameObject("/Resources")] = resources with mock.patch.object(none_reference, "get_object", return_value=None): assert page.extract_text() == "" @pytest.mark.enable_socket def test_scale_by(): """Tests for #3487""" url = "https://github.com/user-attachments/files/22685841/input.pdf" name = "issue3487.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) original_box = RectangleObject((0, 0, 595.275604, 841.88974)) expected_box = RectangleObject((0.0, 0.0, 297.637802, 420.94487)) for page in writer.pages: assert page.artbox == original_box assert page.bleedbox == original_box assert page.cropbox == original_box assert page.mediabox == original_box assert page.trimbox == original_box page.scale_by(0.5) assert page.artbox == expected_box assert page.bleedbox == expected_box assert page.cropbox == expected_box assert page.mediabox == expected_box assert page.trimbox == expected_box @pytest.mark.enable_socket @pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") def test_box_rendering(tmp_path): """Tests for issue #3487.""" url = "https://github.com/user-attachments/files/22685841/input.pdf" name = "issue3487.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) for page in writer.pages: page.scale_by(0.5) target_png_path = tmp_path / "target.png" url = "https://github.com/user-attachments/assets/e9c2271c-bfc3-4a6f-8c91-ffefa24502e2" name = "issue3487.png" target_png_path.write_bytes(get_data_from_url(url, name=name)) pdf_path = tmp_path / "out.pdf" writer.write(pdf_path) for box in ["Art", "Bleed", "Crop", "Media", "Trim"]: png_path = tmp_path / f"{box}.png" # False positive: https://github.com/PyCQA/bandit/issues/333 subprocess.run( # noqa: S603 [ GHOSTSCRIPT_BINARY, f"-dUse{box}Box", "-dFirstPage=1", "-dLastPage=1", "-sDEVICE=pngalpha", "-o", png_path, pdf_path, ] ) assert png_path.is_file(), box assert image_similarity(png_path, target_png_path) >= 0.95, box def test_delete_non_existent_annotations(): writer = PdfWriter() writer.add_blank_page(width=100, height=100) page = writer.pages[0] assert page.annotations is None page.annotations = None assert page.annotations is None def test_replace_contents_on_reader(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] content_stream = ContentStream(stream=None, pdf=reader) content_stream.set_data(b"Test data") expected_message = ( "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated and " "will be removed in pypdf 7.0.0. Attach the page to the writer first or use `PdfWriter(clone_from=...)` " "directly. The existing approach has proved being unreliable." ) with pytest.warns(DeprecationWarning, match=rf"^{re.escape(expected_message)}$"): page.replace_contents(content_stream) @pytest.mark.enable_socket @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_replace_contents_on_reader__indirect_reference(): url = "https://github.com/user-attachments/files/24195534/test.pdf" name = "issue3568.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() lhs = reader.get_page(3) writer.add_page(lhs) lhs = reader.get_page(1) lhs.merge_page(PageObject.create_blank_page(reader)) writer.add_page(lhs) def test_merge_page__coverage(): # Test with some otherwise untested cases. # Own resources are missing. page = PageObject.create_blank_page(width=10, height=10) del page[PageAttributes.RESOURCES] page.merge_page(PageObject.create_blank_page(width=10, height=10)) # Other resources are missing. page = PageObject.create_blank_page(width=10, height=10) del page[PageAttributes.RESOURCES] PageObject.create_blank_page(width=10, height=10).merge_page(page) # No expansion. page = PageObject.create_blank_page(width=10, height=10) page.merge_page(PageObject.create_blank_page(width=20, height=30)) assert page.mediabox == RectangleObject((0.0, 0.0, 10, 10)) # With expansion. page = PageObject.create_blank_page(width=10, height=10) page.merge_page(PageObject.create_blank_page(width=20, height=5), expand=True) assert page.mediabox == RectangleObject((0.0, 0.0, 20, 10)) # With transformation. path = RESOURCE_ROOT / "crazyones.pdf" page = PdfWriter(clone_from=path).pages[0] page.indirect_reference = None page2 = PageObject.create_blank_page(width=20, height=5) transformation = Transformation().rotate(90) page2.merge_transformed_page(page, ctm=transformation, expand=True) assert page2.mediabox == RectangleObject((-792, 0.0, 20, 612)) page2 = PageObject.create_blank_page(width=20, height=5) page2.merge_transformed_page(page, ctm=transformation.ctm, expand=True) assert page2.mediabox == RectangleObject((-792, 0.0, 20, 612)) # Not over. page = PdfWriter(clone_from=path).pages[0] page.indirect_reference = None page2 = PageObject.create_blank_page(width=20, height=5) page2.merge_page(page, over=False) @pytest.mark.enable_socket def test_importing_without_pillow(tmp_path): env = os.environ.copy() env["COVERAGE_PROCESS_START"] = "pyproject.toml" source_file = tmp_path / "script.py" source_file.write_text( """ import sys sys.modules["PIL"] = None from pypdf import PageObject from pypdf._page import pil_not_imported print(pil_not_imported) """ ) try: env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"] except KeyError: env["PYTHONPATH"] = "." result = subprocess.run( # noqa: S603 # We have the control here. [sys.executable, source_file], capture_output=True, env=env, ) assert result.returncode == 0 assert result.stdout.replace(b"\r\n", b"\n") == b"True\n" assert result.stderr == b"" @pytest.mark.enable_socket def test_replace_contents__null_object_cloning_error(): url = "https://github.com/user-attachments/files/25240822/ML-4.30.24.pdf" name = "issue3632.pdf" reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) writer = PdfWriter() for page in reader.pages: new_page = writer.add_page(page) new_page.scale_by(1) page4_idnum = writer.pages[3].indirect_reference.idnum assert isinstance(writer.get_object(page4_idnum)["/Contents"], ContentStream) assert isinstance(writer.get_object(page4_idnum + 1), NullObject) data = BytesIO() writer.write(data) reader = PdfReader(data) assert len(reader.pages) == 10 def test_get_rectangle__size_handling(caplog): """ See issue #2991 and related ones. We would previously generate invalid page boxes when they were part of the `/Pages` instead of the `/Page` due to re-using the same target object, while appending to the existing "full" object. To keep compatibility with our old code, allow these boxes to have more than four entries. """ reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") page = reader.pages[0] assert page.mediabox == RectangleObject((0, 0, 612, 792)) assert caplog.messages == [] reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") page = reader.pages[0] page[NameObject("/MediaBox")] = ArrayObject([0, 0, 13, 37, 0, 0, 13, 37]) assert page.mediabox == RectangleObject((0, 0, 13, 37)) assert "Expected four values, got 8: [0, 0, 13, 37, 0, 0, 13, 37]\n" in caplog.text ================================================ FILE: tests/test_page_labels.py ================================================ """Test the pypdf._page_labels module.""" from io import BytesIO import pytest from pypdf import PdfReader from pypdf._page_labels import ( get_label_from_nums, index2label, number2lowercase_letter, number2lowercase_roman_numeral, number2uppercase_letter, number2uppercase_roman_numeral, nums_clear_range, nums_insert, nums_next, ) from pypdf.generic import ( ArrayObject, DictionaryObject, NameObject, NullObject, NumberObject, ) from . import RESOURCE_ROOT, get_data_from_url @pytest.mark.parametrize( ("number", "expected"), [ (1, "I"), (2, "II"), (3, "III"), (4, "IV"), (5, "V"), (6, "VI"), (7, "VII"), (8, "VIII"), (9, "IX"), (10, "X"), ], ) def test_number2uppercase_roman_numeral(number, expected): assert number2uppercase_roman_numeral(number) == expected def test_number2lowercase_roman_numeral(): assert number2lowercase_roman_numeral(123) == "cxxiii" @pytest.mark.parametrize( ("number", "expected"), [ (1, "a"), (2, "b"), (3, "c"), (25, "y"), (26, "z"), (27, "aa"), (28, "ab"), ], ) def test_number2lowercase_letter(number, expected): assert number2lowercase_letter(number) == expected def test_number2uppercase_letter(): with pytest.raises(ValueError): number2uppercase_letter(-1) @pytest.mark.enable_socket def test_index2label(caplog): name = "waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" r = PdfReader(BytesIO(get_data_from_url(name=name))) assert index2label(r, 1) == "ii" assert index2label(r, 9) == "6" # very silly data to get test cover r.trailer["/Root"]["/PageLabels"]["/Nums"].append(8) r.trailer["/Root"]["/PageLabels"]["/Nums"].append(NullObject()) assert index2label(r, 9) == "10" with pytest.raises(ValueError): nums_clear_range( NumberObject(10), 8, r.trailer["/Root"]["/PageLabels"]["/Nums"] ) r.trailer["/Root"]["/PageLabels"]["/Nums"].append(8) with pytest.raises(ValueError): nums_next(NumberObject(10), r.trailer["/Root"]["/PageLabels"]["/Nums"]) with pytest.raises(ValueError): nums_clear_range( NumberObject(10), 8, r.trailer["/Root"]["/PageLabels"]["/Nums"] ) with pytest.raises(ValueError): nums_insert( NumberObject(10), DictionaryObject(), r.trailer["/Root"]["/PageLabels"]["/Nums"], ) del r.trailer["/Root"]["/PageLabels"]["/Nums"] assert index2label(r, 1) == "2" caplog.clear() r.trailer["/Root"]["/PageLabels"][NameObject("/Kids")] = NullObject() assert index2label(r, 1) == "2" assert caplog.text != "" @pytest.mark.enable_socket def test_index2label_kids(): url = "https://github.com/py-pdf/pypdf/files/14858124/Terminologie_Epochen.Schwerpunkte.Umsetzungen.pdf" r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids.pdf"))) expected = [ "C1", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", *list(map(str, range(1, 284))) ] for x in ["20", "44", "58", "82", "94", "116", "154", "166", "192", "224", "250"]: # Some page labels are unused. Removing them is still easier than copying the # whole list itself here. expected.remove(x) assert r.page_labels == expected @pytest.mark.enable_socket def test_index2label_kids__recursive(caplog): url = "https://github.com/py-pdf/pypdf/files/14842446/tt1.pdf" r = PdfReader( BytesIO(get_data_from_url(url=url, name="index2label_kids_recursive.pdf")) ) expected = [ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "17", "18", "19", ] assert r.page_labels == expected assert caplog.text != "" def test_get_label_from_nums__empty_nums_list(): dictionary_object = DictionaryObject() dictionary_object[NameObject("/Nums")] = ArrayObject() assert get_label_from_nums(dictionary_object, 13) == "14" def test_index2label__empty_kids_list(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") number_tree = DictionaryObject() number_tree[NameObject("/Kids")] = ArrayObject() root = reader.root_object root[NameObject("/PageLabels")] = number_tree assert index2label(reader, 42) == "43" ================================================ FILE: tests/test_pagerange.py ================================================ """Test the pypdf.pagerange module.""" import pytest from pypdf.pagerange import PageRange, ParseError, parse_filename_page_ranges def test_equality(): pr1 = PageRange(slice(0, 5)) pr2 = PageRange(slice(0, 5)) assert pr1 == pr2 def test_hash(): pr1 = PageRange(slice(0, 5)) pr2 = PageRange(slice(0, 5)) pr3 = PageRange(slice(10, 11)) pr4 = PageRange(slice(10, 11, 1)) assert hash(pr1) == hash(pr2) assert hash(pr1) != hash(pr3) # Consider this different for now, although slicing with step size of 1 and `None` should be identical. assert hash(pr3) != hash(pr4) @pytest.mark.parametrize( ("page_range", "expected"), [(slice(0, 5), "0:5"), (slice(0, 5, 2), "0:5:2"), ("-1", "-1:"), ("0", "0")], ) def test_str(page_range, expected): assert str(PageRange(page_range)) == expected @pytest.mark.parametrize( ("page_range", "expected"), [(slice(0, 5), "PageRange('0:5')"), (slice(0, 5, 2), "PageRange('0:5:2')")], ) def test_repr(page_range, expected): assert repr(PageRange(page_range)) == expected def test_equality_other_objectc(): pr1 = PageRange(slice(0, 5)) pr2 = "PageRange(slice(0, 5))" assert pr1 != pr2 def test_idempotency(): pr = PageRange(slice(0, 5)) pr2 = PageRange(pr) assert pr == pr2 @pytest.mark.parametrize( ("range_str", "expected"), [ ("42", slice(42, 43)), ("1:2", slice(1, 2)), ], ) def test_str_init(range_str, expected): pr = PageRange(range_str) assert pr._slice == expected assert PageRange.valid def test_str_init_error(): init_str = "1-2" assert PageRange.valid(init_str) is False with pytest.raises(ParseError) as exc: PageRange(init_str) assert exc.value.args[0] == "1-2" @pytest.mark.parametrize( ("params", "expected"), [ (["foo.pdf", "1:5"], [("foo.pdf", PageRange("1:5"))]), ( ["foo.pdf", "1:5", "bar.pdf"], [("foo.pdf", PageRange("1:5")), ("bar.pdf", PageRange(":"))], ), ], ) def test_parse_filename_page_ranges(params, expected): assert parse_filename_page_ranges(params) == expected def test_parse_filename_page_ranges_err(): with pytest.raises(ValueError) as exc: parse_filename_page_ranges(["1:5", "foo.pdf"]) assert ( exc.value.args[0] == "The first argument must be a filename, not a page range." ) @pytest.mark.parametrize( ("a", "b", "expected"), [ (PageRange(slice(0, 5)), PageRange(slice(2, 10)), slice(0, 10)), (PageRange(slice(0, 5)), PageRange(slice(2, 3)), slice(0, 5)), (PageRange(slice(0, 5)), PageRange(slice(5, 10)), slice(0, 10)), ], ) def test_addition(a, b, expected): pr1 = PageRange(a) pr2 = PageRange(b) assert pr1 + pr2 == PageRange(expected) assert pr2 + pr1 == PageRange(expected) # addition is commutative @pytest.mark.parametrize( ("a", "b"), [ (PageRange(slice(0, 5)), PageRange(slice(7, 10))), (PageRange(slice(7, 10)), PageRange(slice(0, 5))), ], ) def test_addition_gap(a: PageRange, b: PageRange): with pytest.raises(ValueError) as exc: a + b assert exc.value.args[0] == "Can't add PageRanges with gap" def test_addition_non_page_range(): with pytest.raises(TypeError) as exc: PageRange(slice(0, 5)) + "2:7" assert exc.value.args[0] == "Can't add PageRange and " def test_addition_stride(): a = PageRange(slice(0, 5, 2)) b = PageRange(slice(7, 9)) with pytest.raises(ValueError) as exc: a + b assert exc.value.args[0] == "Can't add PageRange with stride" ================================================ FILE: tests/test_papersizes.py ================================================ """Test the pypdf.papersizes module.""" import pytest from pypdf import papersizes def test_din_a0_paper_size(): """The dimensions and area of the DIN A0 paper size are correct.""" dim = papersizes.PaperSize.A0 area_square_pixels = float(dim.width) * dim.height # 72 pixels is 1 inch area_square_inch = area_square_pixels / 72**2 # 25.4 millimeter is equal to 1 inches area_square_mm = area_square_inch * (25.4) ** 2 assert abs(area_square_mm - 999949) < 100 conversion_factor = 72 / 25.4 assert (dim.width - 841 * conversion_factor) < 1 assert (dim.width - 1189 * conversion_factor) < 1 @pytest.mark.parametrize("dimensions", papersizes._din_a) def test_din_a_aspect_ratio(dimensions): """The aspect ratio of DIN A paper sizes is correct.""" assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5 @pytest.mark.parametrize( ("dimensions_a", "dimensions_b"), list(zip(papersizes._din_a, papersizes._din_a[1:])), ) def test_din_a_size_doubling(dimensions_a, dimensions_b): """The height of a DIN A paper size doubles when moving to the next size.""" assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4 ================================================ FILE: tests/test_pdfa.py ================================================ """Ensure that pypdf doesn't break PDF/A compliance.""" from io import BytesIO from pathlib import Path from typing import Optional import pytest from pypdf import PdfReader, PdfWriter from tests import SAMPLE_ROOT def is_pdfa1b_compliant(src: BytesIO): """Check if a PDF is PDF/A-1b compliant.""" def document_information_has_analogous_xml(src: BytesIO) -> bool: reader = PdfReader(src) meta = reader.metadata xmp = reader.xmp_metadata if not meta: return True if not xmp: return False if meta.title and not xmp.dc_title: return meta.title == xmp.dc_title return True return document_information_has_analogous_xml(src) @pytest.mark.samples @pytest.mark.parametrize( ("src", "diagnostic_write_name"), [ (SAMPLE_ROOT / "021-pdfa/crazyones-pdfa.pdf", None), ], ) def test_pdfa(src: Path, diagnostic_write_name: Optional[str]): with open(src, "rb") as fp: data = BytesIO(fp.read()) reader = PdfReader(src) assert is_pdfa1b_compliant(data) writer = PdfWriter() writer.clone_document_from_reader(reader) stream = BytesIO() writer.write(stream) stream.seek(0) assert is_pdfa1b_compliant(stream) if diagnostic_write_name: with open(diagnostic_write_name, "wb") as fp: stream.seek(0) fp.write(stream.read()) ================================================ FILE: tests/test_protocols.py ================================================ """Test the pypdf._protocols module.""" from pypdf._protocols import PdfObjectProtocol class IPdfObjectProtocol(PdfObjectProtocol): pass def test_pdfobjectprotocol(): o = IPdfObjectProtocol() assert o.clone(None, False, None) is None assert o._reference_clone(None, None) is None assert o.get_object() is None assert o.hash_value() is None assert o.write_to_stream(None) is None ================================================ FILE: tests/test_reader.py ================================================ """Test the pypdf._reader module.""" import io import sys import time from io import BytesIO from pathlib import Path from typing import Union import pytest from pypdf import PdfReader, PdfWriter from pypdf._crypt_providers import crypt_provider from pypdf._reader import convert_to_int from pypdf.constants import ImageAttributes as IA from pypdf.constants import PageAttributes as PG from pypdf.constants import UserAccessPermissions as UAP from pypdf.errors import ( DeprecationError, EmptyFileError, FileNotDecryptedError, LimitReachedError, PdfReadError, PdfStreamError, WrongPasswordError, ) from pypdf.generic import ( ArrayObject, Destination, DictionaryObject, IndirectObject, NameObject, NumberObject, TextStringObject, ) from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url, normalize_warnings HAS_AES = crypt_provider[0] in ["pycryptodome", "cryptography"] NestedList = Union[int, None, list["NestedList"]] @pytest.mark.parametrize( ("src", "num_pages"), [("selenium-pypdf-issue-177.pdf", 1), ("pdflatex-outline.pdf", 4)], ) def test_get_num_pages(src, num_pages): src = RESOURCE_ROOT / src with PdfReader(src) as reader: assert len(reader.pages) == num_pages # from #1911 assert "/Size" in reader.trailer @pytest.mark.parametrize( ("pdf_path", "expected"), [ ( RESOURCE_ROOT / "crazyones.pdf", { "/CreationDate": "D:20150604133406-06'00'", "/Creator": " XeTeX output 2015.06.04:1334", "/Producer": "xdvipdfmx (20140317)", }, ), ( RESOURCE_ROOT / "metadata.pdf", { "/CreationDate": "D:20220415093243+02'00'", "/ModDate": "D:20220415093243+02'00'", "/Creator": "pdflatex, or other tool", "/Producer": "Latex with hyperref, or other system", "/Author": "Martin Thoma", "/Keywords": "Some Keywords, other keywords; more keywords", "/Subject": "The Subject", "/Title": "The Title", "/Trapped": "/False", "/PTEX.Fullbanner": ( "This is pdfTeX, Version " "3.141592653-2.6-1.40.23 (TeX Live 2021) " "kpathsea version 6.3.3" ), }, ), ], ids=["crazyones", "metadata"], ) def test_read_metadata(pdf_path, expected): with open(pdf_path, "rb") as inputfile: reader = PdfReader(inputfile) docinfo = reader.metadata assert docinfo is not None metadict = dict(docinfo) assert metadict == expected docinfo.title docinfo.title_raw docinfo.author docinfo.author_raw docinfo.creator docinfo.creator_raw docinfo.producer docinfo.producer_raw docinfo.subject docinfo.subject_raw docinfo.creation_date docinfo.creation_date_raw docinfo.modification_date docinfo.modification_date_raw docinfo.keywords docinfo.keywords_raw if "/Title" in metadict: assert isinstance(docinfo.title, str) assert metadict["/Title"] == docinfo.title def test_read_metadata_title_is_utf8(): with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile: reader = PdfReader(inputfile) title = reader.metadata.title # Should be a str. assert title == "Microsoft Word - トランスバース社買収電話会議英語Final.docx" def test_iss1943(): with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader: docinfo = reader.metadata docinfo.update( { NameObject("/CreationDate"): TextStringObject( "D:20230705005151Z00'00'" ), NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"), } ) docinfo.creation_date docinfo.creation_date_raw docinfo.modification_date docinfo.modification_date_raw docinfo.update({NameObject("/CreationDate"): NumberObject(1)}) assert docinfo.creation_date is None @pytest.mark.samples @pytest.mark.parametrize( "pdf_path", [SAMPLE_ROOT / "017-unreadable-meta-data/unreadablemetadata.pdf"] ) def test_broken_meta_data(pdf_path): with open(pdf_path, "rb") as f: reader = PdfReader(f) assert reader.metadata is None with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as f: b = f.read(-1) reader = PdfReader(BytesIO(b.replace(b"/Info 2 0 R", b"/Info 2 "))) with pytest.raises(PdfReadError) as exc: reader.metadata assert "does not point to a document information dictionary" in repr(exc) @pytest.mark.parametrize( "src", [ RESOURCE_ROOT / "crazyones.pdf", RESOURCE_ROOT / "commented.pdf", ], ) def test_get_annotations(src): with PdfReader(src) as reader: for page in reader.pages: if PG.ANNOTS in page: for annot in page[PG.ANNOTS]: subtype = annot.get_object()[IA.SUBTYPE] if subtype == "/Text": annot.get_object()[PG.CONTENTS] @pytest.mark.parametrize( ("src", "nb_attachments"), [ (RESOURCE_ROOT / "attachment.pdf", 1), (RESOURCE_ROOT / "crazyones.pdf", 0), ], ) def test_get_attachments(src, nb_attachments): reader = PdfReader(src) attachments = {} for page in reader.pages: if PG.ANNOTS in page: for annotation in page[PG.ANNOTS]: annotobj = annotation.get_object() if annotobj[IA.SUBTYPE] == "/FileAttachment": fileobj = annotobj["/FS"] attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data() assert len(attachments) == nb_attachments @pytest.mark.parametrize( ("src", "outline_elements"), [ (RESOURCE_ROOT / "pdflatex-outline.pdf", 9), (RESOURCE_ROOT / "crazyones.pdf", 0), ], ) def test_get_outline(src, outline_elements): reader = PdfReader(src) outline = reader.outline assert len(outline) == outline_elements @pytest.mark.samples @pytest.mark.parametrize( ("src", "expected_images"), [ ("pdflatex-outline.pdf", []), ("crazyones.pdf", []), ("git.pdf", ["Image9.png"]), pytest.param( "imagemagick-lzw.pdf", ["Im0.png"], marks=pytest.mark.xfail(reason="broken image extraction"), ), pytest.param( "imagemagick-ASCII85Decode.pdf", ["Im0.png"], # marks=pytest.mark.xfail(reason="broken image extraction"), ), ("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]), (SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf", ["X0.png"]), ], ) def test_get_images(src, expected_images): from PIL import Image # noqa: PLC0415 src_abs = RESOURCE_ROOT / src reader = PdfReader(src_abs) page = reader.pages[0] images_extracted = page.images assert len(images_extracted) == len(expected_images) for image, expected_image in zip(images_extracted, expected_images): assert image.name == expected_image assert ( image.name.split(".")[-1].upper() == Image.open(io.BytesIO(image.data)).format ) @pytest.mark.parametrize( ("strict", "with_prev_0", "startx_correction", "should_fail", "warning_msgs"), [ ( True, False, -1, False, [ "startxref on same line as offset", "Xref table not zero-indexed. " "ID numbers for objects will be corrected.", ], ), # all nominal => no fail (True, True, -1, True, ""), # Prev=0 => fail expected ( False, False, -1, False, [ "startxref on same line as offset", ], ), ( False, True, -1, False, [ "startxref on same line as offset", "/Prev=0 in the trailer - assuming there is no previous xref table", ], ), # Prev =0 => no strict so tolerant (True, False, 0, True, ""), # error on startxref, in strict => fail expected (True, True, 0, True, ""), ( False, False, 0, False, [ "startxref on same line as offset", "incorrect startxref pointer(1)", "parsing for Object Streams", ], ), # error on startxref, but no strict => xref rebuilt,no fail ( False, True, 0, False, [ "startxref on same line as offset", "incorrect startxref pointer(1)", "parsing for Object Streams", ], ), ], ) def test_get_images_raw( caplog, strict, with_prev_0, startx_correction, should_fail, warning_msgs ): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << %s/Root 5 0 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( # - 1 below in the find because of the double % pdf_data.find(b"1 0 obj") - 1, pdf_data.find(b"2 0 obj") - 1, pdf_data.find(b"3 0 obj") - 1, pdf_data.find(b"4 0 obj") - 1, pdf_data.find(b"5 0 obj") - 1, b"/Prev 0 " if with_prev_0 else b"", # startx_correction should be -1 due to double % at the beginning # inducing an error on startxref computation pdf_data.find(b"xref") + startx_correction, ) pdf_stream = io.BytesIO(pdf_data) if should_fail: with pytest.raises(PdfReadError) as exc: PdfReader(pdf_stream, strict=strict) assert exc.type == PdfReadError if startx_correction == -1: assert ( exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)" ) else: PdfReader(pdf_stream, strict=strict) assert normalize_warnings(caplog.text) == warning_msgs def test_issue297(caplog): path = RESOURCE_ROOT / "issue-297.pdf" with pytest.raises(PdfReadError) as exc: reader = PdfReader(path, strict=True) assert caplog.text == "" assert "Broken xref table" in exc.value.args[0] reader = PdfReader(path, strict=False) assert normalize_warnings(caplog.text) == [ "incorrect startxref pointer(1)", "parsing for Object Streams", ] reader.pages[0] @pytest.mark.parametrize( ("pdffile", "password", "should_fail"), [ ("encrypted-file.pdf", "test", False), ("encrypted-file.pdf", b"test", False), ("encrypted-file.pdf", "qwerty", True), ("encrypted-file.pdf", b"qwerty", True), ], ) def test_get_page_of_encrypted_file(pdffile, password, should_fail): """ Check if we can read a page of an encrypted file. This is a regression test for issue 327: IndexError for get_page() of decrypted file """ path = RESOURCE_ROOT / pdffile if should_fail: with pytest.raises(PdfReadError): PdfReader(path, password=password) else: PdfReader(path, password=password).pages[0] @pytest.mark.parametrize( ("src", "expected", "expected_get_fields"), [ ( "form.pdf", {"foo": ""}, {"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": ""}}, ), ( "form_acrobatReader.pdf", {"foo": "Bar"}, {"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": "Bar"}}, ), ( "form_evince.pdf", {"foo": "bar"}, {"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": "bar"}}, ), ( "crazyones.pdf", {}, None, ) ], ) def test_get_form(src, expected, expected_get_fields, txt_file_path): """Check if we can read out form data.""" src = RESOURCE_ROOT / src reader = PdfReader(src) fields = reader.get_form_text_fields() assert fields == expected with open(txt_file_path, "w") as f: fields = reader.get_fields(fileobj=f) assert fields == expected_get_fields if fields: for field in fields.values(): # Just access the attributes [ field.field_type, field.parent, field.kids, field.name, field.alternate_name, field.mapping_name, field.flags, field.value, field.default_value, field.additional_actions, ] @pytest.mark.enable_socket def test_reading_choice_field_without_opt_key(): """Tests reading a choice field in a PDF without an /Opt key.""" url = "https://github.com/user-attachments/files/23853677/Musterservicevertrag-HNRAGB_Okt2022-Blanko.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name="Musterservicevertrag-HNRAGB_Okt2022-Blanko.pdf"))) fields = reader.get_fields() tn_anrede = fields.get("TN_Anrede") assert tn_anrede is not None # Ensure that parsing of a choice field without /Opt key worked tn_anrede_opt = tn_anrede.get("/Opt") assert tn_anrede_opt is None @pytest.mark.parametrize( ("src", "page_number"), [ ("form.pdf", 0), ("pdflatex-outline.pdf", 2), ], ) def test_get_page_number(src, page_number): src = RESOURCE_ROOT / src reader = PdfReader(src) reader.get_page(0) page = reader.pages[page_number] assert reader.get_page_number(page) == page_number @pytest.mark.parametrize( ("src", "expected"), [("form.pdf", None), ("AutoCad_Simple.pdf", "/SinglePage")], ) def test_get_page_layout(src, expected): src = RESOURCE_ROOT / src reader = PdfReader(src) assert reader.page_layout == expected @pytest.mark.parametrize( ("src", "expected"), [ ("form.pdf", "/UseNone"), ("crazyones.pdf", None), ], ) def test_get_page_mode(src, expected): src = RESOURCE_ROOT / src reader = PdfReader(src) assert reader.page_mode == expected def test_read_empty(): with pytest.raises(EmptyFileError) as exc: PdfReader(io.BytesIO()) assert exc.value.args[0] == "Cannot read an empty file" def test_read_malformed_header(caplog): with pytest.raises(PdfReadError) as exc: PdfReader(io.BytesIO(b"foo"), strict=True) assert exc.value.args[0] == "PDF starts with 'foo', but '%PDF-' expected" caplog.clear() try: PdfReader(io.BytesIO(b"foo"), strict=False) except Exception: pass assert caplog.messages[0].startswith("invalid pdf header") def test_read_malformed_body(): with pytest.raises(PdfReadError) as exc: PdfReader(io.BytesIO(b"%PDF-"), strict=True) assert ( exc.value.args[0] == "EOF marker not found" ) # used to be:STREAM_TRUNCATED_PREMATURELY def test_read_prev_0_trailer(): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << %s/Root 5 0 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) with_prev_0 = True pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj"), pdf_data.find(b"2 0 obj"), pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: PdfReader(pdf_stream, strict=True) assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)" def test_circular_xref_prev_reference(caplog): """Circular /Prev in trailer should be detected, not loop forever (#3654).""" pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Prev %d /Root 5 0 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) xref_offset = pdf_data.find(b"xref") - 1 pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj"), pdf_data.find(b"2 0 obj"), pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), xref_offset, # /Prev points to same xref = circular xref_offset, # startxref ) PdfReader(io.BytesIO(pdf_data)) assert "Circular xref chain detected" in caplog.text def test_read_missing_startxref(): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 5 0 R /Size 6 >>\n" # Removed for this test: b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj"), pdf_data.find(b"2 0 obj"), pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), # Removed for this test: pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: PdfReader(pdf_stream, strict=True) assert exc.value.args[0] == "startxref not found" def test_read_unknown_zero_pages(caplog): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" # Pages 0 0 is the key point: b"5 0 obj << /Pages 0 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 5 1 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj") - 1, pdf_data.find(b"2 0 obj") - 1, pdf_data.find(b"3 0 obj") - 1, pdf_data.find(b"4 0 obj") - 1, pdf_data.find(b"5 0 obj") - 1, pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) reader = PdfReader(pdf_stream, strict=True) warnings = [ "startxref on same line as offset", "Xref table not zero-indexed. ID numbers for objects will be corrected.", ] assert normalize_warnings(caplog.text) == warnings with pytest.raises(PdfReadError) as exc: len(reader.pages) assert exc.value.args[0] == "Could not find object." reader = PdfReader(pdf_stream, strict=False) warnings += [ "Object 5 1 not defined.", "startxref on same line as offset", ] assert normalize_warnings(caplog.text) == warnings with pytest.raises(PdfReadError) as exc: len(reader.pages) assert exc.value.args[0] == "Invalid object in /Pages" def test_read_encrypted_without_decryption(): src = RESOURCE_ROOT / "libreoffice-writer-password.pdf" reader = PdfReader(src) with pytest.raises(FileNotDecryptedError) as exc: len(reader.pages) assert exc.value.args[0] == "File has not been decrypted" def test_get_destination_page_number(): src = RESOURCE_ROOT / "pdflatex-outline.pdf" reader = PdfReader(src) outline = reader.outline for outline_item in outline: if not isinstance(outline_item, list): reader.get_destination_page_number(outline_item) def test_do_not_get_stuck_on_large_files_without_start_xref(): """ Tests for the absence of a DoS bug, where a large file without an startxref mark would cause the library to hang for minutes to hours. """ start_time = time.time() broken_stream = BytesIO(b"\0" * 5 * 1000 * 1000) with pytest.raises(PdfReadError): PdfReader(broken_stream) parse_duration = time.time() - start_time # parsing is expected take less than a second on a modern cpu, but include # a large tolerance to account for busy or slow systems assert parse_duration < 60 @pytest.mark.enable_socket def test_decrypt_when_no_id(): """ Decrypt an encrypted file that's missing the 'ID' value in its trailer. https://github.com/py-pdf/pypdf/issues/608 """ with open(RESOURCE_ROOT / "encrypted_doc_no_id.pdf", "rb") as inputfile: ipdf = PdfReader(inputfile) ipdf.decrypt("") assert ipdf.metadata == {"/Producer": "European Patent Office"} def test_reader_properties(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") assert reader.outline == [] assert len(reader.pages) == 1 assert reader.page_layout is None assert reader.page_mode is None assert reader.is_encrypted is False @pytest.mark.parametrize( "strict", [True, False], ) def test_issue604(caplog, strict): """Test with invalid destinations.""" with open(RESOURCE_ROOT / "issue-604.pdf", "rb") as f: pdf = None outline = None if strict: pdf = PdfReader(f, strict=strict) with pytest.raises(PdfReadError) as exc: outline = pdf.outline if "Unknown Destination" not in exc.value.args[0]: raise Exception("Expected exception not raised") return # outline is not correct pdf = PdfReader(f, strict=strict) outline = pdf.outline msg = [ "Unknown destination: 'ms_Thyroid_2_2020_071520_watermarked.pdf' [0, 1]" ] assert normalize_warnings(caplog.text) == msg def get_dest_pages(x) -> NestedList: if isinstance(x, list): return [get_dest_pages(y) for y in x] destination_page_number = pdf.get_destination_page_number(x) if destination_page_number is None: return destination_page_number return destination_page_number + 1 out = [] # oi can be destination or a list:preferred to just print them for oi in outline: out.append(get_dest_pages(oi)) # noqa: PERF401 def test_decode_permissions(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") base = { "accessability": False, # Do not fix typo, as part of official, but deprecated API. "annotations": False, "assemble": False, "copy": False, "forms": False, "modify": False, "print_high_quality": False, "print": False, } print_ = base.copy() print_["print"] = True with pytest.raises( DeprecationError, match=( r"decode_permissions is deprecated and was removed in pypdf 5\.0\.0\. " r"Use user_access_permissions instead" ), ): assert reader.decode_permissions(4) == print_ modify = base.copy() modify["modify"] = True with pytest.raises( DeprecationError, match=( r"decode_permissions is deprecated and was removed in pypdf 5\.0\.0\. " r"Use user_access_permissions instead" ), ): assert reader.decode_permissions(8) == modify @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") def test_user_access_permissions(): # Not encrypted. reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") assert reader.user_access_permissions is None # Encrypted. reader = PdfReader(RESOURCE_ROOT / "encryption" / "r6-owner-password.pdf") assert reader.user_access_permissions == UAP.all() # Custom writer permissions. writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") writer.encrypt( user_password="", owner_password="abc", permissions_flag=UAP.PRINT | UAP.FILL_FORM_FIELDS, ) output = BytesIO() writer.write(output) reader = PdfReader(output) assert reader.user_access_permissions == (UAP.PRINT | UAP.FILL_FORM_FIELDS) # All writer permissions. writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") writer.encrypt( user_password="", owner_password="abc", permissions_flag=UAP.all(), ) output = BytesIO() writer.write(output) reader = PdfReader(output) assert reader.user_access_permissions == UAP.all() def test_pages_attribute(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) # Test if getting as slice throws an error assert len(reader.pages[:]) == 1 with pytest.raises(IndexError) as exc: reader.pages[-1000] assert exc.value.args[0] == "Sequence index out of range" with pytest.raises(IndexError): reader.pages[1000] assert exc.value.args[0] == "Sequence index out of range" def test_convert_to_int(): assert convert_to_int(b"\x01", 8) == 1 def test_convert_to_int_error(): with pytest.raises(PdfReadError) as exc: convert_to_int(b"256", 16) assert exc.value.args[0] == "Invalid size in convert_to_int" @pytest.mark.enable_socket def test_iss925(): url = "https://github.com/py-pdf/pypdf/files/8796328/1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name="iss925.pdf"))) for page_sliced in reader.pages: page_object = page_sliced.get_object() # Extracts the PDF's Annots (Annotations and Commenting): annots = page_object.get("/Annots") if annots is not None: for annot in annots: annot.get_object() def test_get_object(): reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") assert reader.get_object(22)["/Type"] == "/Catalog" assert reader._get_indirect_object(22, 0)["/Type"] == "/Catalog" def test_extract_text_hello_world(): reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") text = reader.pages[0].extract_text().split("\n") assert text == [ "English:", "Hello World", "Arabic:", "مرحبا بالعالم", "Russian:", "Привет, мир", "Chinese (traditional):", "你好世界", "Thai:", "สวัสดีชาวโลก", "Japanese:", "こんにちは世界", ] def test_read_path(): path = Path(RESOURCE_ROOT, "crazyones.pdf") reader = PdfReader(path) assert len(reader.pages) == 1 def test_read_not_binary_mode(caplog): with open(RESOURCE_ROOT / "crazyones.pdf") as f: msg = ( "PdfReader stream/file object is not in binary mode. " "It may not be read correctly." ) with pytest.raises(io.UnsupportedOperation): PdfReader(f) assert normalize_warnings(caplog.text) == [msg] @pytest.mark.enable_socket @pytest.mark.skipif(not HAS_AES, reason="No AES algorithm available") def test_read_form_416(): url = ( "https://www.fda.gov/downloads/AboutFDA/ReportsManualsForms/Forms/UCM074728.pdf" ) reader = PdfReader(BytesIO(get_data_from_url(url, name="issue_416.pdf"))) fields = reader.get_form_text_fields() assert len(fields) > 0 def test_form_topname_with_and_without_acroform(caplog): r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") r.add_form_topname("no") r.rename_form_topname("renamed") assert "/AcroForm" not in r.trailer["/Root"] r.trailer["/Root"][NameObject("/AcroForm")] = DictionaryObject() r.add_form_topname("toto") r.rename_form_topname("renamed") assert len(r.get_fields()) == 0 r = PdfReader(RESOURCE_ROOT / "form.pdf") r.add_form_topname("top") flds = r.get_fields() assert "top" in flds assert "top.foo" in flds r.rename_form_topname("renamed") flds = r.get_fields() assert "renamed" in flds assert "renamed.foo" in flds r = PdfReader(RESOURCE_ROOT / "form.pdf") r.get_fields()["foo"].indirect_reference.get_object()[ NameObject("/Parent") ] = DictionaryObject() r.add_form_topname("top") assert "have a non-expected parent" in caplog.text @pytest.mark.enable_socket def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://github.com/user-attachments/files/18381758/tika-981961.pdf" msg = [ "incorrect startxref pointer(2)", "parsing for Object Streams", ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf"))) for page in reader.pages: page.extract_text() assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket @pytest.mark.slow def test_extract_text_xref_issue_3(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://github.com/user-attachments/files/18381755/tika-977774.pdf" msg = [ "incorrect startxref pointer(3)", ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf"))) for page in reader.pages: page.extract_text() assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket def test_extract_text_pdf15(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://github.com/user-attachments/files/18381751/tika-976030.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-976030.pdf"))) for page in reader.pages: page.extract_text() @pytest.mark.enable_socket def test_extract_text_xref_table_21_bytes_clrf(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://github.com/user-attachments/files/18381723/tika-956939.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-956939.pdf"))) for page in reader.pages: page.extract_text() @pytest.mark.enable_socket def test_get_fields(): url = "https://github.com/user-attachments/files/18381747/tika-972486.pdf" name = "tika-972486.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_fields() assert fields is not None assert "c1-1" in fields assert dict(fields["c1-1"]) == ( {"/FT": "/Btn", "/T": "c1-1", "/_States_": ["/On", "/Off"]} ) @pytest.mark.enable_socket def test_get_full_qualified_fields(): url = "https://github.com/py-pdf/pypdf/files/10142389/fields_with_dots.pdf" name = "fields_with_dots.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_form_text_fields(True) assert fields is not None assert "customer.name" in fields fields = reader.get_form_text_fields(False) assert fields is not None assert "customer.name" not in fields assert "name" in fields fields = reader.get_fields(True) assert fields is not None assert "customer.name" in fields assert fields["customer.name"]["/T"] == "name" @pytest.mark.enable_socket @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_get_fields_read_else_block(): # covers also issue 1089 url = "https://github.com/user-attachments/files/18381705/tika-934771.pdf" name = "tika-934771.pdf" PdfReader(BytesIO(get_data_from_url(url, name=name))) @pytest.mark.enable_socket def test_get_fields_read_else_block2(): url = "https://github.com/user-attachments/files/18381689/tika-914902.pdf" name = "tika-914902.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_fields() assert fields is None @pytest.mark.enable_socket @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_get_fields_read_else_block3(): url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf" name = "tika-957721.pdf" PdfReader(BytesIO(get_data_from_url(url, name=name))) @pytest.mark.enable_socket def test_metadata_is_none(): url = "https://github.com/user-attachments/files/18381735/tika-963692.pdf" name = "tika-963692.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.metadata is None @pytest.mark.enable_socket def test_get_fields_read_write_report(txt_file_path): url = "https://github.com/user-attachments/files/18381683/tika-909655.pdf" name = "tika-909655.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with open(txt_file_path, "w") as fp: fields = reader.get_fields(fileobj=fp) assert fields @pytest.mark.parametrize( "src", [ RESOURCE_ROOT / "crazyones.pdf", RESOURCE_ROOT / "commented.pdf", ], ) def test_xfa(src): reader = PdfReader(src) assert reader.xfa is None @pytest.mark.enable_socket def test_xfa_non_empty(): url = "https://github.com/user-attachments/files/18381713/tika-942050.pdf" name = "tika-942050.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert list(reader.xfa.keys()) == [ "preamble", "config", "template", "PDFSecurity", "datasets", "postamble", ] @pytest.mark.parametrize( ("src", "pdf_header"), [ (RESOURCE_ROOT / "attachment.pdf", "%PDF-1.5"), (RESOURCE_ROOT / "crazyones.pdf", "%PDF-1.5"), ], ) def test_header(src, pdf_header): reader = PdfReader(src) assert reader.pdf_header == pdf_header @pytest.mark.enable_socket def test_outline_color(): reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].color == [0, 0, 1] @pytest.mark.enable_socket def test_outline_font_format(): reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].font_format == 2 def get_outline_property(outline, attribute_name: str): results = [] if isinstance(outline, list): for outline_item in outline: if isinstance(outline_item, Destination): results.append(getattr(outline_item, attribute_name)) else: results.append(get_outline_property(outline_item, attribute_name)) else: raise ValueError(f"got {type(outline)}") return results @pytest.mark.samples def test_outline_title_issue_1121(): reader = PdfReader(SAMPLE_ROOT / "014-outlines/mistitled_outlines_example.pdf") assert get_outline_property(reader.outline, "title") == [ "First", [ "Second", "Third", "Fourth", [ "Fifth", "Sixth", ], "Seventh", [ "Eighth", "Ninth", ], ], "Tenth", [ "Eleventh", "Twelfth", "Thirteenth", "Fourteenth", ], "Fifteenth", [ "Sixteenth", "Seventeenth", ], "Eighteenth", "Nineteenth", [ "Twentieth", "Twenty-first", "Twenty-second", "Twenty-third", "Twenty-fourth", "Twenty-fifth", "Twenty-sixth", "Twenty-seventh", ], ] @pytest.mark.samples def test_outline_count(): reader = PdfReader(SAMPLE_ROOT / "014-outlines/mistitled_outlines_example.pdf") assert get_outline_property(reader.outline, "outline_count") == [ 5, [ None, None, 2, [ None, None, ], -2, [ None, None, ], ], 4, [ None, None, None, None, ], -2, [ None, None, ], None, 8, [ None, None, None, None, None, None, None, None, ], ] def test_outline_missing_title(caplog): # Strict reader = PdfReader(RESOURCE_ROOT / "outline-without-title.pdf", strict=True) with pytest.raises(PdfReadError) as exc: reader.outline assert exc.value.args[0].startswith("Outline Entry Missing /Title attribute:") # Non-strict : no errors reader = PdfReader(RESOURCE_ROOT / "outline-without-title.pdf", strict=False) assert reader.outline[0]["/Title"] == "" @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ # 1st case : the named_dest are stored directly as a dictionary, PDF 1.1 style ( "https://github.com/py-pdf/pypdf/files/9197028/lorem_ipsum.pdf", "lorem_ipsum.pdf", ), # 2nd case : Dest below names and with Kids... ( "https://github.com/py-pdf/pypdf/files/11714214/PDF32000_2008.pdf", "PDF32000_2008.pdf", ) # 3rd case : Dests with Name tree (TODO: Add this case) ], ids=["stored_directly", "dest_below_names_with_kids"], ) def test_named_destination(url, name): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.named_destinations) > 0 @pytest.mark.enable_socket def test_outline_with_missing_named_destination(): url = "https://github.com/user-attachments/files/18381686/tika-913678.pdf" name = "tika-913678.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # outline items in document reference a named destination that is not defined assert reader.outline[1][0].title.startswith("Report for 2002AZ3B: Microbial") @pytest.mark.enable_socket def test_outline_with_empty_action(): url = "https://github.com/user-attachments/files/18381697/tika-924546.pdf" name = "tika-924546.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # outline items (entitled Tables and Figures) utilize an empty action (/A) # that has no type or destination assert reader.outline[-4].title == "Tables" def test_outline_with_invalid_destinations(): reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") # contains 9 outline items, 6 with invalid destinations # caused by different malformations assert len(reader.outline) == 9 @pytest.mark.enable_socket def test_pdfreader_multiple_definitions(caplog): """iss325""" url = "https://github.com/py-pdf/pypdf/files/9176644/multipledefs.pdf" name = "multipledefs.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() assert normalize_warnings(caplog.text) == [ "Multiple definitions in dictionary at byte 0xb5 for key /Group" ] def test_wrong_password_error(): encrypted_pdf_path = RESOURCE_ROOT / "encrypted-file.pdf" with pytest.raises(WrongPasswordError): PdfReader( encrypted_pdf_path, password="definitely_the_wrong_password!", ) def test_get_page_number_by_indirect(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") reader._get_page_number_by_indirect(1) @pytest.mark.enable_socket def test_corrupted_xref_table(): # issue #1292 url = "https://github.com/py-pdf/pypdf/files/9444747/BreezeManual.orig.pdf" name = "BreezeMan1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() url = "https://github.com/py-pdf/pypdf/files/9444748/BreezeManual.failed.pdf" name = "BreezeMan2.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() @pytest.mark.enable_socket def test_reader(caplog): # iss #1273 url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "Previous trailer cannot be read" in caplog.text caplog.clear() # first call requires some reparations... reader.pages[0].extract_text() caplog.clear() # ...and now no more required reader.pages[0].extract_text() assert caplog.text == "" @pytest.mark.enable_socket def test_zeroing_xref(): # iss #328 url = ( "https://github.com/py-pdf/pypdf/files/9066120/" "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" ) name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) len(reader.pages) @pytest.mark.enable_socket def test_thread(): url = ( "https://github.com/py-pdf/pypdf/files/9066120/" "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" ) name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.threads is None url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert isinstance(reader.threads, ArrayObject) assert len(reader.threads) >= 1 @pytest.mark.enable_socket def test_build_outline_item(caplog): url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) outline = reader._build_outline_item( DictionaryObject( { NameObject("/Title"): TextStringObject("Toto"), NameObject("/Dest"): NumberObject(2), } ) ) assert "Removed unexpected destination 2 from destination" in caplog.text assert outline["/Title"] == "Toto" reader.strict = True with pytest.raises(PdfReadError) as exc: reader._build_outline_item( DictionaryObject( { NameObject("/Title"): TextStringObject("Toto"), NameObject("/Dest"): NumberObject(2), } ) ) assert "Unexpected destination 2" in exc.value.args[0] @pytest.mark.samples @pytest.mark.parametrize( ("src", "page_labels"), [ (RESOURCE_ROOT / "selenium-pypdf-issue-177.pdf", ["1"]), (RESOURCE_ROOT / "encrypted_doc_no_id.pdf", ["1", "2", "3"]), (RESOURCE_ROOT / "pdflatex-outline.pdf", ["1", "2", "3", "4"]), ( SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", ["i", "ii", "iii", "1", "2", "3"], ), ], ids=[ "selenium-pypdf-issue-177.pdf", "encrypted_doc_no_id.pdf", "pdflatex-outline.pdf", "GeoTopo.pdf", ], ) def test_page_labels(src, page_labels): max_indices = 6 assert PdfReader(src).page_labels[:max_indices] == page_labels[:max_indices] @pytest.mark.enable_socket def test_iss1559(): url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" name = "iss1559.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for p in reader.pages: p.extract_text() @pytest.mark.enable_socket def test_iss1652(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10818844/tt.pdf" name = "invalidNamesDest.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.named_destinations @pytest.mark.enable_socket def test_iss1689(): url = "https://github.com/py-pdf/pypdf/files/10948283/error_file_without_data.pdf" name = "iss1689.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0] @pytest.mark.enable_socket def test_iss1710(): url = "https://github.com/py-pdf/pypdf/files/15234776/irbookonlinereading.pdf" name = "irbookonlinereading.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline def test_broken_file_header(): pdf_data = ( b"%%PDF-\xa0sd\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << %s/Root 5 0 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) with_prev_0 = True pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj"), pdf_data.find(b"2 0 obj"), pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", pdf_data.find(b"xref") - 1, ) PdfReader(io.BytesIO(pdf_data)) @pytest.mark.enable_socket def test_iss1756(): url = "https://github.com/py-pdf/pypdf/files/11105591/641-Attachment-B-Pediatric-Cardiac-Arrest-8-1-2019.pdf" name = "iss1756.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.trailer["/ID"] # removed to cope with missing cryptodome during commit check : len(reader.pages) @pytest.mark.enable_socket @pytest.mark.timeout(30) def test_iss1825(): url = "https://github.com/py-pdf/pypdf/files/11367871/MiFO_LFO_FEIS_NOA_Published.3.pdf" name = "iss1825.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] page.extract_text() @pytest.mark.enable_socket def test_iss2082(): url = "https://github.com/py-pdf/pypdf/files/12317939/test.pdf" name = "iss2082.pdf" b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) reader.pages[0].extract_text() bb = bytearray(b) bb[b.find(b"xref") + 2] = ord(b"E") with pytest.raises(PdfReadError): reader = PdfReader(BytesIO(bb)) @pytest.mark.enable_socket def test_issue_140(): url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" name = "issue-140.pdf" b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) assert len(reader.pages) == 54 @pytest.mark.enable_socket def test_xyz_with_missing_param(): """Cf #2236""" url = "https://github.com/py-pdf/pypdf/files/12795356/tt1.pdf" name = "issue2236.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.outline[0]["/Left"] == 820 assert reader.outline[0]["/Top"] == 0 assert reader.outline[1]["/Left"] == 0 assert reader.outline[0]["/Top"] == 0 @pytest.mark.enable_socket def test_corrupted_xref(): url = "https://github.com/py-pdf/pypdf/files/14628314/iss2516.pdf" name = "iss2516.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.root_object["/Type"] == "/Catalog" @pytest.mark.enable_socket def test_truncated_xref(caplog): url = "https://github.com/py-pdf/pypdf/files/14843553/002-trivial-libre-office-writer-broken.pdf" name = "iss2575.pdf" PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "Invalid/Truncated xref table. Rebuilding it." in caplog.text @pytest.mark.enable_socket def test_damaged_pdf(): url = "https://github.com/py-pdf/pypdf/files/15186107/malformed_pdf.pdf" name = "malformed_pdf.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) len(reader.pages) strict_reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=True) with pytest.raises(PdfReadError) as exc: len(strict_reader.pages) assert ( exc.value.args[0] == "Expected object ID (21 0) does not match actual (-1 -1)." ) @pytest.mark.enable_socket @pytest.mark.timeout(10) def test_looping_form(caplog): """Cf iss 2643""" url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf" name = "iss2643.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) flds = reader.get_fields() assert all( x in flds for x in ( "Text10", "Text10.0.0.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1", "amt1.0", "amt1.1", "DSS#3pg3#0hgu7", ) ) writer = PdfWriter(reader) writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"].append( writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"][0] ) flds2 = writer.get_fields() assert "Text68.0 already parsed" in caplog.text assert list(flds.keys()) == list(flds2.keys()) def test_context_manager_with_stream(): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 5\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 5 0 R /Size 6 >>\n" b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( pdf_data.find(b"1 0 obj"), pdf_data.find(b"2 0 obj"), pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with PdfReader(pdf_stream) as reader: assert not reader.stream.closed assert not pdf_stream.closed @pytest.mark.enable_socket @pytest.mark.timeout(10) def test_iss2761(): url = "https://github.com/user-attachments/files/16312198/crash-b26d05712a29b241ac6f9dc7fff57428ba2d1a04.pdf" name = "iss2761.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) with pytest.raises(PdfReadError): reader.pages[0].extract_text() @pytest.mark.enable_socket def test_iss2817(): """Test for rebuiling Xref_ObjStm""" url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf" name = "iss2817.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert ( reader.pages[0]["/Annots"][0].get_object()["/Contents"] == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B" ) @pytest.mark.enable_socket def test_truncated_files(caplog): """Cf #2853""" url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf" name = "iss2780.pdf" # reused b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) assert caplog.text == "" # remove \n at end of file : invisible reader = PdfReader(BytesIO(b[:-1])) assert caplog.text == "" # truncate but still detectable for i in range(-2, -6, -1): caplog.clear() reader = PdfReader(BytesIO(b[:i])) assert "EOF marker seems truncated" in caplog.text assert reader._startxref == 100993 # remove completely EOF : we will not read last section caplog.clear() reader = PdfReader(BytesIO(b[:-6])) assert "CAUTION: startxref found while searching for %%EOF" in caplog.text assert reader._startxref < 100993 @pytest.mark.enable_socket def test_comments_in_array(caplog): """Cf #2843: this deals with comments""" url = "https://github.com/user-attachments/files/16992416/crash-2347912aa2a6f0fab5df4ebc8a424735d5d0d128.pdf" name = "iss2843.pdf" # reused b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) reader.pages[0] assert caplog.text == "" reader = PdfReader(BytesIO(b)) reader.stream = BytesIO(b[:1149]) with pytest.raises(PdfStreamError): reader.pages[0] @pytest.mark.enable_socket def test_space_in_names_to_continue_processing(caplog): """ This deals with space not encoded in names inducing errors. Also covers case where NameObject not met for key. """ url = "https://github.com/user-attachments/files/17095516/crash-e108c4f677040b61e12fa9f1cfde025d704c9b0d.pdf" name = "iss2866.pdf" # reused b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) obj = reader.get_object(70) assert all( x in obj for x in ( "/BaseFont", "/DescendantFonts", "/Encoding", "/Subtype", "/ToUnicode", "/Type", ) ) assert obj["/BaseFont"] == "/AASGAA+Arial,Unicode" # MS is missing to meet spec assert 'PdfReadError("Invalid Elementary Object starting with' in caplog.text caplog.clear() b = b[:264] + b"(Inv) /d " + b[273:] reader = PdfReader(BytesIO(b)) obj = reader.get_object(70) assert all( x in obj for x in ["/DescendantFonts", "/Encoding", "/Subtype", "/ToUnicode", "/Type"] ) assert all( x in caplog.text for x in ( "Expecting a NameObject for key but", 'PdfReadError("Invalid Elementary Object starting with', ) ) reader = PdfReader(BytesIO(b), strict=True) with pytest.raises(PdfReadError): obj = reader.get_object(70) @pytest.mark.enable_socket def test_unbalanced_brackets_in_dictionary_object(caplog): """Cf #2877""" url = "https://github.com/user-attachments/files/17162634/7f40cb209fb97d1782bffcefc5e7be40.pdf" name = "iss2877.pdf" # reused reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages) == 43 # note: /Count = 46 but 3 kids are None @pytest.mark.enable_socket def test_repair_root(caplog): """Cf #2877""" url = "https://github.com/user-attachments/files/17162216/crash-6620e8b1abfe3da639b654595da859b87f985748.pdf" name = "iss2875.pdf" b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) assert len(reader.pages) == 1 assert all( msg in caplog.text for msg in ( "Invalid Root object", 'Searching object with "/Catalog" key', "Root found at IndirectObject(2, 0,", ) ) # no /Root Entry reader = PdfReader(BytesIO(b.replace(b"/Root", b"/Roo "))) caplog.clear() assert len(reader.pages) == 1 assert all( msg in caplog.text for msg in ( 'Cannot find "/Root" key in trailer', 'Searching object with "/Catalog" key', "Root found at IndirectObject(2, 0,", ) ) # Invalid /Root Entry caplog.clear() reader = PdfReader( BytesIO( b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog/Pages 3 0 R", b"/Catalo ") ) ) with pytest.raises(PdfReadError): len(reader.pages) assert all( msg in caplog.text for msg in ( "Invalid Root object in trailer", 'Searching object with "/Catalog" key', ) ) # Invalid /Root Entry + error in get_object caplog.clear() data = b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog/Pages 3 0 R", b"/Catalo ") data = data[:5124] + b"A" + data[5125:] reader = PdfReader(BytesIO(data)) with pytest.raises(PdfReadError): len(reader.pages) assert all( msg in caplog.text for msg in ( "Invalid Root object in trailer", 'Searching object with "/Catalog" key', ) ) # Invalid /Root Entry without /Type, but /Pages. caplog.clear() reader = PdfReader( BytesIO( b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ") ) ) assert len(reader.pages) == 1 assert all( msg in caplog.text for msg in ( "Invalid Root object in trailer", 'Searching object with "/Catalog" key', f"Possible root found at IndirectObject(2, 0, {id(reader)}), but missing /Catalog key" ) ) @pytest.mark.enable_socket def test_issue3151(caplog): """Tests for #3151""" url = "https://github.com/user-attachments/files/18941494/bible.pdf" name = "issue3151.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages) == 742 @pytest.mark.enable_socket def test_issue2886(caplog): """Tests for #2886""" url = "https://github.com/user-attachments/files/17187711/crash-e8a85d82de01cab5eb44e7993304d8b9d1544970.pdf" name = "issue2886.pdf" with pytest.raises(PdfReadError, match=r"Unexpected empty line in Xref table\."): _ = PdfReader(BytesIO(get_data_from_url(url, name=name))) @pytest.mark.enable_socket def test_infinite_loop_for_length_value(): """Tests for #3112""" url = "https://github.com/user-attachments/files/19106009/Special.n.15.du.jeudi.22.fevrier.2024.pdf" name = "issue3112.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() with pytest.raises(PdfReadError, match=r"^Detected loop with self reference for IndirectObject\(165, 0, \d+\)\.$"): writer.add_page(reader.pages[0]) def test_trailer_cannot_be_read(): path = RESOURCE_ROOT / "crazyones.pdf" data = path.read_bytes().replace(b"/Type/XRef", b"/Type/Invalid") with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Unexpected type '/Invalid'$"): reader = PdfReader(BytesIO(data)) list(reader.pages) @pytest.mark.enable_socket def test_read_pdf15_xref_stream(): data = get_data_from_url(name="issue-3429.pdf") with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Size missing from XRef stream {"): PdfReader(BytesIO(data)) data_modified = data.replace(b"/XRef/", b"/XRef/Size/2/") with pytest.raises( PdfReadError, match=r"^Trailer cannot be read: Limit reached while decompressing\. 1545392 bytes remaining\.$" ): PdfReader(BytesIO(data_modified)) @pytest.mark.enable_socket def test_read_standard_xref_table__two_whitespace_characters_between_offset_and_generation(): """Tests for #3482""" url = "https://github.com/user-attachments/files/22591813/helloworld.pdf" name = "issue3482.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages) == 1 assert reader.pages[0].extract_text() == "Hello World!" @pytest.mark.enable_socket def test_root_object_recovery_limit(caplog): url = "https://github.com/user-attachments/files/24525509/root_object_recovery_limit.pdf" name = "root_object_recovery_limit.pdf" data = get_data_from_url(url, name=name) # Default limit. reader = PdfReader(BytesIO(data)) with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum Root object recovery limit reached\.$" ): _ = list(reader.pages) message_numbers = { int(message.split(" ", maxsplit=2)[1]) for message in caplog.messages if message.startswith("Object ") and message.endswith(" 0 not defined.") } assert sorted(message_numbers) == list(range(5, 10001)) # Custom limit. caplog.clear() reader = PdfReader(BytesIO(data), root_object_recovery_limit=42) with pytest.raises( expected_exception=LimitReachedError, match=r"^Maximum Root object recovery limit reached\.$" ): _ = list(reader.pages) message_numbers = { int(message.split(" ", maxsplit=2)[1]) for message in caplog.messages if message.startswith("Object ") and message.endswith(" 0 not defined.") } assert sorted(message_numbers) == list(range(5, 43)) # No limit. Do not run actual process for speed reasons. reader = PdfReader(BytesIO(data), root_object_recovery_limit=None) assert reader._root_object_recovery_limit == sys.maxsize # Strict mode. with pytest.raises(expected_exception=PdfReadError, match=r"^Broken xref table$"): reader = PdfReader(BytesIO(data), strict=True) _ = list(reader.pages) @pytest.mark.timeout(10) def test_rebuild_xref_table__speed(): total_len = 2_000_790 middle = b"\nstartxref 1\n % " leading_len = 0x55E # 1374 leading = b" " * leading_len trailing = b" " * (total_len - leading_len - len(middle)) data = leading + middle + trailing reader = PdfReader(BytesIO(data)) with pytest.raises(expected_exception=PdfReadError, match=r"^Cannot find Root object in pdf$"): _ = list(reader.pages) def test_find_pdf_objects(): data = ( b" \n" b" 11 0 obj\n" b" 12 0 obj\n" b"13 1 obj\n" b"ob\n" b"ab obj\n" b"42 1337 obj \n" b"\n" ) result = list(PdfReader._find_pdf_objects(data)) assert result == [(11, 0, 8), (12, 0, 19), (13, 1, 28), (42, 1337, 49)] @pytest.mark.parametrize( ("data", "expected"), [ (b"\n\ntrailer", []), (b"\n\ntrailer abc", []), (b"\n\ntrailer <<", [10]), (b"\n\ntrailer << /Key null >>\n\n trailer << /Key 42 >>\n", [10, 37]) ] ) def test_find_pdf_trailers(data: bytes, expected: list[int]): result = list(PdfReader._find_pdf_trailers(data)) assert result == expected def test_objstm_batch_parse_caches_all_objects(): """Resolving one ObjStm object should batch-cache all siblings.""" reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") assert len(reader.xref_objStm) > 0 obj_ids = list(reader.xref_objStm.keys()) first_obj = reader.get_object(obj_ids[0]) assert first_obj is not None for idnum in obj_ids[1:]: cached = reader.cache_get_indirect_object(0, idnum) assert cached is not None, f"Object {idnum} was not batch-cached" def test_objstm_cache_hit_returns_target(): """Second call to _get_object_from_stream should return cached objects.""" reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") obj_ids = list(reader.xref_objStm.keys()) # Trigger batch parse reader.get_object(obj_ids[0]) # Call again — all objects are already cached second_id = obj_ids[1] ref = IndirectObject(second_id, 0, reader) result = reader._get_object_from_stream(ref) assert result is reader.cache_get_indirect_object(0, second_id) def test_objstm_skips_cache_for_overridden_objects(): """Objects removed from xref_objStm should not be cached during batch parse.""" reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") obj_ids = list(reader.xref_objStm.keys()) assert len(obj_ids) >= 2 # Simulate an incremental update overriding one object removed_id = obj_ids[-1] saved_entry = reader.xref_objStm.pop(removed_id) reader.resolved_objects.clear() result = reader.get_object(obj_ids[0]) assert result is not None assert reader.cache_get_indirect_object(0, removed_id) is None assert reader.cache_get_indirect_object(0, obj_ids[0]) is not None reader.xref_objStm[removed_id] = saved_entry ================================================ FILE: tests/test_text_extraction.py ================================================ """ Testing the text-extraction submodule and ensuring the quality of text extraction. The tested code might be in _page.py. """ import re from dataclasses import asdict from io import BytesIO from unittest.mock import patch import pytest from pypdf import PdfReader, PdfWriter, mult from pypdf._font import Font from pypdf._text_extraction import set_custom_rtl from pypdf._text_extraction._layout_mode._fixed_width_page import text_show_operations from pypdf.errors import PdfReadError from pypdf.generic import ContentStream from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url @pytest.mark.samples @pytest.mark.parametrize(("visitor_text"), [None, lambda a, b, c, d, e: None]) # noqa: ARG005 def test_multi_language(visitor_text): reader = PdfReader(RESOURCE_ROOT / "multilang.pdf") txt = reader.pages[0].extract_text(visitor_text=visitor_text) assert "Hello World" in txt, "English not correctly extracted" # iss #1296 assert "مرحبا بالعالم" in txt, "Arabic not correctly extracted" assert "Привет, мир" in txt, "Russian not correctly extracted" assert "你好世界" in txt, "Chinese not correctly extracted" assert "สวัสดีชาวโลก" in txt, "Thai not correctly extracted" assert "こんにちは世界" in txt, "Japanese not correctly extracted" # check customizations set_custom_rtl(None, None, "Russian:") assert ":naissuR" in reader.pages[0].extract_text( visitor_text=visitor_text ), "(1) CUSTOM_RTL_SPECIAL_CHARS failed" set_custom_rtl(None, None, [ord(x) for x in "Russian:"]) assert ":naissuR" in reader.pages[0].extract_text( visitor_text=visitor_text ), "(2) CUSTOM_RTL_SPECIAL_CHARS failed" set_custom_rtl(0, 255, None) assert ":hsilgnE" in reader.pages[0].extract_text( visitor_text=visitor_text ), "CUSTOM_RTL_MIN/MAX failed" set_custom_rtl("A", "z", []) assert ":hsilgnE" in reader.pages[0].extract_text( visitor_text=visitor_text ), "CUSTOM_RTL_MIN/MAX failed" set_custom_rtl(-1, -1, []) # to prevent further errors reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi-rotated.pdf") assert "habibi" in reader.pages[0].extract_text(visitor_text=visitor_text) assert "حَبيبي" in reader.pages[0].extract_text(visitor_text=visitor_text) assert "habibi" in reader.pages[1].extract_text(visitor_text=visitor_text) assert "حَبيبي" in reader.pages[1].extract_text(visitor_text=visitor_text) assert "habibi" in reader.pages[2].extract_text(visitor_text=visitor_text) assert "حَبيبي" in reader.pages[2].extract_text(visitor_text=visitor_text) assert "habibi" in reader.pages[3].extract_text(visitor_text=visitor_text) assert "حَبيبي" in reader.pages[3].extract_text(visitor_text=visitor_text) @pytest.mark.parametrize( ("file_name", "constraints"), [ ( "inkscape-abc.pdf", { "A": lambda x, y: 0 < x < 94 and 189 < y < 283, # In upper left "B": lambda x, y: 94 < x < 189 and 94 < y < 189, # In the center "C": lambda x, y: 189 < x < 283 and 0 < y < 94, # In lower right }, ) ], ) def test_visitor_text_matrices(file_name, constraints): """ Checks if the matrices given to the visitor_text function when calling `extract_text` on the first page of `file_name` match some given constraints. `constraints` is a dictionary mapping a line of text to a constraint that should evaluate to `True` on its expected x,y-coordinates. """ reader = PdfReader(RESOURCE_ROOT / file_name) lines = [] def visitor_text(text, cm, tm, font_dict, font_size) -> None: ctm = mult(tm, cm) x = ctm[4] # mult(tm, cm)[4] y = ctm[5] # mult(tm, cm)[5] lines.append({"text": text, "x": x, "y": y}) reader.pages[0].extract_text(visitor_text=visitor_text) for text, constraint in constraints.items(): matches = [li for li in lines if li["text"].strip() == text] assert len(matches) <= 1, f"Multiple lines match {text}" assert len(matches) >= 1, f"No lines match {text}" x = matches[0]["x"] y = matches[0]["y"] assert constraint(x, y), f'Line "{text}" is wrong at x:{x}, y:{y}' @pytest.mark.xfail(reason="known whitespace issue #2336") @pytest.mark.enable_socket def test_issue_2336(): name = "Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf" reader = PdfReader(BytesIO(get_data_from_url(name=name))) page = reader.pages[0] actual_text = page.extract_text() assert "Beira Rio" in actual_text def test_font_class_to_dict(): font = Font( name = "Unknown", space_width=8, character_map={}, encoding = "utf-16-be" ) assert asdict(font) == { "name": "Unknown", "character_map": {}, "encoding": "utf-16-be", "sub_type": "Unknown", "font_descriptor": { "name": "Unknown", "family": "Unknown", "weight": "Unknown", "ascent": 700.0, "descent": -200.0, "cap_height": 600.0, "x_height": 500.0, "italic_angle": 0.0, "flags": 32, "bbox": ( -100.0, -200.0, 1000.0, 900.0, ), }, "character_widths": {"default": 500}, "space_width": 8, "interpretable": True, } @pytest.mark.enable_socket @patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning") def test_uninterpretable_type3_font(mock_logger_warning): url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf" name = "UninterpretableType3Font.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] assert page.extract_text(extraction_mode="layout") == "" mock_logger_warning.assert_called_with( "PDF contains an uninterpretable font. Output will be incomplete.", "pypdf._text_extraction._layout_mode._fixed_width_page" ) @pytest.mark.enable_socket def test_layout_mode_epic_page_fonts(): url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF" name = "Epic Page.PDF" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) expected = (RESOURCE_ROOT / "Epic.Page.layout.txt").read_text(encoding="utf-8") assert expected == reader.pages[0].extract_text(extraction_mode="layout") def test_layout_mode_uncommon_operators(): # Coverage for layout mode Tc, Tz, Ts, ', ", TD, TL, and Tw reader = PdfReader(RESOURCE_ROOT / "toy.pdf") expected = (RESOURCE_ROOT / "toy.layout.txt").read_text(encoding="utf-8") assert expected == reader.pages[0].extract_text(extraction_mode="layout") @pytest.mark.enable_socket def test_layout_mode_type0_font_widths(): # Cover both the 'int int int' and 'int [int int ...]' formats for Type0 # /DescendantFonts /W array entries. url = "https://github.com/py-pdf/pypdf/files/13533204/Claim.Maker.Alerts.Guide_pg2.PDF" name = "Claim Maker Alerts Guide_pg2.PDF" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) expected = (RESOURCE_ROOT / "Claim Maker Alerts Guide_pg2.layout.txt").read_text( encoding="utf-8" ) assert expected == reader.pages[0].extract_text(extraction_mode="layout") @pytest.mark.enable_socket def test_layout_mode_indirect_sequence_font_widths(caplog): # Cover the situation where the sequence for font widths is an IndirectObject # https://github.com/py-pdf/pypdf/pull/2788 url = "https://github.com/user-attachments/files/16491621/2788_example.pdf" name = "2788_example.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.pages[0].extract_text(extraction_mode="layout") == "" url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf" name = "2788_example_malformed.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text(extraction_mode="layout") assert "Invalid font width definition" in caplog.text def dummy_visitor_text(text, ctm, tm, fd, fs): pass @patch("pypdf._page.logger_warning") def test_layout_mode_warnings(mock_logger_warning): # Check that a warning is issued when an argument is ignored reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") page = reader.pages[0] page.extract_text(extraction_mode="plain", visitor_text=dummy_visitor_text) mock_logger_warning.assert_not_called() page.extract_text(extraction_mode="layout", visitor_text=dummy_visitor_text) mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) @pytest.mark.enable_socket def test_space_with_one_unit_smaller_than_font_width(): """Tests for #1328""" url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf" name = "iss1328.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] extracted = page.extract_text() assert "Reporting crude oil leak.\n" in extracted @pytest.mark.enable_socket def test_space_position_calculation(): """Tests for #1153""" url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" name = "iss1153.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[3] extracted = page.extract_text() assert "Shortly after the Geneva BOF session, the" in extracted def test_text_leading_height_unit(): """Tests for #2262""" reader = PdfReader(RESOURCE_ROOT / "toy.pdf") page = reader.pages[0] extracted = page.extract_text() assert "Something[cited]\n" in extracted def test_layout_mode_space_vertically_font_height_weight(): """Tests layout mode with vertical space and font height weight (issue #2915)""" with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: # Load PDF file from file reader = PdfReader(inputfile) page = reader.pages[0] # Normal behaviour with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file: pdftext = pdftext_file.read() text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8") # Compare the text of the PDF to a known source for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): assert expected_line == actual_line pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows assert text == pdftext # Blank lines are added to truly separate paragraphs with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file: pdftext = pdftext_file.read() text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True, layout_mode_font_height_weight=0.85).encode("utf-8") # Compare the text of the PDF to a known source for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): assert expected_line == actual_line pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows assert text == pdftext @pytest.mark.enable_socket def test_infinite_loop_arrays(): """Tests for #2928""" url = "https://github.com/user-attachments/files/17576546/arrayabruptending.pdf" name = "arrayabruptending.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] extracted = page.extract_text() assert "RNA structure comparison" in extracted @pytest.mark.enable_socket def test_content_stream_is_dictionary_object(caplog): """Tests for #2995""" url = "https://github.com/user-attachments/files/18049322/6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf" name = "iss2995.pdf" data = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(data)) page = reader.pages[0] assert "\nYours faithfully \n" in page.extract_text() assert "Expected StreamObject, got DictionaryObject instead. Data might be wrong." in caplog.text caplog.clear() reader = PdfReader(BytesIO(data), strict=True) page = reader.pages[0] with pytest.raises(PdfReadError) as exception: page.extract_text() assert ( "Invalid Elementary Object starting with b\\'\\\\x18\\' @3557: b\\'ateDecode/Length 629\\\\x18ck[" in exception.value.args[0] ) @pytest.mark.enable_socket def test_tz_with_no_operands(): """Tests for #2975""" url = "https://github.com/user-attachments/files/17974120/9E5E080E-C8DB-4A6B-822B-9A67DC04E526-120438.pdf" name = "iss2975.pdf" data = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(data)) page = reader.pages[1] assert "\nThankyouforyourattentiontothismatter.\n" in page.extract_text() @pytest.mark.enable_socket def test_iss3060(): """Test for not throwing 'font not set: is PDF missing a Tf operator'""" url = "https://github.com/user-attachments/files/18482531/test-anon.pdf" name = "iss3060.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # pypdf.errors.PdfReadError: font not set: is PDF missing a Tf operator? txt = reader.pages[0].extract_text(extraction_mode="layout") assert txt.startswith(" *******") @pytest.mark.enable_socket def test_iss3074(): """Test for not throwing 'ZeroDivisionError: float division by zero'""" url = "https://github.com/user-attachments/files/18533211/test-anon.pdf" name = "iss3074.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # pypdf.errors.PdfReadError: ZeroDivisionError: float division by zero txt = reader.pages[0].extract_text(extraction_mode="layout") assert txt.strip().startswith("AAAAAA") @pytest.mark.enable_socket def test_layout_mode_text_state(): """Ensure the text state is stored and reset with q/Q operators.""" # Get the PDF from issue #3212 url = "https://github.com/user-attachments/files/19396790/garbled.pdf" name = "garbled-font.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # Get the txt from issue #3212 and normalize line endings txt_url = "https://github.com/user-attachments/files/19510731/garbled-font.layout.txt" txt_name = "garbled-font.layout.txt" expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n") # Ignore differences in rendering of spaces to work around older differences between the # old layout mode Font code and the new Font class in calculating and dealing with the # fallback width for a character that has no width defined in character_widths. assert expected.replace(" ", "") == reader.pages[0].extract_text(extraction_mode="layout").replace(" ", "") @pytest.mark.enable_socket def test_rotated_line_wrap(): """Ensure correct 2D translation of rotated text after a line wrap.""" # Get the PDF from issue #3247 url = "https://github.com/user-attachments/files/19696918/link16-line-wrap.sanitized.pdf" name = "link16-line-wrap.sanitized.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # Get the txt from issue #3247 and normalize line endings txt_url = "https://github.com/user-attachments/files/19696917/link16-line-wrap.sanitized.expected.txt" txt_name = "link16-line-wrap.sanitized.expected.txt" expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n") assert expected == reader.pages[0].extract_text() @pytest.mark.parametrize( ("op", "msg"), [ (b"BT", "Unbalanced target operations, expected b'ET'."), (b"q", "Unbalanced target operations, expected b'Q'."), ], ) def test_layout_mode_warns_on_malformed_content_stream(op, msg, caplog): """Ensures that imbalanced q/Q or EB/ET is handled gracefully.""" text_show_operations(ops=iter([([], op)]), fonts={}) assert caplog.records assert caplog.records[-1].msg == msg def test_process_operation__cm_multiplication_issue(): """Test for #3262.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") page = writer.pages[0] content = page.get_contents().get_data() content = content.replace(b" 1 0 0 1 72 720 cm ", b" 0.70278 65.3 163.36 cm ") stream = ContentStream(stream=None, pdf=writer) stream.set_data(content) page.replace_contents(stream) assert page.extract_text().startswith("The Crazy Ones\nOctober 14, 1998\n") @pytest.mark.enable_socket def test_rotated_layout_mode(caplog): """Ensures text extraction of rotated pages, as in issue #3270.""" url = "https://github.com/user-attachments/files/19981120/rotated-page.pdf" name = "rotated-page.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) page = writer.pages[0] page.transfer_rotation_to_content() text = page.extract_text(extraction_mode="layout") assert not caplog.records, "No warnings should be issued" assert text, "Text matching the page rotation should be extracted" assert re.search(r"\r?\n +69\r?\n +UNCLASSIFIED$", text), "Contents should be in expected layout" @pytest.mark.enable_socket @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_extract_text__none_objects(): url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf" name = "tika-957721.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() reader.pages[8].extract_text() @pytest.mark.enable_socket def test_extract_text__with_visitor_text(): def visitor_text(*args, **kwargs): # noqa: ANN002, ANN003, ANN202 pass url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf" name = "tika-952016.pdf" stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) page = reader.pages[0] page.extract_text(visitor_text=visitor_text) reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf"))) page = reader.pages[0] page.extract_text(visitor_text=visitor_text) @pytest.mark.enable_socket def test_extract_text__restore_cm_stack_pop_error(): url = "https://github.com/user-attachments/files/18381737/tika-966635.pdf" name = "tika-966635.pdf" stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) page = reader.pages[10] # There is a previous error we already omit ("pop from empty list"), thus # check for the message explicitly here. with pytest.raises(IndexError, match="list index out of range"): page.extract_text() @pytest.mark.timeout(60) @pytest.mark.enable_socket def test_slow_huge_string(): """Tests for #3541""" url = "https://github.com/user-attachments/files/23855795/file.pdf" name = "issue-3541.pdf" stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) page = reader.pages[0] _ = page.extract_text(extraction_mode="layout") @pytest.mark.enable_socket def test_extract_text_with_missing_font_bbox(): url = "https://github.com/user-attachments/files/24611650/bbox_bug_emoji.pdf" name = "issue-3599.pdf" stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) page = reader.pages[0] text = page.extract_text() assert "🎉" in text ================================================ FILE: tests/test_utils.py ================================================ """Test the pypdf._utils module.""" import functools import io import re import subprocess import sys from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any, Callable import pytest import pypdf._utils from pypdf._utils import ( File, Version, _get_max_pdf_version_header, _human_readable_bytes, check_if_whitespace_only, classproperty, deprecate_with_replacement, deprecation_no_replacement, format_iso8824_date, logger_error, mark_location, matrix_multiply, parse_iso8824_date, read_block_backwards, read_previous_line, read_until_regex, read_until_whitespace, rename_kwargs, skip_over_comment, skip_over_whitespace, ) from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError from pypdf.generic import DictionaryObject, NameObject, TextStringObject from . import is_sublist @pytest.mark.parametrize( ("stream", "expected"), [ (io.BytesIO(b"foo"), False), (io.BytesIO(b""), False), (io.BytesIO(b" "), True), (io.BytesIO(b" "), True), (io.BytesIO(b" \n"), True), (io.BytesIO(b" \n"), True), (io.BytesIO(b"\f"), True), ], ) def test_skip_over_whitespace(stream, expected): assert skip_over_whitespace(stream) == expected @pytest.mark.parametrize( ("value", "expected"), [ (b"foo", False), (b" a", False), (b" a\n b", False), (b"", True), (b" ", True), (b" ", True), (b" \n", True), (b" \n", True), (b"\f", True), ], ) def test_check_if_whitespace_only(value, expected): assert check_if_whitespace_only(value) is expected def test_read_until_whitespace(): assert read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" @pytest.mark.parametrize( ("stream", "remainder"), [ (io.BytesIO(b"% foobar\n"), b""), (io.BytesIO(b""), b""), (io.BytesIO(b" "), b" "), (io.BytesIO(b"% foo%\nbar"), b"bar"), ], ) def test_skip_over_comment(stream, remainder): skip_over_comment(stream) assert stream.read() == remainder def test_read_until_regex_premature_ending_name(): stream = io.BytesIO(b"") assert read_until_regex(stream, re.compile(b".")) == b"" def test_read_until_regex_match_in_first_chunk(): """Match within the first 16-byte chunk.""" stream = io.BytesIO(b"hello world") result = read_until_regex(stream, re.compile(b" ")) assert result == b"hello" assert stream.tell() == 5 def test_read_until_regex_match_in_second_chunk(): """Match lands in the second chunk (past first 16 bytes).""" payload = b"0123456789abcdefghij" assert len(payload) == 20 data = payload + b" rest" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(b" ")) assert result == payload assert stream.tell() == 20 def test_read_until_regex_match_at_chunk_boundary(): """Delimiter sits exactly at byte 16 (first byte of second chunk).""" payload = b"0123456789abcdef" assert len(payload) == 16 data = payload + b" after" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(b" ")) assert result == payload assert stream.tell() == 16 def test_read_until_regex_multi_byte_spanning_boundary(): """Multi-byte regex pattern spans across a chunk boundary.""" # "X" at byte 15 (last byte of first chunk), "Y" at byte 16 (first of second) payload = b"0123456789abcde" assert len(payload) == 15 data = payload + b"XYafter" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(b"XY")) assert result == payload assert stream.tell() == 15 def test_read_until_regex_no_match_exhausted(): """No match - stream is fully consumed and all data returned.""" data = b"0123456789" * 10 stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(b"ZZZ")) assert result == data def test_read_until_regex_exponential_chunk_growth(): """Verify correctness with long input that exercises chunk doubling.""" payload = (b"0123456789abcdef" * 3125)[:50_000] assert len(payload) == 50_000 data = payload + b"|done" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(rb"\|")) assert result == payload assert stream.tell() == 50_000 def test_read_until_regex_match_spanning_later_boundary(): """Multi-byte match spanning a boundary after chunk size has grown.""" # Chunk 1: 16 bytes, chunk 2: 32 bytes → total 48 after two reads. # Place "END" at offset 47 so it spans bytes 47-49. payload = (b"0123456789abcdef" * 3)[:47] assert len(payload) == 47 data = payload + b"ENDrest" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(b"END")) assert result == payload assert stream.tell() == 47 def test_read_until_regex_tail_overlap_is_fixed(): """Tail overlap is 16 bytes regardless of chunk size growth. Chunk reads: 16, 32, 64 -> total 112. Place a 16-byte pattern starting one byte before the 64-byte chunk boundary (at offset 47) so it spans into the third chunk. This only works if the tail kept from chunk 2 covers at least 16 bytes. """ pattern = b"ABCDEFGHIJKLMNOP" # 16 bytes assert len(pattern) == 16 # Chunk 1: 16 bytes, chunk 2: 32 bytes -> boundary at offset 48. # Pattern starts at 47, spanning bytes 47-62. payload = b"x" * 47 data = payload + pattern + b"rest" stream = io.BytesIO(data) result = read_until_regex(stream, re.compile(re.escape(pattern))) assert result == payload assert stream.tell() == 47 @pytest.mark.parametrize( ("a", "b", "expected"), [ (((3,),), ((7,),), ((21,),)), (((3, 7),), ((5,), (13,)), ((3 * 5.0 + 7 * 13,),)), (((3,), (7,)), ((5, 13),), ((3 * 5, 3 * 13), (7 * 5, 7 * 13))), ], ) def test_matrix_multiply(a, b, expected): assert matrix_multiply(a, b) == expected def test_mark_location(): stream = io.BytesIO(b"abde" * 6000) mark_location(stream) Path("pypdf_pdfLocation.txt").unlink() # cleanup def test_deprecate_no_replacement(): with pytest.warns( expected_warning=DeprecationWarning, match="foo is deprecated and will be removed in pypdf 3.0.0." ): pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0") @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ (b"abc", 1, 0, b"", 1), (b"abc", 1, 1, b"a", 0), (b"abc", 2, 1, b"b", 1), (b"abc", 2, 2, b"ab", 0), (b"abc", 3, 1, b"c", 2), (b"abc", 3, 2, b"bc", 1), (b"abc", 3, 3, b"abc", 0), (b"", 0, 1, None, 0), (b"a", 0, 1, None, 0), (b"abc", 0, 10, None, 0), ], ) def test_read_block_backwards(dat, pos, to_read, expected, expected_pos): s = io.BytesIO(dat) s.seek(pos) if expected is not None: assert read_block_backwards(s, to_read) == expected else: with pytest.raises(PdfStreamError): read_block_backwards(s, to_read) assert s.tell() == expected_pos def test_read_block_backwards_at_start(): s = io.BytesIO(b"abc") with pytest.raises(PdfStreamError) as _: read_previous_line(s) @pytest.mark.parametrize( ("dat", "pos", "expected", "expected_pos"), [ (b"abc", 1, b"a", 0), (b"abc", 2, b"ab", 0), (b"abc", 3, b"abc", 0), (b"abc\n", 3, b"abc", 0), (b"abc\n", 4, b"", 3), (b"abc\n\r", 4, b"", 3), (b"abc\nd", 5, b"d", 3), # Skip over multiple CR/LF bytes (b"abc\n\r\ndef", 9, b"def", 3), ], ids=list(range(8)), ) def test_read_previous_line(dat, pos, expected, expected_pos): s = io.BytesIO(dat) s.seek(pos) assert read_previous_line(s) == expected assert s.tell() == expected_pos # for unknown reason if the parameters are passed through pytest, errors are reported def test_read_previous_line2(): # Include a block full of newlines... test_read_previous_line( b"abc" + b"\n" * (2 * io.DEFAULT_BUFFER_SIZE) + b"d", 2 * io.DEFAULT_BUFFER_SIZE + 4, b"d", 3, ) # Include a block full of non-newline characters test_read_previous_line( b"abc\n" + b"d" * (2 * io.DEFAULT_BUFFER_SIZE), 2 * io.DEFAULT_BUFFER_SIZE + 4, b"d" * (2 * io.DEFAULT_BUFFER_SIZE), 3, ) # Both test_read_previous_line( b"abcxyz" + b"\n" * (2 * io.DEFAULT_BUFFER_SIZE) + b"d" * (2 * io.DEFAULT_BUFFER_SIZE), 4 * io.DEFAULT_BUFFER_SIZE + 6, b"d" * (2 * io.DEFAULT_BUFFER_SIZE), 6, ) def test_get_max_pdf_version_header(): with pytest.raises(ValueError) as exc: _get_max_pdf_version_header(b"", b"PDF-1.2") assert exc.value.args[0] == "Neither b'' nor b'PDF-1.2' are proper headers" def test_read_block_backwards_exception(): stream = io.BytesIO(b"foobar") stream.seek(6) with pytest.raises(PdfReadError) as exc: read_block_backwards(stream, 7) assert exc.value.args[0] == "Could not read malformed PDF file" def test_deprecate_with_replacement(): def foo() -> None: deprecate_with_replacement("foo", "bar", removed_in="4.3.2") with pytest.warns( DeprecationWarning, match="foo is deprecated and will be removed in pypdf 4.3.2. Use bar instead.", ): foo() def test_deprecation_no_replacement(): def foo() -> None: deprecation_no_replacement("foo", removed_in="4.3.2") with pytest.raises( DeprecationError, match=r"foo is deprecated and was removed in pypdf 4\.3\.2\.", ): foo() def test_logger_error(caplog): enc = NameObject("/Invalid") message = "Advanced encoding %(encoding)s not implemented yet" logger_error(message, source=__name__, encoding=enc) assert "Advanced encoding /Invalid not implemented yet" in caplog.text encoding = DictionaryObject({NameObject("/key"): TextStringObject("value")}) message = "Advanced encoding %(encoding)s not implemented yet" logger_error(message, source=__name__, encoding=encoding) assert "Advanced encoding {'/key': 'value'} not implemented yet" in caplog.text def test_rename_kwargs(): def deprecation_bookmark_nofail(**aliases: str) -> Callable: """ Decorator for deprecated term "bookmark". To be used for methods and function arguments outline_item = a bookmark outline = a collection of outline items. """ def decoration(func: Callable) -> Any: # type: ignore @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> Any: # type: ignore rename_kwargs(func.__name__, kwargs, aliases, fail=False) return func(*args, **kwargs) return wrapper return decoration @deprecation_bookmark_nofail(old_param="new_param") def foo(old_param: int = 1, baz: int = 2, new_param: int = 1) -> None: pass expected_msg = ( "foo received both old_param and new_param as an argument. " "old_param is deprecated. Use new_param instead." ) with pytest.raises(TypeError, match=expected_msg): foo(old_param=12, new_param=13) with pytest.warns( DeprecationWarning, match="old_param is deprecated as an argument. Use new_param instead", ): foo(old_param=12) def test_rename_kwargs__stacklevel(tmp_path: Path) -> None: script = tmp_path / "script.py" script.write_text(""" import functools import warnings from pypdf._utils import rename_kwargs def deprecation(**aliases: str): def decoration(func): @functools.wraps(func) def wrapper(*args, **kwargs): rename_kwargs(func.__name__, kwargs, aliases, fail=False) return func(*args, **kwargs) return wrapper return decoration @deprecation(old_param="new_param") def foo(old_param: int = 1, baz: int = 2, new_param: int = 1) -> None: pass warnings.simplefilter("always") foo(old_param=12) """) result = subprocess.run([sys.executable, script], capture_output=True, text=True) # noqa: S603 assert result.returncode == 0 assert result.stderr == ( f"{script}:23: DeprecationWarning: old_param is deprecated as an argument. " f"Use new_param instead\n foo(old_param=12)\n" ) @pytest.mark.parametrize( ("input_int", "expected_output"), [ (123, "123 Byte"), (1234, "1.2 kB"), (123_456, "123.5 kB"), (1_234_567, "1.2 MB"), (1_234_567_890, "1.2 GB"), (1_234_567_890_000, "1234.6 GB"), ], ) def test_human_readable_bytes(input_int, expected_output): """_human_readable_bytes correctly transforms the integer to a string.""" assert _human_readable_bytes(input_int) == expected_output def test_file_class(): """File class can be instantiated and string representation is ok.""" f = File(name="image.png", data=b"") assert str(f) == "File(name=image.png, data: 0 Byte)" # hash(b"") varies between CPython and PyPy assert repr(f) == f"File(name=image.png, data: 0 Byte, hash: {hash(b'')})" @pytest.mark.parametrize( ("text", "expected"), [ ("D:20210318000756", "2021-03-18T00:07:56"), ("20210318000756", "2021-03-18T00:07:56"), ("D:2021", "2021-01-01T00:00:00"), ("D:202103", "2021-03-01T00:00:00"), ("D:20210304", "2021-03-04T00:00:00"), ("D:2021030402", "2021-03-04T02:00:00"), ("D:20210408054711", "2021-04-08T05:47:11"), ("D:20210408054711Z", "2021-04-08T05:47:11+00:00"), ("D:20210408054711Z00", "2021-04-08T05:47:11+00:00"), ("D:20210408054711Z0000", "2021-04-08T05:47:11+00:00"), ("D:20210408075331+02'00'", "2021-04-08T07:53:31+02:00"), ("D:20210408075331-03'00'", "2021-04-08T07:53:31-03:00"), ], ) def test_parse_datetime(text, expected): date = parse_iso8824_date(text) date_str = (date.isoformat() + date.strftime("%z"))[: len(expected)] assert date_str == expected @pytest.mark.parametrize( ("text", "expected"), [ ("", None), (None, None), ], ) def test_parse_datetime_edge_cases(text, expected): date = parse_iso8824_date(text) assert date == expected def test_parse_datetime_err(): with pytest.raises(ValueError) as ex: parse_iso8824_date("D:20210408T054711Z") assert ex.value.args[0] == "Can not convert date: D:20210408T054711Z" assert parse_iso8824_date("D:20210408054711").tzinfo is None def test_format_iso8824_date(): """Test format_iso8824_date function with timezone handling.""" dt_naive = datetime(2021, 3, 18, 12, 7, 56) result = format_iso8824_date(dt_naive) assert result == "D:20210318120756" dt_utc = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone.utc) result = format_iso8824_date(dt_utc) assert result == "D:20210318120756+00'00'" dt_positive = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=2, minutes=30))) result = format_iso8824_date(dt_positive) assert result == "D:20210318120756+02'30'" dt_negative = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=-5, minutes=-30))) result = format_iso8824_date(dt_negative) assert result == "D:20210318120756-05'30'" def test_format_iso8824_date_roundtrip(): dt_naive = datetime(2021, 3, 18, 12, 7, 56) formatted = format_iso8824_date(dt_naive) parsed = parse_iso8824_date(formatted) assert parsed == dt_naive dt_utc = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone.utc) formatted = format_iso8824_date(dt_utc) parsed = parse_iso8824_date(formatted) assert parsed == dt_utc dt_positive = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=2, minutes=30))) formatted = format_iso8824_date(dt_positive) parsed = parse_iso8824_date(formatted) assert parsed == dt_positive dt_negative = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=-5, minutes=-30))) formatted = format_iso8824_date(dt_negative) parsed = parse_iso8824_date(formatted) assert parsed == dt_negative def test_is_sublist(): # Basic checks: assert is_sublist([0, 1], [0, 1, 2]) is True assert is_sublist([0, 2], [0, 1, 2]) is True assert is_sublist([1, 2], [0, 1, 2]) is True assert is_sublist([0, 3], [0, 1, 2]) is False # Ensure order is checked: assert is_sublist([1, 0], [0, 1, 2]) is False # Ensure duplicates are handled: assert is_sublist([0, 1, 1], [0, 1, 1, 2]) is True assert is_sublist([0, 1, 1], [0, 1, 2]) is False # Edge cases with empty lists: assert is_sublist([], [0, 1, 2]) is True assert is_sublist([0, 1], []) is False # Self-sublist edge case: assert is_sublist([0, 1, 2], [0, 1, 2]) is True @pytest.mark.parametrize( ("left", "right", "is_less_than"), [ ("1", "2", True), ("2", "1", False), ("1", "1", False), ("1.0", "1.1", True), ("1", "1.1", True), # Suffix left ("1a", "2", True), ("2a", "1", False), ("1a", "1", False), ("1.0a", "1.1", True), # I'm not sure about that, but seems special enough that it # probably doesn't matter: ("1a", "1.1", False), # Suffix right ("1", "2a", True), ("2", "1a", False), ("1", "1a", True), ("1.0", "1.1a", True), ("1", "1.1a", True), ("", "0.0.0", True), # Just suffix matters ... hm, I think this is actually wrong: ("1.0a", "1.0", False), ("1.0", "1.0a", True), ], ) def test_version_compare(left, right, is_less_than): assert (Version(left) < Version(right)) is is_less_than def test_version_compare_equal_str(): a = Version("1.0") assert a != "1.0" def test_version_compare_lt_str(): a = Version("1.0") with pytest.raises(ValueError) as exc: a < "1.0" # noqa: B015 assert exc.value.args[0] == "Version cannot be compared against " def test_bad_version(): assert Version("a").components == [(0, "a")] def test_version_eq_hash(): version1 = Version("1.0") version2 = Version("1.0") version3 = Version("1.1") assert version1 == version2 assert version1 != version3 assert hash(version1) == hash(version2) assert hash(version1) != hash(version3) def test_classproperty(): class Container: @classproperty def value1(cls) -> int: # noqa: N805 return 42 @classproperty def value2(cls) -> int: # noqa: N805 return 1337 @classproperty def value3(cls) -> int: # noqa: N805 return 1 @value3.getter def value3(cls) -> int: # noqa: N805 return 2 assert Container.value1 == 42 assert Container.value2 == 1337 assert Container.value3 == 2 assert Container().value1 == 42 assert Container().value2 == 1337 assert Container().value3 == 2 ================================================ FILE: tests/test_workflows.py ================================================ """ Tests in this module behave like user code. They don't mock/patch anything, they cover typical user needs. """ import binascii from io import BytesIO from pathlib import Path from re import findall import pytest from PIL import Image, ImageChops from PIL import __version__ as pil_version from pypdf import PdfReader, PdfWriter, Transformation from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PdfReadWarning from pypdf.generic import ( ArrayObject, ContentStream, DictionaryObject, NameObject, TextStringObject, read_object, ) from . import PROJECT_ROOT, RESOURCE_ROOT, SAMPLE_ROOT, PILContext, get_data_from_url, normalize_warnings from .utils import get_image_data def test_basic_features(tmp_path): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter() assert len(reader.pages) == 1 # add page 1 from input1 to output document, unchanged writer.add_page(reader.pages[0]) # add page 2 from input1, but rotated clockwise 90 degrees writer.add_page(reader.pages[0].rotate(90)) assert writer.pages[0].rotation == 0 assert writer.pages[1].rotation == 90 # add page 3 from input1, but crop it to half size: page4 = reader.pages[0] page4 = writer.add_page(page4) page4.mediabox.upper_right = ( page4.mediabox.right / 2, page4.mediabox.top / 2, ) del page4.mediabox # add page 4 from input1, but first add a watermark from another PDF: page3 = reader.pages[0] page3 = writer.add_page(page3) watermark_pdf = pdf_path watermark = PdfReader(watermark_pdf) page3.merge_page(watermark.pages[0]) # add some Javascript to launch the print window on opening this PDF. # the password dialog may prevent the print dialog from being shown, # comment the encryption lines, if that's the case, to try this out writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # encrypt your new PDF and add a password password = "secret" writer.encrypt(password) # doing it twice should not change anything writer.encrypt(password) # finally, write "output" to pypdf-output.pdf write_path = tmp_path / "pypdf-output.pdf" with open(write_path, "wb") as output_stream: writer.write(output_stream) def test_dropdown_items(): inputfile = RESOURCE_ROOT / "libreoffice-form.pdf" reader = PdfReader(inputfile) fields = reader.get_fields() assert "/Opt" in fields["Nationality"] def test_pdfreader_file_load(): """ Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. """ with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: # Load PDF file from file reader = PdfReader(inputfile) page = reader.pages[0] # Retrieve the text of the PDF with open(RESOURCE_ROOT / "crazyones.txt", "rb") as pdftext_file: pdftext = pdftext_file.read() text = page.extract_text().encode("utf-8") # Compare the text of the PDF to a known source for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): assert expected_line == actual_line pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows assert text == pdftext def test_pdfreader_jpeg_image(): """ Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. """ with open(RESOURCE_ROOT / "jpeg.pdf", "rb") as inputfile: # Load PDF file from file reader = PdfReader(inputfile) # Retrieve the text of the image with open(RESOURCE_ROOT / "jpeg.txt") as pdftext_file: imagetext = pdftext_file.read() page = reader.pages[0] x_object = page[PG.RESOURCES]["/XObject"].get_object() data = x_object["/Im4"].get_data() # Compare the text of the PDF to a known source assert binascii.hexlify(data).decode() == imagetext def test_decrypt(): with open(RESOURCE_ROOT / "libreoffice-writer-password.pdf", "rb") as inputfile: reader = PdfReader(inputfile) assert reader.is_encrypted is True reader.decrypt("openpassword") assert len(reader.pages) == 1 assert reader.is_encrypted is True metadict = reader.metadata assert dict(metadict) == { "/CreationDate": "D:20220403203552+02'00'", "/Creator": "Writer", "/Producer": "LibreOffice 6.4", } def test_text_extraction_encrypted(): inputfile = RESOURCE_ROOT / "libreoffice-writer-password.pdf" reader = PdfReader(inputfile) assert reader.is_encrypted is True reader.decrypt("openpassword") assert ( reader.pages[0] .extract_text() .strip() .startswith("Lorem ipsum dolor sit amet") ) @pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90]) def test_rotate(degree): with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: reader = PdfReader(inputfile) page = reader.pages[0] page.rotate(degree) def test_rotate_45(): with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: reader = PdfReader(inputfile) page = reader.pages[0] with pytest.raises(ValueError) as exc: page.rotate(45) assert exc.value.args[0] == "Rotation angle must be a multiple of 90" @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("enable", "url", "pages"), [ (True, "https://arxiv.org/pdf/2201.00214.pdf", [0, 1, 5, 10]), ( True, "https://github.com/py-pdf/sample-files/raw/main/009-pdflatex-geotopo/GeoTopo.pdf", [0, 1, 5, 10], ), (True, "https://arxiv.org/pdf/2201.00151.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/1707.09725.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00021.pdf", [0, 1, 5, 8]), (True, "https://arxiv.org/pdf/2201.00037.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00069.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00178.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00201.pdf", [0, 1, 5, 8]), (True, "https://arxiv.org/pdf/1602.06541.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00200.pdf", [0, 1, 5, 6]), (True, "https://arxiv.org/pdf/2201.00022.pdf", [0, 1, 5, 10]), (True, "https://arxiv.org/pdf/2201.00029.pdf", [0, 1, 6, 10]), # #1145 (True, "https://github.com/py-pdf/pypdf/files/9174594/2017.pdf", [0]), # #1145, remaining issue (empty arguments for FlateEncoding) ( True, "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf", [0], ), # 6 instead of 5: as there is an issue in page 5 (missing objects) # and too complex to handle the warning without hiding real regressions (True, "https://arxiv.org/pdf/1601.03642.pdf", [0, 1, 5, 7]), ( True, "https://github.com/py-pdf/pypdf/files/3796761/17343_2008_Order_09-Jan-2019.pdf", [0, 1], ), ( True, "https://github.com/py-pdf/pypdf/files/8884471/ssi_manwaring.pdf", [0, 1], ), (True, "https://github.com/py-pdf/pypdf/files/8884469/999092.pdf", [0, 1]), ( True, "file://" + str(RESOURCE_ROOT / "test Orient.pdf"), [0], ), # TODO: preparation of text orientation validation ( True, "https://github.com/py-pdf/pypdf/files/8884470/fdocuments.in_sweet-fundamentals-of-crystallography.pdf", [0, 1, 34, 35, 36, 118, 119, 120, 121], ), (True, "https://github.com/py-pdf/pypdf/files/8884493/998167.pdf", [0]), ( True, "https://github.com/user-attachments/files/18382039/971703.pdf", [0, 1, 5, 8, 14], ), ( # faulty PDF, wrongly linearized and with 2 trailer, second with /Root True, "https://github.com/user-attachments/files/18382034/989691.pdf", [0], ), ], ) def test_extract_textbench(enable, url, pages): if not enable: return print_result = False try: reader = PdfReader(BytesIO(get_data_from_url(url, url.split("/")[-1]))) for page_number in pages: if print_result: print(f"**************** {url} / page {page_number} ****************") rst = reader.pages[page_number].extract_text() if print_result: print(f"{rst}\n*****************************\n") except PdfReadWarning: pass def test_transform_compress_identical_objects(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "two-different-pages.pdf") for page in writer.pages: op = Transformation().scale(sx=0.8, sy=0.8) page.add_transformation(op) writer.compress_identical_objects() bytes_out = BytesIO() writer.write(bytes_out) result_reader = PdfReader(bytes_out) pg1_text = result_reader.pages[0].extract_text() pg2_text = result_reader.pages[1].extract_text() assert pg1_text.strip() == "1" assert pg2_text.strip() == "2" @pytest.mark.slow def test_orientations(): p = PdfReader(RESOURCE_ROOT / "test Orient.pdf").pages[0] p.extract_text("", "") p.extract_text("", "", 0) p.extract_text("", "", 0, 200) p.extract_text() assert findall("\\((.)\\)", p.extract_text()) == ["T", "B", "L", "R"] with pytest.raises(Exception): p.extract_text(None) p.extract_text("", 0) with pytest.raises(Exception): p.extract_text("", "", None) with pytest.raises(Exception): p.extract_text("", "", 0, "") with pytest.raises(Exception): p.extract_text(0, "") p.extract_text(0, 0) p.extract_text(orientations=0) for req, rst in ( (0, ["T"]), (90, ["L"]), (180, ["B"]), (270, ["R"]), ((0,), ["T"]), ((0, 180), ["T", "B"]), ((45,), []), ): assert ( findall("\\((.)\\)", p.extract_text(req)) == rst ), f"extract_text({req}) => {rst}" @pytest.mark.samples @pytest.mark.enable_socket @pytest.mark.parametrize( ("base_path", "overlay_path"), [ ( "resources/crazyones.pdf", "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", ), ( "https://github.com/user-attachments/files/18381707/tika-935981.pdf", "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", ), ], ) def test_overlay(pdf_file_path, base_path, overlay_path): if base_path.startswith("http"): base_path = BytesIO(get_data_from_url(base_path, name="tika-935981.pdf")) else: base_path = PROJECT_ROOT / base_path writer = PdfWriter(clone_from=base_path) reader_overlay = PdfReader(PROJECT_ROOT / overlay_path) overlay = reader_overlay.pages[0] for page in writer.pages: page.merge_page(overlay) with open(pdf_file_path, "wb") as fp: writer.write(fp) @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381697/tika-924546.pdf", "tika-924546.pdf", ) ], ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_with_warning(tmp_path, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) merger = PdfWriter() merger.append(reader) # This could actually be a performance bottleneck: merger.write(tmp_path / "tmp.merged.pdf") @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381757/tika-980613.pdf", "tika-980613.pdf", ) ], ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge(tmp_path, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) merger = PdfWriter() merger.append(reader) merger.write(tmp_path / "tmp.merged.pdf") @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "expected_metadata"), [ ( "https://github.com/user-attachments/files/18381708/tika-935996.pdf", "tika-935996.pdf", { "/Author": "Unknown", "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM", "/Creator": r"C:\DEB\6338", "/Keywords": "", "/Producer": "Acrobat PDFWriter 3.02 for Windows", "/Subject": "", "/Title": r"C:\DEB\6338-6R.PDF", }, ) ], ) def test_get_metadata(url, name, expected_metadata): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) data = reader.metadata assert expected_metadata == data @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "strict", "exception"), [ ( "https://github.com/user-attachments/files/16624503/tika-938702.pdf", "tika-938702.pdf", False, None, # iss #1090 is now fixed ), ( "https://github.com/user-attachments/files/18381715/tika-942358.pdf", "tika-942358.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381684/tika-911260.pdf", "tika-911260.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381766/tika-992472.pdf", "tika-992472.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381756/tika-978477.pdf", "tika-978477.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381731/tika-960317.pdf", "tika-960317.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381701/tika-930513.pdf", "tika-930513.pdf", False, None, ), ( "https://github.com/user-attachments/files/18381691/tika-918113.pdf", "tika-918113.pdf", True, None, ), ( "https://github.com/user-attachments/files/18381711/tika-940704.pdf", "tika-940704.pdf", True, None, ), ( "https://github.com/user-attachments/files/18381752/tika-976488.pdf", "tika-976488.pdf", True, None, ), ( "https://github.com/user-attachments/files/18381716/tika-948176.pdf", "tika-948176.pdf", True, None, ), ], ) def test_extract_text(url, name, strict, exception): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) if not exception: for page in reader.pages: page.extract_text() else: exc, exc_text = exception with pytest.raises(exc) as ex_info: for page in reader.pages: page.extract_text() assert ex_info.value.args[0] == exc_text @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381710/tika-938702.pdf", "tika-938702.pdf", ), ( "https://github.com/user-attachments/files/18381725/tika-957304.pdf", "tika-957304.pdf", ), ( "https://github.com/user-attachments/files/18381690/tika-915194.pdf", "tika-915194.pdf", ), ( "https://github.com/user-attachments/files/18381717/tika-950337.pdf", "tika-950337.pdf", ), ( "https://github.com/user-attachments/files/18381734/tika-962292.pdf", "tika-962292.pdf", ), ], ) def test_compress_raised(url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) writer = PdfWriter() writer.clone_document_from_reader(reader) # no more error since iss #1090 fix for page in writer.pages: page.compress_content_streams() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381733/tika-961883.pdf", "tika-961883.pdf", ), ], ) def test_get_fields_warns(tmp_path, caplog, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) write_path = tmp_path / "tmp.txt" with open(write_path, "w") as fp: retrieved_fields = reader.get_fields(fileobj=fp) assert retrieved_fields == {} assert normalize_warnings(caplog.text) == [ "Ignoring wrong pointing object 1 65536 (offset 0)", "Ignoring wrong pointing object 2 65536 (offset 0)", "Object 2 0 not defined.", ] @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381713/tika-942050.pdf", "tika-942050.pdf", ), ], ) def test_get_fields_no_warning(tmp_path, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) write_path = tmp_path / "tmp.txt" with open(write_path, "w") as fp: retrieved_fields = reader.get_fields(fileobj=fp) assert len(retrieved_fields) == 10 @pytest.mark.enable_socket def test_scale_rectangle_indirect_object(): url = "https://github.com/user-attachments/files/18381778/tika-999944.pdf" name = "tika-999944.pdf" data = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter(clone_from=data) for page in writer.pages: page.scale(sx=2, sy=3) def test_merge_output(caplog): # Arrange base = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR.pdf" crazy = RESOURCE_ROOT / "crazyones.pdf" expected = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" # Act merger = PdfWriter() merger.append(base) merger.merge(1, crazy) stream = BytesIO() merger.write(stream) # Assert stream.seek(0) actual = stream.read() with open(expected, "rb") as fp: expected_data = fp.read() if actual != expected_data: # See https://github.com/pytest-dev/pytest/issues/9124 pytest.fail( f"len(actual) = {len(actual):,} vs len(expected) = {len(expected_data):,}" ) # Cleanup merger.close() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381767/tika-994636.pdf", "tika-994636.pdf", ), ( "https://github.com/user-attachments/files/18381719/tika-952133.pdf", "tika-952133.pdf", ), ( # JPXDecode "https://github.com/user-attachments/files/18381688/tika-914568.pdf", "tika-914568.pdf", ), ( "https://github.com/user-attachments/files/18381718/tika-952016.pdf", "tika-952016.pdf", ), ( "https://github.com/user-attachments/files/18382223/965118.pdf", "tika-965118.pdf", ), ( "https://github.com/user-attachments/files/18381729/tika-959184.pdf", "tika-959184.pdf", ), ( "https://github.com/user-attachments/files/18381727/tika-958496.pdf", "tika-958496.pdf", ), ( "https://github.com/user-attachments/files/18381744/tika-972174.pdf", "tika-972174.pdf", ), ( "https://github.com/user-attachments/files/18381745/tika-972243.pdf", "tika-972243.pdf", ), ( "https://github.com/user-attachments/files/18381743/tika-969502.pdf", "tika-969502.pdf", ), ("https://arxiv.org/pdf/2201.00214.pdf", "arxiv-2201.00214.pdf"), ], ) def test_image_extraction(url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) images_extracted = [] root = Path("extracted-images") if not root.exists(): root.mkdir() with PILContext(): for page in reader.pages: for image in page.images: filename = root / image.name with open(filename, "wb") as img: img.write(image.data) images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: if Path(filepath).exists(): Path(filepath).unlink() @pytest.mark.enable_socket def test_image_extraction_strict(): # Emits log messages url = "https://github.com/user-attachments/files/18381687/tika-914102.pdf" name = "tika-914102.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=True) images_extracted = [] root = Path("extracted-images") if not root.exists(): root.mkdir() for page in reader.pages: for image in page.images: filename = root / image.name with open(filename, "wb") as fp: fp.write(image.data) images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: if Path(filepath).exists(): Path(filepath).unlink() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381754/tika-977609.pdf", "tika-977609.pdf", ), ], ) def test_image_extraction2(url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) images_extracted = [] root = Path("extracted-images") if not root.exists(): root.mkdir() for page in reader.pages: for image in page.images: filename = root / image.name with open(filename, "wb") as img: img.write(image.data) images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: if Path(filepath).exists(): Path(filepath).unlink() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381692/tika-918137.pdf", "tika-918137.pdf", ), ( "https://github.com/user-attachments/files/22596566/7552c42e9280b4476e59e77acc0bc812.pdf", "7552c42e9280b4476e59e77acc0bc812.pdf", ), ], ) def test_get_outline(url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) reader.outline @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381707/tika-935981.pdf", "tika-935981.pdf", ), ( "https://github.com/user-attachments/files/18381709/tika-937334.pdf", "tika-937334.pdf", ), ], ) def test_get_xfa(url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) reader.xfa @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "strict"), [ ( "https://github.com/user-attachments/files/18381765/tika-988698.pdf", "tika-988698.pdf", False, ), ( "https://github.com/user-attachments/files/18382162/914133.pdf", "tika-914133.pdf", False, ), ( "https://github.com/user-attachments/files/18381685/tika-912552.pdf", "tika-912552.pdf", False, ), ( "https://github.com/user-attachments/files/18381687/tika-914102.pdf", "tika-914102.pdf", True, ), ], ) def test_get_fonts(url, name, strict): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) for page in reader.pages: page._get_fonts() @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "strict"), [ ( "https://github.com/user-attachments/files/18382060/tika-942303.pdf", "tika-942303.pdf", True, ), ( "https://github.com/user-attachments/files/18381707/tika-935981.pdf", "tika-935981.pdf", True, ), ( "https://github.com/user-attachments/files/18381738/tika-967399.pdf", "tika-967399.pdf", True, ), ( "https://github.com/user-attachments/files/18381707/tika-935981.pdf", "tika-935981.pdf", False, ), ], ) def test_get_xmp(url, name, strict): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) xmp_info = reader.xmp_metadata if xmp_info: xmp_info.dc_contributor xmp_info.dc_coverage xmp_info.dc_creator xmp_info.dc_date xmp_info.dc_description xmp_info.dc_format xmp_info.dc_identifier xmp_info.dc_language xmp_info.dc_publisher xmp_info.dc_relation xmp_info.dc_rights xmp_info.dc_source xmp_info.dc_subject xmp_info.dc_title xmp_info.dc_type xmp_info.pdf_keywords xmp_info.pdf_pdfversion xmp_info.pdf_producer xmp_info.xmp_create_date xmp_info.xmp_modify_date xmp_info.xmp_metadata_date xmp_info.xmp_creator_tool xmp_info.xmpmm_document_id xmp_info.xmpmm_instance_id xmp_info.custom_properties @pytest.mark.enable_socket def test_tounicode_is_identity(): url = "https://github.com/py-pdf/pypdf/files/9998335/FP_Thesis.pdf" name = "FP_Thesis.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() @pytest.mark.enable_socket def test_append_forms(): # from #1538 writer = PdfWriter() url = "https://github.com/py-pdf/pypdf/files/10367412/pdfa.pdf" name = "form_a.pdf" reader1 = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader1.add_form_topname("form_a") writer.append(reader1) url = "https://github.com/py-pdf/pypdf/files/10367413/pdfb.pdf" name = "form_b.pdf" reader2 = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader2.add_form_topname("form_b") writer.append(reader2) b = BytesIO() writer.write(b) reader = PdfReader(b) assert len(reader.get_form_text_fields()) == len( reader1.get_form_text_fields() ) + len(reader2.get_form_text_fields()) @pytest.mark.enable_socket def test_extra_test_iss1541(): url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" name = "tst_iss1541.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() cs = ContentStream(reader.pages[0]["/Contents"], None, None) cs.operations.insert(-1, ([], b"EMC")) stream = BytesIO() cs.write_to_stream(stream) stream.seek(0) ContentStream(read_object(stream, None, None), None, None).operations cs = ContentStream(reader.pages[0]["/Contents"], None, None) cs.operations.insert(-1, ([], b"E!C")) stream = BytesIO() cs.write_to_stream(stream) stream.seek(0) ContentStream(read_object(stream, None, None), None, None).operations b = BytesIO(data.getbuffer()) reader = PdfReader( BytesIO(bytes(b.getbuffer()).replace(b"EI \n", b"E! \n")), strict=False ) with pytest.raises(PdfReadError) as exc: reader.pages[0].extract_text() assert exc.value.args[0] == "Unexpected end of stream" @pytest.mark.enable_socket def test_fields_returning_stream(): """This problem was reported in #424""" url = "https://github.com/mstamy2/PyPDF2/files/1948267/Simple.form.pdf" name = "tst_iss424.pdf" data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) assert "BtchIssQATit_time" in reader.get_form_text_fields()["TimeStampData"] def test_replace_image(tmp_path): writer = PdfWriter(clone_from=RESOURCE_ROOT / "labeled-edges-center-image.pdf") reader = PdfReader(RESOURCE_ROOT / "jpeg.pdf") img = reader.pages[0].images[0].image if int(pil_version.split(".")[0]) < 9: img = img.convert("RGB") writer.pages[0].images[0].replace(img) b = BytesIO() writer.write(b) reader2 = PdfReader(b) if int(pil_version.split(".")[0]) >= 9: assert reader2.pages[0].images[0].image.mode == "RGBA" # very simple image distance evaluation diff = ImageChops.difference(reader2.pages[0].images[0].image, img) d = sum(get_image_data(diff.convert("L"))) / (diff.size[0] * diff.size[1]) assert d < 1.5 img = img.convert("RGB") # quality does not apply to RGBA/JP2 writer.pages[0].images[0].replace(img, quality=20) diff = ImageChops.difference(writer.pages[0].images[0].image, img) d1 = sum(get_image_data(diff.convert("L"))) / (diff.size[0] * diff.size[1]) assert d1 > d # extra tests for coverage with pytest.raises(TypeError) as exc: reader.pages[0].images[0].replace(img) assert exc.value.args[0] == "Cannot update an image not belonging to a PdfWriter." i = writer.pages[0].images[0] with pytest.raises(TypeError) as exc: i.replace(reader.pages[0].images[0]) # missing .image assert exc.value.args[0] == "new_image shall be a PIL Image" i.indirect_reference = None # to behave like an inline image with pytest.raises(TypeError) as exc: i.replace(reader.pages[0].images[0].image) assert exc.value.args[0] == "Cannot update an inline image." import pypdf # noqa: PLC0415 try: pypdf._page.pil_not_imported = True with pytest.raises(ImportError) as exc: i.replace(reader.pages[0].images[0].image) finally: pypdf._page.pil_not_imported = False @pytest.mark.enable_socket def test_inline_images(): """This problem was reported in #424""" url = "https://arxiv.org/pdf/2201.00151.pdf" name = "2201.00151.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/28e8b87c-be2c-40d9-9c86-15c7819021bf" name = "inline4.png" img_ref = Image.open(BytesIO(get_data_from_url(url, name=name))) assert get_image_data(reader.pages[1].images[4].image) == get_image_data(img_ref) with pytest.raises(KeyError): reader.pages[0].images["~999~"] del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] reader.pages[1].inline_images = None # to force recalculation with pytest.raises(PdfReadError): reader.pages[1].images["~1~"] co = reader.pages[0].get_contents() co.operations.append(([], b"BI")) reader.pages[0][NameObject("/Contents")] = co reader.pages[0].images.keys() with pytest.raises(TypeError) as exc: reader.pages[0].images[0].replace(img_ref) assert exc.value.args[0] == "Cannot update an inline image." _a = {} for x, y in reader.pages[2].images[0:-2].items(): _a[x] = y # noqa: PERF403 # Testing code and easier to read this way. with pytest.raises(KeyError) as exc: reader.pages[2]._get_image(("test",)) url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" name = "iss2598c.pdf" # test data also used in test_images.py/test_inline_image_extraction() reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages[0].images) == 3 @pytest.mark.enable_socket def test_issue1899(): url = "https://github.com/py-pdf/pypdf/files/11801077/lv2018tconv.pdf" name = "lv2018tconv.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for i, page in enumerate(reader.pages): print(i) page.extract_text() @pytest.mark.enable_socket def test_cr_with_cm_operation(): """Issue #2138""" url = "https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf" name = "iss2138.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert ( """STATUS: FNL STYLE: 1172 1172 KNIT SHORTIE SUMMER-B 2023 Company: AMERICAN EAGLE OUTFITTERS Division / Dept: 50 / 170 Season: SUMMER-B 2023""" in reader.pages[0].extract_text() ) # currently there is still a white space on last line missing # so we can not do a full comparison. def remove_trailing_whitespace(text: str) -> str: text = text.strip() return "\n".join(line.rstrip() for line in text.splitlines()) @pytest.mark.samples @pytest.mark.parametrize( ("pdf_path", "expected_path"), [ ( SAMPLE_ROOT / "026-latex-multicolumn/multicolumn.pdf", RESOURCE_ROOT / "multicolumn-lorem-ipsum.txt", ), ( SAMPLE_ROOT / "010-pdflatex-forms/pdflatex-forms.pdf", RESOURCE_ROOT / "010-pdflatex-forms.txt", ), ], ) def test_text_extraction_layout_mode(pdf_path, expected_path): reader = PdfReader(pdf_path) actual = reader.pages[0].extract_text(extraction_mode="layout") expected = expected_path.read_text(encoding="utf-8") # We don't care about trailing whitespace assert remove_trailing_whitespace(actual) == remove_trailing_whitespace(expected) @pytest.mark.enable_socket def test_layout_mode_space_vertically(): reader = PdfReader(BytesIO(get_data_from_url(name="iss2138.pdf"))) # remove automatically added final newline expected = ( (RESOURCE_ROOT / "AEO.1172.layout.txt").read_text(encoding="utf-8").rstrip() ) assert expected == reader.pages[0].extract_text( extraction_mode="layout", layout_mode_space_vertically=False ) @pytest.mark.enable_socket @pytest.mark.parametrize( ("rotation", "strip_rotated"), [(90, True), (180, False), (270, True)] ) def test_layout_mode_rotations(rotation, strip_rotated): writer = PdfWriter(clone_from=BytesIO(get_data_from_url(name="iss2138.pdf"))) rotated_page = writer.pages[0].rotate(rotation) rotated_page.transfer_rotation_to_content() expected = "" if not strip_rotated: expected = ( (RESOURCE_ROOT / "AEO.1172.layout.rot180.txt") .read_text(encoding="utf-8") .rstrip() ) # remove automatically added final newline assert expected == rotated_page.extract_text( extraction_mode="layout", layout_mode_space_vertically=False, layout_mode_strip_rotated=strip_rotated, ) def test_text_extraction_invalid_mode(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) with pytest.raises(ValueError, match="Invalid text extraction mode"): reader.pages[0].extract_text(extraction_mode="foo") # type: ignore @pytest.mark.enable_socket def test_get_page_showing_field(): """ Uses testfile from #2452 in order to get fields on multiple pages, choices boxes,... """ url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf" name = "iss2452.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name))) writer = PdfWriter(clone_from=reader) # validate with Field: only works on Reader (no get_fields on writer yet) fld = reader.get_fields() assert [ p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"]) ] == [0] # validate with dictionary object # NRCategory field is a radio box assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() ) ] == [0, 0, 0, 0, 0] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][8].get_object() ) ] == [0, 0, 0, 0, 0] # validate with IndirectObject # SiteID field is a textbox on multiple pages assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][99] ) ] == [0, 1] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][99] ) ] == [0, 1] # test directly on the widget: assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1] ) ] == [1] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1] ) ] == [1] # Exceptions: # Invalid Object with pytest.raises(ValueError) as exc: reader.get_pages_showing_field(None) with pytest.raises(ValueError) as exc: writer.get_pages_showing_field(None) assert "Field type is invalid" in exc.value.args[0] # Damage Field del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"] del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"] with pytest.raises(ValueError) as exc: reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] ) with pytest.raises(ValueError) as exc: writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) assert "Field is not valid" in exc.value.args[0] # missing Parent in field del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ "/Parent" ] del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ "/Parent" ] with pytest.raises(ValueError) as exc: reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] ) with pytest.raises(ValueError) as exc: writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) # remove "/P" (optional) del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[ "/P" ] del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"] assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1] ) ] == [0] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1] ) ] == [0] assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() ) ] == [0, 0, 0, 0, 0] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][8].get_object() ) ] == [0, 0, 0, 0, 0] # Grouping fields reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[ NameObject("/Kids") ] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]) del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"] del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"] del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"] writer._root_object["/AcroForm"]["/Fields"].append( writer._add_object( DictionaryObject( { NameObject("/T"): TextStringObject("grouping"), NameObject("/FT"): NameObject("/Tx"), NameObject("/Kids"): ArrayObject( [reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]] ), } ) ) ) assert [ p.page_number for p in reader.get_pages_showing_field( reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1] ) ] == [] assert [ p.page_number for p in writer.get_pages_showing_field( writer._root_object["/AcroForm"]["/Fields"][-1] ) ] == [] @pytest.mark.enable_socket def test_extract_empty_page(): """Cf #2533""" url = "https://github.com/py-pdf/pypdf/files/14718318/test.pdf" name = "iss2533.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name))) assert reader.pages[1].extract_text(extraction_mode="layout") == "" @pytest.mark.enable_socket def test_iss2815(): """Cf #2815""" url = "https://github.com/user-attachments/files/16760725/crash-c1920c7a064649e1191d7879952ec252473fc7e6.pdf" name = "iss2815.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name))) assert reader.pages[0].extract_text() == "test command with wrong number of args" ================================================ FILE: tests/test_writer.py ================================================ """Test the pypdf._writer module.""" import re import shutil import subprocess from io import BytesIO from pathlib import Path from tempfile import NamedTemporaryFile from typing import Any from unittest import mock import pytest from pypdf import ( ImageType, ObjectDeletionFlag, PageObject, PdfReader, PdfWriter, Transformation, ) from pypdf.annotations import Link from pypdf.errors import DeprecationError, PageSizeNotDefinedError, PdfReadError, PyPdfError from pypdf.generic import ( ArrayObject, ByteStringObject, ContentStream, DecodedStreamObject, Destination, DictionaryObject, Fit, IndirectObject, NameObject, NullObject, NumberObject, RectangleObject, StreamObject, TextStringObject, ) from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url, is_sublist from .test_images import image_similarity GHOSTSCRIPT_BINARY = shutil.which("gs") def _get_write_target(convert) -> Any: target = convert if callable(convert): with NamedTemporaryFile(suffix=".pdf", delete=False) as temporary: target = temporary.name return target def test_writer_exception_non_binary(tmp_path, caplog): src = RESOURCE_ROOT / "pdflatex-outline.pdf" reader = PdfReader(src) writer = PdfWriter() writer.add_page(reader.pages[0]) with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): writer.write_stream(fp) ending = "to write to is not in binary mode. It may not be written to correctly.\n" assert caplog.text.endswith(ending) def test_writer_clone(): src = RESOURCE_ROOT / "pdflatex-outline.pdf" reader = PdfReader(src) writer = PdfWriter(clone_from=reader) assert len(writer.pages) == 4 assert "PageObject" in str(type(writer.pages[0])) writer = PdfWriter(clone_from=src) assert len(writer.pages) == 4 assert "PageObject" in str(type(writer.pages[0])) def test_clone_metadata(): src = RESOURCE_ROOT / "pdflatex-outline.pdf" reader = PdfReader(src) writer = PdfWriter(clone_from=reader) writer.add_metadata({"/foo": "bar"}) assert writer.metadata == { **reader.metadata, "/foo": "bar", } writer = PdfWriter() writer.clone_document_from_reader(reader) writer.add_metadata({"/foo": "bar"}) assert writer.metadata == { **reader.metadata, "/foo": "bar", } writer.metadata = None writer.add_metadata({"/foo": "bar"}) assert writer.metadata == {"/foo": "bar"} writer = PdfWriter() writer.clone_reader_document_root(reader) writer.add_metadata({"/foo": "bar"}) assert writer.metadata == {"/foo": "bar"} def test_writer_clone_bookmarks(): # Arrange src = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" reader = PdfReader(src) writer = PdfWriter() # Act + test cat cat = "" def cat1(p) -> None: nonlocal cat cat += p.__repr__() writer.clone_document_from_reader(reader, cat1) assert "/Page" in cat assert writer.pages[0].raw_get("/Parent") == writer._pages writer.add_outline_item("Page 1", 0) writer.add_outline_item("Page 2", 1) # Assert bytes_stream = BytesIO() writer.write(bytes_stream) bytes_stream.seek(0) reader2 = PdfReader(bytes_stream) assert len(reader2.pages) == len(reader.pages) assert len(reader2.outline) == 2 # test with append writer = PdfWriter() writer.append(reader) writer.add_outline_item("Page 1", 0) writer.add_outline_item("Page 2", 1) # Assert bytes_stream = BytesIO() writer.write(bytes_stream) bytes_stream.seek(0) reader2 = PdfReader(bytes_stream) assert len(reader2.pages) == len(reader.pages) assert len(reader2.outline) == 2 def writer_operate(writer: PdfWriter) -> None: """ To test the writer that initialized by each of the four usages. Args: writer: A PdfWriter object """ pdf_path = RESOURCE_ROOT / "crazyones.pdf" pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" reader = PdfReader(pdf_path) reader_outline = PdfReader(pdf_outline_path) page = reader.pages[0] with pytest.raises(PageSizeNotDefinedError) as exc: writer.add_blank_page() assert exc.value.args == () writer.insert_page(page, 1) writer.insert_page(reader_outline.pages[0], 0) writer.add_outline_item_destination(page) writer.remove_links() writer.add_outline_item_destination(page) oi = writer.add_outline_item( "An outline item", 0, None, (255, 0, 15), True, True, Fit.fit_box_vertically(10) ) writer.add_outline_item( "The XYZ fit", 0, oi, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3) ) writer.add_outline_item( "The XYZ fit no args", 0, oi, (255, 0, 15), True, True, Fit.xyz() ) writer.add_outline_item( "The FitH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_horizontally(top=10) ) writer.add_outline_item( "The FitV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_vertically(left=10) ) writer.add_outline_item( "The FitR fit", 0, oi, (255, 0, 15), True, True, Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), ) writer.add_outline_item( "The FitB fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box() ) writer.add_outline_item( "The FitBH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box_horizontally(top=10), ) writer.add_outline_item( "The FitBV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box_vertically(left=10), ) writer.add_blank_page() writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) writer.add_annotation( page_number=2, annotation=Link(target_page_index=1, rect=RectangleObject([0, 0, 100, 100])), ) assert writer._get_page_layout() is None writer.page_layout = "broken" assert writer.page_layout == "broken" writer.page_layout = NameObject("/SinglePage") assert writer._get_page_layout() == "/SinglePage" assert writer._get_page_mode() is None writer.page_mode = "/UseNone" assert writer._get_page_mode() == "/UseNone" writer.page_mode = NameObject("/UseOC") assert writer._get_page_mode() == "/UseOC" writer.insert_blank_page(width=100, height=100) page = writer.insert_blank_page(width=100) assert page.mediabox.height == 100 page = writer.insert_blank_page(height=100) assert page.mediabox.width == 100 writer.insert_blank_page() # without parameters writer.remove_images() writer.add_metadata(reader.metadata) writer.add_metadata({"/Author": "Martin Thoma"}) writer.add_metadata({"/MyCustom": 1234}) writer.add_attachment("foobar.gif", b"foobarcontent") # Check that every key in _idnum_hash is correct objects_hash = [o.hash_value() for o in writer._objects] for k, v in writer._idnum_hash.items(): assert v.pdf == writer assert k in objects_hash, f"Missing {v}" def test_insert_blank_page(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") old_page_count = len(writer.pages) old_page = writer.pages[0] page = writer.insert_blank_page(index=0) assert len(writer.pages) == old_page_count + 1 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == old_page.mediabox.height old_page = writer.pages[0] page = writer.insert_blank_page(width=10, index=0) assert len(writer.pages) == old_page_count + 2 assert page.mediabox.width == 10 assert page.mediabox.height == old_page.mediabox.height old_page = writer.pages[0] page = writer.insert_blank_page(width=-10, index=0) assert len(writer.pages) == old_page_count + 3 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == old_page.mediabox.height old_page = writer.pages[0] page = writer.insert_blank_page(height=20, index=0) assert len(writer.pages) == old_page_count + 4 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == 20 old_page = writer.pages[0] page = writer.insert_blank_page(height=-20, index=0) assert len(writer.pages) == old_page_count + 5 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == old_page.mediabox.height page = writer.insert_blank_page(width=30, height=40, index=0) assert len(writer.pages) == old_page_count + 6 assert page.mediabox.width == 30 assert page.mediabox.height == 40 old_page = writer.pages[0] page = writer.insert_blank_page(width=-30, height=-40, index=0) assert len(writer.pages) == old_page_count + 7 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == old_page.mediabox.height page = writer.insert_blank_page(width=50, height=60, index=len(writer.pages)) assert len(writer.pages) == old_page_count + 8 assert page.mediabox.width == 50 assert page.mediabox.height == 60 old_page = writer.pages[0] page = writer.insert_blank_page(width=-50, height=-60, index=-len(writer.pages)) assert len(writer.pages) == old_page_count + 9 assert page.mediabox.width == old_page.mediabox.width assert page.mediabox.height == old_page.mediabox.height page = writer.insert_blank_page(width=70, height=80, index=len(writer.pages) // 2) assert len(writer.pages) == old_page_count + 10 assert page.mediabox.width == 70 assert page.mediabox.height == 80 page = writer.insert_blank_page(width=70, height=80, index=-len(writer.pages) // 2) assert len(writer.pages) == old_page_count + 11 assert page.mediabox.width == 70 assert page.mediabox.height == 80 num_pages = len(writer.pages) with pytest.raises( IndexError, match=re.escape(f"Index should be in range [-{num_pages}, {num_pages}]"), ): page = writer.insert_blank_page(width=90, height=100, index=len(writer.pages) + 1) with pytest.raises( IndexError, match=re.escape(f"Index should be in range [-{num_pages}, {num_pages}]"), ): page = writer.insert_blank_page(width=-90, height=-100, index=-len(writer.pages) - 1) @pytest.mark.parametrize( ("convert", "needs_cleanup"), [ (str, True), (Path, True), (BytesIO(), False), ], ) def test_writer_operations_by_traditional_usage(convert, needs_cleanup): write_data_here = _get_write_target(convert) writer = PdfWriter() writer_operate(writer) # finally, write "output" to pypdf-output.pdf if needs_cleanup: with open(write_data_here, "wb") as output_stream: writer.write(output_stream) else: output_stream = write_data_here writer.write(output_stream) if needs_cleanup: Path(write_data_here).unlink() @pytest.mark.parametrize( ("convert", "needs_cleanup"), [ (str, True), (Path, True), (BytesIO(), False), ], ) def test_writer_operations_by_semi_traditional_usage(convert, needs_cleanup): write_data_here = _get_write_target(convert) with PdfWriter() as writer: writer_operate(writer) # finally, write "output" to pypdf-output.pdf if needs_cleanup: with open(write_data_here, "wb") as output_stream: writer.write(output_stream) else: output_stream = write_data_here writer.write(output_stream) if needs_cleanup: Path(write_data_here).unlink() @pytest.mark.parametrize( ("convert", "needs_cleanup"), [ (str, True), (Path, True), (BytesIO(), False), ], ) def test_writer_operations_by_semi_new_traditional_usage(convert, needs_cleanup): write_data_here = _get_write_target(convert) with PdfWriter() as writer: writer_operate(writer) # finally, write "output" to pypdf-output.pdf writer.write(write_data_here) if needs_cleanup: Path(write_data_here).unlink() @pytest.mark.parametrize( ("convert", "needs_cleanup"), [ (str, True), (Path, True), (BytesIO(), False), ], ) def test_writer_operation_by_new_usage(convert, needs_cleanup): write_data_here = _get_write_target(convert) # This includes write "output" to pypdf-output.pdf with PdfWriter(write_data_here) as writer: writer_operate(writer) if needs_cleanup: Path(write_data_here).unlink() @pytest.mark.parametrize( "input_path", [ "side-by-side-subfig.pdf", "reportlab-inline-image.pdf", ], ) def test_remove_images(pdf_file_path, input_path): pdf_path = RESOURCE_ROOT / input_path reader = PdfReader(pdf_path) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_images() page_contents_stream = writer.pages[0]["/Contents"]._data assert len(page_contents_stream.strip()) # finally, write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) with open(pdf_file_path, "rb") as input_stream: reader = PdfReader(input_stream) if input_path == "side-by-side-subfig.pdf": extracted_text = reader.pages[0].extract_text() assert extracted_text assert "Lorem ipsum dolor sit amet" in extracted_text @pytest.mark.enable_socket def test_remove_images_sub_level(): """Cf #2035""" url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf" name = "iss2103.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) writer.remove_images() assert ( len( [ o.get_object() for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][ "/Resources" ]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values() if not isinstance(o.get_object(), NullObject) ] ) == 0 ) @pytest.mark.parametrize( "input_path", [ "side-by-side-subfig.pdf", "reportlab-inline-image.pdf", ], ) def test_remove_text(input_path, pdf_file_path): pdf_path = RESOURCE_ROOT / input_path reader = PdfReader(pdf_path) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_text() # finally, write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_remove_text_all_operators(pdf_file_path): stream = ( b"BT " b"/F0 36 Tf " b"50 706 Td " b"36 TL " b"(The Tj operator) Tj " b'1 2 (The double quote operator) " ' b"(The single quote operator) ' " b"ET" ) pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Length %d >>\n" b"stream\n" + (b"%s\n" % stream) + b"endstream\n" b"endobj\n" b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 6\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 6 0 R /Size 6 >>\n" b"startxref\n%d\n" b"%%%%EOF" ) startx_correction = -1 pdf_data = pdf_data % ( len(stream), pdf_data.find(b"1 0 obj") + startx_correction, pdf_data.find(b"2 0 obj") + startx_correction, pdf_data.find(b"3 0 obj") + startx_correction, pdf_data.find(b"4 0 obj") + startx_correction, pdf_data.find(b"5 0 obj") + startx_correction, pdf_data.find(b"6 0 obj") + startx_correction, # startx_correction should be -1 due to double % at the beginning # inducing an error on startxref computation pdf_data.find(b"xref"), ) pdf_stream = BytesIO(pdf_data) reader = PdfReader(pdf_stream, strict=False) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_text() # finally, write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_write_metadata(pdf_file_path): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter() writer.add_page(reader.pages[0]) for page in reader.pages: writer.add_page(page) metadata = reader.metadata writer.add_metadata(metadata) writer.add_metadata({"/Title": "The Crazy Ones"}) # finally, write data to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) # Check if the title was set reader = PdfReader(pdf_file_path) metadata = reader.metadata assert metadata.get("/Title") == "The Crazy Ones" def test_fill_form(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "form.pdf") writer = PdfWriter() writer.append(reader, [0]) writer.append(RESOURCE_ROOT / "crazyones.pdf", [0]) writer.update_page_form_field_values( writer.pages[0], {"foo": "some filled in text"}, flags=1, flatten=True ) # check if no fields to fill in the page writer.update_page_form_field_values( writer.pages[1], {"foo": "some filled in text"}, flags=1, flatten=True ) writer.update_page_form_field_values( writer.pages[0], {"foo": "some filled in text"} ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_fill_form_with_qualified(): reader = PdfReader(RESOURCE_ROOT / "form.pdf") reader.add_form_topname("top") writer = PdfWriter() writer.clone_document_from_reader(reader) writer.add_page(reader.pages[0]) writer.update_page_form_field_values( writer.pages[0], {"top.foo": "filling"}, flags=1 ) b = BytesIO() writer.write(b) reader2 = PdfReader(b) fields = reader2.get_fields() assert fields["top.foo"]["/V"] == "filling" @pytest.mark.parametrize( ("use_128bit", "user_password", "owner_password"), [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], ) def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "form.pdf") writer = PdfWriter() page = reader.pages[0] orig_text = page.extract_text() writer.add_page(page) writer.encrypt( owner_password=owner_password, user_password=user_password, use_128bit=use_128bit, ) writer.encrypt( user_password=user_password, owner_password=owner_password, use_128bit=use_128bit, ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) # Test that the data is not there in clear text with open(pdf_file_path, "rb") as input_stream: data = input_stream.read() assert b"foo" not in data # Test the user password (str): reader = PdfReader(pdf_file_path, password="userpwd") new_text = reader.pages[0].extract_text() assert reader.metadata.get("/Producer") == "pypdf" assert new_text == orig_text # Test the owner password (str): reader = PdfReader(pdf_file_path, password="ownerpwd") new_text = reader.pages[0].extract_text() assert reader.metadata.get("/Producer") == "pypdf" assert new_text == orig_text # Test the user password (bytes): reader = PdfReader(pdf_file_path, password=b"userpwd") new_text = reader.pages[0].extract_text() assert reader.metadata.get("/Producer") == "pypdf" assert new_text == orig_text # Test the owner password (bytes): reader = PdfReader(pdf_file_path, password=b"ownerpwd") new_text = reader.pages[0].extract_text() assert reader.metadata.get("/Producer") == "pypdf" assert new_text == orig_text def test_add_outline_item(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") writer = PdfWriter() for page in reader.pages: writer.add_page(page) outline_item = writer.add_outline_item( "An outline item", 1, None, (255, 0, 15), True, True, Fit.fit(), is_open=False, ) _o2a = writer.add_outline_item( "Another", 2, outline_item, None, False, False, Fit.fit() ) _o2b = writer.add_outline_item( "Another bis", 2, outline_item, None, False, False, Fit.fit() ) outline_item2 = writer.add_outline_item( "An outline item 2", 1, None, (255, 0, 15), True, True, Fit.fit(), is_open=True, ) _o3a = writer.add_outline_item( "Another 2", 2, outline_item2, None, False, False, Fit.fit() ) _o3b = writer.add_outline_item( "Another 2bis", 2, outline_item2, None, False, False, Fit.fit() ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "w+b") as output_stream: writer.write(output_stream) output_stream.seek(0) reader = PdfReader(output_stream) assert reader.trailer["/Root"]["/Outlines"]["/Count"] == 3 assert reader.outline[0]["/Count"] == -2 assert reader.outline[0]["/%is_open%"] == False # noqa: E712 assert reader.outline[2]["/Count"] == 2 assert reader.outline[2]["/%is_open%"] == True # noqa: E712 assert reader.outline[1][0]["/Count"] == 0 def test_add_named_destination(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") writer = PdfWriter() assert writer.get_named_dest_root() == [] for page in reader.pages: writer.add_page(page) assert writer.get_named_dest_root() == [] writer.add_named_destination(TextStringObject("A named dest"), 2) writer.add_named_destination(TextStringObject("A named dest2"), 2) writer.add_named_destination(TextStringObject("A named dest3"), page_number=2) writer.add_named_destination(TextStringObject("A named dest3"), page_number=2) root = writer.get_named_dest_root() assert root[0] == "A named dest" assert root[1].pdf == writer assert root[1].get_object()["/S"] == NameObject("/GoTo") assert root[1].get_object()["/D"][0] == writer.pages[2].indirect_reference assert root[2] == "A named dest2" assert root[3].pdf == writer assert root[3].get_object()["/S"] == NameObject("/GoTo") assert root[3].get_object()["/D"][0] == writer.pages[2].indirect_reference assert root[4] == "A named dest3" # test get_object assert writer.get_object(root[1].idnum) == writer.get_object(root[1]) with pytest.raises(ValueError) as exc: writer.get_object(reader.pages[0].indirect_reference) assert exc.value.args[0] == "PDF must be self" # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_add_named_destination_sort_order(pdf_file_path): """ Issue #1927 does not appear. add_named_destination() maintains the named destination list sort order """ writer = PdfWriter() assert writer.get_named_dest_root() == [] writer.add_blank_page(200, 200) writer.add_named_destination("b", 0) # "a" should be moved before "b" on insert writer.add_named_destination("a", 0) root = writer.get_named_dest_root() assert len(root) == 4 assert ( root[0] == "a" ), '"a" was not inserted before "b" in the named destination root' assert root[2] == "b" # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_add_uri(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") writer = PdfWriter() for page in reader.pages: writer.add_page(page) writer.add_uri( 1, "http://www.example.com", RectangleObject([0, 0, 100, 100]), border=[1, 2, 3, [4]], ) writer.add_uri( 2, "https://pypdf.readthedocs.io/en/latest/", RectangleObject([20, 30, 50, 80]), border=[1, 2, 3], ) writer.add_uri( 3, "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", "[ 200 300 250 350 ]", border=[0, 0, 0], ) writer.add_uri( 3, "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", [100, 200, 150, 250], border=[0, 0, 0], ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_link_annotation(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") writer = PdfWriter() for page in reader.pages: writer.add_page(page) writer.add_annotation( page_number=1, annotation=Link( target_page_index=2, rect=RectangleObject( [0, 0, 100, 100], ), border=[1, 2, 3, [4]], fit=Fit.fit(), ), ) writer.add_annotation( page_number=2, annotation=Link( target_page_index=3, rect=RectangleObject( [0, 0, 100, 100], ), border=[1, 2, 3], fit=Fit.fit_horizontally(), ), ) writer.add_annotation( page_number=3, annotation=Link( target_page_index=0, rect=RectangleObject( [200, 300, 250, 350], ), border=[0, 0, 0], fit=Fit.xyz(left=0, top=0, zoom=2), ), ) writer.add_annotation( page_number=3, annotation=Link( target_page_index=0, rect=RectangleObject([100, 200, 150, 250]), border=[0, 0, 0], ), ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: writer.write(output_stream) def test_io_streams(): """This is the example from the docs ("Streaming data").""" filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" with open(filepath, "rb") as fh: bytes_stream = BytesIO(fh.read()) # Read from bytes stream reader = PdfReader(bytes_stream) assert len(reader.pages) == 4 # Write to bytes stream writer = PdfWriter() with BytesIO() as output_stream: writer.write(output_stream) def test_regression_issue670(pdf_file_path): filepath = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(filepath, strict=False) for _ in range(2): writer = PdfWriter() writer.add_page(reader.pages[0]) with open(pdf_file_path, "wb") as f_pdf: writer.write(f_pdf) def test_issue301(): """Test with invalid stream length object.""" with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: reader = PdfReader(f) writer = PdfWriter() writer.append_pages_from_reader(reader) b = BytesIO() writer.write(b) def test_append_pages_from_reader_append(): """Use append_pages_from_reader with a callable.""" with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: reader = PdfReader(f) writer = PdfWriter() writer.append_pages_from_reader(reader, callable) b = BytesIO() writer.write(b) @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_indirect_references_nullobject_exception(pdf_file_path): # TODO: Check this more closely... this looks weird url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @pytest.mark.enable_socket @pytest.mark.slow @pytest.mark.parametrize( ("url", "name"), [ ( "https://github.com/user-attachments/files/18381699/tika-924666.pdf", "test_sweep_indirect_references_nullobject_exception.pdf", ), ( "https://github.com/user-attachments/files/18381694/tika-922840.pdf", "test_write_outline_item_on_page_fitv.pdf", ), ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), ], ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_some_appends(pdf_file_path, url, name): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) def test_pdf_header(): writer = PdfWriter() assert writer.pdf_header == "%PDF-1.3" reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") writer.add_page(reader.pages[0]) assert writer.pdf_header == "%PDF-1.5" writer.pdf_header = b"%PDF-1.6" assert writer.pdf_header == "%PDF-1.6" def test_write_dict_stream_object(pdf_file_path): stream = ( b"BT " b"/F0 36 Tf " b"50 706 Td " b"36 TL " b"(The Tj operator) Tj " b'1 2 (The double quote operator) " ' b"(The single quote operator) ' " b"ET" ) stream_object = StreamObject() stream_object[NameObject("/Type")] = NameObject("/Text") stream_object._data = stream writer = PdfWriter() page_object = PageObject.create_blank_page(writer, 1000, 1000) # Construct dictionary object (PageObject) with stream object # Writer will replace this stream object with indirect object page_object[NameObject("/Test")] = stream_object page_object = writer.add_page(page_object) with open(pdf_file_path, "wb") as fp: writer.write(fp) for k, v in page_object.items(): if k == "/Test": assert repr(v) != repr(stream_object) assert isinstance(v, IndirectObject) assert str(v) == str(stream_object) # expansion of IndirectObjects assert str(v.get_object()) == str(stream_object) break else: pytest.fail("/Test not found") # Check that every key in _idnum_hash is correct objects_hash = [o.hash_value() for o in writer._objects] for k, v in writer._idnum_hash.items(): assert v.pdf == writer assert k in objects_hash, f"Missing {v}" def test_add_single_annotation(pdf_file_path): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) page = reader.pages[0] writer = PdfWriter() writer.add_page(page) annot_dict = { "/Type": "/Annot", "/Subtype": "/Text", "/Rect": [270.75, 596.25, 294.75, 620.25], "/Contents": "Note in second paragraph", "/C": [1, 1, 0], "/M": "D:20220406191858+02'00", "/Popup": { "/Type": "/Annot", "/Subtype": "/Popup", "/Rect": [294.75, 446.25, 494.75, 596.25], "/M": "D:20220406191847+02'00", }, "/T": "moose", } writer.add_annotation(0, annot_dict) # Inspect manually by adding 'assert False' and viewing the PDF with open(pdf_file_path, "wb") as fp: writer.write(fp) @pytest.mark.samples def test_colors_in_outline_item(pdf_file_path): reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") writer = PdfWriter() writer.clone_document_from_reader(reader) purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) writer.add_outline_item("First Outline Item", page_number=2, color="800080") writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) with open(pdf_file_path, "wb") as f: writer.write(f) reader2 = PdfReader(pdf_file_path) for outline_item in reader2.outline: # convert float to string because of mutability assert [f"{c:.5f}" for c in outline_item.color] == [ f"{p:.5f}" for p in purple_rgb ] @pytest.mark.samples def test_write_empty_stream(): reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") writer = PdfWriter() writer.clone_document_from_reader(reader) with pytest.raises(ValueError) as exc: writer.write("") assert exc.value.args[0] == "Output(stream='') is empty." def test_startup_dest(): pdf_file_writer = PdfWriter() pdf_file_writer.append_pages_from_reader(PdfReader(RESOURCE_ROOT / "issue-604.pdf")) assert pdf_file_writer.open_destination is None pdf_file_writer.open_destination = pdf_file_writer.pages[9] # checked also using Acrobrat to verify the good page is opened op = pdf_file_writer.root_object["/OpenAction"] assert op[0] == pdf_file_writer.pages[9].indirect_reference assert op[1] == "/Fit" op = pdf_file_writer.open_destination assert op.raw_get("/Page") == pdf_file_writer.pages[9].indirect_reference assert op["/Type"] == "/Fit" pdf_file_writer.open_destination = op assert pdf_file_writer.open_destination == op # irrelevant, just for coverage pdf_file_writer.root_object[NameObject("/OpenAction")][0] = NumberObject(0) pdf_file_writer.open_destination with pytest.raises(Exception) as exc: del pdf_file_writer.root_object[NameObject("/OpenAction")][0] pdf_file_writer.open_destination assert "Invalid Destination" in str(exc.value) pdf_file_writer.open_destination = "Test" # checked also using Acrobrat to verify open_destination op = pdf_file_writer.root_object["/OpenAction"] assert isinstance(op, TextStringObject) assert op == "Test" op = pdf_file_writer.open_destination assert isinstance(op, TextStringObject) assert op == "Test" # irrelevant, this is just for coverage pdf_file_writer.root_object[NameObject("/OpenAction")] = NumberObject(0) assert pdf_file_writer.open_destination is None pdf_file_writer.open_destination = None assert "/OpenAction" not in pdf_file_writer.root_object pdf_file_writer.open_destination = None @pytest.mark.enable_socket def test_iss471(): url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" name = "book_471.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, excluded_fields=[]) assert isinstance( writer.pages[0]["/Annots"][0].get_object()["/Dest"], TextStringObject ) @pytest.mark.enable_socket def test_reset_translation(): url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, (0, 10)) nb = len(writer._objects) writer.append(reader, (0, 10)) assert ( len(writer._objects) == nb + 11 ) # +10 (pages) +1 because of the added outline nb += 1 writer.reset_translation(reader) writer.append(reader, (0, 10)) assert len(writer._objects) >= nb + 200 nb = len(writer._objects) writer.reset_translation(reader.pages[0].indirect_reference) writer.append(reader, (0, 10)) assert len(writer._objects) >= nb + 200 nb = len(writer._objects) writer.reset_translation() writer.append(reader, (0, 10)) assert len(writer._objects) >= nb + 200 nb = len(writer.pages) writer.append(reader, [reader.pages[0], reader.pages[0]]) assert len(writer.pages) == nb + 2 def test_threads_empty(): writer = PdfWriter() thr = writer.threads assert isinstance(thr, ArrayObject) assert len(thr) == 0 thr2 = writer.threads assert thr == thr2 @pytest.mark.enable_socket def test_append_without_annots_and_articles(): url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, None, (0, 10), True, ["/B"]) writer.reset_translation() writer.append(reader, (0, 10), True, ["/B"]) assert writer.threads == [] writer = PdfWriter() writer.append(reader, None, (0, 10), True, ["/Annots"]) assert "/Annots" not in writer.pages[5] writer = PdfWriter() writer.append(reader, None, (0, 10), True, []) assert "/Annots" in writer.pages[5] assert len(writer.threads) >= 1 @pytest.mark.enable_socket def test_append_multiple(): url = "https://github.com/user-attachments/files/18381699/tika-924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append( reader, [0, 0, 0] ) # to demonstre multiple insertion of same page at once writer.append(reader, [0, 0, 0]) # second pack pages = writer.root_object["/Pages"]["/Kids"] assert pages[0] not in pages[1:] # page not repeated assert pages[-1] not in pages[0:-1] # page not repeated @pytest.mark.samples def test_set_page_label(pdf_file_path): src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels reader = PdfReader(src) expected = [ "i", "ii", "1", "2", "A", "B", "1", "2", "3", "4", "A", "i", "I", "II", "1", "2", "3", "I", "II", ] # Tests full length with labels assigned at first and last elements # Tests different labels assigned to consecutive ranges writer = PdfWriter(reader, full=True) writer.set_page_label(0, 1, "/r") writer.set_page_label(4, 5, "/A") writer.set_page_label(10, 10, "/A") writer.set_page_label(11, 11, "/r") writer.set_page_label(12, 13, "/R") writer.set_page_label(17, 18, "/R") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels == expected writer = PdfWriter() # Same labels, different set order writer.clone_document_from_reader(reader) writer.set_page_label(17, 18, "/R") writer.set_page_label(4, 5, "/A") writer.set_page_label(10, 10, "/A") writer.set_page_label(0, 1, "/r") writer.set_page_label(12, 13, "/R") writer.set_page_label(11, 11, "/r") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels == expected # Tests labels assigned only in the middle # Tests label assigned to a range already containing labelled ranges expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] writer = PdfWriter() writer.clone_document_from_reader(reader) writer.set_page_label(3, 4, "/a") writer.set_page_label(5, 5, "/A") writer.set_page_label(2, 6, "/r") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected # Tests labels assigned inside a previously existing range expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] # Ones repeat because user did not cover the entire original range writer = PdfWriter() writer.clone_document_from_reader(reader) writer.set_page_label(2, 6, "/r") writer.set_page_label(3, 4, "/a") writer.set_page_label(5, 5, "/A") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected # Tests invalid user input writer = PdfWriter() writer.clone_document_from_reader(reader) with pytest.raises( ValueError, match="At least one of style and prefix must be given" ): writer.set_page_label(0, 5, start=2) with pytest.raises( ValueError, match="page_index_from must be greater or equal than 0" ): writer.set_page_label(-1, 5, "/r") with pytest.raises( ValueError, match="page_index_to must be greater or equal than page_index_from" ): writer.set_page_label(5, 0, "/r") with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): writer.set_page_label(0, 19, "/r") with pytest.raises( ValueError, match="If given, start must be greater or equal than one" ): writer.set_page_label(0, 5, "/r", start=-1) pdf_file_path.unlink() src = ( SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" ) # File with pre existing labels reader = PdfReader(src) # Tests adding labels to existing ones expected = ["i", "ii", "A", "B", "1"] writer = PdfWriter() writer.clone_document_from_reader(reader) writer.set_page_label(2, 3, "/A") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected # Tests replacing existing labels expected = ["A", "B", "1", "1", "2"] writer = PdfWriter() writer.clone_document_from_reader(reader) writer.set_page_label(0, 1, "/A") writer.write(pdf_file_path) assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected pdf_file_path.unlink() # Tests prefix and start. src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels reader = PdfReader(src) writer = PdfWriter() writer.clone_document_from_reader(reader) writer.set_page_label(0, 0, prefix="FRONT") writer.set_page_label(1, 2, "/D", start=2) writer.set_page_label(3, 6, prefix="UPDATES") writer.set_page_label(7, 10, "/D", prefix="THYR-") writer.set_page_label(11, 21, "/D", prefix="PAP-") writer.set_page_label(22, 30, "/D", prefix="FOLL-") writer.set_page_label(31, 39, "/D", prefix="HURT-") writer.write(pdf_file_path) @pytest.mark.enable_socket def test_iss1601(): url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" name = "badge-38.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) original_cs_operations = ContentStream( reader.pages[0].get_contents(), reader ).operations writer = PdfWriter() page_1 = writer.add_blank_page( reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] ) page_1.merge_transformed_page(reader.pages[0], Transformation()) page_1_cs_operations = page_1.get_contents().operations assert is_sublist(original_cs_operations, page_1_cs_operations) page_1 = writer.add_blank_page( reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] ) page_1.merge_page(reader.pages[0]) page_1_cs_operations = page_1.get_contents().operations assert is_sublist(original_cs_operations, page_1_cs_operations) def test_attachments(): writer = PdfWriter() writer.add_blank_page(100, 100) b = BytesIO() writer.write(b) b.seek(0) reader = PdfReader(b) b = None assert reader.attachments == {} assert reader._list_attachments() == [] assert reader._get_attachments() == {} to_add = [ ("foobar.txt", b"foobarcontent"), ("foobar2.txt", b"foobarcontent2"), ("foobar2.txt", "2nd_foobarcontent"), ] for name, content in to_add: writer.add_attachment(name, content) b = BytesIO() writer.write(b) b.seek(0) reader = PdfReader(b) b = None assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) assert str(reader.attachments) == "LazyDict(keys=['foobar.txt', 'foobar2.txt'])" assert reader._list_attachments() == [name for name, _ in to_add] # We've added the same key twice - hence only 2 and not 3: att = reader._get_attachments() assert len(att) == 2 # we have 2 keys, but 3 attachments! # The content for foobar.txt is clear and just a single value: assert att["foobar.txt"] == b"foobarcontent" # The content for foobar2.txt is a list! att = reader._get_attachments("foobar2.txt") assert len(att) == 1 assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"] # Let's do both cases with the public interface: assert reader.attachments["foobar.txt"][0] == b"foobarcontent" assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2" assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent" @pytest.mark.enable_socket def test_iss1614(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" name = "iss1614.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) # test for 2nd error case reported in #1614 url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" name = "iss1614.2.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer.append(reader) @pytest.mark.enable_socket def test_new_removes(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" name = "iss1650.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) writer.remove_images() b = BytesIO() writer.write(b) bb = bytes(b.getbuffer()) assert b"/Im0 Do" not in bb assert b"/Fm0 Do" in bb assert b" TJ" in bb writer = PdfWriter() writer.clone_document_from_reader(reader) writer.remove_text() b = BytesIO() writer.write(b) bb = bytes(b.getbuffer()) assert b"/Im0" in bb assert b"Chap" not in bb assert b" TJ" not in bb # Test removing text in a specified font writer = PdfWriter() writer.clone_document_from_reader(reader) b = BytesIO() writer.write(b) temp_reader = PdfReader(b) text = temp_reader.pages[0].extract_text() assert "Arbeitsschritt" in text assert "Modelltechnik" in text writer.remove_text(font_names=["LiberationSans-Bold"]) b = BytesIO() writer.write(b) temp_reader = PdfReader(b) text = temp_reader.pages[0].extract_text() assert "Arbeitsschritt" not in text assert "Modelltechnik" in text # Test removing text in a specified font that doesn't exist (nothing should happen) writer = PdfWriter() writer.clone_document_from_reader(reader) b = BytesIO() writer.write(b) temp_reader = PdfReader(b) text = temp_reader.pages[0].extract_text() assert "Arbeitsschritt" in text assert "Modelltechnik" in text writer.remove_text(font_names=["ComicSans-Oblique"]) b = BytesIO() writer.write(b) temp_reader = PdfReader(b) text = temp_reader.pages[0].extract_text() assert "Arbeitsschritt" in text assert "Modelltechnik" in text url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" name = "GeoBaseWithComments.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer.append(reader) writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) assert "/Links" not in [ a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] ] writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.ATTACHMENTS) assert "/FileAttachment" not in [ a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] ] writer.pages[0]["/Annots"].append( DictionaryObject({NameObject("/Subtype"): TextStringObject("/3D")}) ) assert "/3D" in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.OBJECTS_3D) assert "/3D" not in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] writer.remove_links() assert len(writer.pages[0]["/Annots"]) == 0 assert len(writer.pages[3]["/Annots"]) == 0 writer.remove_annotations("/Text") @pytest.mark.enable_socket def test_late_iss1654(): url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" name = "bid1.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) for p in writer.pages: p.compress_content_streams() b = BytesIO() writer.write(b) @pytest.mark.enable_socket def test_iss1723(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" name = "iss1723.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, (3, 5)) @pytest.mark.enable_socket def test_iss1767(): # test with a pdf which is buggy because the object 389,0 exists 3 times: # twice to define catalog and one as an XObject inducing a loop when # cloning url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" name = "iss1767.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) PdfWriter(clone_from=reader) @pytest.mark.enable_socket def test_named_dest_page_number(): """ Closes iss471 tests appending with named destinations as integers """ url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" name = "central.pdf" writer = PdfWriter() writer.add_blank_page(100, 100) writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 2 assert writer.root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) writer.append(BytesIO(get_data_from_url(url, name=name))) assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 6 writer2 = PdfWriter() writer2.add_blank_page(100, 100) dest = writer2.add_named_destination("toto", 0) dest.get_object()[NameObject("/D")][0] = NullObject() b = BytesIO() writer2.write(b) b.seek(0) writer.append(b) assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 6 def test_update_form_fields(caplog, tmp_path): write_data_here = tmp_path / "out.pdf" writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") writer.update_page_form_field_values( writer.pages[0], { "CheckBox1": "/Yes", "Text1": "mon Text1", "Text2": "ligne1\nligne2", "RadioGroup1": "/2", "RdoS1": "/", "Combo1": "!!monCombo!!", "Liste1": "Liste2", "Liste2": ["Lst1", "Lst3"], "DropList1": "DropListe3", }, auto_regenerate=False, flatten=True, ) del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"] del writer.pages[0]["/Resources"]["/Font"] writer.update_page_form_field_values( writer.pages[0], {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, auto_regenerate=False, ) writer.update_page_form_field_values( writer.pages[0], {"Text1": None, "Text2": None}, auto_regenerate=False, flatten=True, ) writer.write(write_data_here) reader = PdfReader(write_data_here) flds = reader.get_fields() assert flds["CheckBox1"]["/V"] == "/Yes" assert flds["CheckBox1"].indirect_reference.get_object()["/AS"] == "/Yes" assert ( b"(my Text1)" in flds["Text1"].indirect_reference.get_object()["/AP"]["/N"].get_data() ) assert flds["Text2"]["/V"] == "ligne1\nligne2\nligne3" assert ( b"(ligne3)" in flds["Text2"].indirect_reference.get_object()["/AP"]["/N"].get_data() ) assert flds["RadioGroup1"]["/V"] == "/2" assert flds["RadioGroup1"]["/Kids"][0].get_object()["/AS"] == "/Off" assert flds["RadioGroup1"]["/Kids"][1].get_object()["/AS"] == "/2" assert all(x in flds["Liste2"]["/V"] for x in ["Lst1", "Lst3"]) assert all(x in flds["CheckBox1"]["/_States_"] for x in ["/Off", "/Yes"]) assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") writer.add_annotation( page_number=0, annotation=Link(target_page_index=1, rect=RectangleObject([0, 0, 100, 100])), ) writer.insert_blank_page(100, 100, 0) del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DA"] del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DR"]["/Font"] writer.update_page_form_field_values( [writer.pages[0], writer.pages[1]], {"Text1": "!مرحبا بالعالم", "Text2": "ligne1\nligne2\nligne3"}, auto_regenerate=False, ) assert b"/Helvetica " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data() assert "Text string '!مرحبا بالعالم' contains characters not supported by font encoding." in caplog.text writer.update_page_form_field_values( None, {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, auto_regenerate=False, flatten=True ) Path(write_data_here).unlink() def test_add_apstream_object(): writer = PdfWriter() page = writer.add_blank_page(1000, 1000) assert NameObject("/Contents") not in page apstream_object = DecodedStreamObject.initialize_from_dictionary( { NameObject("/Type"): NameObject("/XObject"), NameObject("/Subtype"): NameObject("/Form"), NameObject("/BBox"): RectangleObject([0.0, 0.0, 10.5, 10.5]), "__streamdata__": ByteStringObject(b"BT /F1 12 Tf (Hello World) Tj ET") } ) writer._add_object(apstream_object) object_name = "AA2342!@#$% ^^##aa:-)" x_offset = 200 y_offset = 200 writer._add_apstream_object(page, apstream_object, object_name, x_offset, y_offset) assert NameObject("/XObject") in page[NameObject("/Resources")] assert "/Fm_AA2342__________aa_-_" in page[NameObject("/Resources")][NameObject("/XObject")] assert NameObject("/Contents") in page contents_obj = page[NameObject("/Contents")] stream = contents_obj.get_object() assert isinstance(stream, StreamObject) assert stream.get_data() == ( b"q\n1.0000 0.0000 0.0000 1.0000 200.0000 200.0000 cm\n/Fm_AA2342__________aa_-_ Do\nQ" ) def test_merge_content_stream_to_page(): """Test that new content data is correctly added to page contents in the form of an ArrayObject or StreamObject. The test_add_apstream_object code already correctly checks that _merge_content_stream_to_page works for an emtpy page. """ writer = PdfWriter() page = writer.add_blank_page(100, 100) new_content = b"BT /F1 12 Tf (Hello World) Tj ET" # Call the method under test writer._merge_content_stream_to_page(page, new_content) more_content = b"BT /F1 12 Tf (Hello Again, World) Tj ET" writer._merge_content_stream_to_page(page, more_content) contents_obj = page[NameObject("/Contents")] stream = contents_obj.get_object() assert isinstance(stream, StreamObject) assert stream.get_data() == b"BT /F1 12 Tf (Hello World) Tj ET\nBT /F1 12 Tf (Hello Again, World) Tj ET" new_stream_obj = StreamObject() new_stream_obj.set_data(new_content) content = ArrayObject() content.append(new_stream_obj) page[NameObject("/Contents")] = writer._add_object(content) writer._merge_content_stream_to_page(page, more_content) contents_obj = page[NameObject("/Contents")] array = contents_obj.get_object() assert isinstance(array, ArrayObject) contents = page[NameObject("/Contents")].get_object() assert contents[0].get_object().get_data() == new_content assert contents[1].get_object().get_data() == more_content @pytest.mark.enable_socket def test_update_form_fields2(caplog): my_files = { "test1": { "name": "Test1 Form", "url": "https://github.com/py-pdf/pypdf/files/14817365/test1.pdf", "path": "iss2234a.pdf", "usage": { "fields": { "First Name": "Reed", "Middle Name": "R", "MM": "04", "DD": "21", "YY": "24", "Initial": "RRG", # "I DO NOT Agree": null, # "Last Name": null }, }, }, "test2": { "name": "Test2 Form", "url": "https://github.com/py-pdf/pypdf/files/14817366/test2.pdf", "path": "iss2234b.pdf", "usage": { "fields": { "p2 First Name": "Joe", "p2 Middle Name": "S", "p2 MM": "03", "p2 DD": "31", "p2 YY": "24", "Initial": "JSS", # "p2 I DO NOT Agree": "null", "p2 Last Name": "Smith", "p3 First Name": "شهرزاد", "p3 Middle Name": "R", "p3 MM": "01", "p3 DD": "25", "p3 YY": "21", }, }, }, } merger = PdfWriter() for file in my_files: reader = PdfReader( BytesIO(get_data_from_url(my_files[file]["url"], name=my_files[file]["path"])) ) reader.add_form_topname(file) writer = PdfWriter(clone_from=reader) writer.update_page_form_field_values( None, my_files[file]["usage"]["fields"], auto_regenerate=True ) merger.append(writer) assert merger.get_form_text_fields(True) == { "test1.First Name": "Reed", "test1.Middle Name": "R", "test1.MM": "04", "test1.DD": "21", "test1.YY": "24", "test1.Initial": "RRG", "test1.I DO NOT Agree": None, "test1.Last Name": None, "test2.p2 First Name": "Joe", "test2.p2 Middle Name": "S", "test2.p2 MM": "03", "test2.p2 DD": "31", "test2.p2 YY": "24", "test2.Initial": "JSS", "test2.p2 I DO NOT Agree": None, "test2.p2 Last Name": "Smith", "test2.p3 First Name": "شهرزاد", "test2.p3 Middle Name": "R", "test2.p3 MM": "01", "test2.p3 DD": "25", "test2.p3 YY": "21", } assert "Text string 'شهرزاد' contains characters not supported by font encoding." in caplog.text @pytest.mark.enable_socket def test_iss1862(): # The file here has "/B" entry to define the font in a object below the page # The excluded field shall be considered only at first level (page) and not # below url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" name = "iss1862.pdf" writer = PdfWriter() writer.append(BytesIO(get_data_from_url(url, name=name))) # check that "/B" is in the font writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() def test_empty_objects_before_cloning(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter(clone_from=reader) nb_obj_reader = len(reader.xref_objStm) + sum( len(reader.xref[i]) for i in reader.xref ) nb_obj_reader -= 1 # for trailer nb_obj_reader -= len( {x: 1 for x, y in reader.xref_objStm.values()} ) # to remove object streams assert len(writer._objects) == nb_obj_reader @pytest.mark.enable_socket def test_watermark(): url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" name = "bgwatermark.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" name = "srcwatermark.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) for p in writer.pages: p.merge_page(reader.pages[0], over=False) assert isinstance(p["/Contents"], ArrayObject) assert isinstance(p["/Contents"][0], IndirectObject) b = BytesIO() writer.write(b) assert len(b.getvalue()) < 2.1 * 1024 * 1024 @pytest.mark.enable_socket @pytest.mark.timeout(4) def test_watermarking_speed(): url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" name = "bgwatermark.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://arxiv.org/pdf/2201.00214.pdf" name = "2201.00214.pdf" writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) for p in writer.pages: p.merge_page(reader.pages[0], over=False) out_pdf_bytesio = BytesIO() writer.write(out_pdf_bytesio) pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024 assert pdf_size_in_mib < 20 @pytest.mark.enable_socket @pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") def test_watermark_rendering(tmp_path): """Ensure the visual appearance of watermarking stays correct.""" url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" name = "bgwatermark.pdf" watermark = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" name = "srcwatermark.pdf" page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] writer = PdfWriter() page = writer.add_page(page) page.merge_page(watermark, over=False) target_png_path = tmp_path / "target.png" url = "https://github.com/py-pdf/pypdf/assets/96178532/d5c72d0e-7047-4504-bbf6-bc591c80d7c0" name = "dstwatermark.png" target_png_path.write_bytes(get_data_from_url(url, name=name)) pdf_path = tmp_path / "out.pdf" png_path = tmp_path / "out.png" writer.write(pdf_path) # False positive: https://github.com/PyCQA/bandit/issues/333 subprocess.run( # noqa: S603 [ GHOSTSCRIPT_BINARY, "-sDEVICE=pngalpha", "-o", png_path, pdf_path, ] ) assert png_path.is_file() assert image_similarity(png_path, target_png_path) >= 0.95 @pytest.mark.samples @pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") def test_watermarking_reportlab_rendering(tmp_path): """ This test is showing a rotated+mirrored watermark in pypdf==3.15.4. Replacing the generate_base with e.g. the crazyones did not show the issue. """ base_path = SAMPLE_ROOT / "022-pdfkit/pdfkit.pdf" watermark_path = SAMPLE_ROOT / "013-reportlab-overlay/reportlab-overlay.pdf" reader = PdfReader(base_path) base_page = reader.pages[0] watermark = PdfReader(watermark_path).pages[0] writer = PdfWriter() base_page = writer.add_page(base_page) base_page.merge_page(watermark) target_png_path = RESOURCE_ROOT / "test_watermarking_reportlab_rendering.png" pdf_path = tmp_path / "out.pdf" png_path = tmp_path / "test_watermarking_reportlab_rendering.png" writer.write(pdf_path) # False positive: https://github.com/PyCQA/bandit/issues/333 subprocess.run( # noqa: S603 [ GHOSTSCRIPT_BINARY, "-r120", "-sDEVICE=pngalpha", "-o", png_path, pdf_path, ] ) assert png_path.is_file() assert image_similarity(png_path, target_png_path) >= 0.999 @pytest.mark.enable_socket def test_da_missing_in_annot(): url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" name = "BuildingDivisionPermitApplication.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) writer.update_page_form_field_values( writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False ) b = BytesIO() writer.write(b) reader = PdfReader(BytesIO(b.getvalue())) ff = reader.get_fields() # check for autosize processing assert ( b" 0 Tf" not in ff["PCN-1"].indirect_reference.get_object()["/AP"]["/N"].get_data() ) f2 = writer.get_object(ff["PCN-2"].indirect_reference.idnum) f2[NameObject("/Parent")] = writer.get_object( ff["PCN-1"].indirect_reference.idnum ).indirect_reference writer.update_page_form_field_values( writer.pages[0], {"PCN-2": "1"}, auto_regenerate=False ) def test_missing_fields(pdf_file_path): reader = PdfReader(RESOURCE_ROOT / "form.pdf") writer = PdfWriter() writer.add_page(reader.pages[0]) with pytest.raises(PyPdfError) as exc: writer.update_page_form_field_values( writer.pages[0], {"foo": "some filled in text"}, flags=1 ) assert exc.value.args[0] == "No /AcroForm dictionary in PDF of PdfWriter Object" writer = PdfWriter() writer.append(reader, [0]) del writer.root_object["/AcroForm"]["/Fields"] with pytest.raises(PyPdfError) as exc: writer.update_page_form_field_values( writer.pages[0], {"foo": "some filled in text"}, flags=1 ) assert exc.value.args[0] == "No /Fields dictionary in PDF of PdfWriter Object" def test_missing_info(): reader = PdfReader(RESOURCE_ROOT / "missing_info.pdf") writer = PdfWriter(clone_from=reader) assert len(writer.pages) == len(reader.pages) assert writer.metadata is None b = BytesIO() writer.write(b) assert b"/Info" not in b.getvalue() reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") writer.metadata = reader.metadata assert dict(writer._info) == dict(reader._info) assert writer.metadata == reader.metadata b = BytesIO() writer.write(b) assert b"/Info" in b.getvalue() writer.metadata = {} writer._info = DictionaryObject() # for code coverage b = BytesIO() writer.write(b) assert b"/Info" in b.getvalue() assert writer.metadata == {} writer.metadata = None writer.metadata = None # for code coverage assert writer.metadata is None assert PdfWriter().metadata == {"/Producer": "pypdf"} b = BytesIO() writer.write(b) assert b"/Info" not in b.getvalue() @pytest.mark.enable_socket def test_germanfields(): """Cf #2035""" url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" name = "germanfields.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) form_fields = {"Text Box 1": "test æ ø å"} writer.update_page_form_field_values( writer.pages[0], form_fields, auto_regenerate=False ) bytes_stream = BytesIO() writer.write(bytes_stream) bytes_stream.seek(0) reader2 = PdfReader(bytes_stream) assert ( b"test \xe6 \xf8 \xe5" in reader2.get_fields()["Text Box 1"] .indirect_reference.get_object()["/AP"]["/N"] .get_data() ) @pytest.mark.enable_socket def test_no_t_in_articles(): """Cf #2078""" url = "https://github.com/py-pdf/pypdf/files/12311735/bad.pdf" name = "iss2078.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) @pytest.mark.enable_socket def test_no_i_in_articles(): """Cf #2089""" url = "https://github.com/py-pdf/pypdf/files/12352793/kim2002.pdf" name = "iss2089.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) @pytest.mark.enable_socket def test_damaged_pdf_length_returning_none(): """ Cf #140 https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 """ url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" name = "iss140_bad_pdf.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) @pytest.mark.enable_socket def test_viewerpreferences(): """Add Tests for ViewerPreferences""" url = "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf" name = "2015._pb_decode_pg0.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) v = reader.viewer_preferences assert v.center_window == True # noqa: E712 writer = PdfWriter(clone_from=reader) v = writer.viewer_preferences assert v.center_window == True # noqa: E712 v.center_window = False assert ( writer.root_object["/ViewerPreferences"]["/CenterWindow"] == False # noqa: E712 ) assert v.print_area == "/CropBox" with pytest.raises(ValueError): v.non_fullscreen_pagemode = "toto" with pytest.raises(ValueError): v.non_fullscreen_pagemode = "/toto" v.non_fullscreen_pagemode = "/UseOutlines" assert ( writer.root_object["/ViewerPreferences"]["/NonFullScreenPageMode"] == "/UseOutlines" ) writer = PdfWriter(clone_from=reader) v = writer.viewer_preferences assert v.center_window == True # noqa: E712 v.center_window = False assert ( writer.root_object["/ViewerPreferences"]["/CenterWindow"] == False # noqa: E712 ) writer = PdfWriter(clone_from=reader) writer.root_object[NameObject("/ViewerPreferences")] = writer._add_object( writer.root_object["/ViewerPreferences"] ) v = writer.viewer_preferences v.center_window = False assert ( writer.root_object["/ViewerPreferences"]["/CenterWindow"] == False # noqa: E712 ) v.num_copies = 1 assert v.num_copies == 1 assert v.print_pagerange is None with pytest.raises(ValueError): v.print_pagerange = "toto" v.print_pagerange = ArrayObject() assert len(v.print_pagerange) == 0 writer.create_viewer_preferences() assert len(writer.root_object["/ViewerPreferences"]) == 0 writer.viewer_preferences.direction = "/R2L" assert len(writer.root_object["/ViewerPreferences"]) == 1 assert writer.viewer_preferences.enforce == [] assert "/Enforce" not in writer.viewer_preferences writer.viewer_preferences.enforce += writer.viewer_preferences.PRINT_SCALING assert writer.viewer_preferences["/Enforce"] == ["/PrintScaling"] writer.viewer_preferences.enforce = None assert "/Enforce" not in writer.viewer_preferences writer.viewer_preferences.enforce = None del reader.trailer["/Root"]["/ViewerPreferences"] assert reader.viewer_preferences is None writer = PdfWriter(clone_from=reader) assert writer.viewer_preferences is None def test_extra_spaces_in_da_text(caplog): writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") t = writer.pages[0]["/Annots"][0].get_object()["/DA"] t = t.replace("/Helv", "/Helv ") writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) writer.update_page_form_field_values( writer.pages[0], {"foo": "abcd"}, auto_regenerate=False ) t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() assert "Font dictionary for not found." not in caplog.text assert b"/Helv" in t assert b"(abcd)" in t @pytest.mark.enable_socket def test_object_contains_indirect_reference_to_self(): url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" name = "iss2102.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() width, height = 595, 841 outpage = writer.add_blank_page(width, height) outpage.merge_page(reader.pages[6]) writer.append(reader) def test_remove_image_per_type(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf") writer.remove_images(ImageType.INLINE_IMAGES) assert all( x not in writer.pages[0].get_contents().get_data() for x in (b"BI", b"ID", b"EI") ) writer.remove_images() writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") writer.remove_images(ImageType.DRAWING_IMAGES) assert all( x not in writer.pages[1].get_contents().get_data() for x in (b" re\n", b"W*", b"f*") ) assert all( x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm") ) assert all( x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data() for x in (b" re\n", b"W*", b"f*") ) writer.remove_images(ImageType.XOBJECT_IMAGES) assert b"Do\n" not in writer.pages[0].get_contents().get_data() assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0 @pytest.mark.enable_socket def test_add_outlines_on_empty_dict(): """Cf #2233""" def _get_parent_bookmark(current_indent, history_indent, bookmarks) -> Any: """The parent of A is the nearest bookmark whose indent is smaller than A's""" assert len(history_indent) == len(bookmarks) if current_indent == 0: return None for i in range(len(history_indent) - 1, -1, -1): # len(history_indent) - 1 ===> 0 if history_indent[i] < current_indent: return bookmarks[i] return None bookmark_lines = """1 FUNDAMENTALS OF RADIATIVE TRANSFER 1 1.1 The Electromagnetic Spectrum; Elementary Properties of Radiation 1 1.2 Radiative Flux 2 Macroscopic Description of the Propagation of Radiation 2 Flux from an Isotropic Source-The Inverse Square Law 2 1.3 The Specific Intensity and Its Moments 3 Definition of Specific Intensity or Brightness 3 Net Flux and Momentum Flux 4 Radiative Energy Density 5 Radiation Pressure in an Enclosure Containing an Isotropic Radiation Field 6 Constancy of Specific Zntensiw Along Rays in Free Space 7 Proof of the Inverse Square Law for a Uniformly Bright Sphere 7 1.4 Radiative Transfer 8 Emission 9 Absorption 9 The Radiative Transfer Equation 11 Optical Depth and Source Function 12 Mean Free Path 14 Radiation Force 15 1.5 Thermal Radiation 15 Blackbody Radiation 15 Kirchhof's Law for Thermal Emission 16 Thermodynamics of Blackbody Radiation 17 The Planck Spectrum 20 Properties of the Planck Law 23 Characteristic Temperatures Related to Planck Spectrum 25 1.6 The Einstein Coefficients 27 Definition of Coefficients 27 Relations between Einstein Coefficients 29 Absorption and Emission Coefficients in Terms of Einstein Coefficients 30 1.7 Scattering Effects; Random Walks 33 Pure Scattering 33 Combined Scattering and Absorption 36 1.8 Radiative Diffusion 39 The Rosseland Approximation 39 The Eddington Approximation; Two-Stream Approximation 42 PROBLEMS 45 REFERENCES 50 2 BASIC THEORY OF RADIATION FIELDS 51 2.1 Review of Maxwell’s Equations 51 2.2 Plane Electromagnetic Waves 55 2.3 The Radiation Spectrum 58 2.4 Polarization and Stokes Parameters 62 Monochromatic Waves 62 Quasi-monochromatic Waves 65 2.5 Electromagnetic Potentials 69 2.6 Applicability of Transfer Theory and the Geometrical Optics Limit 72 PROBLEMS 74 REFERENCES 76""" url = "https://github.com/py-pdf/pypdf/files/12797067/test-12.pdf" name = "iss2233.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) bookmarks, history_indent = [], [] for line in bookmark_lines.split("\n"): line2 = re.split(r"\s+", line.strip()) indent_size = len(line) - len(line.lstrip()) parent = _get_parent_bookmark(indent_size, history_indent, bookmarks) history_indent.append(indent_size) title, page = " ".join(line2[:-1]), int(line2[-1]) - 1 new_bookmark = writer.add_outline_item(title, page, parent=parent) bookmarks.append(new_bookmark) def test_merging_many_temporary_files(caplog): def create_number_pdf(_n) -> BytesIO: pytest.importorskip("fpdf") from fpdf import FPDF # noqa: PLC0415 pdf = FPDF() pdf.add_page() pdf.set_font("helvetica", "B", 16) pdf.cell(40, 10, str(_n)) byte_string = pdf.output() return BytesIO(byte_string) writer = PdfWriter() for n in range(100): reader = PdfReader(create_number_pdf(n)) for page in reader.pages: # Should only be one page. writer.add_page(page) pg = PageObject.create_blank_page(writer, 1000, 1000) pg1 = writer.add_page(pg) assert len(writer.pages) == 101 caplog.clear() writer.remove_page(pg) assert "Cannot find page in pages" in caplog.text assert len(writer.pages) == 101 writer.remove_page(pg1) assert len(writer.pages) == 100 out = BytesIO() writer.write(out) out.seek(0) reader = PdfReader(out) for n, page in enumerate(reader.pages): text = page.extract_text() assert text == str(n) # test completed to validate remove_page writer.remove_page(writer.pages[-1], True) writer2 = PdfWriter() writer2.remove_page(0) writer2.flattened_pages = None writer2.remove_page(0) caplog.clear() writer.remove_page(writer.pages[-1]["/Contents"].indirect_reference) assert "IndirectObject is not referencing a page" in caplog.text caplog.clear() pg = PageObject.create_blank_page(writer, 1000, 1000) writer.remove_page(pg) assert "Cannot find page in pages" in caplog.text caplog.clear() writer.remove_page(999999) assert "Page number is out of range" in caplog.text pg = PageObject.create_blank_page(writer, 1000, 1000) pg = writer._add_object(pg) writer.flattened_pages.append(pg) caplog.clear() writer.remove_page(pg) assert "Cannot find page in pages" in caplog.text @pytest.mark.enable_socket def test_reattach_fields(): """ Test Reattach function addressed in #2453 """ url = "https://github.com/py-pdf/pypdf/files/14241368/ExampleForm.pdf" name = "iss2453.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() for p in reader.pages: writer.add_page(p) assert len(writer.reattach_fields()) == 15 assert len(writer.reattach_fields()) == 0 # nothing to append anymore assert len(writer.root_object["/AcroForm"]["/Fields"]) == 15 writer = PdfWriter(clone_from=reader) assert len(writer.reattach_fields()) == 7 writer.reattach_fields() assert len(writer.root_object["/AcroForm"]["/Fields"]) == 15 writer = PdfWriter() for p in reader.pages: writer.add_page(p) ano = writer.pages[0]["/Annots"][0].get_object() del ano.indirect_reference writer.pages[0]["/Annots"][0] = ano assert isinstance(writer.pages[0]["/Annots"][0], DictionaryObject) assert len(writer.reattach_fields(writer.pages[0])) == 6 assert isinstance(writer.pages[0]["/Annots"][0], IndirectObject) del writer.pages[1]["/Annots"] assert len(writer.reattach_fields(writer.pages[1])) == 0 def test_get_pagenumber_from_indirectobject(): """Test test_get_pagenumber_from_indirectobject""" pdf_path = RESOURCE_ROOT / "crazyones.pdf" writer = PdfWriter(clone_from=pdf_path) assert writer._get_page_number_by_indirect(None) is None assert writer._get_page_number_by_indirect(NullObject()) is None ind = writer.pages[0].indirect_reference assert writer._get_page_number_by_indirect(ind) == 0 assert writer._get_page_number_by_indirect(ind.idnum) == 0 assert writer._get_page_number_by_indirect(ind.idnum + 1) is None def test_replace_object(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter(clone_from=reader) with pytest.raises(ValueError): writer._replace_object(reader.pages[0].indirect_reference, reader.pages[0]) writer._replace_object(writer.pages[0].indirect_reference, reader.pages[0]) pg = PageObject.create_blank_page(writer, 1000, 1000) writer._replace_object(writer.pages[0].indirect_reference, pg) # mainly for coverage reader = PdfReader(pdf_path) # reload a new instance with pytest.raises(ValueError): reader._replace_object(writer.pages[0].indirect_reference, reader.pages[0]) with pytest.raises(ValueError): reader._replace_object(IndirectObject(9999, 9999, reader), reader.pages[0]) reader._replace_object(reader.pages[0].indirect_reference, reader.pages[0]) pg = PageObject.create_blank_page(writer, 1000, 1000) reader._replace_object(reader.pages[0].indirect_reference, pg) pg = PageObject.create_blank_page(None, 1000, 1000) pg[NameObject("/Contents")] = writer.pages[0]["/Contents"] writer._add_object(pg) writer.add_page(pg) def test_mime_jupyter(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter(clone_from=reader) assert reader._repr_mimebundle_(("include",), ("exclude",)) == {} assert writer._repr_mimebundle_(("include",), ("exclude",)) == {} def test_init_without_named_arg(): """Test to use file_obj argument and not clone_from""" pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) writer = PdfWriter(clone_from=reader) nb = len(writer._objects) writer = PdfWriter(reader) assert len(writer._objects) == nb with open(pdf_path, "rb") as f: writer = PdfWriter(f) f.seek(0, 0) by = BytesIO(f.read()) assert len(writer._objects) == nb writer = PdfWriter(pdf_path) assert len(writer._objects) == nb writer = PdfWriter(str(pdf_path)) assert len(writer._objects) == nb writer = PdfWriter(by) assert len(writer._objects) == nb @pytest.mark.enable_socket def test_i_in_choice_fields(): """Cf #2611""" url = "https://github.com/py-pdf/pypdf/files/15176321/FRA.F.6180.150.pdf" name = "iss2611.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) assert "/I" in writer.get_fields()["State"].indirect_reference.get_object() writer.update_page_form_field_values( writer.pages[0], {"State": "NY"}, auto_regenerate=False ) assert "/I" not in writer.get_fields()["State"].indirect_reference.get_object() def test_selfont(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") writer.update_page_form_field_values( writer.pages[0], {"Text1": ("Text_1", "", 5), "Text2": ("Text_2", "/F3", 0)}, auto_regenerate=False, ) assert ( b"/F3 5 Tf" in writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"].get_data() ) assert ( b"Text_1" in writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"].get_data() ) assert ( b"/F3 12.0 Tf" in writer.pages[0]["/Annots"][2].get_object()["/AP"]["/N"].get_data() ) assert ( b"Text_2" in writer.pages[0]["/Annots"][2].get_object()["/AP"]["/N"].get_data() ) @pytest.mark.enable_socket def test_no_resource_for_14_std_fonts(): """Cf #2670""" url = "https://github.com/py-pdf/pypdf/files/15405390/f1040.pdf" name = "iss2670.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) p = writer.pages[0] for a in p["/Annots"]: a = a.get_object() if a["/FT"] == "/Tx": writer.update_page_form_field_values( p, {a["/T"]: "Brooks"}, auto_regenerate=False ) assert "/Helvetica" in a["/AP"]["/N"]["/Resources"]["/Font"] @pytest.mark.enable_socket def test_field_box_upside_down(): """Cf #2724""" url = "https://github.com/user-attachments/files/15996356/FRA.F.6180.55.pdf" name = "iss2724.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"}) assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == ( b"q\n/Tx BMC \nq\n2 1 102.29520000000001 9.835000000000036 re\n" b"W\nBT\n/Arial 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n" b"Q\nEMC\nQ\n" ) box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"] assert box[2] > 0 assert box[3] > 0 @pytest.mark.enable_socket def test_matrix_entry_in_field_annots(): """Cf #2731""" url = "https://github.com/user-attachments/files/16036514/template.pdf" name = "iss2731.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) writer.update_page_form_field_values( writer.pages[0], {"Stellenbezeichnung_1": "some filled in text"}, auto_regenerate=False, ) assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"] @pytest.mark.enable_socket def test_compress_identical_objects(): """Cf #2728 and #2794""" url = "https://github.com/user-attachments/files/16575458/tt2.pdf" name = "iss2794.pdf" in_bytes = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter(in_bytes) writer.compress_identical_objects(remove_orphans=False) out1 = BytesIO() writer.write(out1) assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) writer.remove_page( 1 ) # page0 contains fields which keep reference to the deleted page out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) writer.compress_identical_objects(remove_identicals=False) out3 = BytesIO() writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) def test_set_need_appearances_writer(): """Minimal test for coverage""" writer = PdfWriter() writer.set_need_appearances_writer() def test_utf16_metadata(): """See #2754""" writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") writer.add_metadata( { "/Subject": "Invoice №AI_047", } ) b = BytesIO() writer.write(b) b.seek(0) reader = PdfReader(b) assert reader.metadata.subject == "Invoice №AI_047" bb = b.getvalue() i = bb.find(b"/Subject") assert bb[i : i + 100] == ( b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e" b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)" ) @pytest.mark.enable_socket def test_increment_writer(caplog): """Tests for #2811""" writer = PdfWriter( RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", incremental=True, ) # Contains JBIG2 not decoded for the moment assert writer.list_objects_in_increment() == [] # no flowdown of properties # test writing with empty increment b = BytesIO() writer.write(b) with open( RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", "rb" ) as f: assert b.getvalue() == f.read(-1) b.seek(0) writer2 = PdfWriter(b, incremental=True) assert len([x for x in writer2._objects if x is not None]) == len( [x for x in writer._objects if x is not None] ) writer2.add_metadata({"/Author": "test"}) assert len(writer2.list_objects_in_increment()) == 1 b = BytesIO() writer2.write(b) # modify one object writer.pages[0][NameObject("/MediaBox")] = ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)] b = BytesIO() writer.write(b) writer.pages[5][NameObject("/MediaBox")] = ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert len(writer.list_objects_in_increment()) == 2 # modify object IndirectObject(5,0) : for coverage writer.get_object(5)[NameObject("/ForTestOnly")] = NameObject("/ForTestOnly") b = BytesIO() writer.write(b) assert b.getvalue().startswith(writer._reader.stream.getvalue()) b.seek(0) reader = PdfReader(b) assert reader.pages[0]["/MediaBox"] == ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert "/ForTestOnly" in reader.get_object(5) with pytest.raises(PyPdfError): writer = PdfWriter(1, incremental=True) b.seek(0) writer = PdfWriter(b, incremental=True) assert writer.list_objects_in_increment() == [] # no flowdown of properties writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == 1 b = BytesIO() writer.write(b) writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == len(writer._objects) # insert pages in a tree url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" name = "iss2343b.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") pg = writer.insert_page(reader.pages[0], 4) assert ( pg.raw_get("/Parent") == writer.root_object["/Pages"]["/Kids"][0].get_object()["/Kids"][0] ) assert pg["/Parent"]["/Count"] == 8 assert writer.root_object["/Pages"]["/Count"] == 285 assert len(writer.flattened_pages) == 285 # clone without info writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) assert len(writer.list_objects_in_increment()) == 0 assert writer.metadata is None writer.metadata = {} assert writer.metadata == {} assert len(writer.list_objects_in_increment()) == 1 writer.metadata = None assert len(writer.list_objects_in_increment()) == 0 assert writer.metadata is None b = BytesIO() writer.write(b) @pytest.mark.enable_socket def test_append_pdf_with_dest_without_page(caplog): """Tests for #2842""" url = "https://github.com/user-attachments/files/16990834/test.pdf" name = "iss2842.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) assert "/__WKANCHOR_8" not in writer.named_destinations assert len(writer.named_destinations) == 3 @pytest.mark.enable_socket def test_destination_is_nullobject(): """Tests for #2958""" url = "https://github.com/user-attachments/files/17822279/C0.00.-.COVER.SHEET.pdf" name = "iss2958.pdf" source_data = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter() writer.append(source_data) @pytest.mark.enable_socket def test_destination_page_is_none(): """Tests for #2963""" url = "https://github.com/user-attachments/files/17879461/3.pdf" name = "iss2963.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) def test_stream_not_closed(): """Tests for #2905""" src = RESOURCE_ROOT / "pdflatex-outline.pdf" with NamedTemporaryFile(suffix=".pdf") as tmp: with PdfReader(src) as reader, PdfWriter() as writer: writer.add_page(reader.pages[0]) writer.write(tmp) assert not tmp.file.closed with NamedTemporaryFile(suffix=".pdf") as target: with PdfWriter(target.file) as writer: writer.add_blank_page(100, 100) assert not target.file.closed with open(src, "rb") as fileobj: with PdfWriter(fileobj) as writer: pass assert not fileobj.closed def test_auto_write(tmp_path): """Another test for #2905""" target = tmp_path / "out.pdf" with PdfWriter(target) as writer: writer.add_blank_page(100, 100) assert target.stat().st_size > 0 def test_deprecate_with_as(): """Yet another test for #2905""" with PdfWriter() as writer: with pytest.raises( expected_exception=DeprecationError, match=r"with_as_usage is deprecated and was removed in pypdf 5\.0" ): _ = writer.with_as_usage with pytest.raises( expected_exception=DeprecationError, match=r"with_as_usage is deprecated and was removed in pypdf 5\.0" ): writer.with_as_usage = False # old code allowed setting this, so... @pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") @pytest.mark.enable_socket def test_inline_image_q_operator_handling(tmp_path): """Test for #2927""" pdf_url = "https://github.com/user-attachments/files/17614880/test_clean.pdf" pdf_name = "iss2927.pdf" pdf_data = BytesIO(get_data_from_url(pdf_url, name=pdf_name)) png_url = "https://github.com/user-attachments/assets/abe16f48-9afa-4179-b1e8-62be27b95c26" png_name = "iss2927.png" expected_png_path = tmp_path / "expected.png" expected_png_path.write_bytes(get_data_from_url(png_url, name=png_name)) writer = PdfWriter() writer.append(pdf_data) for page in writer.pages: page.transfer_rotation_to_content() pdf_path = tmp_path / "out.pdf" png_path = tmp_path / "actual.png" writer.write(pdf_path) # False positive: https://github.com/PyCQA/bandit/issues/333 subprocess.run( # noqa: S603 [ GHOSTSCRIPT_BINARY, "-r120", "-sDEVICE=pngalpha", "-o", png_path, pdf_path, ] ) assert png_path.is_file() assert image_similarity(png_path, expected_png_path) >= 0.99999 def test_insert_filtered_annotations__annotations_are_none(): writer = PdfWriter() writer.add_blank_page(72, 72) stream = BytesIO() writer.write(stream) reader = PdfReader(stream) assert writer._insert_filtered_annotations( annots=None, page=PageObject(), pages={}, reader=reader ) == [] def test_incremental_read(): """Test for #3116""" writer = PdfWriter() writer.add_blank_page(72, 72) stream0 = BytesIO() writer.write(stream0) reader = PdfReader(stream0) # 1 = Catalog, 2 = Pages, 3 = New Page, 4 = Info, Size == 5 assert reader.trailer["/Size"] == 5 stream0.seek(0, 0) writer = PdfWriter(stream0, incremental=True) assert len(writer._objects) == 4 assert writer._objects[-1] is not None stream1 = BytesIO() writer.write(stream1) # nothing modified, so nothing added = ideal situation assert stream1.getvalue() == stream1.getvalue() stream0.seek(0, 0) writer = PdfWriter(stream0, incremental=True) assert len(writer._objects) == 4 assert writer._objects[-1] is not None writer.add_blank_page(72, 72) assert len(writer._objects) == 5 stream1 = BytesIO() writer.write(stream1) # 2 = Pages, 5 = New Page, 6 = XRef, Size == 7 # XRef is created on write and not counted assert len(writer._objects) == 5 def test_compress_identical_objects__after_remove_images(): """Test for #3237""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "AutoCad_Diagram.pdf") writer.remove_images() writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) def test_merge__process_named_dests__no_dests_in_source_file(): """Test for #3279""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") # Hacky solution to avoid attribute errors. names = DictionaryObject() names.indirect_reference = names writer.root_object[NameObject("/Names")] = names reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") destination = Destination(title="test.pdf", page=reader.pages[0], fit=Fit("/Fit")) with mock.patch.object(reader, "_get_named_destinations", return_value={"test.pdf": destination}): writer.append(reader) # The page now points to the appended one. assert writer.named_destinations == { "test.pdf": Destination(title="test.pdf", page=writer.pages[1].indirect_reference, fit=Fit("/Fit")) } def test_insert_filtered_annotations__link_without_destination(): """Test for #3211""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") annotations = [ DictionaryObject({ "/A": DictionaryObject({"/S": NameObject("/GoTo"), "/D": None}), "/BS": {"/S": "/S", "/Type": "/Border", "/W": 0}, "/Border": [0, 0, 0], "/H": "/I", "/Rect": [68.6001, 653.405, 526.2, 671.054], "/StructParent": 9, "/Subtype": NameObject("/Link"), "/Type": NameObject("/Annot") }) ] result = writer._insert_filtered_annotations( annots=annotations, page=writer.pages[0], pages={}, reader=reader ) assert result == [] writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") del annotations[0]["/A"]["/D"] result = writer._insert_filtered_annotations( annots=annotations, page=writer.pages[0], pages={}, reader=reader ) assert result == [] @pytest.mark.enable_socket def test_insert_filtered_annotations__annotations_are_no_list(caplog): """Tests for #3320""" url = "https://github.com/user-attachments/files/20818089/bugpdf.pdf" name = "issue3320.pdf" source_data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(source_data) writer = PdfWriter() writer.append(reader) font_file2 = reader.get_object(36).indirect_reference assert caplog.messages == [ ( f"Expected annotation arrays: {{'/FontFile2': {font_file2!r}, " "'/Descent': -269, '/CapHeight': 714, '/FontWeight': " "300, '/FontName': '/JQJGLF+OpenSans-Light', '/ItalicAngle': 0, '/StemV': " "48, '/Type': '/FontDescriptor', '/FontBBox': [-521, -269, 1140, 1048], " "'/FontFamily': 'Open Sans Light', '/Flags': 32, '/XHeight': 531, " "'/Ascent': 1048, '/FontStretch': '/Normal'} []. Ignoring annotations." ), ( f"Expected list of annotations, got {{'/FontFile2': {font_file2!r}, " "'/Descent': -269, '/CapHeight': 714, '/FontWeight': 300, '/FontName': '/JQJGLF+OpenSans-Light', " "'/ItalicAngle': 0, '/StemV': 48, '/Type': '/FontDescriptor', '/FontBBox': [-521, -269, 1140, 1048], " "'/FontFamily': 'Open Sans Light', '/Flags': 32, '/XHeight': 531, '/Ascent': 1048, '/FontStretch': " "'/Normal'} of type DictionaryObject." ) ] def test_unterminated_object__with_incremental_writer(): """Test for #3118""" reader = PdfReader(RESOURCE_ROOT / "bytes.pdf") writer = PdfWriter(reader, incremental=True) writer.add_blank_page(72, 72) fi = BytesIO() writer.write(fi) b = fi.getvalue() assert b[-39:] == b"\nendstream\nendobj\nstartxref\n1240\n%%EOF\n" def test_wrong_size_in_incremental_pdf(caplog): source_data = RESOURCE_ROOT.joinpath("crazyones.pdf").read_bytes() writer = PdfWriter(BytesIO(source_data), incremental=True) writer._add_object(DictionaryObject()) incremental_data = BytesIO() writer.write(incremental_data) modified_data = incremental_data.getvalue().replace(b"/Size 25", b"/Size 2") writer = PdfWriter(BytesIO(modified_data), incremental=False) assert "Object count 19 exceeds defined trailer size 2" in caplog.text assert len(writer._objects) == 20 caplog.clear() writer = PdfWriter(incremental=False, strict=True) with pytest.raises(expected_exception=PdfReadError, match=r"^Object count 19 exceeds defined trailer size 2$"): writer.clone_reader_document_root(reader=PdfReader(BytesIO(modified_data))) with pytest.raises(expected_exception=PdfReadError, match=r"^Got index error while flattening\.$"): PdfWriter(BytesIO(modified_data), incremental=True) @pytest.mark.enable_socket def test_flatten_form_field_without_font_in_resources(): """ This test is a regression test for issue #3553. Flatten form field with /Resources lacking /Font. """ reader = PdfReader(BytesIO(get_data_from_url(name="issue-3553.pdf"))) writer = PdfWriter() writer.append(reader) writer.update_page_form_field_values( writer.pages[0], {"Unique reference numberRow1": "test"}, flatten=True, ) b = BytesIO() writer.write(b) reader = PdfReader(b) form_text_fields = reader.get_form_text_fields() assert form_text_fields["Unique reference numberRow1"] == "test" def test_merge_with_null_acroform_does_not_raise_typeerror(): """ Source PDFs may contain '/AcroForm null'. Test for issue #3598. """ src_writer = PdfWriter() src_writer.add_blank_page(72, 72) src_writer.root_object[NameObject("/AcroForm")] = NullObject() src_bytes = BytesIO() src_writer.write(src_bytes) src_bytes.seek(0) source = PdfReader(src_bytes) target = PdfWriter() target.merge(0, source) assert "/AcroForm" not in target.root_object def test_compress_identical_objects__info_is_none(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") writer.compress_identical_objects() writer.metadata = None writer.compress_identical_objects() @pytest.mark.enable_socket def test_flatten_form_field_with_signature(): """ This test is a regression test for issue #3633. Flatten form field with /Sig. """ writer = PdfWriter(BytesIO(get_data_from_url(name="issue-3633.pdf"))) writer.update_page_form_field_values( writer.pages[0], {"signature": "test"}, flatten=True, ) b = BytesIO() writer.write(b) _ = PdfReader(b) ================================================ FILE: tests/test_xmp.py ================================================ """Test the pypdf.xmp module.""" from datetime import datetime, timedelta, timezone from io import BytesIO import pytest import pypdf.generic import pypdf.xmp from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError, XmpDocumentError from pypdf.generic import ContentStream, NameObject, StreamObject from pypdf.xmp import XmpInformation from . import RESOURCE_ROOT, SAMPLE_ROOT, get_data_from_url @pytest.mark.samples @pytest.mark.parametrize( "src", [ (SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf"), ], ) def test_read_xmp_metadata_samples(src): reader = PdfReader(src) xmp = reader.xmp_metadata assert xmp assert xmp.dc_contributor == [] assert xmp.dc_creator == ["John Doe"] assert xmp.dc_source == "Martin Thoma" # attribute node assert xmp.dc_description == {"x-default": "This is a text"} assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)] assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"} assert xmp.custom_properties == { "Style": "FooBarStyle", "other": "worlds", "⏰": "time", } @pytest.mark.samples def test_writer_xmp_metadata_samples(): writer = PdfWriter(SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf") xmp = writer.xmp_metadata assert xmp assert xmp.dc_contributor == [] assert xmp.dc_creator == ["John Doe"] assert xmp.dc_source == "Martin Thoma" # attribute node assert xmp.dc_description == {"x-default": "This is a text"} assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)] assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"} assert xmp.custom_properties == { "Style": "FooBarStyle", "other": "worlds", "⏰": "time", } co = pypdf.generic.ContentStream(None, None) co.set_data( xmp.stream.get_data().replace( b'dc:source="Martin Thoma"', b'dc:source="Pubpub-Zz"' ) ) writer.xmp_metadata = pypdf.xmp.XmpInformation(co) b = BytesIO() writer.write(b) reader = PdfReader(b) xmp2 = reader.xmp_metadata assert xmp2.dc_source == "Pubpub-Zz" @pytest.mark.parametrize( ("src", "has_xmp"), [ (RESOURCE_ROOT / "commented-xmp.pdf", True), (RESOURCE_ROOT / "crazyones.pdf", False), ], ) def test_read_xmp_metadata(src, has_xmp): """Read XMP metadata from PDF files.""" reader = PdfReader(src) xmp = reader.xmp_metadata assert (xmp is None) == (not has_xmp) if has_xmp: for _ in xmp.get_element( about_uri="", namespace=pypdf.xmp.RDF_NAMESPACE, name="Artist" ): pass assert get_all_tiff(xmp) == {"tiff:Artist": ["me"]} assert xmp.dc_contributor == [] def get_all_tiff(xmp: pypdf.xmp.XmpInformation): """Return all TIFF metadata as a dictionary.""" data = {} tiff_ns = xmp.get_nodes_in_namespace( about_uri="", namespace="http://ns.adobe.com/tiff/1.0/" ) for tag in tiff_ns: contents = [content.data for content in tag.childNodes] data[tag.tagName] = contents return data def test_converter_date(): """ _converter_date returns the correct datetime. This is a regression test for issue #774. """ date = pypdf.xmp._converter_date("2021-04-28T12:23:34.123Z") assert date == datetime(2021, 4, 28, 12, 23, 34, 123000) with pytest.raises(ValueError) as exc: pypdf.xmp._converter_date("today") assert exc.value.args[0].startswith("Invalid date format") date = pypdf.xmp._converter_date("2021-04-28T12:23:01-03:00") assert date == datetime(2021, 4, 28, 15, 23, 1) def test_modify_date(): """ xmp_modify_date is extracted correctly. This is a regression test for issue #914. """ path = RESOURCE_ROOT / "issue-914-xmp-data.pdf" reader = PdfReader(path) assert reader.xmp_metadata.xmp_modify_date == datetime(2022, 4, 9, 15, 22, 43) @pytest.mark.parametrize( "x", ["a", 42, 3.141, False, True], ) def test_identity_function(x): """The identity is returning its input.""" assert pypdf.xmp._identity(x) == x @pytest.mark.enable_socket @pytest.mark.parametrize( ("url", "name", "xmpmm_instance_id"), [ ( None, "tika-955562.pdf", "uuid:ca96e032-c2af-49bd-a71c-95889bafbf1d", ) ], ) def test_xmpmm_instance_id(url, name, xmpmm_instance_id): """XMPMM instance id is correctly extracted.""" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id # cache hit: assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id @pytest.mark.enable_socket def test_xmp_dc_description_extraction(): """XMP dc_description is correctly extracted.""" url = "https://github.com/user-attachments/files/18381721/tika-953770.pdf" name = "tika-953770.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_description == { "x-default": "U.S. Title 50 Certification Form" } # cache hit: assert xmp_metadata.dc_description == { "x-default": "U.S. Title 50 Certification Form" } @pytest.mark.enable_socket def test_dc_creator_extraction(): """XMP dc_creator is correctly extracted.""" url = "https://github.com/user-attachments/files/18381721/tika-953770.pdf" name = "tika-953770.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"] # cache hit: assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"] @pytest.mark.enable_socket def test_custom_properties_extraction(): """XMP custom_properties is correctly extracted.""" url = "https://github.com/user-attachments/files/18381764/tika-986065.pdf" name = "tika-986065.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"} # cache hit: assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"} @pytest.mark.enable_socket def test_dc_subject_extraction(): """XMP dc_subject is correctly extracted.""" url = "https://github.com/user-attachments/files/18381730/tika-959519.pdf" name = "tika-959519.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_subject == [ "P&P", "manual", "1240.2325", "CVM", "PROCEDURES ON MEDIA INQUIRIES", "animal", "media", "procedures", "inquiries", ] # Cache hit: assert xmp_metadata.dc_subject == [ "P&P", "manual", "1240.2325", "CVM", "PROCEDURES ON MEDIA INQUIRIES", "animal", "media", "procedures", "inquiries", ] @pytest.mark.enable_socket def test_invalid_xmp_information_handling(): """ Invalid XML in xmp_metadata is gracefully handled. This is a regression test for issue #585. """ url = "https://github.com/py-pdf/pypdf/files/5536984/test.pdf" name = "pypdf-5536984.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with pytest.raises(PdfReadError) as exc: reader.xmp_metadata assert exc.value.args[0].startswith("XML in XmpInformation was invalid") @pytest.mark.samples def test_pdfa_xmp_metadata_with_values(): """Test PDF/A XMP metadata extraction from a file with PDF/A metadata.""" reader = PdfReader(SAMPLE_ROOT / "021-pdfa" / "crazyones-pdfa.pdf") xmp = reader.xmp_metadata assert xmp is not None assert xmp.pdfaid_part == "1" assert xmp.pdfaid_conformance == "B" @pytest.mark.samples def test_pdfa_xmp_metadata_without_values(): """Test PDF/A XMP metadata extraction from a file without PDF/A metadata.""" reader = PdfReader(SAMPLE_ROOT / "020-xmp" / "output_with_metadata_pymupdf.pdf") xmp = reader.xmp_metadata assert xmp is not None assert xmp.pdfaid_part is None assert xmp.pdfaid_conformance is None @pytest.mark.enable_socket def test_xmp_metadata__content_stream_is_dictionary_object(): url = "https://github.com/user-attachments/files/18943249/testing.pdf" name = "issue3107.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with pytest.raises( PdfReadError, match="XML in XmpInformation was invalid: 'DictionaryObject' object has no attribute 'get_data'" ): assert reader.xmp_metadata is not None @pytest.mark.enable_socket def test_dc_creator__bag_instead_of_seq(): url = "https://github.com/user-attachments/files/18381698/tika-924562.pdf" name = "tika-924562.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.xmp_metadata is not None assert reader.xmp_metadata.dc_creator == ["William J. Hussar"] @pytest.mark.enable_socket def test_dc_language__no_bag_container(): reader = PdfReader(BytesIO(get_data_from_url(name="iss2138.pdf"))) assert reader.xmp_metadata is not None assert reader.xmp_metadata.dc_language == ["x-unknown"] def test_reading_does_not_destroy_root_object(): """Test for #3391.""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf") xmp = writer.xmp_metadata assert xmp is not None assert not isinstance(writer.root_object["/Metadata"], XmpInformation) assert isinstance(writer.root_object["/Metadata"].get_object(), StreamObject) output = BytesIO() writer.write(output) output_bytes = output.getvalue() assert b"\n/Metadata 27 0 R\n" in output_bytes def test_xmp_information__write_to_stream(): writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf") xmp = writer.xmp_metadata output = BytesIO() with pytest.warns( DeprecationWarning, match=( r"^XmpInformation\.write_to_stream is deprecated and will be removed in pypdf 6\.0\.0\. " r"Use PdfWriter\.xmp_metadata instead\.$" ) ): xmp.write_to_stream(output) output_bytes = output.getvalue() assert output_bytes.startswith(b"<<\n/Type /Metadata\n/Subtype /XML\n/Length 2786\n>>\nstream\nme", b"Foo Bar") xmp_metadata.stream.set_data(new_metadata) output = BytesIO() writer.write(output) output_bytes = output.getvalue() reader = PdfReader(BytesIO(output_bytes)) assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["Foo Bar"]} # Fix metadata not being an IndirectObject before. writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf") writer.root_object[NameObject("/Metadata")] = writer.root_object["/Metadata"].get_object() assert "/XML" in str(writer.root_object) writer.xmp_metadata = new_metadata output = BytesIO() writer.write(output) output_bytes = output.getvalue() reader = PdfReader(BytesIO(output_bytes)) assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["Foo Bar"]} assert "/XML" not in str(writer.root_object) def test_xmp_information__create(): """Test XmpInformation.create() classmethod.""" xmp = XmpInformation.create() assert xmp is not None assert xmp.dc_title == {} assert xmp.dc_creator == [] assert xmp.dc_description == {} assert xmp.xmp_create_date is None assert xmp.pdf_producer is None def test_xmp_information__set_dc_title(): """Test setting dc:title metadata.""" xmp = XmpInformation.create() title_values = {"x-default": "Test Title", "en": "Test Title EN"} xmp.dc_title = title_values assert xmp.dc_title == title_values xmp.dc_title = None assert xmp.dc_title is None or xmp.dc_title == {} def test_xmp_information__set_dc_creator(): """Test setting dc:creator metadata.""" xmp = XmpInformation.create() creators = ["Author One", "Author Two"] xmp.dc_creator = creators assert xmp.dc_creator == creators xmp.dc_creator = None assert xmp.dc_creator is None or xmp.dc_creator == [] def test_xmp_information__set_dc_description(): """Test setting dc:description metadata.""" xmp = XmpInformation.create() description_values = {"x-default": "Test Description", "en": "Test Description EN"} xmp.dc_description = description_values assert xmp.dc_description == description_values xmp.dc_description = None assert xmp.dc_description is None or xmp.dc_description == {} def test_xmp_information__set_dc_subject(): """Test setting dc:subject metadata.""" xmp = XmpInformation.create() subjects = ["keyword1", "keyword2", "keyword3"] xmp.dc_subject = subjects assert xmp.dc_subject == subjects xmp.dc_subject = None assert xmp.dc_subject is None or xmp.dc_subject == [] def test_xmp_information__set_dc_date(): """Test setting dc:date metadata.""" xmp = XmpInformation.create() test_date = datetime(2023, 12, 25, 10, 30, 45) xmp.dc_date = [test_date] stored_dates = xmp.dc_date assert len(stored_dates) == 1 date_string = "2023-12-25T10:30:45.000000Z" xmp.dc_date = [date_string] stored_dates = xmp.dc_date assert len(stored_dates) == 1 xmp.dc_date = None assert xmp.dc_date is None or xmp.dc_date == [] def test_xmp_information__set_single_fields(): """Test setting single-value metadata fields.""" xmp = XmpInformation.create() xmp.dc_coverage = "Global coverage" assert xmp.dc_coverage == "Global coverage" xmp.dc_coverage = None assert xmp.dc_coverage is None xmp.dc_format = "application/pdf" assert xmp.dc_format == "application/pdf" xmp.dc_format = None assert xmp.dc_format is None xmp.dc_identifier = "unique-id-123" assert xmp.dc_identifier == "unique-id-123" xmp.dc_identifier = None assert xmp.dc_identifier is None xmp.dc_source = "Original Source" assert xmp.dc_source == "Original Source" xmp.dc_source = None assert xmp.dc_source is None def test_xmp_information__set_bag_fields(): """Test setting bag (unordered array) metadata fields.""" xmp = XmpInformation.create() contributors = ["Contributor One", "Contributor Two"] xmp.dc_contributor = contributors assert xmp.dc_contributor == contributors xmp.dc_contributor = None assert xmp.dc_contributor is None or xmp.dc_contributor == [] languages = ["en", "fr", "de"] xmp.dc_language = languages assert xmp.dc_language == languages xmp.dc_language = None assert xmp.dc_language is None or xmp.dc_language == [] publishers = ["Publisher One", "Publisher Two"] xmp.dc_publisher = publishers assert xmp.dc_publisher == publishers xmp.dc_publisher = None assert xmp.dc_publisher is None or xmp.dc_publisher == [] relations = ["Related Doc 1", "Related Doc 2"] xmp.dc_relation = relations assert xmp.dc_relation == relations xmp.dc_relation = None assert xmp.dc_relation is None or xmp.dc_relation == [] types = ["Document", "Text"] xmp.dc_type = types assert xmp.dc_type == types xmp.dc_type = None assert xmp.dc_type is None or xmp.dc_type == [] def test_xmp_information__set_dc_rights(): """Test setting dc:rights metadata.""" xmp = XmpInformation.create() rights_values = {"x-default": "All rights reserved", "en": "All rights reserved EN"} xmp.dc_rights = rights_values assert xmp.dc_rights == rights_values xmp.dc_rights = None assert xmp.dc_rights is None or xmp.dc_rights == {} def test_xmp_information__set_pdf_fields(): """Test setting PDF namespace metadata fields.""" xmp = XmpInformation.create() xmp.pdf_keywords = "keyword1, keyword2, keyword3" assert xmp.pdf_keywords == "keyword1, keyword2, keyword3" xmp.pdf_keywords = None assert xmp.pdf_keywords is None xmp.pdf_pdfversion = "1.4" assert xmp.pdf_pdfversion == "1.4" xmp.pdf_pdfversion = None assert xmp.pdf_pdfversion is None xmp.pdf_producer = "pypdf" assert xmp.pdf_producer == "pypdf" xmp.pdf_producer = None assert xmp.pdf_producer is None def test_xmp_information__set_xmp_date_fields(): """Test setting XMP date metadata fields.""" xmp = XmpInformation.create() test_date = datetime(2023, 12, 25, 10, 30, 45) aware_date = datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone(timedelta(hours=-5))) xmp.xmp_create_date = test_date stored_date = xmp.xmp_create_date assert isinstance(stored_date, datetime) xmp.xmp_create_date = aware_date stored_date = xmp.xmp_create_date assert stored_date == datetime(2023, 1, 1, 17, 0, 0) xmp.xmp_create_date = None assert xmp.xmp_create_date is None xmp.xmp_modify_date = test_date stored_date = xmp.xmp_modify_date assert isinstance(stored_date, datetime) xmp.xmp_modify_date = aware_date stored_date = xmp.xmp_modify_date assert stored_date == datetime(2023, 1, 1, 17, 0, 0) xmp.xmp_modify_date = None assert xmp.xmp_modify_date is None xmp.xmp_metadata_date = test_date stored_date = xmp.xmp_metadata_date assert isinstance(stored_date, datetime) xmp.xmp_metadata_date = aware_date stored_date = xmp.xmp_metadata_date assert stored_date == datetime(2023, 1, 1, 17, 0, 0) xmp.xmp_metadata_date = None assert xmp.xmp_metadata_date is None def test_xmp_information__set_xmp_creator_tool(): """Test setting xmp:CreatorTool metadata.""" xmp = XmpInformation.create() xmp.xmp_creator_tool = "pypdf" assert xmp.xmp_creator_tool == "pypdf" xmp.xmp_creator_tool = None assert xmp.xmp_creator_tool is None def test_xmp_information__set_xmpmm_fields(): """Test setting XMPMM namespace metadata fields.""" xmp = XmpInformation.create() doc_id = "uuid:12345678-1234-1234-1234-123456789abc" xmp.xmpmm_document_id = doc_id assert xmp.xmpmm_document_id == doc_id xmp.xmpmm_document_id = None assert xmp.xmpmm_document_id is None instance_id = "uuid:87654321-4321-4321-4321-cba987654321" xmp.xmpmm_instance_id = instance_id assert xmp.xmpmm_instance_id == instance_id xmp.xmpmm_instance_id = None assert xmp.xmpmm_instance_id is None def test_xmp_information__set_pdfaid_fields(): """Test setting PDF/A ID namespace metadata fields.""" xmp = XmpInformation.create() xmp.pdfaid_part = "1" assert xmp.pdfaid_part == "1" xmp.pdfaid_part = None assert xmp.pdfaid_part is None xmp.pdfaid_conformance = "B" assert xmp.pdfaid_conformance == "B" xmp.pdfaid_conformance = None assert xmp.pdfaid_conformance is None def test_xmp_information__create_with_writer(): """Test using XmpInformation.create() with PdfWriter.""" xmp = XmpInformation.create() xmp.dc_title = {"x-default": "Created with pypdf"} xmp.dc_creator = ["pypdf user"] xmp.pdf_producer = "pypdf library" writer = PdfWriter() writer.add_blank_page(612, 792) writer.xmp_metadata = xmp output = BytesIO() writer.write(output) output_bytes = output.getvalue() reader = PdfReader(BytesIO(output_bytes)) xmp_read = reader.xmp_metadata assert xmp_read is not None assert xmp_read.dc_title == {"x-default": "Created with pypdf"} assert xmp_read.dc_creator == ["pypdf user"] assert xmp_read.pdf_producer == "pypdf library" def test_xmp_information__namespace_prefix(): """Test _get_namespace_prefix method.""" xmp = XmpInformation.create() assert xmp._get_namespace_prefix(pypdf.xmp.DC_NAMESPACE) == "dc" assert xmp._get_namespace_prefix(pypdf.xmp.XMP_NAMESPACE) == "xmp" assert xmp._get_namespace_prefix(pypdf.xmp.PDF_NAMESPACE) == "pdf" assert xmp._get_namespace_prefix(pypdf.xmp.XMPMM_NAMESPACE) == "xmpMM" assert xmp._get_namespace_prefix(pypdf.xmp.PDFAID_NAMESPACE) == "pdfaid" assert xmp._get_namespace_prefix(pypdf.xmp.PDFX_NAMESPACE) == "pdfx" assert xmp._get_namespace_prefix("unknown://namespace") == "unknown" def test_xmp_information__owner_document_none_errors(): xmp = XmpInformation.create() original_owner = xmp.rdf_root.ownerDocument try: for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._get_or_create_description() with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._update_stream() xmp.rdf_root.ownerDocument = original_owner for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp.dc_coverage = "test coverage" xmp.rdf_root.ownerDocument = original_owner for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp.dc_contributor = ["contributor"] xmp.rdf_root.ownerDocument = original_owner for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp.dc_creator = ["creator"] xmp.rdf_root.ownerDocument = original_owner for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp.dc_title = {"x-default": "title"} xmp.rdf_root.ownerDocument = original_owner desc = xmp._get_or_create_description() desc.setAttribute("test-attr", "test-value") xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._set_single_value("test-namespace", "test-attr", "new-value") xmp.rdf_root.ownerDocument = original_owner desc = xmp._get_or_create_description() xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._set_bag_values("test-namespace", "test-name", ["value"]) xmp.rdf_root.ownerDocument = original_owner desc = xmp._get_or_create_description() xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._set_seq_values("test-namespace", "test-name", ["value"]) xmp.rdf_root.ownerDocument = original_owner desc = xmp._get_or_create_description() xmp.rdf_root.ownerDocument = None with pytest.raises(XmpDocumentError, match="XMP Document is None"): xmp._set_langalt_values("test-namespace", "test-name", {"x-default": "value"}) finally: xmp.rdf_root.ownerDocument = original_owner def test_xmp_information__remove_existing_attribute(): xmp = XmpInformation.create() xmp.dc_coverage = "initial coverage" assert xmp.dc_coverage == "initial coverage" xmp.dc_coverage = "updated coverage" assert xmp.dc_coverage == "updated coverage" xmp.dc_coverage = None assert xmp.dc_coverage is None desc = xmp._get_or_create_description() desc.setAttributeNS(pypdf.xmp.DC_NAMESPACE, "dc:coverage", "original attribute") assert desc.getAttributeNS(pypdf.xmp.DC_NAMESPACE, "coverage") == "original attribute" xmp.dc_coverage = "new element value" assert xmp.dc_coverage == "new element value" assert desc.getAttributeNS(pypdf.xmp.DC_NAMESPACE, "coverage") == "" elements = desc.getElementsByTagNameNS(pypdf.xmp.DC_NAMESPACE, "coverage") assert len(elements) == 1 assert elements[0].firstChild.data == "new element value" def test_xmp_information__edge_case_coverage(): xmp = XmpInformation.create() xmp.dc_contributor = [] assert xmp.dc_contributor == [] xmp.dc_creator = [] assert xmp.dc_creator == [] xmp.dc_title = {} assert xmp.dc_title == {} xmp.dc_contributor = None assert xmp.dc_contributor == [] xmp.dc_creator = None assert xmp.dc_creator == [] xmp.dc_title = None assert xmp.dc_title == {} def test_xmp_information__create_new_description(): """Test creating new description elements.""" xmp = XmpInformation.create() for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) desc = xmp._get_or_create_description("test-uri") assert desc.getAttributeNS(pypdf.xmp.RDF_NAMESPACE, "about") == "test-uri" assert desc.tagName == "rdf:Description" assert desc.namespaceURI == pypdf.xmp.RDF_NAMESPACE def test_xmp_information__get_text_skips_non_text_nodes(): xmp = XmpInformation.create() doc = xmp.rdf_root.ownerDocument el = doc.createElementNS(pypdf.xmp.DC_NAMESPACE, "dc:test") el.appendChild(doc.createTextNode("hello")) el.appendChild(doc.createElement("ignored-node")) el.appendChild(doc.createTextNode(" world")) assert xmp._get_text(el) == "hello world" def test_xmp_information__get_or_create_description_mismatch_about_uri(): xmp = XmpInformation.create() existing = xmp._get_or_create_description() existing.setAttributeNS(pypdf.xmp.RDF_NAMESPACE, "rdf:about", "foo-uri") new_desc = xmp._get_or_create_description("bar-uri") assert new_desc is not existing assert new_desc.getAttributeNS(pypdf.xmp.RDF_NAMESPACE, "about") == "bar-uri" all_desc = list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")) about_values = {d.getAttributeNS(pypdf.xmp.RDF_NAMESPACE, "about") for d in all_desc} assert {"foo-uri", "bar-uri"}.issubset(about_values) def test_xmp_information__attribute_handling(): """Test attribute node removal and creation (line 479, 484, 506, 535, 564).""" xmp = XmpInformation.create() for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.dc_coverage = "test coverage" assert xmp.dc_coverage == "test coverage" xmp.dc_contributor = ["contributor1", "contributor2"] assert xmp.dc_contributor == ["contributor1", "contributor2"] xmp.dc_creator = ["creator1", "creator2"] assert xmp.dc_creator == ["creator1", "creator2"] xmp.dc_title = {"x-default": "Test Title", "en": "Test Title EN"} assert xmp.dc_title == {"x-default": "Test Title", "en": "Test Title EN"} xmp.dc_format = "application/pdf" assert xmp.dc_format == "application/pdf" xmp.dc_format = "text/plain" assert xmp.dc_format == "text/plain" def test_xmp_information__create_and_set_metadata(): xmp = XmpInformation.create() for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) desc = xmp._get_or_create_description() desc.setAttribute("test", "value") xmp.dc_source = "original" xmp.dc_source = "modified" assert xmp.dc_source == "modified" for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): xmp.rdf_root.removeChild(desc) xmp.dc_contributor = ["test1"] xmp.dc_creator = ["test2"] xmp.dc_title = {"x-default": "test3"} assert xmp.dc_contributor == ["test1"] assert xmp.dc_creator == ["test2"] assert xmp.dc_title == {"x-default": "test3"} def test_xmp_information__external_entity_expansion(tmpdir): path = tmpdir / "secret.txt" path.write("VERY SECRET") stream = ContentStream(pdf=None, stream=None) stream.set_data(f""" ]> &xxe;abc """.encode()) xmp = XmpInformation(stream) assert xmp.dc_creator == ["abc"] @pytest.mark.timeout(10) def test_xmp_information__exponential_entity_expansion(): stream = ContentStream(pdf=None, stream=None) stream.set_data(b""" ]> &lol9; """) with pytest.raises( expected_exception=PdfReadError, match=( r"^XML in XmpInformation was invalid: limit on input amplification factor " r"\(from DTD and entities\) breached: line 16, column 60$" ) ): XmpInformation(stream) ================================================ FILE: tests/utils.py ================================================ """Utility functions and classes for testing.""" import logging from typing import Union from PIL import Image from pypdf import PageObject from pypdf.generic import DictionaryObject, IndirectObject class PositionedText: """ Specify a text with coordinates, font-dictionary and font-size. The font-dictionary may be None in case of an unknown font. """ def __init__(self, text, x, y, font_dict, font_size) -> None: # TODO: \0-replace: Encoding issue in some files? self.text = text.replace("\0", "") self.x = x self.y = y self.font_dict = font_dict self.font_size = font_size def get_base_font(self) -> str: """ Gets the base font of the text. Return UNKNOWN in case of an unknown font. """ if (self.font_dict is None) or "/BaseFont" not in self.font_dict: return "UNKNOWN" return self.font_dict["/BaseFont"] class Rectangle: """Specify a rectangle.""" def __init__(self, x, y, w, h) -> None: self.x = x.as_numeric() self.y = y.as_numeric() self.w = w.as_numeric() self.h = h.as_numeric() def contains(self, x, y) -> bool: return ( self.x <= x <= (self.x + self.w) and self.y <= y <= (self.y + self.h) ) def extract_text_and_rectangles( page: PageObject, rect_filter=None ) -> tuple[list[PositionedText], list[Rectangle]]: """ Extracts texts and rectangles of a page of type pypdf._page.PageObject. This function supports simple coordinate transformations only. The optional rect_filter-lambda can be used to filter wanted rectangles. rect_filter has Rectangle as argument and must return a boolean. It returns a tuple containing a list of extracted texts and a list of extracted rectangles. """ logger = logging.getLogger("extract_text_and_rectangles") rectangles = [] texts = [] def print_op_b(op, args, cm_matrix, tm_matrix) -> None: if logger.isEnabledFor(logging.DEBUG): logger.debug(f"before: {op} at {cm_matrix}, {tm_matrix}") if op == b"re": if logger.isEnabledFor(logging.DEBUG): logger.debug(f" add rectangle: {args}") w = args[2] h = args[3] r = Rectangle(args[0], args[1], w, h) if (rect_filter is None) or rect_filter(r): rectangles.append(r) def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None: if text.strip() != "": if logger.isEnabledFor(logging.DEBUG): logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}") texts.append( PositionedText( text, tm_matrix[4], tm_matrix[5], font_dict, font_size ) ) visitor_before = print_op_b visitor_text = print_visi page.extract_text( visitor_operand_before=visitor_before, visitor_text=visitor_text ) return texts, rectangles def extract_table( texts: list[PositionedText], rectangles: list[Rectangle] ) -> list[list[list[PositionedText]]]: """ Extracts a table containing text. It is expected that each cell is marked by a rectangle-object. It is expected that the page contains one table only. It is expected that the table contains at least 3 columns and 2 rows. A list of rows is returned. Each row contains a list of cells. Each cell contains a list of PositionedText-elements. """ logger = logging.getLogger("extractTable") # Step 1: Count number of x- and y-coordinates of rectangles. # Remove duplicate rectangles. The new list is rectangles_filtered. col2count = {} row2count = {} key2rectangle = {} rectangles_filtered = [] for r in rectangles: # Coordinates may be inaccurate, we have to round. # cell: x=72.264, y=386.57, w=93.96, h=46.584 # cell: x=72.271, y=386.56, w=93.96, h=46.59 key = f"{round(r.x, 0)} {round(r.y, 0)} {round(r.w, 0)} {round(r.h, 0)}" if key in key2rectangle: # Ignore duplicate rectangles continue key2rectangle[key] = r if r.x not in col2count: col2count[r.x] = 0 if r.y not in row2count: row2count[r.y] = 0 col2count[r.x] += 1 row2count[r.y] += 1 rectangles_filtered.append(r) # Step 2: Look for texts in rectangles. rectangle2texts = {} for text in texts: for r in rectangles_filtered: if r.contains(text.x, text.y): if r not in rectangle2texts: rectangle2texts[r] = [] rectangle2texts[r].append(text) break # PDF: y = 0 is expected at the bottom of the page. # So the header-row is expected to have the highest y-value. rectangles.sort(key=lambda r: (-r.y, r.x)) # Step 3: Build the list of rows containing list of cell-texts. rows = [] row_nr = 0 col_nr = 0 curr_y = None curr_row = None for r in rectangles_filtered: if col2count[r.x] < 3 or row2count[r.y] < 2: # We expect at least 3 columns and 2 rows. continue if curr_y is None or r.y != curr_y: # next row curr_y = r.y col_nr = 0 row_nr += 1 curr_row = [] rows.append(curr_row) col_nr += 1 if logger.isEnabledFor(logging.DEBUG): logger.debug(f"cell: x={r.x}, y={r.y}, w={r.w}, h={r.h}") if r not in rectangle2texts: curr_row.append("") continue cell_texts = list(rectangle2texts[r]) curr_row.append(cell_texts) return rows def extract_cell_text(cell_texts: list[PositionedText]) -> str: """Joins the text-objects of a cell.""" return ("".join(t.text for t in cell_texts)).strip() def get_image_data( image: Image.Image, band: Union[int, None] = None ) -> Union[tuple[tuple[int, ...], ...], tuple[float, ...]]: try: return image.get_flattened_data(band=band) except AttributeError: # For Pillow < 12.1.0 return tuple(image.getdata(band=band)) class ReaderDummy: def __init__(self, strict=False) -> None: self.strict = strict def get_object(self, indirect_reference): class DummyObj: def get_object(self) -> "DummyObj": return self return DictionaryObject() def get_reference(self, obj): return IndirectObject(idnum=1, generation=1, pdf=self)