Repository: jsvine/pdfplumber
Branch: stable
Commit: 98041536932c
Files: 60
Total size: 385.6 KB
Directory structure:
gitextract_36f6_7t4/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug-report.md
│ │ ├── config.yml
│ │ └── feature-request.md
│ └── workflows/
│ └── tests.yml
├── .gitignore
├── CHANGELOG.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── codecov.yml
├── docs/
│ ├── colors.md
│ ├── repairing.md
│ └── structure.md
├── pdfplumber/
│ ├── __init__.py
│ ├── _typing.py
│ ├── _version.py
│ ├── cli.py
│ ├── container.py
│ ├── convert.py
│ ├── ctm.py
│ ├── display.py
│ ├── page.py
│ ├── pdf.py
│ ├── py.typed
│ ├── repair.py
│ ├── structure.py
│ ├── table.py
│ └── utils/
│ ├── __init__.py
│ ├── clustering.py
│ ├── exceptions.py
│ ├── generic.py
│ ├── geometry.py
│ ├── pdfinternals.py
│ └── text.py
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests/
├── comparisons/
│ ├── scotus-transcript-p1-cropped.txt
│ └── scotus-transcript-p1.txt
├── pdfs/
│ └── make_xref.py
├── test_basics.py
├── test_ca_warn_report.py
├── test_convert.py
├── test_ctm.py
├── test_dedupe_chars.py
├── test_display.py
├── test_issues.py
├── test_laparams.py
├── test_list_metadata.py
├── test_mcids.py
├── test_nics_report.py
├── test_oss_fuzz.py
├── test_repair.py
├── test_structure.py
├── test_table.py
└── test_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.md
================================================
---
name: Bug report
about: Use this if you observe a specific problem with pdfplumber's code or results
title: ''
labels: bug
assignees: ''
---
## Describe the bug
*A clear and concise description of what the bug is.*
## Have you tried [repairing](https://github.com/jsvine/pdfplumber/blob/stable/docs/repairing.md) the PDF?
*Please try running your code with `pdfplumber.open(..., repair=True)` before submitting a bug report.*
## Code to reproduce the problem
*Paste it here, or attach a Python file.*
## PDF file
*Please attach any PDFs necessary to reproduce the problem.*
*If you need to redact text in a sensitive PDF, you can run it through [JoshData/pdf-redactor](https://github.com/JoshData/pdf-redactor).*
## Expected behavior
*What did you expect the result __should__ have been?*
## Actual behavior
*What actually happened, instead?*
## Screenshots
*If applicable, add screenshots to help explain your problem.*
## Environment
- pdfplumber version: [e.g., 0.5.22]
- Python version: [e.g., 3.8.1]
- OS: [e.g., Mac, Linux, etc.]
## Additional context
*Add any other context/notes about the problem here.*
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: Troubleshooting, etc.
url: https://github.com/jsvine/pdfplumber/discussions
about: Use 'Discussions' to request assistance, ask questions, etc.
================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.md
================================================
---
name: Feature request
about: Suggest a feature or improvement
title: ''
labels: feature-request
assignees: ''
---
Please describe, in as much detail as possible, your proposal and how it would improve your experience with pdfplumber.
================================================
FILE: .github/workflows/tests.yml
================================================
name: Tests
on: [push, pull_request]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Configure pip caching
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt')}}-${{ hashFiles('**/requirements-dev.txt') }}
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements-dev.txt
- name: Validate against psf/black
run: python -m black --check pdfplumber tests
- name: Validate against isort
run: python -m isort --profile black --check-only pdfplumber tests
- name: Validate against flake8
run: python -m flake8 pdfplumber tests
- name: Check type annotations via mypy
run: python -m mypy --strict --implicit-reexport pdfplumber
test:
needs: lint
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install ghostscript
run: sudo apt update && sudo apt install ghostscript
- name: Configure pip caching
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt')}}-${{ hashFiles('**/requirements-dev.txt') }}
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements-dev.txt
- name: Run tests
run: |
python -m pytest -n auto
python -m coverage html
- name: Upload code coverage
uses: codecov/codecov-action@v3
if: matrix.python-version == 3.9
- name: Build package
run: python setup.py build sdist
================================================
FILE: .gitignore
================================================
venv/
notebooks/
nonpublic/
.ipynb_checkpoints
.DS_Store
.idea/
.pytest_cache/
.mypy_cache/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).
## Unreleased
- Upgrade `pdfminer.six` from `20251230` to `20260107`. ([07a5ff6](https://github.com/jsvine/pdfplumber/commit/07a5ff6))
## 0.11.9 — 2026-01-05
### Changed
- Upgrade `pdfminer.six` from `20251107` to `20251230`. ([75bbed3](https://github.com/jsvine/pdfplumber/commit/75bbed3) + [1524ce4](https://github.com/jsvine/pdfplumber/commit/1524ce4) + [26687c3](https://github.com/jsvine/pdfplumber/commit/26687c3) + [9555532](https://github.com/jsvine/pdfplumber/commit/9555532))
## [0.11.8] - 2025-11-08
### Added
- Add `edge_min_length_prefilter` table setting for initial edge filtering. Lowering this setting enables capturing small edge segments (e.g., dashed lines) that would be filtered out with the default minimum length of 1. Raising this setting would be less common but plausible. (h/t @bronislav). ([#1274](https://github.com/jsvine/pdfplumber/issues/1274)).
### Changed
- Upgrade `pdfminer.six` from `20250506` to `20251107` (h/t @henry-renner-v). ([0079187](https://github.com/jsvine/pdfplumber/pull/1348/commits/0079187ab5493f4440147cde83ee627cab081079))
## [0.11.7] - 2025-06-12
### Added
- Add access to `Page.trimbox`, `Page.bleedbox`, and `Page.artbox` (h/t @samuelbradshaw). ([#1313](https://github.com/jsvine/pdfplumber/issues/1313) + [7e364e6](https://github.com/jsvine/pdfplumber/commit/7e364e6193c6e8bafa9b46587c0fdd4a46405399))
### Changed
- Upgrade `pdfminer.six` from `20250327` to `20250506`. ([4c7e092](https://github.com/jsvine/pdfplumber/commit/4c7e092))
### Removed
- Remove `stroking_pattern` and `non_stroking_pattern` object attributes, due to changes in `pdfminer.six`. ([4c7e092](https://github.com/jsvine/pdfplumber/commit/4c7e092))
## [0.11.6] - 2025-03-27
### Changed
- Upgrade `pdfminer.six` from `20231228` to `20250327` ([3fcb493](https://github.com/jsvine/pdfplumber/commit/3fcb493) + [12a73a2](https://github.com/jsvine/pdfplumber/commit/12a73a2))
- Use csv.QUOTE_MINIMAL for .to_csv(...) ([980494a](https://github.com/jsvine/pdfplumber/commit/980494a))
### Fixed
- Fix bug with `use_text_flow=True` text extraction (h/t @samuelbradshaw)([#1279](https://github.com/jsvine/pdfplumber/issues/1279) + [e15ed98](https://github.com/jsvine/pdfplumber/commit/e15ed98))
- Catch exceptions from pdfminer and malformed PDFs ([43ccc5b](https://github.com/jsvine/pdfplumber/commit/43ccc5b))
- More broadly handle RecursionError ([748ff31](https://github.com/jsvine/pdfplumber/commit/748ff31))
### Removed
- Remove test_issue_1089 ([#1263](https://github.com/jsvine/pdfplumber/issues/1263) + [7e28e76](https://github.com/jsvine/pdfplumber/commit/7e28e76))
## [0.11.5] - 2025-01-01
### Added
- Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
- Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
- Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
### Fixed
- Fix `PageImage.debug_tablefinder(...)` so that its main keyword argument is named the same (`table_settings=`) as other related `Page` methods (h/t @n-traore). ([#1237](https://github.com/jsvine/pdfplumber/issues/1237))
## [0.11.4] - 2024-08-18
### Fixed
- Fix one type hint so that it doesn't throw error on Python 3.8 (h/t @andrekeller). ([#1184](https://github.com/jsvine/pdfplumber/issues/1184))
## [0.11.3] - 2024-08-07
### Added
- Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050) + [d39302f](https://github.com/jsvine/pdfplumber/commit/d39302f))
- Add `Page.extract_words(return_chars=True)`, mirroring `Page.search(..., return_chars=True)`; if this argument is passed, each word dictionary will include an additional key-value pair: `"chars": [char_object, ...]` (h/t @cmdlineluser). ([#1173](https://github.com/jsvine/pdfplumber/issues/1173) + [1496cbd](https://github.com/jsvine/pdfplumber/commit/1496cbd))
- Add `pdfplumber.open(unicode_norm="NFC"/"NFD"/"NFKC"/NFKD")`, where the values are the [four options for Unicode normalization](https://unicode.org/reports/tr15/#Normalization_Forms_Table) (h/t @petermr + @agusluques). ([#905](https://github.com/jsvine/pdfplumber/issues/905) + [03a477f](https://github.com/jsvine/pdfplumber/commit/03a477f))
### Changed
- Change default setting `pdfplumber.repair(...)` passes to Ghostscript's `-dPDFSETTINGS` parameter, from `prepress` to `default`, and make that setting modifiable via `.repair(setting=...)`, where the value is one of `"default"`, `"prepress"`, `"printer"`, or `"ebook"` (h/t @Laubeee). ([#874](https://github.com/jsvine/pdfplumber/issues/874) + [48cab3f](https://github.com/jsvine/pdfplumber/commit/48cab3f))
### Fixed
- Fix handling of object coordinates when `mediabox` does not begin at `(0,0)` (h/t @wodny). ([#1181](https://github.com/jsvine/pdfplumber/issues/1181) + [9025c3f](https://github.com/jsvine/pdfplumber/commit/9025c3f) + [046bd87](https://github.com/jsvine/pdfplumber/commit/046bd87))
- Fix error on getting `.annots`/`.hyperlinks` from `CroppedPage` (due to missing `.rotation` and `.initial_doctop` attributes) (h/t @Safrone). ([#1171](https://github.com/jsvine/pdfplumber/issues/1171) + [e5737d2](https://github.com/jsvine/pdfplumber/commit/e5737d2))
- Fix problem where `Page.crop(...)` was not cropping `.annots/.hyperlinks` (h/t @Safrone). ([#1171](https://github.com/jsvine/pdfplumber/issues/1171) + [22494e8](https://github.com/jsvine/pdfplumber/commit/22494e8))
- Fix calculation of coordinates for `.annots` on `CroppedPage`s. ([0bbb340](https://github.com/jsvine/pdfplumber/commit/0bbb340) + [b16acc3](https://github.com/jsvine/pdfplumber/commit/b16acc3))
- Dereference structure element attributes (h/t @dhdaines). ([#1169](https://github.com/jsvine/pdfplumber/pull/1169) + [3f16180](https://github.com/jsvine/pdfplumber/commit/3f16180))
- Fix `Page.get_attr(...)` so that it fully resolves references before determining whether the attribute's value is `None` (h/t @zzhangyun + @mkl-public). ([#1176](https://github.com/jsvine/pdfplumber/issues/1176) + [c20cd3b](https://github.com/jsvine/pdfplumber/commit/c20cd3b))
## [0.11.2] - 2024-07-06
### Added
- Add `extra_attrs` parameter to `.dedupe_chars(...)` to adjust the properties used when deduplicating (h/t @QuentinAndre11). ([#1114](https://github.com/jsvine/pdfplumber/issues/1114))
### Development Changes
- Remove testing for Python 3.8, add testing for Python 3.12. ([944eaed](https://github.com/jsvine/pdfplumber/commit/944eaed))
- Upgrade `flake8`, `pytest`, and `pytest-cov` — and add `setuptools` and `py` as explicit dev requirements (for Python 3.12).
## [0.11.1] - 2024-06-11
### Fixed
- Fix `.open(..., repair=True)` subprocess args (to avoid stderr being captured) ([70534a7](https://github.com/jsvine/pdfplumber/commit/70534a7))
- Fix coordinates of annots on rotated pages ([aaa35c9](https://github.com/jsvine/pdfplumber/commit/aaa35c9))
- Fix handling `PDFDocEncoding` failures in `decode_text(...)`([#1147](https://github.com/jsvine/pdfplumber/issues/1147) + [4daf0aa](https://github.com/jsvine/pdfplumber/commit/4daf0aa))
- Add `.get_textmap.cache_clear()` to `page.close()` ([0a26f05](https://github.com/jsvine/pdfplumber/commit/0a26f05))
## [0.11.0] - 2024-03-07
### Added
- Add `{line,char}_dir{,rotated,render}` params, to provide better support for non–top-to-bottom, left-to-right text (h/t @afriedman412). ([850fd45](https://github.com/jsvine/pdfplumber/commit/850fd45))
- Add `curve["path"]` and `curve["dash"]`, thanks to `pdfminer.six` upgrade (see below). ([1820247](https://github.com/jsvine/pdfplumber/commit/1820247))
### Changed
- Upgrade `pdfminer.six` from `20221105` to `20231228`. ([cd2f768](https://github.com/jsvine/pdfplumber/commit/cd2f768))
- Change value of in `word["direction"]` from `{1,-1}` to `{"ltr","rtl","ttb","btt"}`. ([850fd45](https://github.com/jsvine/pdfplumber/commit/850fd45))
- Deprecate `vertical_ttb`, `horizontal_ltr` in favor of `char_dir` and `char_dir_rotated`.([850fd45](https://github.com/jsvine/pdfplumber/commit/850fd45))
### Fixed
- Fix layout-caching issue caused by `0bfffc2`. ([#1097](https://github.com/jsvine/pdfplumber/pull/1097) + [efca277](https://github.com/jsvine/pdfplumber/commit/efca277))
- Fix missing ParentTree edge-case. ([#1094](https://github.com/jsvine/pdfplumber/pull/1094)))
## [0.10.4] - 2024-02-10
### Added
- Add `x_tolerance_ratio` parameter to `extract_text` and similar functions, to account for text size when spacing characters (instead of a fixed number of pixels) (h/t @afriedman412). ([#1041](https://github.com/jsvine/pdfplumber/pulls/1041))
- Add support for PDF 1.3 logical structure via `Page.structure_tree` (h/t @dhdaines). ([#963](https://github.com/jsvine/pdfplumber/pulls/963))
- Add "gswin64c" as another possible Ghostscript executable in `repair.py` (h/t @echedey-ls). ([#1032](https://github.com/jsvine/pdfplumber/issues/1030))
- Re-add `Page.close()` method, have `PDF.close()` close all pages as well, and improve relevant documentation (h/t @luketudge). ([#1042](https://github.com/jsvine/pdfplumber/issues/1042))
- Add `force_mediabox` parameter to `Page.to_image(...)`. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054))
### Fixed
- Standardize handling of cropbox, fixing various issues with PageImage. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054))
- Fix `Page.get_textmap` caching to allow for `extra_attrs=[...]`, by preconverting list kwargs to tuples. ([#1030](https://github.com/jsvine/pdfplumber/issues/1030))
- Explicitly close `pypdfium2.PdfDocument` in `get_page_image` (h/t @dhdaines). ([#1090](https://github.com/jsvine/pdfplumber/pull/1090))
- In `PDFPageAggregatorWithMarkedContent.tag_cur_item`, check `self.cur_item._objs` length before trying to access `[-1]`. ([4f39d03](https://github.com/jsvine/pdfplumber/commit/4f39d03))
## [0.10.3] - 2023-10-26
### Added
- Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961))
- Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953))
### Fixed
- Respect `use_text_flow` in `extract_text` (h/t @dhdaines). ([#983](https://github.com/jsvine/pdfplumber/pulls/983))
## [0.10.2] - 2023-07-29
### Added
- Add `PDF.path`: A `Path` object for PDFs loaded by passing a path (unless `repair=True`), and `None` otherwise. ([30a52cb](https://github.com/jsvine/pdfplumber/commit/30a52cb) + [#948](https://github.com/jsvine/pdfplumber/issues/948))
- Accept `Iterable` objects for geometry utils (h/t @dhdaines). ([53bee23](https://github.com/jsvine/pdfplumber/commit/53bee23) + [#945](https://github.com/jsvine/pdfplumber/pulls/945))
### Changed
- Use pypdfium2's *public* (not private) `.render(...)` method (h/t @mara004). ([28f4ebe](https://github.com/jsvine/pdfplumber/commit/28f4ebe) + [#899](https://github.com/jsvine/pdfplumber/discussions/899#discussioncomment-6520928))
### Fixed
- Fix `.to_image()` for `ZipExtFile`s (h/t @Urbener). ([30a52cb](https://github.com/jsvine/pdfplumber/commit/30a52cb) + [#948](https://github.com/jsvine/pdfplumber/issues/948))
## [0.10.1] - 2023-07-19
### Added
- Add `antialias` boolean parameter to `Page.to_image(...)` and associated methods (h/t @cmdlineluser). ([7e28931](https://github.com/jsvine/pdfplumber/commit/7e28931))
## [0.10.0] - 2023-07-16
### Changed
- Normalize color representation to `tuple[float|int, ...]` ([#917](https://github.com/jsvine/pdfplumber/issues/917)). ([57d51bb](https://github.com/jsvine/pdfplumber/commit/57d51bb))
- Replace Wand with pypdfium2 for page.to_image(...). ([b049373](https://github.com/jsvine/pdfplumber/commit/b049373))
### Added
- Add `pdfplumber.repair(...)` and `.open(repair=True)` ([#824](https://github.com/jsvine/pdfplumber/issues/824)). ([db6ae97](https://github.com/jsvine/pdfplumber/commit/db6ae97))
- Add Page.find_table(...) ([#873](https://github.com/jsvine/pdfplumber/issues/873)). ([3772af6](https://github.com/jsvine/pdfplumber/commit/3772af6))
- Add `quantize=True`, `colors=256`, `bits=8` arguments/defaults to `PageImage.save(...)`. ([b049373](https://github.com/jsvine/pdfplumber/commit/b049373))
- Extract and handle patterns + (some) color spaces. ([97ca4b0](https://github.com/jsvine/pdfplumber/commit/97ca4b0))
### Removed
- Remove support for Python 3.7 ([EOL'ed June 2023](https://endoflife.date/python)). ([c9d24d5](https://github.com/jsvine/pdfplumber/commit/c9d24d5))
- Remove vestigial 'font' and 'name' properties from PDF objects. ([6d62054](https://github.com/jsvine/pdfplumber/commit/6d62054))
### Fixed
- Fix bug for re-crops that use relative=True ([#914](https://github.com/jsvine/pdfplumber/issues/914)). ([0de6da9](https://github.com/jsvine/pdfplumber/commit/0de6da9))
- Handle `use_text_flow` more consistently ([#912](https://github.com/jsvine/pdfplumber/issues/912)). ([b1db5b8](https://github.com/jsvine/pdfplumber/commit/b1db5b8))
## [0.9.0] - 2023-04-13
### Changed
- Make word segmentation (via `WordExtractor.char_begins_new_word(...)`) more explict and rigorous; should help in catching edge-cases in the future. ([6acd580](https://github.com/jsvine/pdfplumber/commit/6acd580) + [ebb93ea](https://github.com/jsvine/pdfplumber/commit/ebb93ea) + [#840](https://github.com/jsvine/pdfplumber/discussions/840#discussioncomment-5312166))
- Use `curve_edge` objects (instead of just `line` and `rect_edge` objects) in default table-detection strategy. ([6f6b465](https://github.com/jsvine/pdfplumber/commit/6f6b465) + [#858](https://github.com/jsvine/pdfplumber/discussions/858))
- By default, expand ligatures into their consituent letters (e.g., `ffi` to `ffi`), and add the `expand_ligatures` boolean parameter to text-extraction methods. ([86e935d](https://github.com/jsvine/pdfplumber/commit/86e935d) + [#598](https://github.com/jsvine/pdfplumber/issues/598))
### Added
- Add `Page.extract_text_lines(...)` method. ([4b37397](https://github.com/jsvine/pdfplumber/commit/4b37397) + [#852](https://github.com/jsvine/pdfplumber/discussions/852))
- Add `main_group`, `return_groups`, `return_chars` parameters to `Page.search(...)`. ([4b37397](https://github.com/jsvine/pdfplumber/commit/4b37397))
- Add `.curve_edges` property to `PDF` and `Page`. ([6f6b465](https://github.com/jsvine/pdfplumber/commit/6f6b465))
### Fixed
- Fix handling of bytes-typed fontnames. ([9441ff7](https://github.com/jsvine/pdfplumber/commit/9441ff7) + [#461](https://github.com/jsvine/pdfplumber/discussions/461) + [#842](https://github.com/jsvine/pdfplumber/discussions/842))
- Fix handling of whitespace-only and empty results of `Page.search(...)`. ([6f6b465](https://github.com/jsvine/pdfplumber/commit/6f6b465) + [#853](https://github.com/jsvine/pdfplumber/discussions/853))
## [0.8.1] - 2023-04-08
### Fixed
- Fix `x0>x1`/etc. error for when drawing rect fills, per new Pillow version ([db136b7](https://github.com/jsvine/pdfplumber/commit/db136b7))
## [0.8.0] - 2023-02-13
### Changed
- Change the (still experimental) `Page/utils.extract_text(layout=True)` approach so that it pads, to the degree necessary, the ends of lines with spaces and the end of the text with blank lines to acheive better mimicry of page layout. ([d3662de](https://github.com/jsvine/pdfplumber/commit/d3662de))
- Refactor handling of `pts` attribute and, in doing so, deprecate the `curve_obj["points"]` attribute, and fix `PageImage.draw_line(...)`'s handling of diagonal lines. ([216bedd](https://github.com/jsvine/pdfplumber/commit/216bedd))
- Breaking change: In `Page.extract_table[s](...)`, `keep_blank_chars` must now be passed as `text_keep_blank_chars`, for consistency's sake. ([c4e1b29](https://github.com/jsvine/pdfplumber/commit/c4e1b29))
### Added
- Add `Page.extract_table[s](...)` support for all `Page.extract_text(...)` keyword arguments. ([c4e1b29](https://github.com/jsvine/pdfplumber/commit/c4e1b29))
- Add `height` and `width` keyword arguemnts to `Page.to_image(...)`. ([#798](https://github.com/jsvine/pdfplumber/issues/798) + [93f7dbd](https://github.com/jsvine/pdfplumber/commit/93f7dbd))
- Add `layout_width`, `layout_width_chars`, `layout_height`, and `layout_width_chars` parameters to `Page/utils.extract_text(layout=True)`. ([d3662de](https://github.com/jsvine/pdfplumber/commit/d3662de))
- Add CITATION.cff. ([#755](https://github.com/jsvine/pdfplumber/issues/755)) [h/t @joaoccruz]
### Fixed
- Fix simple edge-case for when page rotation is (incorrectly) set to `None`. ([#811](https://github.com/jsvine/pdfplumber/pull/811)) [h/t @toshi1127]
### Development Changes
- Convert `utils.py` into `utils/` submodules. Retains same interface, just an improvement in organization. ([6351d97](https://github.com/jsvine/pdfplumber/commit/6351d97))
- Fix typing hints to include io.BytesIO. ([d4107f6](https://github.com/jsvine/pdfplumber/commit/d4107f6)) [h/t @conitrade-as]
- Refactor text-extraction utilities, paving way for better consistency across various entrypoints to text extraction (e.g., via `utils.extract_text(...)`, via `Page.extract_text(...)`, via `Page.extract_table(...)`). ([3424b57](https://github.com/jsvine/pdfplumber/commit/3424b57))
## [0.7.6] - 2022-11-22
### Changed
- Bump pinned `pdfminer.six` version to `20221105`. ([e63a038](https://github.com/jsvine/pdfplumber/commit/e63a038))
### Fixed
- Restore `text` attribute to `.textboxhorizontal`/etc., regression introduced in `9587cc7` / `v0.6.2`. ([8a0c126](https://github.com/jsvine/pdfplumber/commit/8a0c126))
- Fix `lru_cache` usage, which are [discouraged for class methods](https://rednafi.github.io/reflections/dont-wrap-instance-methods-with-functoolslru_cache-decorator-in-python.html) due to garbage-collection issues. ([e3142a0](https://github.com/jsvine/pdfplumber/commit/e3142a0))
### Development Changes
- Upgrade `nbexec` development requirement from `0.1.0` to `0.2.0`. ([30dac25](https://github.com/jsvine/pdfplumber/commit/30dac25))
## [0.7.5] - 2022-10-01
### Added
- Add `PageImage.show()` as alias for `PageImage.annotated.show()`. ([#715](https://github.com/jsvine/pdfplumber/discussions/715) + [5c7787b](https://github.com/jsvine/pdfplumber/commit/5c7787b))
### Fixed
- Fixed issue where `py.typed` file was not included in PyPi distribution. ([#698](https://github.com/jsvine/pdfplumber/issues/698) + [#703](https://github.com/jsvine/pdfplumber/pull/703) + [6908487](https://github.com/jsvine/pdfplumber/commit/6908487)) [h/t @jhonatan-lopes]
- Reinstated the ability to call `utils.cluster_objects(...)` with any hashable value (`str`, `int`, `tuple`, etc.) as the `key_fn` parameter, reverting breaking change in [58b1ab1](https://github.com/jsvine/pdfplumber/commit/58b1ab1). ([#691](https://github.com/jsvine/pdfplumber/issues/691) + [1e97656](https://github.com/jsvine/pdfplumber/commit/1e97656)) [h/t @jfuruness]
### Development Changes
- Update Wand version in `requirements.txt` from `>=0.6.7` to `>=0.6.10`. ([#713](https://github.com/jsvine/pdfplumber/issues/713) + [3457d79](https://github.com/jsvine/pdfplumber/commit/3457d79))
## [0.7.4] - 2022-07-19
### Added
- Add `utils.outside_bbox(...)` and `Page.outside_bbox(...)` method, which are the inverse of `utils.within_bbox(...)` and `Page.within_bbox(...)`. ([#369](https://github.com/jsvine/pdfplumber/issues/369) + [3ab1cc4](https://github.com/jsvine/pdfplumber/commit/3ab1cc4))
- Add `strict=True/False` parameter to `Page.crop(...)`, `Page.within_bbox(...)`, and `Page.outside_bbox(...)`; default is `True`, while `False` bypasses the `test_proposed_bbox(...)` check. ([#421](https://github.com/jsvine/pdfplumber/issues/421) + [71ad60f](https://github.com/jsvine/pdfplumber/commit/71ad60f))
- Add more guidance to exception when `.to_image(...)` raises `PIL.Image.DecompressionBombError`. ([#413](https://github.com/jsvine/pdfplumber/issues/413) + [b6ff9e8](https://github.com/jsvine/pdfplumber/commit/b6ff9e8))
### Fixed
- Fix `PageImage` conversions for PDFs with `cmyk` colorspaces; convert them to `rgb` earlier in the process. ([28330da](https://github.com/jsvine/pdfplumber/commit/28330da))
## [0.7.3] - 2022-07-18
### Fixed
- Quick fix for transparency issue in visual debugging mode. ([b98dd7c](https://github.com/jsvine/pdfplumber/commit/b98dd7c))
## [0.7.2] - 2022-07-17
### Added
- Add `split_at_punctuation` parameter to `.extract_words(...)` and `.extract_text(...)`. ([#682](https://github.com/jsvine/pdfplumber/issues/674)) [h/t @lolipopshock]
- Add README.md link to @hbh112233abc's [Chinese translation of README.md](https://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md). ([#674](https://github.com/jsvine/pdfplumber/issues/674))
### Changed
- Change `.to_image(...)`'s approach, preferring to composite with a white background instead of removing the alpha channel. ([1cd1f9a](https://github.com/jsvine/pdfplumber/commit/1cd1f9a))
### Fixed
- Fix bug in `LayoutEngine.calculate(...)` when processing char objects with len>1 representations, such as ligatures. ([#683](https://github.com/jsvine/pdfplumber/issues/683))
## [0.7.1] - 2022-05-31
### Fixed
- Fix bug when calling `PageImage.debug_tablefinder()` (i.e., with no parameters). ([#659](https://github.com/jsvine/pdfplumber/issues/659) + [063e2ed](https://github.com/jsvine/pdfplumber/commit/063e2ed)) [h/t @rneumann7]
### Development Changes
- Add `Makefile` target for `examples`, as well as dev requirements to support re-running the example notebooks automatically. ([ef065a7](https://github.com/jsvine/pdfplumber/commit/ef065a7))
## [0.7.0] - 2022-05-27
### Added
- Add `"matrix"` property to `char` objects, representing the current transformation matrix. ([ae6f99e](https://github.com/jsvine/pdfplumber/commit/ae6f99e))
- Add `pdfplumber.ctm` submodule with class `CTM`, to calculate scale, skew, and translation of a current transformation matrix obtained from a `char`'s `"matrix"` property. ([ae6f99e](https://github.com/jsvine/pdfplumber/commit/ae6f99e))
- Add `page.search(...)`, an *experimental feature* that allows you to search a page's text via regular expressions and non-regex strings, returning the text, any regex matches, the bounding box coordinates, and the char objects themselves. ([#201](https://github.com/jsvine/pdfplumber/issues/201) + [58b1ab1](https://github.com/jsvine/pdfplumber/commit/58b1ab1))
- Add `--include-attrs`/`--exclude-attrs` to CLI (and corresponding params to `.to_json(...)`, `.to_csv(...)`, and `Serializer`. ([4deac25](https://github.com/jsvine/pdfplumber/commit/4deac25))
- Add `py.typed` for PEP561 compatibility and detection of typing hints by mypy. ([ca795d1](https://github.com/jsvine/pdfplumber/commit/ca795d1)) [h/t @jhonatan-lopes]
### Changed
- Bump pinned `pdfminer.six` version to `20220524`. ([486cea8](https://github.com/jsvine/pdfplumber/commit/486cea8))
### Removed
- Remove `utils.collate_chars(...)`, the old name (and then alias) for `utils.extract_text(...)`. ([24f3532](https://github.com/jsvine/pdfplumber/commit/24f3532))
- Remove `utils._itemgetter(...)`, an internal-use method previously used by `utils.cluster_objects(...)`. ([58b1ab1](https://github.com/jsvine/pdfplumber/commit/58b1ab1))
### Fixed
- Fix `IndexError` bug for `.extract_text(layout=True)` on pages without text. ([#658](https://github.com/jsvine/pdfplumber/issues/658) + [ad3df11](https://github.com/jsvine/pdfplumber/commit/ad3df11)) [h/t @ethanscorey]
## [0.6.2] - 2022-05-06
### Added
- Add type annotations, and refactor parts of the library accordingly. ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
- Add enforcement of type annotations via `mypy --strict`. ([cdfdb87](https://github.com/jsvine/pdfplumber/commit/cdfdb87a215fed6cdc0db3a218c35bf18d399cbe))
- Add final bits of test coverage. ([feb9d08](https://github.com/jsvine/pdfplumber/commit/feb9d082d7afb31edd0838cb93666d1e71c119da))
- Add `TableSettings` class, a behind-the-scenes handler for managing and validating table-extraction settings. ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
### Changed
- Rename the positional argument to `.to_csv(...)` and `.to_json(...)` from `types` to `object_types`. ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
- Tweak the output of `.to_json(...)` so that, if an object type is not present for a given page, it has no key in the page's object representation. ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
### Removed
- Remove `utils.filter_objects(...)` and move the functionality to within the `FilteredPage.objects` property calculation, the only part of the library that used it. ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
- Remove code that sets `pdfminer.pdftypes.STRICT = True` and `pdfminer.pdfinterp.STRICT = True`, since that [has now been the default for a while](https://github.com/pdfminer/pdfminer.six/commit/9439a3a31a347836aad1c1226168156125d9505f). ([9587cc7](https://github.com/jsvine/pdfplumber/commit/9587cc7d2292a1eae7a0150ab406f9365944266f))
## [0.6.1] - 2022-04-23
### Changed
- Bump pinned `pdfminer.six` version to `20220319`. ([e434ed0](https://github.com/jsvine/pdfplumber/commit/e434ed0b196f1f2c0b7f76e8ea2663e40c99e93c))
- Bump minimum `Pillow` version to `>=9.1`. ([d88eff1](https://github.com/jsvine/pdfplumber/commit/d88eff1e5354baa219ebff244fd4ab0e74db49c5))
- Drop support for Python 3.6 (EOL Dec. 2021) ([a32473e](https://github.com/jsvine/pdfplumber/commit/a32473ee5f9113d5c5a96b30270cafc58d170f46))
### Fixed
- If `pdfplumber.open(...)` opens a file but a `pdfminer.pdfparser.PSException` is raised during the process, `pdfplumber` now makes sure to close that file. ([#581](https://github.com/jsvine/pdfplumber/pull/581) + ([#578](https://github.com/jsvine/pdfplumber/issues/578)) [h/t @johnhuge]
- Fix incompatibility with `Pillow>=9.1`. ([#637](https://github.com/jsvine/pdfplumber/issues/637))
## [0.6.0] - 2021-12-21
### Added
- Add `.extract_text(layout=True)`, an *experimental feature* which attempts to mimic the structural layout of the text on the page. ([#10](https://github.com/jsvine/pdfplumber/issues/10))
- Add `utils.merge_bboxes(bboxes)`, which returns the smallest bounding box that contains all bounding boxes in the `bboxes` argument. ([f8d5e70](https://github.com/jsvine/pdfplumber/commit/f8d5e70a509aa9ed3ee565d7d3f97bb5ec67f5a5))
- Add `--precision` argument to CLI ([#520](https://github.com/jsvine/pdfplumber/pull/520))
- Add `snap_x_tolerance` and `snap_y_tolerance` to table extraction settings. ([#51](https://github.com/jsvine/pdfplumber/pull/51) + [#475](https://github.com/jsvine/pdfplumber/issues/475)) [h/t @dustindall]
- Add `join_x_tolerance` and `join_y_tolerance` to table extraction settings. ([cbb34ce](https://github.com/jsvine/pdfplumber/commit/cbb34ce28b9b66d8d709304bbd0de267d82d75f3))
### Changed
- Upgrade `pdfminer.six` from `20200517` to `20211012`; see [that library's changelog](https://github.com/pdfminer/pdfminer.six/blob/develop/CHANGELOG.md) for details, but a key difference is an improvement in how it assigns `line`, `rect`, and `curve` objects. (Diagonal two-point lines, for instance, are now `line` objects instead of `curve` objects.) ([#515](https://github.com/jsvine/pdfplumber/pull/515))
- Remove Decimal-ization of parsed object attributes, which are now represented with as much precision as is returned by `pdfminer.six` ([#346](https://github.com/jsvine/pdfplumber/discussions/346) + [#520](https://github.com/jsvine/pdfplumber/pull/520))
- `.extract_text(...)` returns `""` instead of `None` when character list is empty. ([#482](https://github.com/jsvine/pdfplumber/issues/482) + [cb9900b](https://github.com/jsvine/pdfplumber/commit/cb9900b49706e96df520dbd1067c2a57a4cdb20d)) [h/t @tungph]
- `.extract_words(...)` now includes `doctop` among the attributes it returns for each word. ([66fef89](https://github.com/jsvine/pdfplumber/commit/66fef89b670cf95d13a5e23040c7bf9339944c01))
- Change behavior of horizontal `text_strategy`, so that it uses the top and bottom of *every* word, not just the top of every word and the bottom of the last. ([#467](https://github.com/jsvine/pdfplumber/pull/467) + [#466](https://github.com/jsvine/pdfplumber/issues/466) + [#265](https://github.com/jsvine/pdfplumber/issues/265)) [h/t @bobluda + @samkit-jain]
- Change `table.merge_edges(...)` behavior when `join_tolerance` (and `x`/`y` variants) `<= 0`, so that joining is attempted regardless, to handle cases of overlapping lines. ([cbb34ce](https://github.com/jsvine/pdfplumber/commit/cbb34ce28b9b66d8d709304bbd0de267d82d75f3))
- Raise error if certain table-extraction settings are negative. ([aa2d594](https://github.com/jsvine/pdfplumber/commit/aa2d594d3b3352dbcef503e4df2e045d69fc2511))
### Fixed
- Fix slowdown in `.extract_words(...)`/`WordExtractor.iter_chars_to_words(...)` on very long words, caused by repeatedly re-calculating bounding box. ([#483](https://github.com/jsvine/pdfplumber/discussions/483))
- Handle `UnicodeDecodeError` when trying to decode utf-16-encoded annotations ([#463](https://github.com/jsvine/pdfplumber/issues/463)) [h/t @tungph]
- Fix crash when extracting tables with null values in `(text|intersection)_(x|y)_tolerance` settings. ([#539](https://github.com/jsvine/pdfplumber/discussions/539)) [h/t @yoavxyoav]
### Removed
- Remove `pdfplumber.load(...)` method, which has been deprecated since `0.5.23` ([54cbbc5](https://github.com/jsvine/pdfplumber/commit/54cbbc5321b42f3976b2ac750c25b7b2ec6045d7))
### Development Changes
- Add `CONTRIBUTING.md` ([#428](https://github.com/jsvine/pdfplumber/pull/428))
- Enforce import order via [`isort`](https://pycqa.github.io/isort/index.html) ([d72b879](https://github.com/jsvine/pdfplumber/commit/d72b879665b410bd0f9c436d54ae60b3989489d5))
- Update Pillow and Wand versions in `requirements.txt` ([cae6924](https://github.com/jsvine/pdfplumber/commit/cae69246c53e49f95c1adbb5dffb3d49e726c5df))
- Update all dependency versions in `requirements-dev.txt` ([2f7e7ee](https://github.com/jsvine/pdfplumber/commit/2f7e7ee49172d681f34269a0db0276dffefa6386))
## [0.5.28] — 2021-05-08
### Added
- Add `--laparams` flag to CLI. ([#407](https://github.com/jsvine/pdfplumber/pull/407))
### Changed
- Change `.convert_csv(...)` to order objects first by page number, rather than object type. ([#407](https://github.com/jsvine/pdfplumber/pull/407))
- Change `.convert_csv(...)`, `.convert_json(...)`, and CLI so that, by default, they returning all available object types, rather than those in a predefined default list. ([#407](https://github.com/jsvine/pdfplumber/pull/407))
### Fixed
- Fix `.extract_text(...)` so that it can accept generator objects as its main parameter. ([#385](https://github.com/jsvine/pdfplumber/pull/385)) [h/t @alexreg]
- Fix page-parsing so that `LTAnno` objects (which have no bounding-box coordinates) are not extracted. (Was only an issue when setting `laparams`.) ([#388](https://github.com/jsvine/pdfplumber/issues/383))
- Fix `Page.extract_table(...)` so that it honors text tolerance settings ([#415](https://github.com/jsvine/pdfplumber/issues/415)) [h/t @trifling]
## [0.5.27] — 2021-02-28
### Fixed
- Fix regression (introduced in `0.5.26`/[b1849f4](https://github.com/jsvine/pdfplumber/commit/b1849f4)) in closing files opened by `PDF.open`
- Reinstate access to higher-level layout objects (such as `textboxhorizontal`) when `laparams` is passed to `pdfplumber.open(...)`. Had been removed in `0.5.24` via [1f87898](https://github.com/jsvine/pdfplumber/commit/1f878988576017b64f5cd77e1eb21b401124c699). ([#359](https://github.com/jsvine/pdfplumber/issues/359) + [#364](https://github.com/jsvine/pdfplumber/pull/364))
### Development Changes
- Add a `python setup.py build sdist` test to main GitHub action. ([#365](https://github.com/jsvine/pdfplumber/pull/365))
## [0.5.26] — 2021-02-10
### Added
- Add `Page.close/__enter__/__exit__` methods, by generalizing that behavior through the `Container` class ([b1849f4](https://github.com/jsvine/pdfplumber/commit/b1849f4))
### Changed
- Change handling of floating point numbers; no longer convert them to `Decimal` objects and do not round them
- Change `TableFinder` to return tables in order of topmost-and-then-leftmost, rather than leftmost-and-then-topmost ([#336](https://github.com/jsvine/pdfplumber/issues/336))
- Change `Page.to_image()`'s handling of alpha layer, to remove aliasing artifacts ([#340](https://github.com/jsvine/pdfplumber/pull/340)) [h/t @arlyon]
### Development Changes
- Enforce `psf/black` and `flake8` on `tests/` ([#327](https://github.com/jsvine/pdfplumber/pull/327)
## [0.5.25] — 2020-12-09
### Added
- Add new boolean argument `strict_metadata` (default `False`) to `pdfplumber.open(...)` method for handling metadata resolution failures ([f2c510d](https://github.com/jsvine/pdfplumber/commit/f2c510d))
### Fixed
- Fix metadata extraction to handle integer/floating-point values ([cb32478](https://github.com/jsvine/pdfplumber/commit/cb32478)) ([#297](https://github.com/jsvine/pdfplumber/issues/297))
- Fix metadata extraction to handle nested metadata values ([2d9415](https://github.com/jsvine/pdfplumber/commit/2d9415)) ([#316](https://github.com/jsvine/pdfplumber/issues/316))
- Explicitly load text as utf-8 in `setup.py` ([7854328](https://github.com/jsvine/pdfplumber/commit/7854328)) ([#304](https://github.com/jsvine/pdfplumber/issues/304))
- Fix `pdfplumber.open(...)` so that it does not close file objects passed to it ([408605f](https://github.com/jsvine/pdfplumber/commit/408605f)) ([#312](https://github.com/jsvine/pdfplumber/issues/312))
## [0.5.24] — 2020-10-20
### Added
- Added `extra_attrs=[...]` parameter to `.extract_text(...)` ([c8b200e](https://github.com/jsvine/pdfplumber/commit/c8b200e)) ([#28](https://github.com/jsvine/pdfplumber/issues/28))
- Added `utils/page.dedupe_chars(...)` ([04fd56a](https://github.com/jsvine/pdfplumber/commit/04fd56a) + [b132d45](https://github.com/jsvine/pdfplumber/commit/b132d45)) ([#71](https://github.com/jsvine/pdfplumber/issues/71))
### Changed
- Change character attribute `upright` from `int` to `bool` (per original `pdfminer.six` representation) ([1f87898](https://github.com/jsvine/pdfplumber/commit/1f87898))
- Remove access and reference to `Container.figures`, given that they are not fundamental objects ([8e74cb9](https://github.com/jsvine/pdfplumber/commit/8e74cb9))
### Fixed
- Decimalize "simple" `explicit_horizontal_lines`/`explicit_vertical_lines` descs passed to `TableFinder` methods ([bc40779](https://github.com/jsvine/pdfplumber/commit/bc40779)) ([#290](https://github.com/jsvine/pdfplumber/issues/290))
### Development Changes
- Refactor/simplify `Page.process_objects` ([1f87898](https://github.com/jsvine/pdfplumber/commit/1f87898)), `utils.extract_words` ([c8b200e](https://github.com/jsvine/pdfplumber/commit/c8b200e)), and `convert.serialize` ([a74d3bc](https://github.com/jsvine/pdfplumber/commit/a74d3bc))
- Remove `test_issues.py:test_pr_77` ([917467a](https://github.com/jsvine/pdfplumber/commit/917467a)) and narrow `test_ca_warn_report:test_objects` ([6233bbd](https://github.com/jsvine/pdfplumber/commit/6233bbd)) to speed up tests
## [0.5.23] — 2020-08-15
### Added
- Add `utils.resolve` (non-recursive .resolve_all) ([7a90630](https://github.com/jsvine/pdfplumber/commit/7a90630))
- Add `page.annots` and `page.hyperlinks`, replacing non-functional `page.annos`, and mirroring pdfminer's language ("annot" vs. "anno"). ([aa03961](https://github.com/jsvine/pdfplumber/commit/aa03961))
- Add `page/pdf.to_json` and `page/pdf.to_csv` ([cbc91c6](https://github.com/jsvine/pdfplumber/commit/cbc91c6))
- Add `relative=True/False` parameter to `.crop` and `.within_bbox`; those methods also now raise exceptions for invalid and out-of-page bounding boxes. ([047ad34](https://github.com/jsvine/pdfplumber/commit/047ad34)) [h/t @samkit-jain]
### Changed
- Remove `pdfminer.from_path` and `pdfminer.load` as deprecated; now `pdfminer.open` is the canonical way to load a PDF. ([00e789b](https://github.com/jsvine/pdfplumber/commit/00e789b))
- Simplify the logic in "text" table-finding strategies; in edge cases, may result in changes to results. ([d224202](https://github.com/jsvine/pdfplumber/commit/d224202))
- Drop support for Python 3.5 ([baf1033](https://github.com/jsvine/pdfplumber/commit/baf1033))
### Fixed
- Fix `.extract_words`, which had been returning incorrect results when `horizontal_ltr = False` ([d16aa13](https://github.com/jsvine/pdfplumber/commit/d16aa13))
- Fix `utils.resize_object`, which had been failing in various permutations ([d16aa13](https://github.com/jsvine/pdfplumber/commit/d16aa13))
- Fix `lines_strict` table-finding strategy, which a typo had prevented from being usable ([f0c9b85](https://github.com/jsvine/pdfplumber/commit/f0c9b85))
- Fix `utils.resolve_all` to guard against two known sources of infinite recursion ([cbc91c6](https://github.com/jsvine/pdfplumber/commit/cbc91c6))
### Development Changes
- Rename default branch to "stable," to clarify its purpose
- Reformat code with psf/black ([1258e09](https://github.com/jsvine/pdfplumber/commit/1258e09))
- Add code linting via psf/black and flake8 ([1258e09](https://github.com/jsvine/pdfplumber/commit/1258e09))
- Switch from nosetests to pytest ([1ac16dd](https://github.com/jsvine/pdfplumber/commit/1ac16dd))
- Switch from pipenv to standard requirements.txt + python -m venv ([48eaa51](https://github.com/jsvine/pdfplumber/commit/48eaa51))
- Add GitHub action for tests + codecov ([b148fd1](https://github.com/jsvine/pdfplumber/commit/b148fd1))
- Add Makefile for building development virtual environment and running tests ([4c69c58](https://github.com/jsvine/pdfplumber/commit/4c69c58))
- Add badges to README.md ([9e42dc3](https://github.com/jsvine/pdfplumber/commit/9e42dc3))
- Add Trove classifiers for Python versions to setup.py ([6946e8d](https://github.com/jsvine/pdfplumber/commit/6946e8d))
- Add MANIFEST.in ([eafc15c](https://github.com/jsvine/pdfplumber/commit/eafc15c))
- Add GitHub issue templates ([c4156d6](https://github.com/jsvine/pdfplumber/commit/c4156d6))
- Remove `pandas` from dev requirements and tests ([a5e7d7f](https://github.com/jsvine/pdfplumber/commit/a5e7d7f))
## [0.5.22] — 2020-07-18
### Changed
- Upgraded `pdfminer.six` requirement to `==20200517` ([cddbff7](https://github.com/jsvine/pdfplumber/commit/cddbff7)) [h/t @youngquan]
### Added
- Add support for `non_stroking_color` attribute on `char` objects ([0254da3](https://github.com/jsvine/pdfplumber/commit/0254da3)) [h/t @idan-david]
## [0.5.21] — 2020-05-27
### Fixed
- Fix `Page.extract_table(...)` to return `None` instead of crashing when no table is found ([d64afa8](https://github.com/jsvine/pdfplumber/commit/d64afa8)) [h/t @stucka]
## [0.5.20] — 2020-04-29
### Fixed
- Fix `.get_page_image` to prefer paths over streams, when possible ([ab957de](https://github.com/jsvine/pdfplumber/commit/ab957de)) [h/t @ubmarco]
- Local-fix pdfminer.six's `.resolve_all` to handle tuples and simplify ([85f422d](https://github.com/jsvine/pdfplumber/commit/85f422d))
### Changed
- Remove support for Python 2 and Python <3.3
## [0.5.19] — 2020-04-16
### Changed
- Add `utils.decimalize` performance improvement ([830d117](https://github.com/jsvine/pdfplumber/commit/830d117)) [h/t @ubmarco]
### Fixed
- Fix un-referenced method when using "text" table-finding strategy ([2a0c4a2](https://github.com/jsvine/pdfplumber/commit/2a0c4a2))
- Add missing object type `rect_edge` to `obj_to_edges()` ([0edc6bf](https://github.com/jsvine/pdfplumber/commit/0edc6bf))
## [0.5.18] — 2020-04-01
### Changed
- Allow `rect` and `curve` objects also to be passed to "explicit_..._lines" setting when table-finding. (And disallow other types of dicts to be passed.)
### Fixed
- Fix `utils.extract_text` bug introduced in prior version
## [0.5.17] — 2020-04-01
### Fixed
- Fix and simplify obj-in-bbox logic (see commit [25672961](https://github.com/jsvine/pdfplumber/commit/25672961))
- Improve/fix the way `utils.extract_text` handles vertical text (see commit [8a5d858b](https://github.com/jsvine/pdfplumber/commit/8a5d858b)) [h/t @dwalton76]
- Have `Page.to_image` use bytes stream instead of file path (Issue [#124](https://github.com/jsvine/pdfplumber/issues/124) / PR [#179](https://github.com/jsvine/pdfplumber/pull/179)) [h/t @cheungpat]
- Fix issue [#176](https://github.com/jsvine/pdfplumber/issues/176), in which `Page.extract_tables` did not pass kwargs to `Table.extract` [h/t @jsfenfen]
## [0.5.16] — 2020-01-12
### Fixed
- Prevent custom LAParams from raising exception (Issue [#168](https://github.com/jsvine/pdfplumber/issues/168) / PR [#169](https://github.com/jsvine/pdfplumber/pull/169)) [h/t @frascuchon]
- Add `six` as explicit dependency (for now)
## [0.5.15] — 2020-01-05
### Changed
- Upgrade `pdfminer.six` requirement to `==20200104`
- Upgrade `pillow` requirement `>=7.0.0`
- Remove Python 2.7 and 3.4 from `tox` tests
## [0.5.14] — 2019-10-06
### Fixed
- Fix sorting bug in `page.extract_table()`
- Fix support for password-protected PDFs (PR [#138](https://github.com/jsvine/pdfplumber/pull/138))
## [0.5.13] — 2019-08-29
### Fixed
- Fixed PDF object resolution for rotation (PR [#136](https://github.com/jsvine/pdfplumber/pull/136))
## [0.5.12] — 2019-04-14
### Added
- `cdecimal` support for Python 2
- Support for password-protected PDFs
## [0.5.11] — 2018-11-13
### Added
- Caching for `.decimalize()` method
### Changed
- Upgrade to `pdfminer.six==20181108`
- Make whitespace checking more robust (PR [#88](https://github.com/jsvine/pdfplumber/pull/88))
### Fixed
- Fix issue [#75](https://github.com/jsvine/pdfplumber/issues/75) (`.to_image()` custom arguments)
- Fix issue raised in PR [#77](https://github.com/jsvine/pdfplumber/pull/77) (PDFObjRef resolution), and general class of problems
- Fix issue [#90](https://github.com/jsvine/pdfplumber/issues/90), and general class of problems, by explicitly typecasting each kind of PDF Object
## [0.5.10] — 2018-08-03
### Fixed
- Fix bug in which, when calling get_page_image(...), the alpha channel could make the whole page black out.
## [0.5.9] — 2018-07-10
### Fixed
- Fix issue [#67](https://github.com/jsvine/pdfplumber/issues/67), in which bool-type metadata were handled incorrectly
## [0.5.8] — 2018-03-06
### Fixed
- Fix issue [#53](https://github.com/jsvine/pdfplumber/issues/53), in which non-decimalize-able (non_)stroking_color properties were raising errors.
## [0.5.7] — 2018-01-20
### Added
- `.travis.yml`, but failing on `.to_image()`
### Changed
- Move from defunct `pycrypto` to `pycryptodome`
- Update `pdfminer.six` to `20170720`
## [0.5.6] — 2017-11-21
### Fixed
- Fix issue [#41](https://github.com/jsvine/pdfplumber/issues/41), in which PDF-object-referenced cropboxes/mediaboxes weren't being fully resolved.
## [0.5.5] — 2017-05-10
### Added
- Access to `__version__` from main namespace
### Fixed
- Fix issue #33, by checking `decode_text`'s argument type
## [0.5.4] — 2017-04-27
### Fixed
- Pin `pdfminer.six` to version `20151013` (for now), fixing incompatibility
## [0.5.3] — 2017-02-27
### Fixed
- Allow `import pdfplumber` even if ImageMagick not installed.
## [0.5.2] — 2017-02-27
### Added
- Access to `curve` points. (E.g., `page.curves[0]["points"]`.)
- Ability for `.draw_line` to draw `curve` points.
### Changed
- Disaggregated "min_words_vertical" (default: 3) and "min_words_horizontal" (default: 1), removing "text_word_threshold".
- Internally, made `utils.decimalize` a bit more robust; now throws errors on non-decimalizable items.
- Now explicitly ignoring some (obscure) `pdfminer` object attributes.
- Raw input for `.draw_line` from a bounding box to `((x, y), (x, y))`, for consistency with `curve["points"]` and with `Pillow`'s underlying method.
### Fixed
- Fixed typo bug when `.rect_edges` is called before `.edges`
## [0.5.1] — 2017-02-26
### Added
- Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`.
- Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings.
### Changed
- Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3.
### Fixed
- Properly handle conversion of PDFs with transparency to `pillow` images.
- Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`).
## [0.5.0] - 2017-02-25
### Added
- Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.)
- More powerful options for extracting data from tables. See changes below.
### Changed
- Entirely overhaul the table-extraction methods. Now based on [Anssi Nurminen's master's thesis](http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3).
- Disentangle `.crop` from `.intersects_bbox` and `.within_bbox`.
- Change default `x_tolerance` and `y_tolerance` for word extraction from `5` to `3`
### Fixed
- Fix bug stemming from non-decimalized page heights. [h/t @jsfenfen]
## [0.4.6] - 2017-01-26
### Added
- Provide access to `Page.page_number`
### Changed
- Use `.page_number` instead of `.page_id` as primary identifier. [h/t @jsfenfen]
- Change default `x_tolerance` and `y_tolerance` for word extraction from `0` to `5`
### Fixed
- Provide proper support for rotated pages
## [0.4.5] - 2016-12-09
### Fixed
- Fix bug stemming from when metadata includes a PostScript literal. [h/t @boblannon]
## [0.4.4] - Mistakenly skipped
Whoops.
## [0.4.3] - 2016-04-12
### Changed
- When extracting table cells, use chars' midpoints instead of top-points.
### Fixed
- Fix find_gutters — should ignore `" "` chars
================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
title: pdfplumber
type: software
version: 0.11.9
date-released: "2026-01-05"
authors:
- family-names: "Singer-Vine"
given-names: "Jeremy"
email: "jsvine@gmail.com"
- name: "The pdfplumber contributors"
repository-code: "https://github.com/jsvine/pdfplumber"
url: "https://github.com/jsvine/pdfplumber"
license: MIT
abstract: >-
Plumb a PDF for detailed information about each char, rectangle,
line, et cetera — and easily extract text and tables.
keywords:
- "pdf"
- "pdf parsing"
- "table extraction"
================================================
FILE: CONTRIBUTING.md
================================================
# Contribution Guidelines
Thank you for your interest in `pdfplumber`! Before submitting an issue or filing a pull request, please consult the brief notes and instructions below.
## Creating issues
- If you are __troubleshooting__ a specific PDF and have not identified a clear bug, please [open a discussion](https://github.com/jsvine/pdfplumber/discussions) instead of an issue.
- Malformed PDFs can often cause problems that cannot be directly fixed in `pdfplumber`. For that reason, please __try repairing__ your PDF using [Ghostscript](https://www.ghostscript.com/) before filing a bug report. To do so, run `gs -o repaired.pdf -sDEVICE=pdfwrite original.pdf`, replacing `original.pdf` with your PDF's actual filename.
- If your issue relates to __text not being displayed__ correctly, please compare the output to [`pdfminer.six`'s `pdf2txt` command](https://pdfminersix.readthedocs.io/en/latest/tutorial/commandline.html). If you're seeing the same problems there, please consult that repository instead of this one, because `pdfplumber` depends on `pdfminer.six` for text extraction.
- Please do fill out all requested sections of the __issue template__; doing so will help the maintainers and community more efficiently respond.
## Submitting pull requests
- If you would like to propose a change that is more __complex__ than a simple bug-fix, please [first open a discussion](https://github.com/jsvine/pdfplumber/discussions). If you are submitting a __simple__ bugfix, typo correction, et cetera, feel free to open a pull request directly.
- PRs should be submitted against the __`develop` branch__ only.
- PRs should contain one or more __tests__ that support the changes. The tests should pass with the new code but fail on the commits prior. For guidance, see the existing tests in the `tests/` directory. To execute the tests, run `make tests` or `python -m pytest`.
- Python code in PRs should conform to [`psf/black`](https://black.readthedocs.io/en/stable/), [`isort`](https://pycqa.github.io/isort/index.html), and [`flake8`](https://pypi.org/project/flake8/) __formatting__ guidelines. To automatically reformat your code accordingly, run `make format`. To test the formatting and `flake8` compliance, run `make lint`.
- Please add yourself to the [list of contributors](https://github.com/jsvine/pdfplumber#acknowledgments--contributors).
- Please also update the [CHANGELOG.md](https://github.com/jsvine/pdfplumber/blob/develop/CHANGELOG.md).
================================================
FILE: LICENSE.txt
================================================
The MIT License (MIT)
Copyright (c) 2015, Jeremy Singer-Vine
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MANIFEST.in
================================================
include LICENSE.txt
include README.md
include requirements.txt
include requirements-dev.txt
include pdfplumber/py.typed
================================================
FILE: Makefile
================================================
.PHONY: venv tests check-black check-flake lint format examples build
VENV ?= .venv
PYTHON = ${VENV}/bin/python
venv:
python3 -m venv venv
${VENV}/bin/pip install --upgrade pip
${VENV}/bin/pip install -r requirements.txt
${VENV}/bin/pip install -r requirements-dev.txt
${VENV}/bin/pip install -e .
tests:
${PYTHON} -m pytest -n auto
${PYTHON} -m coverage html
check-black:
${VENV}/bin/black --check pdfplumber tests
check-isort:
${VENV}/bin/isort --profile black --check-only pdfplumber tests
check-flake:
${VENV}/bin/flake8 pdfplumber tests
check-mypy:
${VENV}/bin/mypy --strict --implicit-reexport pdfplumber
lint: check-flake check-mypy check-black check-isort
format:
${VENV}/bin/black pdfplumber tests
${VENV}/bin/isort --profile black pdfplumber tests
examples:
${VENV}/bin/nbexec examples/notebooks
build:
${PYTHON} -m build
================================================
FILE: README.md
================================================
# pdfplumber
[](https://pypi.python.org/pypi/pdfplumber)  [](https://codecov.io/gh/jsvine/pdfplumber/branch/stable) [](https://pypi.python.org/pypi/pdfplumber)
Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging.
Works best on machine-generated, rather than scanned, PDFs. Built on [`pdfminer.six`](https://github.com/goulu/pdfminer).
Currently [tested](tests/) on [Python 3.10, 3.11, 3.12, 3.13, 3.14](.github/workflows/tests.yml).
Translations of this document are available in: [Chinese (by @hbh112233abc)](https://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md).
__To report a bug__ or request a feature, please [file an issue](https://github.com/jsvine/pdfplumber/issues/new/choose). __To ask a question__ or request assistance with a specific PDF, please [use the discussions forum](https://github.com/jsvine/pdfplumber/discussions).
## Table of Contents
- [Installation](#installation)
- [Command line interface](#command-line-interface)
- [Python library](#python-library)
- [Visual debugging](#visual-debugging)
- [Extracting text](#extracting-text)
- [Extracting tables](#extracting-tables)
- [Extracting form values](#extracting-form-values)
- [Demonstrations](#demonstrations)
- [Comparison to other libraries](#comparison-to-other-libraries)
- [Acknowledgments / Contributors](#acknowledgments--contributors)
- [Contributing](#contributing)
## Installation
```sh
pip install pdfplumber
```
## Command line interface
### Basic example
```sh
curl "https://raw.githubusercontent.com/jsvine/pdfplumber/stable/examples/pdfs/background-checks.pdf" > background-checks.pdf
pdfplumber background-checks.pdf > background-checks.csv
```
The output will be a CSV containing info about every character, line, and rectangle in the PDF.
### Options
| Argument | Description |
|----------|-------------|
|`--format [format]`| `csv`, `json`, or `text`. The `csv` and `json` formats return information about each object. Of those two, the `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes. The `text` option returns a plain-text representation of the PDF, using `Page.extract_text(layout=True)`.|
|`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`, et cetera. Defaults to all available.|
|`--laparams`| A JSON-formatted string (e.g., `'{"detect_vertical": true}'`) to pass to `pdfplumber.open(..., laparams=...)`.|
|`--precision [integer]`| The number of decimal places to round floating-point numbers. Defaults to no rounding.|
## Python library
### Basic example
```python
import pdfplumber
with pdfplumber.open("path/to/file.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.chars[0])
```
### Loading a PDF
To start working with a PDF, call `pdfplumber.open(x)`, where `x` can be a:
- path to your PDF file
- file object, loaded as bytes
- file-like object, loaded as bytes
The `open` method returns an instance of the `pdfplumber.PDF` class.
To load a password-protected PDF, pass the `password` keyword argument, e.g., `pdfplumber.open("file.pdf", password = "test")`.
To set layout analysis parameters to `pdfminer.six`'s layout engine, pass the `laparams` keyword argument, e.g., `pdfplumber.open("file.pdf", laparams = { "line_overlap": 0.7 })`.
To [pre-normalize Unicode text](https://unicode.org/reports/tr15/), pass `unicode_norm=...`, where `...` is one of the [four Unicode normalization forms](https://unicode.org/reports/tr15/#Normalization_Forms_Table): `"NFC"`, `"NFD"`, `"NFKC"`, or `"NFKD"`.
Invalid metadata values are treated as a warning by default. If that is not intended, pass `strict_metadata=True` to the `open` method and `pdfplumber.open` will raise an exception if it is unable to parse the metadata.
### The `pdfplumber.PDF` class
The top-level `pdfplumber.PDF` class represents a single PDF and has two main properties:
| Property | Description |
|----------|-------------|
|`.metadata`| A dictionary of metadata key/value pairs, drawn from the PDF's `Info` trailers. Typically includes "CreationDate," "ModDate," "Producer," et cetera.|
|`.pages`| A list containing one `pdfplumber.Page` instance per page loaded.|
... and also has the following method:
| Method | Description |
|--------|-------------|
|`.close()`| Calling this method calls `Page.close()` on each page, and also closes the file stream (except in cases when the stream is external, i.e., already opened and passed directly to `pdfplumber`). |
### The `pdfplumber.Page` class
The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll do with `pdfplumber` will revolve around this class. It has these main properties:
| Property | Description |
|----------|-------------|
|`.page_number`| The sequential page number, starting with `1` for the first page, `2` for the second, and so on.|
|`.width`| The page's width.|
|`.height`| The page's height.|
|`.objects` / `.chars` / `.lines` / `.rects` / `.curves` / `.images`| Each of these properties is a list, and each list contains one dictionary for each such object embedded on the page. For more detail, see "[Objects](#objects)" below.|
... and these main methods:
| Method | Description |
|--------|-------------|
|`.crop(bounding_box, relative=False, strict=True)`| Returns a version of the page cropped to the bounding box, which should be expressed as 4-tuple with the values `(x0, top, x1, bottom)`. Cropped pages retain objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box. If `relative=True`, the bounding box is calculated as an offset from the top-left of the page's bounding box, rather than an absolute positioning. (See [Issue #245](https://github.com/jsvine/pdfplumber/issues/245) for a visual example and explanation.) When `strict=True` (the default), the crop's bounding box must fall entirely within the page's bounding box.|
|`.within_bbox(bounding_box, relative=False, strict=True)`| Similar to `.crop`, but only retains objects that fall *entirely within* the bounding box.|
|`.outside_bbox(bounding_box, relative=False, strict=True)`| Similar to `.crop` and `.within_bbox`, but only retains objects that fall *entirely outside* the bounding box.|
|`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
... and also has the following method:
| Method | Description |
|--------|-------------|
|`.close()`| By default, `Page` objects cache their layout and object information to avoid having to reprocess it. When parsing large PDFs, however, these cached properties can require a lot of memory. You can use this method to flush the cache and release the memory.|
Additional methods are described in the sections below:
- [Visual debugging](#visual-debugging)
- [Extracting text](#extracting-text)
- [Extracting tables](#extracting-tables)
### Objects
Each instance of `pdfplumber.PDF` and `pdfplumber.Page` provides access to several types of PDF objects, all derived from [`pdfminer.six`](https://github.com/pdfminer/pdfminer.six/) PDF parsing. The following properties each return a Python list of the matching objects:
- `.chars`, each representing a single text character.
- `.lines`, each representing a single 1-dimensional line.
- `.rects`, each representing a single 2-dimensional rectangle.
- `.curves`, each representing any series of connected points that `pdfminer.six` does not recognize as a line or rectangle.
- `.images`, each representing an image.
- `.annots`, each representing a single PDF annotation (cf. Section 8.4 of the [official PDF specification](https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf) for details)
- `.hyperlinks`, each representing a single PDF annotation of the subtype `Link` and having an `URI` action attribute
Each object is represented as a simple Python `dict`, with the following properties:
#### `char` properties
| Property | Description |
|----------|-------------|
|`page_number`| Page number on which this character was found.|
|`text`| E.g., "z", or "Z" or " ".|
|`fontname`| Name of the character's font face.|
|`size`| Font size.|
|`adv`| Equal to text width * the font size * scaling factor.|
|`upright`| Whether the character is upright.|
|`height`| Height of the character.|
|`width`| Width of the character.|
|`x0`| Distance of left side of character from left side of page.|
|`x1`| Distance of right side of character from left side of page.|
|`y0`| Distance of bottom of character from bottom of page.|
|`y1`| Distance of top of character from bottom of page.|
|`top`| Distance of top of character from top of page.|
|`bottom`| Distance of bottom of the character from top of page.|
|`doctop`| Distance of top of character from top of document.|
|`matrix`| The "current transformation matrix" for this character. (See below for details.)|
|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this character if any (otherwise `None`). *Experimental attribute.*|
|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this character if any (otherwise `None`). *Experimental attribute.*|
|`ncs`|TKTK|
|`stroking_pattern`|TKTK|
|`non_stroking_pattern`|TKTK|
|`stroking_color`|The color of the character's outline (i.e., stroke). See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The character's interior color. See [docs/colors.md](docs/colors.md) for details.|
|`object_type`| "char"|
__Note__: A character’s `matrix` property represents the “current transformation matrix,” as described in Section 4.2.2 of the [PDF Reference](https://ghostscript.com/~robin/pdf_reference17.pdf) (6th Ed.). The matrix controls the character’s scale, skew, and positional translation. Rotation is a combination of scale and skew, but in most cases can be considered equal to the x-axis skew. The `pdfplumber.ctm` submodule defines a class, `CTM`, that assists with these calculations. For instance:
```python
from pdfplumber.ctm import CTM
my_char = pdf.pages[0].chars[3]
my_char_ctm = CTM(*my_char["matrix"])
my_char_rotation = my_char_ctm.skew_x
```
#### `line` properties
| Property | Description |
|----------|-------------|
|`page_number`| Page number on which this line was found.|
|`height`| Height of line.|
|`width`| Width of line.|
|`x0`| Distance of left-side extremity from left side of page.|
|`x1`| Distance of right-side extremity from left side of page.|
|`y0`| Distance of bottom extremity from bottom of page.|
|`y1`| Distance of top extremity bottom of page.|
|`top`| Distance of top of line from top of page.|
|`bottom`| Distance of bottom of the line from top of page.|
|`doctop`| Distance of top of line from top of document.|
|`linewidth`| Thickness of line.|
|`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.|
|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this line if any (otherwise `None`). *Experimental attribute.*|
|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this line if any (otherwise `None`). *Experimental attribute.*|
|`object_type`| "line"|
#### `rect` properties
| Property | Description |
|----------|-------------|
|`page_number`| Page number on which this rectangle was found.|
|`height`| Height of rectangle.|
|`width`| Width of rectangle.|
|`x0`| Distance of left side of rectangle from left side of page.|
|`x1`| Distance of right side of rectangle from left side of page.|
|`y0`| Distance of bottom of rectangle from bottom of page.|
|`y1`| Distance of top of rectangle from bottom of page.|
|`top`| Distance of top of rectangle from top of page.|
|`bottom`| Distance of bottom of the rectangle from top of page.|
|`doctop`| Distance of top of rectangle from top of document.|
|`linewidth`| Thickness of line.|
|`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.|
|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this rect if any (otherwise `None`). *Experimental attribute.*|
|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this rect if any (otherwise `None`). *Experimental attribute.*|
|`object_type`| "rect"|
#### `curve` properties
| Property | Description |
|----------|-------------|
|`page_number`| Page number on which this curve was found.|
|`pts`| A list of `(x, top)` tuples indicating the *points on the curve*.|
|`path`| A list of `(cmd, *(x, top))` tuples *describing the full path description*, including (for example) control points used in Bezier curves.|
|`height`| Height of curve's bounding box.|
|`width`| Width of curve's bounding box.|
|`x0`| Distance of curve's left-most point from left side of page.|
|`x1`| Distance of curve's right-most point from left side of the page.|
|`y0`| Distance of curve's lowest point from bottom of page.|
|`y1`| Distance of curve's highest point from bottom of page.|
|`top`| Distance of curve's highest point from top of page.|
|`bottom`| Distance of curve's lowest point from top of page.|
|`doctop`| Distance of curve's highest point from top of document.|
|`linewidth`| Thickness of line.|
|`fill`| Whether the shape defined by the curve's path is filled.|
|`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.|
|`dash`|A `([dash_array], dash_phase)` tuple describing the curve's dash style. See [Table 4.6 of the PDF specification](https://ghostscript.com/~robin/pdf_reference17.pdf#page=218) for details.|
|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this curve if any (otherwise `None`). *Experimental attribute.*|
|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this curve if any (otherwise `None`). *Experimental attribute.*|
|`object_type`| "curve"|
#### Derived properties
Additionally, both `pdfplumber.PDF` and `pdfplumber.Page` provide access to several derived lists of objects: `.rect_edges` (which decomposes each rectangle into its four lines), `.curve_edges` (which does the same for `curve` objects), and `.edges` (which combines `.rect_edges`, `.curve_edges`, and `.lines`).
#### `image` properties
*Note: Although the positioning and characteristics of `image` objects are available via `pdfplumber`, this library does not provide direct support for reconstructing image content. For that, please see [this suggestion](https://github.com/jsvine/pdfplumber/discussions/496#discussioncomment-1259772).*
| Property | Description |
|----------|-------------|
|`page_number`| Page number on which the image was found.|
|`height`| Height of the image.|
|`width`| Width of the image.|
|`x0`| Distance of left side of the image from left side of page.|
|`x1`| Distance of right side of the image from left side of page.|
|`y0`| Distance of bottom of the image from bottom of page.|
|`y1`| Distance of top of the image from bottom of page.|
|`top`| Distance of top of the image from top of page.|
|`bottom`| Distance of bottom of the image from top of page.|
|`doctop`| Distance of top of rectangle from top of document.|
|`srcsize`| The image original dimensions, as a `(width, height)` tuple.|
|`colorspace`| Color domain of the image (e.g., RGB).|
|`bits`| The number of bits per color component; e.g., 8 corresponds to 255 possible values for each color component (R, G, and B in an RGB color space).|
|`stream`| Pixel values of the image, as a `pdfminer.pdftypes.PDFStream` object.|
|`imagemask`| A nullable boolean; if `True`, "specifies that the image data is to be used as a stencil mask for painting in the current color."|
|`name`| "The name by which this image XObject is referenced in the XObject subdictionary of the current resource dictionary." [🔗](https://ghostscript.com/~robin/pdf_reference17.pdf#page=340) |
|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this image if any (otherwise `None`). *Experimental attribute.*|
|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this image if any (otherwise `None`). *Experimental attribute.*|
|`object_type`| "image"|
### Obtaining higher-level layout objects via `pdfminer.six`
If you pass the `pdfminer.six`-handling `laparams` parameter to `pdfplumber.open(...)`, then each page's `.objects` dictionary will also contain `pdfminer.six`'s higher-level layout objects, such as `"textboxhorizontal"`.
## Visual debugging
`pdfplumber`'s visual debugging tools can be helpful in understanding the structure of a PDF and the objects that have been extracted from it.
### Creating a `PageImage` with `.to_image()`
To turn any page (including cropped pages) into an `PageImage` object, call `my_page.to_image()`. You can optionally pass *one* of the following keyword arguments:
- `resolution`: The desired number pixels per inch. Default: `72`. Type: `int`.
- `width`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`.
- `height`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`.
- `antialias`: Whether to use antialiasing when creating the image. Setting to `True` creates images with less-jagged text and graphics, but with larger file sizes. Default: `False`. Type: `bool`.
- `force_mediabox`: Use the page's `.mediabox` dimensions, rather than the `.cropbox` dimensions. Default: `False`. Type: `bool`.
For instance:
```python
im = my_pdf.pages[0].to_image(resolution=150)
```
From a script or REPL, `im.show()` will open the image in your local image viewer. But `PageImage` objects also play nicely with Jupyter notebooks; they automatically render as cell outputs. For example:

*Note*: `.to_image(...)` works as expected with `Page.crop(...)`/`CroppedPage` instances, but is unable to incorporate changes made via `Page.filter(...)`/`FilteredPage` instances.
### Basic `PageImage` methods
| Method | Description |
|--------|-------------|
|`im.reset()`| Clears anything you've drawn so far.|
|`im.copy()`| Copies the image to a new `PageImage` object.|
|`im.show()`| Opens the image in your local image viewer.|
|`im.save(path_or_fileobject, format="PNG", quantize=True, colors=256, bits=8)`| Saves the annotated image as a PNG file. The default arguments quantize the image to a palette of 256 colors, saving the PNG with 8-bit color depth. You can disable quantization by passing `quantize=False` or adjust the size of the color palette by passing `colors=N`.|
### Drawing methods
You can pass explicit coordinates or any `pdfplumber` PDF object (e.g., char, line, rect) to these methods.
| Single-object method | Bulk method | Description |
|----------------------|-------------|-------------|
|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`| Draws a line from a `line`, `curve`, or a 2-tuple of 2-tuples (e.g., `((x, y), (x, y))`).|
|`im.draw_vline(location, stroke={color}, stroke_width=1)`| `im.draw_vlines(list_of_locations, **kwargs)`| Draws a vertical line at the x-coordinate indicated by `location`.|
|`im.draw_hline(location, stroke={color}, stroke_width=1)`| `im.draw_hlines(list_of_locations, **kwargs)`| Draws a horizontal line at the y-coordinate indicated by `location`.|
|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`| Draws a rectangle from a `rect`, `char`, etc., or 4-tuple bounding box.|
|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`| Draws a circle at `(x, y)` coordinate or at the center of a `char`, `rect`, etc.|
Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillow.readthedocs.io/en/latest/reference/ImageDraw.html), but the parameters have been tweaked for consistency with SVG's `fill`/`stroke`/`stroke_width` nomenclature.
### Visually debugging the table-finder
`im.debug_tablefinder(table_settings={})` will return a version of the PageImage with the detected lines (in red), intersections (circles), and tables (light blue) overlaid.
## Extracting text
`pdfplumber` can extract text from any given page (including cropped and derived pages). It can also attempt to preserve the layout of that text, as well as to identify the coordinates of words and search queries. `Page` objects can call the following text-extraction methods:
| Method | Description |
|--------|-------------|
|`.extract_text(x_tolerance=3, x_tolerance_ratio=None, y_tolerance=3, layout=False, x_density=7.25, y_density=13, line_dir_render=None, char_dir_render=None, **kwargs)`| Collates all of the page's character objects into a single string.
When `layout=False`: Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. (If `x_tolerance_ratio` is not `None`, the extractor uses a dynamic `x_tolerance` equal to `x_tolerance_ratio * previous_character["size"]`.) Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.
When `layout=True` (*experimental feature*): Attempts to mimic the structural layout of the text on the page(s), using `x_density` and `y_density` to determine the minimum number of characters/newlines per "point," the PDF unit of measurement. Passing `line_dir_render="ttb"/"btt"/"ltr"/"rtl"` and/or `char_dir_render="ttb"/"btt"/"ltr"/"rtl"` will output the the lines/characters in a different direction than the default. All remaining `**kwargs` are passed to `.extract_words(...)` (see below), the first step in calculating the layout.
|
|`.extract_text_simple(x_tolerance=3, y_tolerance=3)`| A slightly faster but less flexible version of `.extract_text(...)`, using a simpler logic.|
|`.extract_words(x_tolerance=3, x_tolerance_ratio=None, y_tolerance=3, keep_blank_chars=False, use_text_flow=False, line_dir="ttb", char_dir="ltr", line_dir_rotated="ttb", char_dir_rotated="ltr", extra_attrs=[], split_at_punctuation=False, expand_ligatures=True, return_chars=False)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. (If `x_tolerance_ratio` is not `None`, the extractor uses a dynamic `x_tolerance` equal to `x_tolerance_ratio * previous_character["size"]`.) A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. Changing `keep_blank_chars` to `True` will mean that blank characters are treated as part of a word, not as a space between words. Changing `use_text_flow` to `True` will use the PDF's underlying flow of characters as a guide for ordering and segmenting the words, rather than presorting the characters by x/y position. (This mimics how dragging a cursor highlights text in a PDF; as with that, the order does not always appear to be logical.) The arguments `line_dir` and `char_dir` tell this method the direction in which lines/characters are expected to be read; valid options are "ttb" (top-to-bottom), "btt" (bottom-to-top), "ltr" (left-to-right), and "rtl" (right-to-left). The `line_dir_rotated` and `char_dir_rotated` arguments are similar, but for text that has been rotated. Passing a list of `extra_attrs` (e.g., `["fontname", "size"]` will restrict each words to characters that share exactly the same value for each of those [attributes](#char-properties), and the resulting word dicts will indicate those attributes. Setting `split_at_punctuation` to `True` will enforce breaking tokens at punctuations specified by `string.punctuation`; or you can specify the list of separating punctuation by pass a string, e.g., split_at_punctuation='!"&\'()*+,.:;<=>?@[\]^\`\{\|\}~'. Unless you set `expand_ligatures=False`, ligatures such as `fi` will be expanded into their constituent letters (e.g., `fi`). Passing `return_chars=True` will add, to each word dictionary, a list of its constituent characters, as a list in the `"chars"` field.|
|`.extract_text_lines(layout=False, strip=True, return_chars=True, **kwargs)`|*Experimental feature* that returns a list of dictionaries representing the lines of text on the page. The `strip` parameter works analogously to Python's `str.strip()` method, and returns `text` attributes without their surrounding whitespace. (Only relevant when `layout = True`.) Setting `return_chars` to `False` will exclude the individual character objects from the returned text-line dicts. The remaining `**kwargs` are those you would pass to `.extract_text(layout=True, ...)`.|
|`.search(pattern, regex=True, case=True, main_group=0, return_groups=True, return_chars=True, layout=False, **kwargs)`|*Experimental feature* that allows you to search a page's text, returning a list of all instances that match the query. For each instance, the response dictionary object contains the matching text, any regex group matches, the bounding box coordinates, and the char objects themselves. `pattern` can be a compiled regular expression, an uncompiled regular expression, or a non-regex string. If `regex` is `False`, the pattern is treated as a non-regex string. If `case` is `False`, the search is performed in a case-insensitive manner. Setting `main_group` restricts the results to a specific regex group within the `pattern` (default of `0` means the entire match). Setting `return_groups` and/or `return_chars` to `False` will exclude the lists of the matched regex groups and/or characters from being added (as `"groups"` and `"chars"` to the return dicts). The `layout` parameter operates as it does for `.extract_text(...)`. The remaining `**kwargs` are those you would pass to `.extract_text(layout=True, ...)`. __Note__: Zero-width and all-whitespace matches are discarded, because they (generally) have no explicit position on the page. |
|`.dedupe_chars(tolerance=1, extra_attrs=("fontname", "size"))`| Returns a version of the page with duplicate chars — those sharing the same text, positioning (within `tolerance` x/y), and `extra_attrs` as other characters — removed. (See [Issue #71](https://github.com/jsvine/pdfplumber/issues/71) to understand the motivation.)|
## Extracting tables
`pdfplumber`'s approach to table detection borrows heavily from [Anssi Nurminen's master's thesis](https://trepo.tuni.fi/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3), and is inspired by [Tabula](https://github.com/tabulapdf/tabula-extractor/issues/16). It works like this:
1. For any given PDF page, find the lines that are (a) explicitly defined and/or (b) implied by the alignment of words on the page.
2. Merge overlapping, or nearly-overlapping, lines.
3. Find the intersections of all those lines.
4. Find the most granular set of rectangles (i.e., cells) that use these intersections as their vertices.
5. Group contiguous cells into tables.
### Table-extraction methods
`pdfplumber.Page` objects can call the following table methods:
| Method | Description |
|--------|-------------|
|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, `.columns`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.|
|`.find_table(table_settings={})`|Similar to `.find_tables(...)`, but returns the *largest* table on the page, as a `Table` object. If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.|
|`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.|
|`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page (see `.find_table(...)` above), represented as a list of lists, with the structure `row -> cell`.|
|`.debug_tablefinder(table_settings={})`|Returns an instance of the `TableFinder` class, with access to the `.edges`, `.intersections`, `.cells`, and `.tables` properties.|
For example:
```python
pdf = pdfplumber.open("path/to/my.pdf")
page = pdf.pages[0]
page.extract_table()
```
[Click here for a more detailed example.](examples/notebooks/extract-table-ca-warn-report.ipynb)
### Table-extraction settings
By default, `extract_tables` uses the page's vertical and horizontal lines (or rectangle edges) as cell-separators. But the method is highly customizable via the `table_settings` argument. The possible settings, and their defaults:
```python
{
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 3,
"snap_x_tolerance": 3,
"snap_y_tolerance": 3,
"join_tolerance": 3,
"join_x_tolerance": 3,
"join_y_tolerance": 3,
"edge_min_length": 3,
"edge_min_length_prefilter": 1,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"intersection_tolerance": 3,
"intersection_x_tolerance": 3,
"intersection_y_tolerance": 3,
"text_tolerance": 3,
"text_x_tolerance": 3,
"text_y_tolerance": 3,
"text_*": …, # See below
}
```
| Setting | Description |
|---------|-------------|
|`"vertical_strategy"`| Either `"lines"`, `"lines_strict"`, `"text"`, or `"explicit"`. See explanation below.|
|`"horizontal_strategy"`| Either `"lines"`, `"lines_strict"`, `"text"`, or `"explicit"`. See explanation below.|
|`"explicit_vertical_lines"`| A list of vertical lines that explicitly demarcate cells in the table. Can be used in combination with any of the strategies above. Items in the list should be either numbers — indicating the `x` coordinate of a line the full height of the page — or `line`/`rect`/`curve` objects.|
|`"explicit_horizontal_lines"`| A list of horizontal lines that explicitly demarcate cells in the table. Can be used in combination with any of the strategies above. Items in the list should be either numbers — indicating the `y` coordinate of a line the full height of the page — or `line`/`rect`/`curve` objects.|
|`"snap_tolerance"`, `"snap_x_tolerance"`, `"snap_y_tolerance"`| Parallel lines within `snap_tolerance` points will be "snapped" to the same horizontal or vertical position.|
|`"join_tolerance"`, `"join_x_tolerance"`, `"join_y_tolerance"`| Line segments on the same infinite line, and whose ends are within `join_tolerance` of one another, will be "joined" into a single line segment.|
|`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
|`"edge_min_length_prefilter"`| Edges shorter than `edge_min_length_prefilter` will be discarded during initial edge filtering from the page. Lowering this value (e.g., to `0.5`) can help capture small dashed lines that might otherwise be filtered out.|
|`"min_words_vertical"`| When using `"vertical_strategy": "text"`, at least `min_words_vertical` words must share the same alignment.|
|`"min_words_horizontal"`| When using `"horizontal_strategy": "text"`, at least `min_words_horizontal` words must share the same alignment.|
|`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges must be within `intersection_tolerance` points to be considered intersecting.|
|`"text_*"`| All settings prefixed with `text_` are then used when extracting text from each discovered table. All possible arguments to `Page.extract_text(...)` are also valid here.|
|`"text_x_tolerance"`, `"text_y_tolerance"`| These `text_`-prefixed settings *also* apply to the table-identification algorithm when the `text` strategy is used. I.e., when that algorithm searches for words, it will expect the individual letters in each word to be no more than `text_x_tolerance`/`text_y_tolerance` points apart.|
### Table-extraction strategies
Both `vertical_strategy` and `horizontal_strategy` accept the following options:
| Strategy | Description |
|----------|-------------|
| `"lines"` | Use the page's graphical lines — including the sides of rectangle objects — as the borders of potential table-cells. |
| `"lines_strict"` | Use the page's graphical lines — but *not* the sides of rectangle objects — as the borders of potential table-cells. |
| `"text"` | For `vertical_strategy`: Deduce the (imaginary) lines that connect the left, right, or center of words on the page, and use those lines as the borders of potential table-cells. For `horizontal_strategy`, the same but using the tops of words. |
| `"explicit"` | Only use the lines explicitly defined in `explicit_vertical_lines` / `explicit_horizontal_lines`. |
### Notes
- Often it's helpful to crop a page — `Page.crop(bounding_box)` — before trying to extract the table.
- Table extraction for `pdfplumber` was radically redesigned for `v0.5.0`, and introduced breaking changes.
## Extracting form values
Sometimes PDF files can contain forms that include inputs that people can fill out and save. While values in form fields appear like other text in a PDF file, form data is handled differently. If you want the gory details, see page 671 of this [specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf).
`pdfplumber` doesn't have an interface for working with form data, but you can access it using `pdfplumber`'s wrappers around `pdfminer`.
For example, this snippet will retrieve form field names and values and store them in a dictionary.
```python
import pdfplumber
from pdfplumber.utils.pdfinternals import resolve_and_decode, resolve
pdf = pdfplumber.open("document_with_form.pdf")
def parse_field_helper(form_data, field, prefix=None):
""" appends any PDF AcroForm field/value pairs in `field` to provided `form_data` list
if `field` has child fields, those will be parsed recursively.
"""
resolved_field = field.resolve()
field_name = '.'.join(filter(lambda x: x, [prefix, resolve_and_decode(resolved_field.get("T"))]))
if "Kids" in resolved_field:
for kid_field in resolved_field["Kids"]:
parse_field_helper(form_data, kid_field, prefix=field_name)
if "T" in resolved_field or "TU" in resolved_field:
# "T" is a field-name, but it's sometimes absent.
# "TU" is the "alternate field name" and is often more human-readable
# your PDF may have one, the other, or both.
alternate_field_name = resolve_and_decode(resolved_field.get("TU")) if resolved_field.get("TU") else None
field_value = resolve_and_decode(resolved_field["V"]) if 'V' in resolved_field else None
form_data.append([field_name, alternate_field_name, field_value])
form_data = []
fields = resolve(resolve(pdf.doc.catalog["AcroForm"])["Fields"])
for field in fields:
parse_field_helper(form_data, field)
```
Once you run this script, `form_data` is a list containing a three-element tuple for each form element. For instance, a PDF form with a city and state field might look like this.
```
[
['STATE.0', 'enter STATE', 'CA'],
['section 2 accident infoRmation.1.0',
'enter city of accident',
'SAN FRANCISCO']
]
```
*Thanks to [@jeremybmerrill](https://github.com/jeremybmerrill) for helping to maintain the form-parsing code above.*
## Demonstrations
- [Using `extract_table` on a California Worker Adjustment and Retraining Notification (WARN) report](examples/notebooks/extract-table-ca-warn-report.ipynb). Demonstrates basic visual debugging and table extraction.
- [Using `extract_table` on the FBI's National Instant Criminal Background Check System PDFs](examples/notebooks/extract-table-nics.ipynb). Demonstrates how to use visual debugging to find optimal table extraction settings. Also demonstrates `Page.crop(...)` and `Page.extract_text(...).`
- [Inspecting and visualizing `curve` objects](examples/notebooks/ag-energy-roundup-curves.ipynb).
- [Extracting fixed-width data from a San Jose PD firearm search report](examples/notebooks/san-jose-pd-firearm-report.ipynb), an example of using `Page.extract_text(...)`.
## Comparison to other libraries
Several other Python libraries help users to extract information from PDFs. As a broad overview, `pdfplumber` distinguishes itself from other PDF processing libraries by combining these features:
- Easy access to detailed information about each PDF object
- Higher-level, customizable methods for extracting text and tables
- Tightly integrated visual debugging
- Other useful utility functions, such as filtering objects via a crop-box
It's also helpful to know what features `pdfplumber` does __not__ provide:
- PDF *generation*
- PDF *modification*
- Optical character recognition (OCR)
- Strong support for extracting tables from OCR'ed documents
### Specific comparisons
- [`pdfminer.six`](https://github.com/pdfminer/pdfminer.six) provides the foundation for `pdfplumber`. It primarily focuses on parsing PDFs, analyzing PDF layouts and object positioning, and extracting text. It does not provide tools for table extraction or visual debugging. License: [MIT](https://github.com/pdfminer/pdfminer.six?tab=MIT-1-ov-file).
- [`PyPDF2`](https://github.com/mstamy2/PyPDF2) is a pure-Python library "capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files." It can extract page text, but does not provide easy access to shape objects (rectangles, lines, etc.), table-extraction, or visually debugging tools. License: [BSD](https://github.com/py-pdf/pypdf?tab=License-1-ov-file#readme).
- [`pymupdf`](https://pymupdf.readthedocs.io/) is substantially faster than `pdfminer.six` (and thus also `pdfplumber`) and can generate and modify PDFs, but the library requires installation of non-Python software (MuPDF). It also does not enable easy access to shape objects (rectangles, lines, etc.), and does not provide table-extraction or visual debugging tools. License: [AGPL](https://pymupdf.readthedocs.io/en/latest/about.html#license-and-copyright).
- [`camelot`](https://github.com/camelot-dev/camelot), [`tabula-py`](https://github.com/chezou/tabula-py), and [`pdftables`](https://github.com/drj11/pdftables) all focus primarily on extracting tables. In some cases, they may be better suited to the particular tables you are trying to extract. License: [MIT](https://github.com/camelot-dev/camelot?tab=MIT-1-ov-file#readme) (`camelot`), [MIT](https://github.com/chezou/tabula-py?tab=MIT-1-ov-file#readme) (`tabula-py`), [BSD](https://github.com/drj11/pdftables?tab=BSD-2-Clause-1-ov-file#readme) (`pdftables`).
## Acknowledgments / Contributors
Many thanks to the following users who've contributed ideas, features, and fixes:
- [Jacob Fenton](https://github.com/jsfenfen)
- [Dan Nguyen](https://github.com/dannguyen)
- [Jeff Barrera](https://github.com/jeffbarrera)
- [Bob Lannon](https://github.com/boblannon)
- [Dustin Tindall](https://github.com/dustindall)
- [@yevgnen](https://github.com/Yevgnen)
- [@meldonization](https://github.com/meldonization)
- [Oisín Moran](https://github.com/OisinMoran)
- [Samkit Jain](https://github.com/samkit-jain)
- [Francisco Aranda](https://github.com/frascuchon)
- [Kwok-kuen Cheung](https://github.com/cheungpat)
- [Marco](https://github.com/ubmarco)
- [Idan David](https://github.com/idan-david)
- [@xv44586](https://github.com/xv44586)
- [Alexander Regueiro](https://github.com/alexreg)
- [Daniel Peña](https://github.com/trifling)
- [@bobluda](https://github.com/bobluda)
- [@ramcdona](https://github.com/ramcdona)
- [@johnhuge](https://github.com/johnhuge)
- [Jhonatan Lopes](https://github.com/jhonatan-lopes)
- [Ethan Corey](https://github.com/ethanscorey)
- [Shannon Shen](https://github.com/lolipopshock)
- [Matsumoto Toshi](https://github.com/toshi1127)
- [John West](https://github.com/jwestwsj)
- [David Huggins-Daines](https://github.com/dhdaines)
- [Jeremy B. Merrill](https://github.com/jeremybmerrill)
- [Echedey Luis](https://github.com/echedey-ls)
- [Andy Friedman](https://github.com/afriedman412)
- [Aron Weiler](https://github.com/aronweiler)
- [Quentin André](https://github.com/QuentinAndre11)
- [Léo Roux](https://github.com/leorouxx)
- [@wodny](https://github.com/wodny)
- [Michal Stolarczyk](https://github.com/stolarczyk)
- [Brandon Roberts](https://github.com/brandonrobertz)
- [@ennamarie19](https://github.com/ennamarie19)
- [Anton Ilin](https://github.com/bronislav)
## Contributing
Pull requests are welcome, but please submit a proposal issue first, as the library is in active development.
Current maintainers:
- [Jeremy Singer-Vine](https://github.com/jsvine)
- [Samkit Jain](https://github.com/samkit-jain)
================================================
FILE: codecov.yml
================================================
codecov:
branch: stable
================================================
FILE: docs/colors.md
================================================
# Colors
In the PDF specification, as well as in `pdfplumber`, most graphical objects can have two color attributes:
- `stroking_color`: The color of the object's outline
- `non_stroking_color`: The color of the object's interior, or "fill"
In the PDF specification, colors have both a "color space" and a "color value".
## Color Spaces
Valid color spaces are grouped into three categories:
- Device color spaces
- `DeviceGray`
- `DeviceRGB`
- `DeviceCMYK`
- CIE-based color spaces
- `CalGray`
- `CalRGB`
- `Lab`
- `ICCBased`
- Special color spaces
- `Indexed`
- `Pattern`
- `Separation`
- `DeviceN`
To read more about the differences between those color spaces, see section 4.5 [here](https://ghostscript.com/~robin/pdf_reference17.pdf).
`pdfplumber` aims to expose those color spaces as `scs` (stroking color space) and `ncs` (non-stroking color space), represented as a __string__.
__Caveat__: The only information `pdfplumber` can __currently__ expose is the non-stroking color space for `char` objects. The rest (stroking color space for `char` objects and either color space for the other types of objects) will require a pull request to `pdfminer.six`.
## Color Values
The color value determines *what specific color* in the color space should be used. With the exception of the "special color spaces," these color values are specified as a series of numbers. For `DeviceRGB`, for example, the color values are three numbers, representing the intensities of red, green, and blue.
In `pdfplumber`, those color values are exposed as `stroking_color` and `non_stroking_color`, represented as a __tuple of numbers__.
The pattern specified by the `Pattern` color space is exposed via the `non_stroking_pattern` and `stroking_pattern` attributes.
================================================
FILE: docs/repairing.md
================================================
# Repairing Malformed PDFs
Many parsing issues can be traced back to malformed PDFs.
Malformed PDFs can often be [fixed via Ghostscript](https://superuser.com/questions/278562/how-can-i-fix-repair-a-corrupted-pdf-file).
`pdfplumber` lets you automatically run those repairs, in several ways:
- `pdfplumber.open(..., repair=True)` will repair your PDF on the fly (but not save the repaired version to disk).
- `pdfplumber.repair(path_to_pdf)` will return a `BytesIO` object holding the bytes of a repaired version of the original file.
- `pdfplumber.repair(path_to_pdf, outfile="path/to/repaired.pdf")` will write a repaired version of the original file to the indicated `outfile` path.
## Custom parameters
- `gs_path=...`: You can pass a custom path for the Ghostscript executable, helpful in case `pdfplumber` is unable to auto-detect your copy of Ghostscript.
================================================
FILE: docs/structure.md
================================================
# Structure Tree
Since PDF 1.3 it is possible for a PDF to contain logical structure,
contained in a *structure tree*. In conjunction with PDF 1.2 [marked
content sections](#marked-content-sections) this forms the basis of
Tagged PDF and other accessibility features.
Unfortunately, since all of these standards are optional and variably
implemented in PDF authoring tools, and are frequently not enabled by
default, it is not possible to rely on them to extract the structure
of a PDF and associated content. Nonetheless they can be useful as
features for a heuristic or machine-learning based system, or for
extracting particular structures such as tables.
Since `pdfplumber`'s API is page-based, the structure is available for
a particular page, using the `structure_tree` attribute:
with pdfplumber.open(pdffile) as pdf:
for element in pdf.pages[0].structure_tree:
print(element["type"], element["mcids"])
for child in element.children:
print(child["type"], child["mcids"])
The `type` field contains the type of the structure element - the
standard structure types can be seen in section 10.7.3 of [the PDF 1.7
reference
document](https://ghostscript.com/~robin/pdf_reference17.pdf#page=898),
but usually they are rather HTML-like, if created by a recent PDF
authoring tool (notably, older tools may simply produce `P` for
everything).
The `mcids` field contains the list of marked content section IDs
corresponding to this element.
The `lang` field is often present as well, and contains a language
code for the text content, e.g. `"EN-US"` or `"FR-CA"`.
The `alt_text` field will be present if the author has helpfully added
alternate text to an image. In some cases, `actual_text` may also be
present.
There are also various attributes that may be in the `attributes`
field. Some of these are quite useful indeed, such as ``BBox` which
gives you the bounding box of a `Table`, `Figure`, or `Image`. You
can see a full list of these [in the PDF
spec](https://ghostscript.com/~robin/pdf_reference17.pdf#page=916).
Note that the `BBox` is in PDF coordinate space with the origin at the
bottom left of the page. To convert it to `pdfplumber`'s space you
can do, for example:
x0, y0, x1, y1 = element['attributes']['BBox']
top = page.height - y1
bottom = page.height - y0
doctop = page.initial_doctop + top
bbox = (x0, top, x1, bottom)
It is also possible to get the structure tree for the entire document.
In this case, because marked content IDs are specific to a given page,
each element will also have a `page_number` attribute, which is the
number of the page containing (partially or completely) this element,
indexed from 1 (for consistency with `pdfplumber.Page`).
You can also access the underlying `PDFStructTree` object for more
flexibility, including visual debugging. For instance to plot the
bounding boxes of the contents of all of the `TD` elements on the
first page of a document:
page = pdf.pages[0]
stree = PDFStructTree(pdf, page)
img = page.to_image()
img.draw_rects(stree.element_bbox(td) for td in table.find_all("TD"))
The `find_all` method works rather like the same method in
[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree) -
it takes an element name, a regular expression, or a matching
function.
================================================
FILE: pdfplumber/__init__.py
================================================
__all__ = [
"__version__",
"utils",
"pdfminer",
"open",
"repair",
"set_debug",
]
import pdfminer
import pdfminer.pdftypes
from . import utils
from ._version import __version__
from .pdf import PDF
from .repair import repair
open = PDF.open
================================================
FILE: pdfplumber/_typing.py
================================================
from typing import Any, Dict, Iterable, List, Literal, Sequence, Tuple, Union
T_seq = Sequence
T_num = Union[int, float]
T_point = Tuple[T_num, T_num]
T_bbox = Tuple[T_num, T_num, T_num, T_num]
T_obj = Dict[str, Any]
T_obj_list = List[T_obj]
T_obj_iter = Iterable[T_obj]
T_dir = Union[Literal["ltr"], Literal["rtl"], Literal["ttb"], Literal["btt"]]
================================================
FILE: pdfplumber/_version.py
================================================
version_info = (0, 11, 9)
__version__ = ".".join(map(str, version_info))
================================================
FILE: pdfplumber/cli.py
================================================
#!/usr/bin/env python
import argparse
import json
import sys
from collections import defaultdict, deque
from itertools import chain
from typing import Any, DefaultDict, Dict, List
from .pdf import PDF
if len(sys.argv) == 1:
sys.argv.append("--help")
def parse_page_spec(p_str: str) -> List[int]:
if "-" in p_str:
start, end = map(int, p_str.split("-"))
return list(range(start, end + 1))
else:
return [int(p_str)]
def parse_args(args_raw: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser("pdfplumber")
parser.add_argument("infile", nargs="?", type=argparse.FileType("rb"))
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--structure",
help="Write the structure tree as JSON. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
group.add_argument(
"--structure-text",
help="Write the structure tree as JSON including text contents. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
parser.add_argument("--format", choices=["csv", "json", "text"], default="csv")
parser.add_argument("--types", nargs="+")
parser.add_argument(
"--include-attrs",
nargs="+",
help="Include *only* these object attributes in output.",
)
parser.add_argument(
"--exclude-attrs",
nargs="+",
help="Exclude these object attributes from output.",
)
parser.add_argument("--laparams", type=json.loads)
parser.add_argument("--precision", type=int)
parser.add_argument("--pages", nargs="+", type=parse_page_spec)
parser.add_argument(
"--indent", type=int, help="Indent level for JSON pretty-printing."
)
args = parser.parse_args(args_raw)
if args.pages is not None:
args.pages = list(chain(*args.pages))
return args
def add_text_to_mcids(pdf: PDF, data: List[Dict[str, Any]]) -> None:
page_contents: DefaultDict[int, Any] = defaultdict(lambda: defaultdict(str))
for page in pdf.pages:
text_contents = page_contents[page.page_number]
for c in page.chars:
mcid = c.get("mcid")
if mcid is None:
continue
text_contents[mcid] += c["text"]
d = deque(data)
while d:
el = d.popleft()
if "children" in el:
d.extend(el["children"])
pageno = el.get("page_number")
if pageno is None:
continue
text_contents = page_contents[pageno]
if "mcids" in el:
el["text"] = [text_contents[mcid] for mcid in el["mcids"]]
def main(args_raw: List[str] = sys.argv[1:]) -> None:
args = parse_args(args_raw)
with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
if args.structure:
print(json.dumps(pdf.structure_tree, indent=args.indent))
elif args.structure_text:
tree = pdf.structure_tree
add_text_to_mcids(pdf, tree)
print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
elif args.format == "csv":
pdf.to_csv(
sys.stdout,
args.types,
precision=args.precision,
include_attrs=args.include_attrs,
exclude_attrs=args.exclude_attrs,
)
elif args.format == "text":
for page in pdf.pages:
print(page.extract_text(layout=True))
else:
pdf.to_json(
sys.stdout,
args.types,
precision=args.precision,
include_attrs=args.include_attrs,
exclude_attrs=args.exclude_attrs,
indent=args.indent,
)
if __name__ == "__main__":
main()
================================================
FILE: pdfplumber/container.py
================================================
import csv
import json
from io import StringIO
from itertools import chain
from typing import Any, Dict, List, Optional, Set, TextIO
from . import utils
from ._typing import T_obj, T_obj_list
from .convert import CSV_COLS_REQUIRED, CSV_COLS_TO_PREPEND, Serializer
class Container(object):
cached_properties = ["_rect_edges", "_curve_edges", "_edges", "_objects"]
@property
def pages(self) -> Optional[List[Any]]: # pragma: nocover
raise NotImplementedError
@property
def objects(self) -> Dict[str, T_obj_list]: # pragma: nocover
raise NotImplementedError
def to_dict(
self, object_types: Optional[List[str]] = None
) -> Dict[str, Any]: # pragma: nocover
raise NotImplementedError
def flush_cache(self, properties: Optional[List[str]] = None) -> None:
props = self.cached_properties if properties is None else properties
for p in props:
if hasattr(self, p):
delattr(self, p)
@property
def rects(self) -> T_obj_list:
return self.objects.get("rect", [])
@property
def lines(self) -> T_obj_list:
return self.objects.get("line", [])
@property
def curves(self) -> T_obj_list:
return self.objects.get("curve", [])
@property
def images(self) -> T_obj_list:
return self.objects.get("image", [])
@property
def chars(self) -> T_obj_list:
return self.objects.get("char", [])
@property
def textboxverticals(self) -> T_obj_list:
return self.objects.get("textboxvertical", [])
@property
def textboxhorizontals(self) -> T_obj_list:
return self.objects.get("textboxhorizontal", [])
@property
def textlineverticals(self) -> T_obj_list:
return self.objects.get("textlinevertical", [])
@property
def textlinehorizontals(self) -> T_obj_list:
return self.objects.get("textlinehorizontal", [])
@property
def rect_edges(self) -> T_obj_list:
if hasattr(self, "_rect_edges"):
return self._rect_edges
rect_edges_gen = (utils.rect_to_edges(r) for r in self.rects)
self._rect_edges: T_obj_list = list(chain(*rect_edges_gen))
return self._rect_edges
@property
def curve_edges(self) -> T_obj_list:
if hasattr(self, "_curve_edges"):
return self._curve_edges
curve_edges_gen = (utils.curve_to_edges(r) for r in self.curves)
self._curve_edges: T_obj_list = list(chain(*curve_edges_gen))
return self._curve_edges
@property
def edges(self) -> T_obj_list:
if hasattr(self, "_edges"):
return self._edges
line_edges = list(map(utils.line_to_edge, self.lines))
self._edges: T_obj_list = line_edges + self.rect_edges + self.curve_edges
return self._edges
@property
def horizontal_edges(self) -> T_obj_list:
def test(x: T_obj) -> bool:
return bool(x["orientation"] == "h")
return list(filter(test, self.edges))
@property
def vertical_edges(self) -> T_obj_list:
def test(x: T_obj) -> bool:
return bool(x["orientation"] == "v")
return list(filter(test, self.edges))
def to_json(
self,
stream: Optional[TextIO] = None,
object_types: Optional[List[str]] = None,
include_attrs: Optional[List[str]] = None,
exclude_attrs: Optional[List[str]] = None,
precision: Optional[int] = None,
indent: Optional[int] = None,
) -> Optional[str]:
data = self.to_dict(object_types)
serialized = Serializer(
precision=precision,
include_attrs=include_attrs,
exclude_attrs=exclude_attrs,
).serialize(data)
if stream is None:
return json.dumps(serialized, indent=indent)
else:
json.dump(serialized, stream, indent=indent)
return None
def to_csv(
self,
stream: Optional[TextIO] = None,
object_types: Optional[List[str]] = None,
precision: Optional[int] = None,
include_attrs: Optional[List[str]] = None,
exclude_attrs: Optional[List[str]] = None,
) -> Optional[str]:
if stream is None:
stream = StringIO()
to_string = True
else:
to_string = False
if object_types is None:
object_types = list(self.objects.keys()) + ["annot"]
serialized = []
fields: Set[str] = set()
pages = [self] if self.pages is None else self.pages
serializer = Serializer(
precision=precision,
include_attrs=include_attrs,
exclude_attrs=exclude_attrs,
)
for page in pages:
for t in object_types:
objs = getattr(page, t + "s")
if len(objs):
serialized += serializer.serialize(objs)
new_keys = [k for k, v in objs[0].items() if type(v) is not dict]
fields = fields.union(set(new_keys))
non_req_cols = CSV_COLS_TO_PREPEND + list(
sorted(set(fields) - set(CSV_COLS_REQUIRED + CSV_COLS_TO_PREPEND))
)
cols = CSV_COLS_REQUIRED + list(filter(serializer.attr_filter, non_req_cols))
w = csv.DictWriter(
stream,
fieldnames=cols,
extrasaction="ignore",
quoting=csv.QUOTE_MINIMAL,
escapechar="\\",
)
w.writeheader()
w.writerows(serialized)
if to_string:
stream.seek(0)
return stream.read()
else:
return None
================================================
FILE: pdfplumber/convert.py
================================================
import base64
from typing import Any, Callable, Dict, List, Optional, Tuple
from pdfminer.psparser import PSLiteral
from .utils import decode_text
ENCODINGS_TO_TRY = [
"utf-8",
"latin-1",
"utf-16",
"utf-16le",
]
CSV_COLS_REQUIRED = [
"object_type",
]
CSV_COLS_TO_PREPEND = [
"page_number",
"x0",
"x1",
"y0",
"y1",
"doctop",
"top",
"bottom",
"width",
"height",
]
def get_attr_filter(
include_attrs: Optional[List[str]] = None, exclude_attrs: Optional[List[str]] = None
) -> Callable[[str], bool]:
if include_attrs is not None and exclude_attrs is not None:
raise ValueError(
"Cannot specify `include_attrs` and `exclude_attrs` at the same time."
)
elif include_attrs is not None:
incl = set(CSV_COLS_REQUIRED + include_attrs)
return lambda attr: attr in incl
elif exclude_attrs is not None:
nonexcludable = set(exclude_attrs).intersection(set(CSV_COLS_REQUIRED))
if len(nonexcludable):
raise ValueError(
f"Cannot exclude these required properties: {list(nonexcludable)}"
)
excl = set(exclude_attrs)
return lambda attr: attr not in excl
else:
return lambda attr: True
def to_b64(data: bytes) -> str:
return base64.b64encode(data).decode("ascii")
class Serializer:
def __init__(
self,
precision: Optional[int] = None,
include_attrs: Optional[List[str]] = None,
exclude_attrs: Optional[List[str]] = None,
):
self.precision = precision
self.attr_filter = get_attr_filter(
include_attrs=include_attrs, exclude_attrs=exclude_attrs
)
def serialize(self, obj: Any) -> Any:
if obj is None:
return None
t = type(obj)
# Basic types don't need to be converted
if t in (int, str):
return obj
# Use one of the custom converters, if possible
fn = getattr(self, f"do_{t.__name__}", None)
if fn is not None:
return fn(obj)
# Otherwise, just use the string-representation
else:
return str(obj)
def do_float(self, x: float) -> float:
return x if self.precision is None else round(x, self.precision)
def do_bool(self, x: bool) -> int:
return int(x)
def do_list(self, obj: List[Any]) -> List[Any]:
return list(self.serialize(x) for x in obj)
def do_tuple(self, obj: Tuple[Any, ...]) -> Tuple[Any, ...]:
return tuple(self.serialize(x) for x in obj)
def do_dict(self, obj: Dict[str, Any]) -> Dict[str, Any]:
if "object_type" in obj.keys():
return {k: self.serialize(v) for k, v in obj.items() if self.attr_filter(k)}
else:
return {k: self.serialize(v) for k, v in obj.items()}
def do_PDFStream(self, obj: Any) -> Dict[str, Optional[str]]:
return {"rawdata": to_b64(obj.rawdata) if obj.rawdata else None}
def do_PSLiteral(self, obj: PSLiteral) -> str:
return decode_text(obj.name)
def do_bytes(self, obj: bytes) -> Optional[str]:
for e in ENCODINGS_TO_TRY:
try:
return obj.decode(e)
except UnicodeDecodeError: # pragma: no cover
return None
# If none of the decodings work, raise whatever error
# decoding with utf-8 causes
obj.decode(ENCODINGS_TO_TRY[0]) # pragma: no cover
return None # pragma: no cover
================================================
FILE: pdfplumber/ctm.py
================================================
import math
from typing import NamedTuple
# For more details, see the PDF Reference, 6th Ed., Section 4.2.2 ("Common
# Transformations")
class CTM(NamedTuple):
a: float
b: float
c: float
d: float
e: float
f: float
@property
def scale_x(self) -> float:
return math.sqrt(pow(self.a, 2) + pow(self.b, 2))
@property
def scale_y(self) -> float:
return math.sqrt(pow(self.c, 2) + pow(self.d, 2))
@property
def skew_x(self) -> float:
return (math.atan2(self.d, self.c) * 180 / math.pi) - 90
@property
def skew_y(self) -> float:
return math.atan2(self.b, self.a) * 180 / math.pi
@property
def translation_x(self) -> float:
return self.e
@property
def translation_y(self) -> float:
return self.f
================================================
FILE: pdfplumber/display.py
================================================
import pathlib
from io import BufferedReader, BytesIO
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
import PIL.Image
import PIL.ImageDraw
import pypdfium2 # type: ignore
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils.exceptions import MalformedPDFException
if TYPE_CHECKING: # pragma: nocover
import pandas as pd
from .page import Page
class COLORS:
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)
TRANSPARENT = (0, 0, 0, 0)
DEFAULT_FILL = COLORS.BLUE + (50,)
DEFAULT_STROKE = COLORS.RED + (200,)
DEFAULT_STROKE_WIDTH = 1
DEFAULT_RESOLUTION = 72
T_color = Union[Tuple[int, int, int], Tuple[int, int, int, int], str]
T_contains_points = Union[Tuple[T_point, ...], List[T_point], T_obj]
def get_page_image(
stream: Union[BufferedReader, BytesIO],
path: Optional[pathlib.Path],
page_ix: int,
resolution: Union[int, float],
password: Optional[str],
antialias: bool = False,
) -> PIL.Image.Image:
src: Union[pathlib.Path, BufferedReader, BytesIO]
# If we are working with a file object saved to disk
if path:
src = path
# If we instead are working with a BytesIO stream
else:
stream.seek(0)
src = stream
try:
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
except pypdfium2.PdfiumError as e:
raise MalformedPDFException(e)
pdfium_page = pdfium_doc.get_page(page_ix)
img: PIL.Image.Image = pdfium_page.render(
# Modifiable arguments
scale=resolution / 72,
no_smoothtext=not antialias,
no_smoothpath=not antialias,
no_smoothimage=not antialias,
# Non-modifiable arguments
prefer_bgrx=True,
).to_pil()
pdfium_doc.close()
return img.convert("RGB")
class PageImage:
def __init__(
self,
page: "Page",
original: Optional[PIL.Image.Image] = None,
resolution: Union[int, float] = DEFAULT_RESOLUTION,
antialias: bool = False,
force_mediabox: bool = False,
):
self.page = page
self.root = page if page.is_original else page.root_page
self.resolution = resolution
if original is None:
self.original = get_page_image(
stream=page.pdf.stream,
path=page.pdf.path,
page_ix=page.page_number - 1,
resolution=resolution,
antialias=antialias,
password=page.pdf.password,
)
else:
self.original = original
self.scale = self.original.size[0] / (page.cropbox[2] - page.cropbox[0])
# This value represents the coordinates of the page,
# in page-unit values, that will be displayed.
self.bbox = (
page.bbox
if page.bbox != page.mediabox
else (page.mediabox if force_mediabox else page.cropbox)
)
# If this value is different than the *Page*'s .cropbox
# (e.g., because the mediabox differs from the cropbox or
# or because we've used Page.crop(...)), then we'll need to
# crop the initially-converted image.
if page.bbox != page.cropbox:
crop_dims = self._reproject_bbox(page.cropbox)
bbox_dims = self._reproject_bbox(self.bbox)
self.original = self.original.crop(
(
bbox_dims[0] - crop_dims[0],
bbox_dims[1] - crop_dims[1],
bbox_dims[2] - crop_dims[0],
bbox_dims[3] - crop_dims[1],
)
)
self.reset()
def _reproject_bbox(self, bbox: T_bbox) -> Tuple[int, int, int, int]:
x0, top, x1, bottom = bbox
_x0, _top = self._reproject((x0, top))
_x1, _bottom = self._reproject((x1, bottom))
return (_x0, _top, _x1, _bottom)
def _reproject(self, coord: T_point) -> Tuple[int, int]:
"""
Given an (x0, top) tuple from the *root* coordinate system,
return an (x0, top) tuple in the *image* coordinate system.
"""
x0, top = coord
_x0 = (x0 - self.bbox[0]) * self.scale
_top = (top - self.bbox[1]) * self.scale
return (int(_x0), int(_top))
def reset(self) -> "PageImage":
self.annotated = PIL.Image.new("RGB", self.original.size)
self.annotated.paste(self.original)
self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
return self
def save(
self,
dest: Union[str, pathlib.Path, BytesIO],
format: str = "PNG",
quantize: bool = True,
colors: int = 256,
bits: int = 8,
**kwargs: Any,
) -> None:
if quantize:
out = self.annotated.quantize(colors, method=PIL.Image.FASTOCTREE).convert(
"P"
)
else:
out = self.annotated
out.save(
dest,
format=format,
bits=bits,
dpi=(self.resolution, self.resolution),
**kwargs,
)
def copy(self) -> "PageImage":
return self.__class__(self.page, self.original)
def draw_line(
self,
points_or_obj: T_contains_points,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
# If passing a raw list of points, use those
if isinstance(points_or_obj, (tuple, list)):
points = points_or_obj
# Else, use the "pts" attribute if available
elif isinstance(points_or_obj, dict) and "pts" in points_or_obj:
points = [(x, y) for x, y in points_or_obj["pts"]]
# Otherwise, just use ((x0, top), (x1, bottom))
else:
obj = points_or_obj
points = ((obj["x0"], obj["top"]), (obj["x1"], obj["bottom"]))
self.draw.line(
list(map(self._reproject, points)), fill=stroke, width=stroke_width
)
return self
def draw_lines(
self,
list_of_lines: Union[T_seq[T_contains_points], "pd.DataFrame"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in utils.to_list(list_of_lines):
self.draw_line(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_vline(
self,
location: T_num,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (location, self.bbox[1], location, self.bbox[3])
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self
def draw_vlines(
self,
locations: Union[List[T_num], "pd.Series[float]"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in list(locations):
self.draw_vline(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_hline(
self,
location: T_num,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (self.bbox[0], location, self.bbox[2], location)
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self
def draw_hlines(
self,
locations: Union[List[T_num], "pd.Series[float]"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in list(locations):
self.draw_hline(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_rect(
self,
bbox_or_obj: Union[T_bbox, T_obj],
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
if isinstance(bbox_or_obj, (tuple, list)):
bbox = bbox_or_obj
else:
obj = bbox_or_obj
bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
x0, top, x1, bottom = bbox
half = stroke_width / 2
x0 = min(x0 + half, (x0 + x1) / 2)
top = min(top + half, (top + bottom) / 2)
x1 = max(x1 - half, (x0 + x1) / 2)
bottom = max(bottom - half, (top + bottom) / 2)
fill_bbox = self._reproject_bbox((x0, top, x1, bottom))
self.draw.rectangle(fill_bbox, fill, COLORS.TRANSPARENT)
if stroke_width > 0:
segments = [
((x0, top), (x1, top)), # top
((x0, bottom), (x1, bottom)), # bottom
((x0, top), (x0, bottom)), # left
((x1, top), (x1, bottom)), # right
]
self.draw_lines(segments, stroke=stroke, stroke_width=stroke_width)
return self
def draw_rects(
self,
list_of_rects: Union[List[T_bbox], T_obj_list, "pd.DataFrame"],
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in utils.to_list(list_of_rects):
self.draw_rect(x, fill=fill, stroke=stroke, stroke_width=stroke_width)
return self
def draw_circle(
self,
center_or_obj: Union[T_point, T_obj],
radius: int = 5,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
) -> "PageImage":
if isinstance(center_or_obj, tuple):
center = center_or_obj
else:
obj = center_or_obj
center = ((obj["x0"] + obj["x1"]) / 2, (obj["top"] + obj["bottom"]) / 2)
cx, cy = center
bbox = (cx - radius, cy - radius, cx + radius, cy + radius)
self.draw.ellipse(self._reproject_bbox(bbox), fill, stroke)
return self
def draw_circles(
self,
list_of_circles: Union[List[T_point], T_obj_list, "pd.DataFrame"],
radius: int = 5,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
) -> "PageImage":
for x in utils.to_list(list_of_circles):
self.draw_circle(x, radius=radius, fill=fill, stroke=stroke)
return self
def debug_table(
self,
table: Table,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = 1,
) -> "PageImage":
"""
Outline all found tables.
"""
self.draw_rects(
table.cells, fill=fill, stroke=stroke, stroke_width=stroke_width
)
return self
def debug_tablefinder(
self,
table_settings: Optional[
Union[TableFinder, TableSettings, T_table_settings]
] = None,
) -> "PageImage":
if isinstance(table_settings, TableFinder):
finder = table_settings
elif table_settings is None or isinstance(
table_settings, (TableSettings, dict)
):
finder = self.page.debug_tablefinder(table_settings)
else:
raise ValueError(
"Argument must be instance of TableFinder"
"or a TableFinder settings dict."
)
for table in finder.tables:
self.debug_table(table)
self.draw_lines(finder.edges, stroke_width=1)
self.draw_circles(
list(finder.intersections.keys()),
fill=COLORS.TRANSPARENT,
stroke=COLORS.BLUE + (200,),
radius=3,
)
return self
def outline_words(
self,
stroke: T_color = DEFAULT_STROKE,
fill: T_color = DEFAULT_FILL,
stroke_width: int = DEFAULT_STROKE_WIDTH,
x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
) -> "PageImage":
words = self.page.extract_words(
x_tolerance=x_tolerance, y_tolerance=y_tolerance
)
self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width)
return self
def outline_chars(
self,
stroke: T_color = (255, 0, 0, 255),
fill: T_color = (255, 0, 0, int(255 / 4)),
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
self.draw_rects(
self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width
)
return self
def _repr_png_(self) -> bytes:
b = BytesIO()
self.save(b, "PNG")
return b.getvalue()
def show(self) -> None: # pragma: no cover
self.annotated.show()
================================================
FILE: pdfplumber/page.py
================================================
import numbers
import re
from functools import lru_cache
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
List,
Optional,
Pattern,
Tuple,
Union,
)
from unicodedata import normalize as normalize_unicode
from warnings import warn
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LTChar,
LTComponent,
LTContainer,
LTCurve,
LTItem,
LTPage,
LTTextContainer,
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
from pdfminer.pdfpage import PDFPage
from pdfminer.psparser import PSLiteral
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list
from .container import Container
from .structure import PDFStructTree, StructTreeMissing
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils import decode_text, resolve_all, resolve_and_decode
from .utils.exceptions import MalformedPDFException, PdfminerException
from .utils.text import TextMap
lt_pat = re.compile(r"^LT")
ALL_ATTRS = set(
[
"adv",
"height",
"linewidth",
"pts",
"size",
"srcsize",
"width",
"x0",
"x1",
"y0",
"y1",
"bits",
"matrix",
"upright",
"fontname",
"text",
"imagemask",
"colorspace",
"evenodd",
"fill",
"non_stroking_color",
"stroke",
"stroking_color",
"stream",
"name",
"mcid",
"tag",
]
)
if TYPE_CHECKING: # pragma: nocover
from .display import PageImage
from .pdf import PDF
# via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa
CP936_FONTNAMES = {
b"\xcb\xce\xcc\xe5": "SimSun,Regular",
b"\xba\xda\xcc\xe5": "SimHei,Regular",
b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular",
b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular",
b"\xc1\xa5\xca\xe9": "SimLi,Regular",
}
def fix_fontname_bytes(fontname: bytes) -> str:
if b"+" in fontname:
split_at = fontname.index(b"+") + 1
prefix, suffix = fontname[:split_at], fontname[split_at:]
else:
prefix, suffix = b"", fontname
suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1])
return str(prefix)[2:-1] + suffix_new
def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
return {
key: (tuple(value) if isinstance(value, list) else value)
for key, value in kwargs.items()
}
class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
"""Extract layout from a specific page, adding marked-content IDs to
objects where found."""
cur_mcid: Optional[int] = None
cur_tag: Optional[str] = None
def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
"""Handle beginning of tag, setting current MCID if any."""
self.cur_tag = decode_text(tag.name)
if isinstance(props, dict) and "MCID" in props:
self.cur_mcid = props["MCID"]
else:
self.cur_mcid = None
def end_tag(self) -> None:
"""Handle beginning of tag, clearing current MCID."""
self.cur_tag = None
self.cur_mcid = None
def tag_cur_item(self) -> None:
"""Add current MCID to what we hope to be the most recent object created
by pdfminer.six."""
# This is somewhat hacky and would not be necessary if
# pdfminer.six supported MCIDs. In reading the code it's
# clear that the `render_*` methods methods will only ever
# create one object, but that is far from being guaranteed.
# Even if pdfminer.six's API would just return the objects it
# creates, we wouldn't have to do this.
if self.cur_item._objs:
cur_obj = self.cur_item._objs[-1]
cur_obj.mcid = self.cur_mcid # type: ignore
cur_obj.tag = self.cur_tag # type: ignore
def render_char(self, *args, **kwargs) -> float: # type: ignore
"""Hook for rendering characters, adding the `mcid` attribute."""
adv = super().render_char(*args, **kwargs)
self.tag_cur_item()
return adv
def render_image(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering images, adding the `mcid` attribute."""
super().render_image(*args, **kwargs)
self.tag_cur_item()
def paint_path(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering lines and curves, adding the `mcid` attribute."""
super().paint_path(*args, **kwargs)
self.tag_cur_item()
def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
# Per PDF Reference 3.8.4: "Note: Although rectangles are
# conventionally specified by their lower-left and upperright
# corners, it is acceptable to specify any two diagonally opposite
# corners."
if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover
raise MalformedPDFException(
f"Bounding box contains non-number coordinate(s): {box_raw}"
)
x0, x1 = sorted((box_raw[0], box_raw[2]))
y0, y1 = sorted((box_raw[1], box_raw[3]))
if rotation in [90, 270]:
return (y0, x0, y1, x1)
else:
return (x0, y0, x1, y1)
# PDFs coordinate spaces refer to an origin in the bottom-left of the
# page; pdfplumber flips this vertically, so that the origin is in the
# top-left.
def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox:
x0, y0, x1, y1 = box_raw
return (x0, mb_height - y1, x1, mb_height - y0)
class Page(Container):
cached_properties: List[str] = Container.cached_properties + ["_layout"]
is_original: bool = True
pages = None
def __init__(
self,
pdf: "PDF",
page_obj: PDFPage,
page_number: int,
initial_doctop: T_num = 0,
):
self.pdf = pdf
self.root_page = self
self.page_obj = page_obj
self.page_number = page_number
self.initial_doctop = initial_doctop
def get_attr(key: str, default: Any = None) -> Any:
value = resolve_all(page_obj.attrs.get(key))
return default if value is None else value
# Per PDF Reference Table 3.27: "The number of degrees by which the
# page should be rotated clockwise when displayed or printed. The value
# must be a multiple of 90. Default value: 0"
_rotation = get_attr("Rotate", 0)
self.rotation = _rotation % 360
mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation)
mb_height = mb_raw[3] - mb_raw[1]
self.mediabox = _invert_box(mb_raw, mb_height)
for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]:
if box_name in page_obj.attrs:
box_normalized = _invert_box(
_normalize_box(get_attr(box_name), self.rotation), mb_height
)
setattr(self, box_name.lower(), box_normalized)
if "CropBox" not in page_obj.attrs:
self.cropbox = self.mediabox
# Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...)
self.bbox = self.mediabox
# See https://rednafi.com/python/lru_cache_on_methods/
self.get_textmap = lru_cache()(self._get_textmap)
def close(self) -> None:
self.flush_cache()
self.get_textmap.cache_clear()
@property
def width(self) -> T_num:
return self.bbox[2] - self.bbox[0]
@property
def height(self) -> T_num:
return self.bbox[3] - self.bbox[1]
@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Return the structure tree for a page, if any."""
try:
return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
except StructTreeMissing:
return []
@property
def layout(self) -> LTPage:
if hasattr(self, "_layout"):
return self._layout
device = PDFPageAggregatorWithMarkedContent(
self.pdf.rsrcmgr,
pageno=self.page_number,
laparams=self.pdf.laparams,
)
interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
try:
interpreter.process_page(self.page_obj)
except Exception as e:
raise PdfminerException(e)
self._layout: LTPage = device.get_result()
return self._layout
@property
def annots(self) -> T_obj_list:
def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]:
turns = r // 90
for i in range(turns):
x, y = pt
comp = self.width if i == turns % 2 else self.height
pt = (y, (comp - x))
return pt
def parse(annot: T_obj) -> T_obj:
_a, _b, _c, _d = annot["Rect"]
pt0 = rotate_point((_a, _b), self.rotation)
pt1 = rotate_point((_c, _d), self.rotation)
rh = self.root_page.height
x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh)
a = annot.get("A", {})
extras = {
"uri": a.get("URI"),
"title": annot.get("T"),
"contents": annot.get("Contents"),
}
for k, v in extras.items():
if v is not None:
try:
extras[k] = v.decode("utf-8")
except UnicodeDecodeError:
try:
extras[k] = v.decode("utf-16")
except UnicodeDecodeError:
if self.pdf.raise_unicode_errors:
raise
warn(
f"Could not decode {k} of annotation."
f" {k} will be missing."
)
parsed = {
"page_number": self.page_number,
"object_type": "annot",
"x0": x0,
"y0": rh - bottom,
"x1": x1,
"y1": rh - top,
"doctop": self.initial_doctop + top,
"top": top,
"bottom": bottom,
"width": x1 - x0,
"height": bottom - top,
}
parsed.update(extras)
# Replace the indirect reference to the page dictionary
# with a pointer to our actual page
if "P" in annot:
annot["P"] = self
parsed["data"] = annot
return parsed
raw = resolve_all(self.page_obj.annots) or []
parsed = list(map(parse, raw))
if isinstance(self, CroppedPage):
return self._crop_fn(parsed)
else:
return parsed
@property
def hyperlinks(self) -> T_obj_list:
return [a for a in self.annots if a["uri"] is not None]
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = self.parse_objects()
return self._objects
def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]:
# See note below re. #1181 and mediabox-adjustment reversions
return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1])
def process_object(self, obj: LTItem) -> T_obj:
kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]:
k, v = item
if k in ALL_ATTRS:
res = resolve_all(v)
return (k, res)
else:
return None
attr = dict(filter(None, map(process_attr, obj.__dict__.items())))
attr["object_type"] = kind
attr["page_number"] = self.page_number
for cs in ["ncs", "scs"]:
# Note: As of pdfminer.six v20221105, that library only
# exposes ncs for LTChars, and neither attribute for
# other objects. Keeping this code here, though,
# for ease of addition if color spaces become
# more available via pdfminer.six
if hasattr(obj, cs):
attr[cs] = resolve_and_decode(getattr(obj, cs).name)
if isinstance(obj, (LTChar, LTTextContainer)):
text = obj.get_text()
attr["text"] = (
normalize_unicode(self.pdf.unicode_norm, text)
if self.pdf.unicode_norm is not None
else text
)
if isinstance(obj, LTChar):
# pdfminer.six (at least as of v20221105) does not
# directly expose .stroking_color and .non_stroking_color
# for LTChar objects (unlike, e.g., LTRect objects).
gs = obj.graphicstate
attr["stroking_color"] = (
gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,)
)
attr["non_stroking_color"] = (
gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,)
)
# Handle (rare) byte-encoded fontnames
if isinstance(attr["fontname"], bytes): # pragma: nocover
attr["fontname"] = fix_fontname_bytes(attr["fontname"])
elif isinstance(obj, (LTCurve,)):
attr["pts"] = list(map(self.point2coord, attr["pts"]))
# Ignoring typing because type signature for obj.original_path
# appears to be incorrect
attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501
attr["dash"] = obj.dashing_style
# As noted in #1181, `pdfminer.six` adjusts objects'
# coordinates relative to the MediaBox:
# https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84
mb_x0, mb_top = self.mediabox[:2]
if "y0" in attr:
attr["top"] = (self.height - attr["y1"]) + mb_top
attr["bottom"] = (self.height - attr["y0"]) + mb_top
attr["doctop"] = self.initial_doctop + attr["top"]
if "x0" in attr and mb_x0 != 0:
attr["x0"] = attr["x0"] + mb_x0
attr["x1"] = attr["x1"] + mb_x0
return attr
def iter_layout_objects(
self, layout_objects: List[LTComponent]
) -> Generator[T_obj, None, None]:
for obj in layout_objects:
# If object is, like LTFigure, a higher-level object ...
if isinstance(obj, LTContainer):
# and LAParams is passed, process the object itself.
if self.pdf.laparams is not None:
yield self.process_object(obj)
# Regardless, iterate through its children
yield from self.iter_layout_objects(obj._objs)
else:
yield self.process_object(obj)
def parse_objects(self) -> Dict[str, T_obj_list]:
objects: Dict[str, T_obj_list] = {}
for obj in self.iter_layout_objects(self.layout._objs):
kind = obj["object_type"]
if kind in ["anno"]:
continue
if objects.get(kind) is None:
objects[kind] = []
objects[kind].append(obj)
return objects
def debug_tablefinder(
self, table_settings: Optional[T_table_settings] = None
) -> TableFinder:
tset = TableSettings.resolve(table_settings)
return TableFinder(self, tset)
def find_tables(
self, table_settings: Optional[T_table_settings] = None
) -> List[Table]:
tset = TableSettings.resolve(table_settings)
return TableFinder(self, tset).tables
def find_table(
self, table_settings: Optional[T_table_settings] = None
) -> Optional[Table]:
tset = TableSettings.resolve(table_settings)
tables = self.find_tables(tset)
if len(tables) == 0:
return None
# Return the largest table, as measured by number of cells.
def sorter(x: Table) -> Tuple[int, T_num, T_num]:
return (-len(x.cells), x.bbox[1], x.bbox[0])
largest = list(sorted(tables, key=sorter))[0]
return largest
def extract_tables(
self, table_settings: Optional[T_table_settings] = None
) -> List[List[List[Optional[str]]]]:
tset = TableSettings.resolve(table_settings)
tables = self.find_tables(tset)
return [table.extract(**(tset.text_settings or {})) for table in tables]
def extract_table(
self, table_settings: Optional[T_table_settings] = None
) -> Optional[List[List[Optional[str]]]]:
tset = TableSettings.resolve(table_settings)
table = self.find_table(tset)
if table is None:
return None
else:
return table.extract(**(tset.text_settings or {}))
def _get_textmap(self, **kwargs: Any) -> TextMap:
defaults: Dict[str, Any] = dict(
layout_bbox=self.bbox,
)
if "layout_width_chars" not in kwargs:
defaults.update({"layout_width": self.width})
if "layout_height_chars" not in kwargs:
defaults.update({"layout_height": self.height})
full_kwargs: Dict[str, Any] = {**defaults, **kwargs}
return utils.chars_to_textmap(self.chars, **full_kwargs)
def search(
self,
pattern: Union[str, Pattern[str]],
regex: bool = True,
case: bool = True,
main_group: int = 0,
return_chars: bool = True,
return_groups: bool = True,
**kwargs: Any,
) -> List[Dict[str, Any]]:
textmap = self.get_textmap(**tuplify_list_kwargs(kwargs))
return textmap.search(
pattern,
regex=regex,
case=case,
main_group=main_group,
return_chars=return_chars,
return_groups=return_groups,
)
def extract_text(self, **kwargs: Any) -> str:
return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string
def extract_text_simple(self, **kwargs: Any) -> str:
return utils.extract_text_simple(self.chars, **kwargs)
def extract_words(self, **kwargs: Any) -> T_obj_list:
return utils.extract_words(self.chars, **kwargs)
def extract_text_lines(
self, strip: bool = True, return_chars: bool = True, **kwargs: Any
) -> T_obj_list:
return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines(
strip=strip, return_chars=return_chars
)
def crop(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
return CroppedPage(self, bbox, relative=relative, strict=strict)
def within_bbox(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
"""
Same as .crop, except only includes objects fully within the bbox
"""
return CroppedPage(
self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox
)
def outside_bbox(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
"""
Same as .crop, except only includes objects fully within the bbox
"""
return CroppedPage(
self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox
)
def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":
return FilteredPage(self, test_function)
def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
"""
Removes duplicate chars — those sharing the same text and positioning
(within `tolerance`) as other characters in the set. Adjust extra_args
to be more/less restrictive with the properties checked.
"""
p = FilteredPage(self, lambda x: True)
p._objects = {kind: objs for kind, objs in self.objects.items()}
p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs)
return p
def to_image(
self,
resolution: Optional[Union[int, float]] = None,
width: Optional[Union[int, float]] = None,
height: Optional[Union[int, float]] = None,
antialias: bool = False,
force_mediabox: bool = False,
) -> "PageImage":
"""
You can pass a maximum of 1 of the following:
- resolution: The desired number pixels per inch. Defaults to 72.
- width: The desired image width in pixels.
- height: The desired image width in pixels.
"""
from .display import DEFAULT_RESOLUTION, PageImage
num_specs = sum(x is not None for x in [resolution, width, height])
if num_specs > 1:
raise ValueError(
f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501
)
elif width is not None:
resolution = 72 * width / self.width
elif height is not None:
resolution = 72 * height / self.height
return PageImage(
self,
resolution=resolution or DEFAULT_RESOLUTION,
antialias=antialias,
force_mediabox=force_mediabox,
)
def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
if object_types is None:
_object_types = list(self.objects.keys()) + ["annot"]
else:
_object_types = object_types
d = {
"page_number": self.page_number,
"initial_doctop": self.initial_doctop,
"rotation": self.rotation,
"cropbox": self.cropbox,
"mediabox": self.mediabox,
"bbox": self.bbox,
"width": self.width,
"height": self.height,
}
for t in _object_types:
d[t + "s"] = getattr(self, t + "s")
return d
def __repr__(self) -> str:
return f""
class DerivedPage(Page):
is_original: bool = False
def __init__(self, parent_page: Page):
self.parent_page = parent_page
self.root_page = parent_page.root_page
self.pdf = parent_page.pdf
self.page_obj = parent_page.page_obj
self.page_number = parent_page.page_number
self.initial_doctop = parent_page.initial_doctop
self.rotation = parent_page.rotation
self.mediabox = parent_page.mediabox
self.cropbox = parent_page.cropbox
self.flush_cache(Container.cached_properties)
self.get_textmap = lru_cache()(self._get_textmap)
def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
bbox_area = utils.calculate_area(bbox)
if bbox_area == 0:
raise ValueError(f"Bounding box {bbox} has an area of zero.")
overlap = utils.get_bbox_overlap(bbox, parent_bbox)
if overlap is None:
raise ValueError(
f"Bounding box {bbox} is entirely outside "
f"parent page bounding box {parent_bbox}"
)
overlap_area = utils.calculate_area(overlap)
if overlap_area < bbox_area:
raise ValueError(
f"Bounding box {bbox} is not fully within "
f"parent page bounding box {parent_bbox}"
)
class CroppedPage(DerivedPage):
def __init__(
self,
parent_page: Page,
crop_bbox: T_bbox,
crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox,
relative: bool = False,
strict: bool = True,
):
if relative:
o_x0, o_top, _, _ = parent_page.bbox
x0, top, x1, bottom = crop_bbox
crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)
if strict:
test_proposed_bbox(crop_bbox, parent_page.bbox)
def _crop_fn(objs: T_obj_list) -> T_obj_list:
return crop_fn(objs, crop_bbox)
super().__init__(parent_page)
self._crop_fn = _crop_fn
# Note: testing for original function passed, not _crop_fn
if crop_fn is utils.outside_bbox:
self.bbox = parent_page.bbox
else:
self.bbox = crop_bbox
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = {
k: self._crop_fn(v) for k, v in self.parent_page.objects.items()
}
return self._objects
class FilteredPage(DerivedPage):
def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]):
self.bbox = parent_page.bbox
self.filter_fn = filter_fn
super().__init__(parent_page)
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = {
k: list(filter(self.filter_fn, v))
for k, v in self.parent_page.objects.items()
}
return self._objects
================================================
FILE: pdfplumber/pdf.py
================================================
import itertools
import logging
import pathlib
from io import BufferedReader, BytesIO
from types import TracebackType
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from ._typing import T_num, T_obj_list
from .container import Container
from .page import Page
from .repair import T_repair_setting, _repair
from .structure import PDFStructTree, StructTreeMissing
from .utils import resolve_and_decode
from .utils.exceptions import PdfminerException
logger = logging.getLogger(__name__)
class PDF(Container):
cached_properties: List[str] = Container.cached_properties + ["_pages"]
def __init__(
self,
stream: Union[BufferedReader, BytesIO],
stream_is_external: bool = False,
path: Optional[pathlib.Path] = None,
pages: Optional[Union[List[int], Tuple[int]]] = None,
laparams: Optional[Dict[str, Any]] = None,
password: Optional[str] = None,
strict_metadata: bool = False,
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
raise_unicode_errors: bool = True,
):
self.stream = stream
self.stream_is_external = stream_is_external
self.path = path
self.pages_to_parse = pages
self.laparams = None if laparams is None else LAParams(**laparams)
self.password = password
self.unicode_norm = unicode_norm
self.raise_unicode_errors = raise_unicode_errors
try:
self.doc = PDFDocument(PDFParser(stream), password=password or "")
except Exception as e:
raise PdfminerException(e)
self.rsrcmgr = PDFResourceManager()
self.metadata = {}
for info in self.doc.info:
self.metadata.update(info)
for k, v in self.metadata.items():
try:
self.metadata[k] = resolve_and_decode(v)
except Exception as e: # pragma: nocover
if strict_metadata:
# Raise an exception since unable to resolve the metadata value.
raise
# This metadata value could not be parsed. Instead of failing the PDF
# read, treat it as a warning only if `strict_metadata=False`.
logger.warning(
f'[WARNING] Metadata key "{k}" could not be parsed due to '
f"exception: {str(e)}"
)
@classmethod
def open(
cls,
path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
pages: Optional[Union[List[int], Tuple[int]]] = None,
laparams: Optional[Dict[str, Any]] = None,
password: Optional[str] = None,
strict_metadata: bool = False,
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
repair: bool = False,
gs_path: Optional[Union[str, pathlib.Path]] = None,
repair_setting: T_repair_setting = "default",
raise_unicode_errors: bool = True,
) -> "PDF":
stream: Union[BufferedReader, BytesIO]
if repair:
stream = _repair(
path_or_fp, password=password, gs_path=gs_path, setting=repair_setting
)
stream_is_external = False
# Although the original file has a path,
# the repaired version does not
path = None
elif isinstance(path_or_fp, (str, pathlib.Path)):
stream = open(path_or_fp, "rb")
stream_is_external = False
path = pathlib.Path(path_or_fp)
else:
stream = path_or_fp
stream_is_external = True
path = None
try:
return cls(
stream,
path=path,
pages=pages,
laparams=laparams,
password=password,
strict_metadata=strict_metadata,
unicode_norm=unicode_norm,
stream_is_external=stream_is_external,
raise_unicode_errors=raise_unicode_errors,
)
except PdfminerException:
if not stream_is_external:
stream.close()
raise
def close(self) -> None:
self.flush_cache()
for page in self.pages:
page.close()
if not self.stream_is_external:
self.stream.close()
def __enter__(self) -> "PDF":
return self
def __exit__(
self,
t: Optional[Type[BaseException]],
value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
self.close()
@property
def pages(self) -> List[Page]:
if hasattr(self, "_pages"):
return self._pages
doctop: T_num = 0
pp = self.pages_to_parse
self._pages: List[Page] = []
def iter_pages() -> Generator[PDFPage, None, None]:
gen = PDFPage.create_pages(self.doc)
while True:
try:
yield next(gen)
except StopIteration:
break
except Exception as e:
raise PdfminerException(e)
for i, page in enumerate(iter_pages()):
page_number = i + 1
if pp is not None and page_number not in pp:
continue
p = Page(self, page, page_number=page_number, initial_doctop=doctop)
self._pages.append(p)
doctop += p.height
return self._pages
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
all_objects: Dict[str, T_obj_list] = {}
for p in self.pages:
for kind in p.objects.keys():
all_objects[kind] = all_objects.get(kind, []) + p.objects[kind]
self._objects: Dict[str, T_obj_list] = all_objects
return self._objects
@property
def annots(self) -> List[Dict[str, Any]]:
gen = (p.annots for p in self.pages)
return list(itertools.chain(*gen))
@property
def hyperlinks(self) -> List[Dict[str, Any]]:
gen = (p.hyperlinks for p in self.pages)
return list(itertools.chain(*gen))
@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Return the structure tree for the document."""
try:
return [elem.to_dict() for elem in PDFStructTree(self)]
except StructTreeMissing:
return []
def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
return {
"metadata": self.metadata,
"pages": [page.to_dict(object_types) for page in self.pages],
}
================================================
FILE: pdfplumber/py.typed
================================================
================================================
FILE: pdfplumber/repair.py
================================================
import pathlib
import shutil
import subprocess
from io import BufferedReader, BytesIO
from typing import Literal, Optional, Union
T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]
def _repair(
path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
password: Optional[str] = None,
gs_path: Optional[Union[str, pathlib.Path]] = None,
setting: T_repair_setting = "default",
) -> BytesIO:
executable = (
gs_path
or shutil.which("gs")
or shutil.which("gswin32c")
or shutil.which("gswin64c")
)
if executable is None: # pragma: nocover
raise Exception(
"Cannot find Ghostscript, which is required for repairs.\n"
"Visit https://www.ghostscript.com/ for installation instructions."
)
repair_args = [
executable,
"-sstdout=%stderr",
"-o",
"-",
"-sDEVICE=pdfwrite",
f"-dPDFSETTINGS=/{setting}",
]
if password:
repair_args += [f"-sPDFPassword={password}"]
if isinstance(path_or_fp, (str, pathlib.Path)):
stdin = None
repair_args += [str(pathlib.Path(path_or_fp).absolute())]
else:
stdin = path_or_fp
repair_args += ["-"]
proc = subprocess.Popen(
repair_args,
stdin=subprocess.PIPE if stdin else None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = proc.communicate(stdin.read() if stdin else None)
if proc.returncode:
raise Exception(f"{stderr.decode('utf-8')}")
return BytesIO(stdout)
def repair(
path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
outfile: Optional[Union[str, pathlib.Path]] = None,
password: Optional[str] = None,
gs_path: Optional[Union[str, pathlib.Path]] = None,
setting: T_repair_setting = "default",
) -> Optional[BytesIO]:
repaired = _repair(path_or_fp, password, gs_path=gs_path, setting=setting)
if outfile:
with open(outfile, "wb") as f:
f.write(repaired.read())
return None
else:
return repaired
================================================
FILE: pdfplumber/structure.py
================================================
import itertools
import logging
import re
from collections import deque
from dataclasses import asdict, dataclass, field
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Pattern,
Tuple,
Union,
)
from pdfminer.data_structures import NumberTree
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral
from ._typing import T_bbox, T_obj
from .utils import decode_text, geometry
logger = logging.getLogger(__name__)
if TYPE_CHECKING: # pragma: nocover
from .page import Page
from .pdf import PDF
MatchFunc = Callable[["PDFStructElement"], bool]
def _find_all(
elements: Iterable["PDFStructElement"],
matcher: Union[str, Pattern[str], MatchFunc],
) -> Iterator["PDFStructElement"]:
"""
Common code for `find_all()` in trees and elements.
"""
def match_tag(x: "PDFStructElement") -> bool:
"""Match an element name."""
return x.type == matcher
def match_regex(x: "PDFStructElement") -> bool:
"""Match an element name by regular expression."""
return matcher.match(x.type) # type: ignore
if isinstance(matcher, str):
match_func = match_tag
elif isinstance(matcher, re.Pattern):
match_func = match_regex
else:
match_func = matcher # type: ignore
d = deque(elements)
while d:
el = d.popleft()
if match_func(el):
yield el
d.extendleft(reversed(el.children))
class Findable:
"""find() and find_all() methods that can be inherited to avoid
repeating oneself"""
children: List["PDFStructElement"]
def find_all(
self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Iterator["PDFStructElement"]:
"""Iterate depth-first over matching elements in subtree.
The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
"""
return _find_all(self.children, matcher)
def find(
self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Optional["PDFStructElement"]:
"""Find the first matching element in subtree.
The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
"""
try:
return next(_find_all(self.children, matcher))
except StopIteration:
return None
@dataclass
class PDFStructElement(Findable):
type: str
revision: Optional[int]
id: Optional[str]
lang: Optional[str]
alt_text: Optional[str]
actual_text: Optional[str]
title: Optional[str]
page_number: Optional[int]
attributes: Dict[str, Any] = field(default_factory=dict)
mcids: List[int] = field(default_factory=list)
children: List["PDFStructElement"] = field(default_factory=list)
def __iter__(self) -> Iterator["PDFStructElement"]:
return iter(self.children)
def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:
"""Collect all MCIDs (with their page numbers, if there are
multiple pages in the tree) inside a structure element.
"""
# Collect them depth-first to preserve ordering
for mcid in self.mcids:
yield self.page_number, mcid
d = deque(self.children)
while d:
el = d.popleft()
for mcid in el.mcids:
yield el.page_number, mcid
d.extendleft(reversed(el.children))
def to_dict(self) -> Dict[str, Any]:
"""Return a compacted dict representation."""
r = asdict(self)
# Prune empty values (does not matter in which order)
d = deque([r])
while d:
el = d.popleft()
for k in list(el.keys()):
if el[k] is None or el[k] == [] or el[k] == {}:
del el[k]
if "children" in el:
d.extend(el["children"])
return r
class StructTreeMissing(ValueError):
pass
class PDFStructTree(Findable):
"""Parse the structure tree of a PDF.
The constructor takes a `pdfplumber.PDF` and optionally a
`pdfplumber.Page`. To avoid creating the entire tree for a large
document it is recommended to provide a page.
This class creates a representation of the portion of the
structure tree that reaches marked content sections, either for a
single page, or for the whole document. Note that this is slightly
different from the behaviour of other PDF libraries which will
also include structure elements with no content.
If the PDF has no structure, the constructor will raise
`StructTreeMissing`.
"""
page: Optional["Page"]
def __init__(self, doc: "PDF", page: Optional["Page"] = None):
self.doc = doc.doc
if "StructTreeRoot" not in self.doc.catalog:
raise StructTreeMissing("PDF has no structure")
self.root = resolve1(self.doc.catalog["StructTreeRoot"])
self.role_map = resolve1(self.root.get("RoleMap", {}))
self.class_map = resolve1(self.root.get("ClassMap", {}))
self.children: List[PDFStructElement] = []
# If we have a specific page then we will work backwards from
# its ParentTree - this is because structure elements could
# span multiple pages, and the "Pg" attribute is *optional*,
# so this is the approved way to get a page's structure...
if page is not None:
self.page = page
self.pages = {page.page_number: page}
self.page_dict = None
# ...EXCEPT that the ParentTree is sometimes missing, in which
# case we fall back to the non-approved way.
parent_tree_obj = self.root.get("ParentTree")
if parent_tree_obj is None:
self._parse_struct_tree()
else:
parent_tree = NumberTree(parent_tree_obj)
# If there is no marked content in the structure tree for
# this page (which can happen even when there is a
# structure tree) then there is no `StructParents`.
# Note however that if there are XObjects in a page,
# *they* may have `StructParent` (not `StructParents`)
if "StructParents" not in self.page.page_obj.attrs:
return
parent_id = self.page.page_obj.attrs["StructParents"]
# NumberTree should have a `get` method like it does in pdf.js...
parent_array = resolve1(
next(array for num, array in parent_tree.values if num == parent_id)
)
self._parse_parent_tree(parent_array)
else:
self.page = None
# Overhead of creating pages shouldn't be too bad we hope!
self.pages = {page.page_number: page for page in doc.pages}
self.page_dict = {
page.page_obj.pageid: page.page_number for page in self.pages.values()
}
self._parse_struct_tree()
def _make_attributes(
self, obj: Dict[str, Any], revision: Optional[int]
) -> Dict[str, Any]:
attr_obj_list = []
for key in "C", "A":
if key not in obj:
continue
attr_obj = resolve1(obj[key])
# It could be a list of attribute objects (why?)
if isinstance(attr_obj, list):
attr_obj_list.extend(attr_obj)
else:
attr_obj_list.append(attr_obj)
attr_objs = []
prev_obj = None
for aref in attr_obj_list:
# If we find a revision number, which might "follow the
# revision object" (the spec is not clear about what this
# should look like but it implies they are simply adjacent
# in a flat array), then use it to decide whether to take
# the previous object...
if isinstance(aref, int):
if aref == revision and prev_obj is not None:
attr_objs.append(prev_obj)
prev_obj = None
else:
if prev_obj is not None:
attr_objs.append(prev_obj)
prev_obj = resolve1(aref)
if prev_obj is not None:
attr_objs.append(prev_obj)
# Now merge all the attribute objects in the collected to a
# single set (again, the spec doesn't really explain this but
# does say that attributes in /A supersede those in /C)
attr = {}
for obj in attr_objs:
if isinstance(obj, PSLiteral):
key = decode_text(obj.name)
if key not in self.class_map:
logger.warning("Unknown attribute class %s", key)
continue
obj = self.class_map[key]
for k, v in obj.items():
if isinstance(v, PSLiteral):
attr[k] = decode_text(v.name)
else:
attr[k] = obj[k]
return attr
def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]:
# We hopefully caught these earlier
assert "MCID" not in obj, "Uncaught MCR: %s" % obj
assert "Obj" not in obj, "Uncaught OBJR: %s" % obj
# Get page number if necessary
page_number = None
if self.page_dict is not None and "Pg" in obj:
page_objid = obj["Pg"].objid
assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj
page_number = self.page_dict[page_objid]
obj_tag = ""
if "S" in obj:
obj_tag = decode_text(obj["S"].name)
if obj_tag in self.role_map:
obj_tag = decode_text(self.role_map[obj_tag].name)
children = resolve1(obj["K"]) if "K" in obj else []
if isinstance(children, int): # ugh... isinstance...
children = [children]
elif isinstance(children, dict): # a single object.. ugh...
children = [obj["K"]]
revision = obj.get("R")
attributes = self._make_attributes(obj, revision)
element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None
title = decode_text(resolve1(obj["T"])) if "T" in obj else None
lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None
alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None
actual_text = (
decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None
)
element = PDFStructElement(
type=obj_tag,
id=element_id,
page_number=page_number,
revision=revision,
lang=lang,
title=title,
alt_text=alt_text,
actual_text=actual_text,
attributes=attributes,
)
return element, children
def _parse_parent_tree(self, parent_array: List[Any]) -> None:
"""Populate the structure tree using the leaves of the parent tree for
a given page."""
# First walk backwards from the leaves to the root, tracking references
d = deque(parent_array)
s = {}
found_root = False
while d:
ref = d.popleft()
# In the case where an MCID is not associated with any
# structure, there will be a "null" in the parent tree.
if ref == PDFParser.KEYWORD_NULL:
continue
if repr(ref) in s:
continue
obj = resolve1(ref)
# This is required! It's in the spec!
if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot":
found_root = True
else:
# We hope that these are actual elements and not
# references or marked-content sections...
element, children = self._make_element(obj)
# We have no page tree so we assume this page was parsed
assert element is not None
s[repr(ref)] = element, children
d.append(obj["P"])
# If we didn't reach the root something is quite wrong!
assert found_root
self._resolve_children(s)
def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
if "Pg" not in obj:
return True
page_objid = obj["Pg"].objid
if self.page_dict is not None:
return page_objid in self.page_dict
if self.page is not None:
# We have to do this to satisfy mypy
if page_objid != self.page.page_obj.pageid:
return False
return True
def _parse_struct_tree(self) -> None:
"""Populate the structure tree starting from the root, skipping
unparsed pages and empty elements."""
root = resolve1(self.root["K"])
# It could just be a single object ... it's in the spec (argh)
if isinstance(root, dict):
root = [self.root["K"]]
d = deque(root)
s = {}
while d:
ref = d.popleft()
# In case the tree is actually a DAG and not a tree...
if repr(ref) in s: # pragma: nocover (shouldn't happen)
continue
obj = resolve1(ref)
# Deref top-level OBJR skipping refs to unparsed pages
if isinstance(obj, dict) and "Obj" in obj:
if not self.on_parsed_page(obj):
continue
ref = obj["Obj"]
obj = resolve1(ref)
element, children = self._make_element(obj)
# Similar to above, delay resolving the children to avoid
# tree-recursion.
s[repr(ref)] = element, children
for child in children:
obj = resolve1(child)
if isinstance(obj, dict):
if not self.on_parsed_page(obj):
continue
if "Obj" in obj:
child = obj["Obj"]
elif "MCID" in obj:
continue
if isinstance(child, PDFObjRef):
d.append(child)
# Traverse depth-first, removing empty elements (unsure how to
# do this non-recursively)
def prune(elements: List[Any]) -> List[Any]:
next_elements = []
for ref in elements:
obj = resolve1(ref)
if isinstance(ref, int):
next_elements.append(ref)
continue
elif isinstance(obj, dict):
if not self.on_parsed_page(obj):
continue
if "MCID" in obj:
next_elements.append(obj["MCID"])
continue
elif "Obj" in obj:
ref = obj["Obj"]
element, children = s[repr(ref)]
children = prune(children)
# See assertions below
if element is None or not children:
del s[repr(ref)]
else:
s[repr(ref)] = element, children
next_elements.append(ref)
return next_elements
prune(root)
self._resolve_children(s)
def _resolve_children(self, seen: Dict[str, Any]) -> None:
"""Resolve children starting from the tree root based on references we
saw when traversing the structure tree.
"""
root = resolve1(self.root["K"])
# It could just be a single object ... it's in the spec (argh)
if isinstance(root, dict):
root = [self.root["K"]]
self.children = []
# Create top-level self.children
parsed_root = []
for ref in root:
obj = resolve1(ref)
if isinstance(obj, dict) and "Obj" in obj:
if not self.on_parsed_page(obj):
continue
ref = obj["Obj"]
if repr(ref) in seen:
parsed_root.append(ref)
d = deque(parsed_root)
while d:
ref = d.popleft()
element, children = seen[repr(ref)]
assert element is not None, "Unparsed element"
for child in children:
obj = resolve1(child)
if isinstance(obj, int):
element.mcids.append(obj)
elif isinstance(obj, dict):
# Skip out-of-page MCIDS and OBJRs
if not self.on_parsed_page(obj):
continue
if "MCID" in obj:
element.mcids.append(obj["MCID"])
elif "Obj" in obj:
child = obj["Obj"]
# NOTE: if, not elif, in case of OBJR above
if isinstance(child, PDFObjRef):
child_element, _ = seen.get(repr(child), (None, None))
if child_element is not None:
element.children.append(child_element)
d.append(child)
self.children = [seen[repr(ref)][0] for ref in parsed_root]
def __iter__(self) -> Iterator[PDFStructElement]:
return iter(self.children)
def element_bbox(self, el: PDFStructElement) -> T_bbox:
"""Get the bounding box for an element for visual debugging."""
page = None
if self.page is not None:
page = self.page
elif el.page_number is not None:
page = self.pages[el.page_number]
bbox = el.attributes.get("BBox", None)
if page is not None and bbox is not None:
from .page import CroppedPage, _invert_box, _normalize_box
# Use secret knowledge of CroppedPage (cannot use
# page.height because it is the *cropped* dimension, but
# cropping does not actually translate coordinates)
bbox = _invert_box(
_normalize_box(bbox), page.mediabox[3] - page.mediabox[1]
)
# Use more secret knowledge of CroppedPage
if isinstance(page, CroppedPage):
rect = geometry.bbox_to_rect(bbox)
rects = page._crop_fn([rect])
if not rects:
raise IndexError("Element no longer on page")
return geometry.obj_to_bbox(rects[0])
else:
# Not sure why mypy complains here
return bbox # type: ignore
else:
mcid_objs = []
for page_number, mcid in el.all_mcids():
objects: Iterable[T_obj]
if page_number is None:
if page is not None:
objects = itertools.chain.from_iterable(page.objects.values())
else:
objects = [] # pragma: nocover
else:
objects = itertools.chain.from_iterable(
self.pages[page_number].objects.values()
)
for c in objects:
if c["mcid"] == mcid:
mcid_objs.append(c)
if not mcid_objs:
raise IndexError("No objects found") # pragma: nocover
return geometry.objects_to_bbox(mcid_objs)
================================================
FILE: pdfplumber/table.py
================================================
import itertools
from dataclasses import dataclass
from operator import itemgetter
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type, Union
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_iter, T_obj_list, T_point
DEFAULT_SNAP_TOLERANCE = 3
DEFAULT_JOIN_TOLERANCE = 3
DEFAULT_MIN_WORDS_VERTICAL = 3
DEFAULT_MIN_WORDS_HORIZONTAL = 1
T_intersections = Dict[T_point, Dict[str, T_obj_list]]
T_table_settings = Union["TableSettings", Dict[str, Any]]
if TYPE_CHECKING: # pragma: nocover
from .page import Page
def snap_edges(
edges: T_obj_list,
x_tolerance: T_num = DEFAULT_SNAP_TOLERANCE,
y_tolerance: T_num = DEFAULT_SNAP_TOLERANCE,
) -> T_obj_list:
"""
Given a list of edges, snap any within `tolerance` pixels of one another
to their positional average.
"""
by_orientation: Dict[str, T_obj_list] = {"v": [], "h": []}
for e in edges:
by_orientation[e["orientation"]].append(e)
snapped_v = utils.snap_objects(by_orientation["v"], "x0", x_tolerance)
snapped_h = utils.snap_objects(by_orientation["h"], "top", y_tolerance)
return snapped_v + snapped_h
def join_edge_group(
edges: T_obj_iter, orientation: str, tolerance: T_num = DEFAULT_JOIN_TOLERANCE
) -> T_obj_list:
"""
Given a list of edges along the same infinite line, join those that
are within `tolerance` pixels of one another.
"""
if orientation == "h":
min_prop, max_prop = "x0", "x1"
elif orientation == "v":
min_prop, max_prop = "top", "bottom"
else:
raise ValueError("Orientation must be 'v' or 'h'")
sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
joined = [sorted_edges[0]]
for e in sorted_edges[1:]:
last = joined[-1]
if e[min_prop] <= (last[max_prop] + tolerance):
if e[max_prop] > last[max_prop]:
# Extend current edge to new extremity
joined[-1] = utils.resize_object(last, max_prop, e[max_prop])
else:
# Edge is separate from previous edges
joined.append(e)
return joined
def merge_edges(
edges: T_obj_list,
snap_x_tolerance: T_num,
snap_y_tolerance: T_num,
join_x_tolerance: T_num,
join_y_tolerance: T_num,
) -> T_obj_list:
"""
Using the `snap_edges` and `join_edge_group` methods above,
merge a list of edges into a more "seamless" list.
"""
def get_group(edge: T_obj) -> Tuple[str, T_num]:
if edge["orientation"] == "h":
return ("h", edge["top"])
else:
return ("v", edge["x0"])
if snap_x_tolerance > 0 or snap_y_tolerance > 0:
edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
_sorted = sorted(edges, key=get_group)
edge_groups = itertools.groupby(_sorted, key=get_group)
edge_gen = (
join_edge_group(
items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
)
for k, items in edge_groups
)
edges = list(itertools.chain(*edge_gen))
return edges
def words_to_edges_h(
words: T_obj_list, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL
) -> T_obj_list:
"""
Find (imaginary) horizontal lines that connect the tops
of at least `word_threshold` words.
"""
by_top = utils.cluster_objects(words, itemgetter("top"), 1)
large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
rects = list(map(utils.objects_to_rect, large_clusters))
if len(rects) == 0:
return []
min_x0 = min(map(itemgetter("x0"), rects))
max_x1 = max(map(itemgetter("x1"), rects))
edges = []
for r in rects:
edges += [
# Top of text
{
"x0": min_x0,
"x1": max_x1,
"top": r["top"],
"bottom": r["top"],
"width": max_x1 - min_x0,
"orientation": "h",
},
# For each detected row, we also add the 'bottom' line. This will
# generate extra edges, (some will be redundant with the next row
# 'top' line), but this catches the last row of every table.
{
"x0": min_x0,
"x1": max_x1,
"top": r["bottom"],
"bottom": r["bottom"],
"width": max_x1 - min_x0,
"orientation": "h",
},
]
return edges
def words_to_edges_v(
words: T_obj_list, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL
) -> T_obj_list:
"""
Find (imaginary) vertical lines that connect the left, right, or
center of at least `word_threshold` words.
"""
# Find words that share the same left, right, or centerpoints
by_x0 = utils.cluster_objects(words, itemgetter("x0"), 1)
by_x1 = utils.cluster_objects(words, itemgetter("x1"), 1)
def get_center(word: T_obj) -> T_num:
return float(word["x0"] + word["x1"]) / 2
by_center = utils.cluster_objects(words, get_center, 1)
clusters = by_x0 + by_x1 + by_center
# Find the points that align with the most words
sorted_clusters = sorted(clusters, key=lambda x: -len(x))
large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
# For each of those points, find the bboxes fitting all matching words
bboxes = list(map(utils.objects_to_bbox, large_clusters))
# Iterate through those bboxes, condensing overlapping bboxes
condensed_bboxes: List[T_bbox] = []
for bbox in bboxes:
overlap = any(utils.get_bbox_overlap(bbox, c) for c in condensed_bboxes)
if not overlap:
condensed_bboxes.append(bbox)
if len(condensed_bboxes) == 0:
return []
condensed_rects = map(utils.bbox_to_rect, condensed_bboxes)
sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
max_x1 = max(map(itemgetter("x1"), sorted_rects))
min_top = min(map(itemgetter("top"), sorted_rects))
max_bottom = max(map(itemgetter("bottom"), sorted_rects))
return [
{
"x0": b["x0"],
"x1": b["x0"],
"top": min_top,
"bottom": max_bottom,
"height": max_bottom - min_top,
"orientation": "v",
}
for b in sorted_rects
] + [
{
"x0": max_x1,
"x1": max_x1,
"top": min_top,
"bottom": max_bottom,
"height": max_bottom - min_top,
"orientation": "v",
}
]
def edges_to_intersections(
edges: T_obj_list, x_tolerance: T_num = 1, y_tolerance: T_num = 1
) -> T_intersections:
"""
Given a list of edges, return the points at which they intersect
within `tolerance` pixels.
"""
intersections: T_intersections = {}
v_edges, h_edges = [
list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
]
for v in sorted(v_edges, key=itemgetter("x0", "top")):
for h in sorted(h_edges, key=itemgetter("top", "x0")):
if (
(v["top"] <= (h["top"] + y_tolerance))
and (v["bottom"] >= (h["top"] - y_tolerance))
and (v["x0"] >= (h["x0"] - x_tolerance))
and (v["x0"] <= (h["x1"] + x_tolerance))
):
vertex = (v["x0"], h["top"])
if vertex not in intersections:
intersections[vertex] = {"v": [], "h": []}
intersections[vertex]["v"].append(v)
intersections[vertex]["h"].append(h)
return intersections
def intersections_to_cells(intersections: T_intersections) -> List[T_bbox]:
"""
Given a list of points (`intersections`), return all rectangular "cells"
that those points describe.
`intersections` should be a dictionary with (x0, top) tuples as keys,
and a list of edge objects as values. The edge objects should correspond
to the edges that touch the intersection.
"""
def edge_connects(p1: T_point, p2: T_point) -> bool:
def edges_to_set(edges: T_obj_list) -> Set[T_bbox]:
return set(map(utils.obj_to_bbox, edges))
if p1[0] == p2[0]:
common = edges_to_set(intersections[p1]["v"]).intersection(
edges_to_set(intersections[p2]["v"])
)
if len(common):
return True
if p1[1] == p2[1]:
common = edges_to_set(intersections[p1]["h"]).intersection(
edges_to_set(intersections[p2]["h"])
)
if len(common):
return True
return False
points = list(sorted(intersections.keys()))
n_points = len(points)
def find_smallest_cell(points: List[T_point], i: int) -> Optional[T_bbox]:
if i == n_points - 1:
return None
pt = points[i]
rest = points[i + 1 :]
# Get all the points directly below and directly right
below = [x for x in rest if x[0] == pt[0]]
right = [x for x in rest if x[1] == pt[1]]
for below_pt in below:
if not edge_connects(pt, below_pt):
continue
for right_pt in right:
if not edge_connects(pt, right_pt):
continue
bottom_right = (right_pt[0], below_pt[1])
if (
(bottom_right in intersections)
and edge_connects(bottom_right, right_pt)
and edge_connects(bottom_right, below_pt)
):
return (pt[0], pt[1], bottom_right[0], bottom_right[1])
return None
cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
return list(filter(None, cell_gen))
def cells_to_tables(cells: List[T_bbox]) -> List[List[T_bbox]]:
"""
Given a list of bounding boxes (`cells`), return a list of tables that
hold those cells most simply (and contiguously).
"""
def bbox_to_corners(bbox: T_bbox) -> Tuple[T_point, T_point, T_point, T_point]:
x0, top, x1, bottom = bbox
return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
remaining_cells = list(cells)
# Iterate through the cells found above, and assign them
# to contiguous tables
current_corners: Set[T_point] = set()
current_cells: List[T_bbox] = []
tables = []
while len(remaining_cells):
initial_cell_count = len(current_cells)
for cell in list(remaining_cells):
cell_corners = bbox_to_corners(cell)
# If we're just starting a table ...
if len(current_cells) == 0:
# ... immediately assign it to the empty group
current_corners |= set(cell_corners)
current_cells.append(cell)
remaining_cells.remove(cell)
else:
# How many corners does this table share with the current group?
corner_count = sum(c in current_corners for c in cell_corners)
# If touching on at least one corner...
if corner_count > 0:
# ... assign it to the current group
current_corners |= set(cell_corners)
current_cells.append(cell)
remaining_cells.remove(cell)
# If this iteration did not find any more cells to append...
if len(current_cells) == initial_cell_count:
# ... start a new cell group
tables.append(list(current_cells))
current_corners.clear()
current_cells.clear()
# Once we have exhausting the list of cells ...
# ... and we have a cell group that has not been stored
if len(current_cells):
# ... store it.
tables.append(list(current_cells))
# Sort the tables top-to-bottom-left-to-right based on the value of the
# topmost-and-then-leftmost coordinate of a table.
_sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
filtered = [t for t in _sorted if len(t) > 1]
return filtered
class CellGroup(object):
def __init__(self, cells: List[Optional[T_bbox]]):
self.cells = cells
self.bbox = (
min(map(itemgetter(0), filter(None, cells))),
min(map(itemgetter(1), filter(None, cells))),
max(map(itemgetter(2), filter(None, cells))),
max(map(itemgetter(3), filter(None, cells))),
)
class Row(CellGroup):
pass
class Column(CellGroup):
pass
class Table(object):
def __init__(self, page: "Page", cells: List[T_bbox]):
self.page = page
self.cells = cells
@property
def bbox(self) -> T_bbox:
c = self.cells
return (
min(map(itemgetter(0), c)),
min(map(itemgetter(1), c)),
max(map(itemgetter(2), c)),
max(map(itemgetter(3), c)),
)
def _get_rows_or_cols(self, kind: Type[CellGroup]) -> List[CellGroup]:
axis = 0 if kind is Row else 1
antiaxis = int(not axis)
# Sort first by top/x0, then by x0/top
_sorted = sorted(self.cells, key=itemgetter(antiaxis, axis))
# Sort get all x0s/tops
xs = list(sorted(set(map(itemgetter(axis), self.cells))))
# Group by top/x0
grouped = itertools.groupby(_sorted, itemgetter(antiaxis))
rows = []
# for y/x, row/column-cells ...
for y, row_cells in grouped:
xdict = {cell[axis]: cell for cell in row_cells}
row = kind([xdict.get(x) for x in xs])
rows.append(row)
return rows
@property
def rows(self) -> List[CellGroup]:
return self._get_rows_or_cols(Row)
@property
def columns(self) -> List[CellGroup]:
return self._get_rows_or_cols(Column)
def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:
chars = self.page.chars
table_arr = []
def char_in_bbox(char: T_obj, bbox: T_bbox) -> bool:
v_mid = (char["top"] + char["bottom"]) / 2
h_mid = (char["x0"] + char["x1"]) / 2
x0, top, x1, bottom = bbox
return bool(
(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
)
for row in self.rows:
arr = []
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
for cell in row.cells:
if cell is None:
cell_text = None
else:
cell_chars = [
char for char in row_chars if char_in_bbox(char, cell)
]
if len(cell_chars):
if "layout" in kwargs:
kwargs["layout_width"] = cell[2] - cell[0]
kwargs["layout_height"] = cell[3] - cell[1]
kwargs["layout_bbox"] = cell
cell_text = utils.extract_text(cell_chars, **kwargs)
else:
cell_text = ""
arr.append(cell_text)
table_arr.append(arr)
return table_arr
TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
NON_NEGATIVE_SETTINGS = [
"snap_tolerance",
"snap_x_tolerance",
"snap_y_tolerance",
"join_tolerance",
"join_x_tolerance",
"join_y_tolerance",
"edge_min_length",
"edge_min_length_prefilter",
"min_words_vertical",
"min_words_horizontal",
"intersection_tolerance",
"intersection_x_tolerance",
"intersection_y_tolerance",
]
class UnsetFloat(float):
pass
UNSET = UnsetFloat(0)
@dataclass
class TableSettings:
vertical_strategy: str = "lines"
horizontal_strategy: str = "lines"
explicit_vertical_lines: Optional[List[Union[T_obj, T_num]]] = None
explicit_horizontal_lines: Optional[List[Union[T_obj, T_num]]] = None
snap_tolerance: T_num = DEFAULT_SNAP_TOLERANCE
snap_x_tolerance: T_num = UNSET
snap_y_tolerance: T_num = UNSET
join_tolerance: T_num = DEFAULT_JOIN_TOLERANCE
join_x_tolerance: T_num = UNSET
join_y_tolerance: T_num = UNSET
edge_min_length: T_num = 3
edge_min_length_prefilter: T_num = 1
min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL
min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL
intersection_tolerance: T_num = 3
intersection_x_tolerance: T_num = UNSET
intersection_y_tolerance: T_num = UNSET
text_settings: Optional[Dict[str, Any]] = None
def __post_init__(self) -> None:
"""Clean up user-provided table settings.
Validates that the table settings provided consists of acceptable values and
returns a cleaned up version. The cleaned up version fills out the missing
values with the default values in the provided settings.
TODO: Can be further used to validate that the values are of the correct
type. For example, raising a value error when a non-boolean input is
provided for the key ``keep_blank_chars``.
:param table_settings: User-provided table settings.
:returns: A cleaned up version of the user-provided table settings.
:raises ValueError: When an unrecognised key is provided.
"""
for setting in NON_NEGATIVE_SETTINGS:
if (getattr(self, setting) or 0) < 0:
raise ValueError(f"Table setting '{setting}' cannot be negative")
for orientation in ["horizontal", "vertical"]:
strategy = getattr(self, orientation + "_strategy")
if strategy not in TABLE_STRATEGIES:
raise ValueError(
f"{orientation}_strategy must be one of"
f'{{{",".join(TABLE_STRATEGIES)}}}'
)
if self.text_settings is None:
self.text_settings = {}
# This next section is for backwards compatibility
for attr in ["x_tolerance", "y_tolerance"]:
if attr not in self.text_settings:
self.text_settings[attr] = self.text_settings.get("tolerance", 3)
if "tolerance" in self.text_settings:
del self.text_settings["tolerance"]
# End of that section
for attr, fallback in [
("snap_x_tolerance", "snap_tolerance"),
("snap_y_tolerance", "snap_tolerance"),
("join_x_tolerance", "join_tolerance"),
("join_y_tolerance", "join_tolerance"),
("intersection_x_tolerance", "intersection_tolerance"),
("intersection_y_tolerance", "intersection_tolerance"),
]:
if getattr(self, attr) is UNSET:
setattr(self, attr, getattr(self, fallback))
@classmethod
def resolve(cls, settings: Optional[T_table_settings]) -> "TableSettings":
if settings is None:
return cls()
elif isinstance(settings, cls):
return settings
elif isinstance(settings, dict):
core_settings = {}
text_settings = {}
for k, v in settings.items():
if k[:5] == "text_":
text_settings[k[5:]] = v
else:
core_settings[k] = v
core_settings["text_settings"] = text_settings
return cls(**core_settings)
else:
raise ValueError(f"Cannot resolve settings: {settings}")
class TableFinder(object):
"""
Given a PDF page, find plausible table structures.
Largely borrowed from Anssi Nurminen's master's thesis:
http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
... and inspired by Tabula:
https://github.com/tabulapdf/tabula-extractor/issues/16
"""
def __init__(self, page: "Page", settings: Optional[T_table_settings] = None):
self.page = page
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
self.edges,
self.settings.intersection_x_tolerance,
self.settings.intersection_y_tolerance,
)
self.cells = intersections_to_cells(self.intersections)
self.tables = [
Table(self.page, cell_group) for cell_group in cells_to_tables(self.cells)
]
def get_edges(self) -> T_obj_list:
settings = self.settings
for orientation in ["vertical", "horizontal"]:
strategy = getattr(settings, orientation + "_strategy")
if strategy == "explicit":
lines = getattr(settings, "explicit_" + orientation + "_lines")
if len(lines) < 2:
raise ValueError(
f"If {orientation}_strategy == 'explicit', "
f"explicit_{orientation}_lines "
f"must be specified as a list/tuple of two or more "
f"floats/ints."
)
v_strat = settings.vertical_strategy
h_strat = settings.horizontal_strategy
if v_strat == "text" or h_strat == "text":
words = self.page.extract_words(**(settings.text_settings or {}))
v_explicit = []
for desc in settings.explicit_vertical_lines or []:
if isinstance(desc, dict):
for e in utils.obj_to_edges(desc):
if e["orientation"] == "v":
v_explicit.append(e)
else:
v_explicit.append(
{
"x0": desc,
"x1": desc,
"top": self.page.bbox[1],
"bottom": self.page.bbox[3],
"height": self.page.bbox[3] - self.page.bbox[1],
"orientation": "v",
}
)
if v_strat == "lines":
v_base = utils.filter_edges(
self.page.edges, "v", min_length=settings.edge_min_length_prefilter
)
elif v_strat == "lines_strict":
v_base = utils.filter_edges(
self.page.edges,
"v",
edge_type="line",
min_length=settings.edge_min_length_prefilter,
)
elif v_strat == "text":
v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
elif v_strat == "explicit":
v_base = []
v = v_base + v_explicit
h_explicit = []
for desc in settings.explicit_horizontal_lines or []:
if isinstance(desc, dict):
for e in utils.obj_to_edges(desc):
if e["orientation"] == "h":
h_explicit.append(e)
else:
h_explicit.append(
{
"x0": self.page.bbox[0],
"x1": self.page.bbox[2],
"width": self.page.bbox[2] - self.page.bbox[0],
"top": desc,
"bottom": desc,
"orientation": "h",
}
)
if h_strat == "lines":
h_base = utils.filter_edges(
self.page.edges, "h", min_length=settings.edge_min_length_prefilter
)
elif h_strat == "lines_strict":
h_base = utils.filter_edges(
self.page.edges,
"h",
edge_type="line",
min_length=settings.edge_min_length_prefilter,
)
elif h_strat == "text":
h_base = words_to_edges_h(
words, word_threshold=settings.min_words_horizontal
)
elif h_strat == "explicit":
h_base = []
h = h_base + h_explicit
edges = list(v) + list(h)
edges = merge_edges(
edges,
snap_x_tolerance=settings.snap_x_tolerance,
snap_y_tolerance=settings.snap_y_tolerance,
join_x_tolerance=settings.join_x_tolerance,
join_y_tolerance=settings.join_y_tolerance,
)
return utils.filter_edges(edges, min_length=settings.edge_min_length)
================================================
FILE: pdfplumber/utils/__init__.py
================================================
from .clustering import cluster_list, cluster_objects, make_cluster_dict # noqa: F401
from .generic import to_list # noqa: F401
from .geometry import ( # noqa: F401
bbox_to_rect,
calculate_area,
clip_obj,
crop_to_bbox,
curve_to_edges,
filter_edges,
get_bbox_overlap,
intersects_bbox,
line_to_edge,
merge_bboxes,
move_object,
obj_to_bbox,
obj_to_edges,
objects_to_bbox,
objects_to_rect,
outside_bbox,
rect_to_edges,
resize_object,
snap_objects,
within_bbox,
)
from .pdfinternals import ( # noqa: F401
decode_psl_list,
decode_text,
resolve,
resolve_all,
resolve_and_decode,
)
from .text import ( # noqa: F401
DEFAULT_X_DENSITY,
DEFAULT_X_TOLERANCE,
DEFAULT_Y_DENSITY,
DEFAULT_Y_TOLERANCE,
chars_to_textmap,
collate_line,
dedupe_chars,
extract_text,
extract_text_simple,
extract_words,
)
================================================
FILE: pdfplumber/utils/clustering.py
================================================
import itertools
from collections.abc import Hashable
from operator import itemgetter
from typing import Any, Callable, Dict, Iterable, List, Tuple, TypeVar, Union
from .._typing import T_num, T_obj
def cluster_list(xs: List[T_num], tolerance: T_num = 0) -> List[List[T_num]]:
if tolerance == 0:
return [[x] for x in sorted(xs)]
if len(xs) < 2:
return [[x] for x in sorted(xs)]
groups = []
xs = list(sorted(xs))
current_group = [xs[0]]
last = xs[0]
for x in xs[1:]:
if x <= (last + tolerance):
current_group.append(x)
else:
groups.append(current_group)
current_group = [x]
last = x
groups.append(current_group)
return groups
def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num, int]:
clusters = cluster_list(list(set(values)), tolerance)
nested_tuples = [
[(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
]
return dict(itertools.chain(*nested_tuples))
Clusterable = TypeVar("Clusterable", T_obj, Tuple[Any, ...])
def cluster_objects(
xs: List[Clusterable],
key_fn: Union[Hashable, Callable[[Clusterable], T_num]],
tolerance: T_num,
preserve_order: bool = False,
) -> List[List[Clusterable]]:
if not callable(key_fn):
key_fn = itemgetter(key_fn)
values = map(key_fn, xs)
cluster_dict = make_cluster_dict(values, tolerance)
get_0, get_1 = itemgetter(0), itemgetter(1)
if preserve_order:
cluster_tuples = [(x, cluster_dict.get(key_fn(x))) for x in xs]
else:
cluster_tuples = sorted(
((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1
)
grouped = itertools.groupby(cluster_tuples, key=get_1)
return [list(map(get_0, v)) for k, v in grouped]
================================================
FILE: pdfplumber/utils/exceptions.py
================================================
class MalformedPDFException(Exception):
pass
class PdfminerException(Exception):
pass
================================================
FILE: pdfplumber/utils/generic.py
================================================
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Dict, Hashable, List, Union
from .._typing import T_seq
if TYPE_CHECKING: # pragma: nocover
from pandas.core.frame import DataFrame
def to_list(collection: Union[T_seq[Any], "DataFrame"]) -> List[Any]:
if isinstance(collection, list):
return collection
elif isinstance(collection, Sequence):
return list(collection)
elif hasattr(collection, "to_dict"):
res: List[Dict[Hashable, Any]] = collection.to_dict(
"records"
) # pragma: nocover
return res
else:
return list(collection)
================================================
FILE: pdfplumber/utils/geometry.py
================================================
import itertools
from operator import itemgetter
from typing import Dict, Iterable, Optional
from .._typing import T_bbox, T_num, T_obj, T_obj_list
from .clustering import cluster_objects
def objects_to_rect(objects: Iterable[T_obj]) -> Dict[str, T_num]:
"""
Given an iterable of objects, return the smallest rectangle (i.e. a
dict with "x0", "top", "x1", and "bottom" keys) that contains them
all.
"""
return bbox_to_rect(objects_to_bbox(objects))
def objects_to_bbox(objects: Iterable[T_obj]) -> T_bbox:
"""
Given an iterable of objects, return the smallest bounding box that
contains them all.
"""
return merge_bboxes(map(bbox_getter, objects))
bbox_getter = itemgetter("x0", "top", "x1", "bottom")
def obj_to_bbox(obj: T_obj) -> T_bbox:
"""
Return the bounding box for an object.
"""
bbox: T_bbox = bbox_getter(obj)
return bbox
def bbox_to_rect(bbox: T_bbox) -> Dict[str, T_num]:
"""
Return the rectangle (i.e a dict with keys "x0", "top", "x1",
"bottom") for an object.
"""
return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
def merge_bboxes(bboxes: Iterable[T_bbox]) -> T_bbox:
"""
Given an iterable of bounding boxes, return the smallest bounding box
that contains them all.
"""
x0, top, x1, bottom = zip(*bboxes)
return (min(x0), min(top), max(x1), max(bottom))
def get_bbox_overlap(a: T_bbox, b: T_bbox) -> Optional[T_bbox]:
a_left, a_top, a_right, a_bottom = a
b_left, b_top, b_right, b_bottom = b
o_left = max(a_left, b_left)
o_right = min(a_right, b_right)
o_bottom = min(a_bottom, b_bottom)
o_top = max(a_top, b_top)
o_width = o_right - o_left
o_height = o_bottom - o_top
if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
return (o_left, o_top, o_right, o_bottom)
else:
return None
def calculate_area(bbox: T_bbox) -> T_num:
left, top, right, bottom = bbox
if left > right or top > bottom:
raise ValueError(f"{bbox} has a negative width or height.")
return (right - left) * (bottom - top)
def clip_obj(obj: T_obj, bbox: T_bbox) -> Optional[T_obj]:
overlap = get_bbox_overlap(obj_to_bbox(obj), bbox)
if overlap is None:
return None
dims = bbox_to_rect(overlap)
copy = dict(obj)
for attr in ["x0", "top", "x1", "bottom"]:
copy[attr] = dims[attr]
diff = dims["top"] - obj["top"]
if "doctop" in copy:
copy["doctop"] = obj["doctop"] + diff
copy["width"] = copy["x1"] - copy["x0"]
copy["height"] = copy["bottom"] - copy["top"]
return copy
def intersects_bbox(objs: Iterable[T_obj], bbox: T_bbox) -> T_obj_list:
"""
Filters objs to only those intersecting the bbox
"""
return [obj for obj in objs if get_bbox_overlap(obj_to_bbox(obj), bbox) is not None]
def within_bbox(objs: Iterable[T_obj], bbox: T_bbox) -> T_obj_list:
"""
Filters objs to only those fully within the bbox
"""
return [
obj
for obj in objs
if get_bbox_overlap(obj_to_bbox(obj), bbox) == obj_to_bbox(obj)
]
def outside_bbox(objs: Iterable[T_obj], bbox: T_bbox) -> T_obj_list:
"""
Filters objs to only those fully outside the bbox
"""
return [obj for obj in objs if get_bbox_overlap(obj_to_bbox(obj), bbox) is None]
def crop_to_bbox(objs: Iterable[T_obj], bbox: T_bbox) -> T_obj_list:
"""
Filters objs to only those intersecting the bbox,
and crops the extent of the objects to the bbox.
"""
return list(filter(None, (clip_obj(obj, bbox) for obj in objs)))
def move_object(obj: T_obj, axis: str, value: T_num) -> T_obj:
assert axis in ("h", "v")
if axis == "h":
new_items = [
("x0", obj["x0"] + value),
("x1", obj["x1"] + value),
]
if axis == "v":
new_items = [
("top", obj["top"] + value),
("bottom", obj["bottom"] + value),
]
if "doctop" in obj:
new_items += [("doctop", obj["doctop"] + value)]
if "y0" in obj:
new_items += [
("y0", obj["y0"] - value),
("y1", obj["y1"] - value),
]
return obj.__class__(tuple(obj.items()) + tuple(new_items))
def snap_objects(objs: Iterable[T_obj], attr: str, tolerance: T_num) -> T_obj_list:
axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
list_objs = list(objs)
clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
snapped_clusters = [
[move_object(obj, axis, avg - obj[attr]) for obj in cluster]
for cluster, avg in zip(clusters, avgs)
]
return list(itertools.chain(*snapped_clusters))
def resize_object(obj: T_obj, key: str, value: T_num) -> T_obj:
assert key in ("x0", "x1", "top", "bottom")
old_value = obj[key]
diff = value - old_value
new_items = [
(key, value),
]
if key == "x0":
assert value <= obj["x1"]
new_items.append(("width", obj["x1"] - value))
elif key == "x1":
assert value >= obj["x0"]
new_items.append(("width", value - obj["x0"]))
elif key == "top":
assert value <= obj["bottom"]
new_items.append(("doctop", obj["doctop"] + diff))
new_items.append(("height", obj["height"] - diff))
if "y1" in obj:
new_items.append(("y1", obj["y1"] - diff))
elif key == "bottom":
assert value >= obj["top"]
new_items.append(("height", obj["height"] + diff))
if "y0" in obj:
new_items.append(("y0", obj["y0"] - diff))
return obj.__class__(tuple(obj.items()) + tuple(new_items))
def curve_to_edges(curve: T_obj) -> T_obj_list:
point_pairs = zip(curve["pts"], curve["pts"][1:])
return [
{
"object_type": "curve_edge",
"x0": min(p0[0], p1[0]),
"x1": max(p0[0], p1[0]),
"top": min(p0[1], p1[1]),
"doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
"bottom": max(p0[1], p1[1]),
"width": abs(p0[0] - p1[0]),
"height": abs(p0[1] - p1[1]),
"orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
}
for p0, p1 in point_pairs
]
def rect_to_edges(rect: T_obj) -> T_obj_list:
top, bottom, left, right = [dict(rect) for x in range(4)]
top.update(
{
"object_type": "rect_edge",
"height": 0,
"y0": rect["y1"],
"bottom": rect["top"],
"orientation": "h",
}
)
bottom.update(
{
"object_type": "rect_edge",
"height": 0,
"y1": rect["y0"],
"top": rect["top"] + rect["height"],
"doctop": rect["doctop"] + rect["height"],
"orientation": "h",
}
)
left.update(
{
"object_type": "rect_edge",
"width": 0,
"x1": rect["x0"],
"orientation": "v",
}
)
right.update(
{
"object_type": "rect_edge",
"width": 0,
"x0": rect["x1"],
"orientation": "v",
}
)
return [top, bottom, left, right]
def line_to_edge(line: T_obj) -> T_obj:
edge = dict(line)
edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
return edge
def obj_to_edges(obj: T_obj) -> T_obj_list:
t = obj["object_type"]
if "_edge" in t:
return [obj]
elif t == "line":
return [line_to_edge(obj)]
else:
return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
def filter_edges(
edges: Iterable[T_obj],
orientation: Optional[str] = None,
edge_type: Optional[str] = None,
min_length: T_num = 1,
) -> T_obj_list:
if orientation not in ("v", "h", None):
raise ValueError("Orientation must be 'v' or 'h'")
def test(e: T_obj) -> bool:
dim = "height" if e["orientation"] == "v" else "width"
et_correct = e["object_type"] == edge_type if edge_type is not None else True
orient_correct = orientation is None or e["orientation"] == orientation
return bool(et_correct and orient_correct and (e[dim] >= min_length))
return list(filter(test, edges))
================================================
FILE: pdfplumber/utils/pdfinternals.py
================================================
from typing import Any, List, Optional, Union
from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral
from pdfminer.utils import PDFDocEncoding
from .exceptions import MalformedPDFException
def decode_text(s: Union[bytes, str]) -> str:
"""
Decodes a PDFDocEncoding string to Unicode.
Adds py3 compatibility to pdfminer's version.
"""
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
return str(s[2:], "utf-16be", "ignore")
try:
ords = (ord(c) if isinstance(c, str) else c for c in s)
return "".join(PDFDocEncoding[o] for o in ords)
except IndexError:
return str(s)
def resolve_and_decode(obj: Any) -> Any:
"""Recursively resolve the metadata values."""
if hasattr(obj, "resolve"):
obj = obj.resolve()
if isinstance(obj, list):
return list(map(resolve_and_decode, obj))
elif isinstance(obj, PSLiteral):
return decode_text(obj.name)
elif isinstance(obj, (str, bytes)):
return decode_text(obj)
elif isinstance(obj, dict):
for k, v in obj.items():
obj[k] = resolve_and_decode(v)
return obj
return obj
def decode_psl_list(_list: List[Union[PSLiteral, str]]) -> List[str]:
return [
decode_text(value.name) if isinstance(value, PSLiteral) else value
for value in _list
]
def resolve(x: Any) -> Any:
if isinstance(x, PDFObjRef):
return x.resolve()
else:
return x
def get_dict_type(d: Any) -> Optional[str]:
if not isinstance(d, dict):
return None
t = d.get("Type")
if isinstance(t, PSLiteral):
return decode_text(t.name)
else:
return t
def resolve_all(x: Any) -> Any:
"""
Recursively resolves the given object and all the internals.
"""
if isinstance(x, PDFObjRef):
resolved = x.resolve()
# Avoid infinite recursion
if get_dict_type(resolved) == "Page":
return x
try:
return resolve_all(resolved)
except RecursionError as e:
raise MalformedPDFException(e)
elif isinstance(x, (list, tuple)):
return type(x)(resolve_all(v) for v in x)
elif isinstance(x, dict):
exceptions = ["Parent"] if get_dict_type(x) == "Annot" else []
return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()}
else:
return x
================================================
FILE: pdfplumber/utils/text.py
================================================
import inspect
import itertools
import logging
import re
import string
from operator import itemgetter
from typing import (
Any,
Callable,
Dict,
Generator,
List,
Match,
Optional,
Pattern,
Tuple,
Union,
)
from .._typing import T_bbox, T_dir, T_num, T_obj, T_obj_iter, T_obj_list
from .clustering import cluster_objects
from .generic import to_list
from .geometry import objects_to_bbox
logger = logging.getLogger(__name__)
DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3
DEFAULT_X_DENSITY = 7.25
DEFAULT_Y_DENSITY = 13
DEFAULT_LINE_DIR: T_dir = "ttb"
DEFAULT_CHAR_DIR: T_dir = "ltr"
LIGATURES = {
"ff": "ff",
"ffi": "ffi",
"ffl": "ffl",
"fi": "fi",
"fl": "fl",
"st": "st",
"ſt": "st",
}
def get_line_cluster_key(line_dir: T_dir) -> Callable[[T_obj], T_num]:
return {
"ttb": lambda x: x["top"],
"btt": lambda x: -x["bottom"],
"ltr": lambda x: x["x0"],
"rtl": lambda x: -x["x1"],
}[line_dir]
def get_char_sort_key(char_dir: T_dir) -> Callable[[T_obj], Tuple[T_num, T_num]]:
return {
"ttb": lambda x: (x["top"], x["bottom"]),
"btt": lambda x: (-(x["top"] + x["height"]), -x["top"]),
"ltr": lambda x: (x["x0"], x["x0"]),
"rtl": lambda x: (-x["x1"], -x["x0"]),
}[char_dir]
BBOX_ORIGIN_KEYS = {
"ttb": itemgetter(1),
"btt": itemgetter(3),
"ltr": itemgetter(0),
"rtl": itemgetter(2),
}
POSITION_KEYS = {
"ttb": itemgetter("top"),
"btt": itemgetter("bottom"),
"ltr": itemgetter("x0"),
"rtl": itemgetter("x1"),
}
def validate_directions(line_dir: T_dir, char_dir: T_dir, suffix: str = "") -> None:
valid_dirs = set(POSITION_KEYS.keys())
if line_dir not in valid_dirs:
raise ValueError(
f"line_dir{suffix} must be one of {valid_dirs}, not {line_dir}"
)
if char_dir not in valid_dirs:
raise ValueError(
f"char_dir{suffix} must be one of {valid_dirs}, not {char_dir}"
)
if set(line_dir) == set(char_dir):
raise ValueError(
f"line_dir{suffix}={line_dir} is incompatible "
f"with char_dir{suffix}={char_dir}"
)
class TextMap:
"""
A TextMap maps each unicode character in the text to an individual `char`
object (or, in the case of layout-implied whitespace, `None`).
"""
def __init__(
self,
tuples: List[Tuple[str, Optional[T_obj]]],
line_dir_render: T_dir,
char_dir_render: T_dir,
) -> None:
validate_directions(line_dir_render, char_dir_render, "_render")
self.tuples = tuples
self.line_dir_render = line_dir_render
self.char_dir_render = char_dir_render
self.as_string = self.to_string()
def to_string(self) -> str:
cd = self.char_dir_render
ld = self.line_dir_render
base = "".join(map(itemgetter(0), self.tuples))
if cd == "ltr" and ld == "ttb":
return base
else:
lines = base.split("\n")
if ld in ("btt", "rtl"):
lines = list(reversed(lines))
if cd == "rtl":
lines = ["".join(reversed(line)) for line in lines]
if ld in ("rtl", "ltr"):
max_line_length = max(map(len, lines))
if cd == "btt":
lines = [
(" " * (max_line_length - len(line))) + line for line in lines
]
else:
lines = [
line + (" " * (max_line_length - len(line))) for line in lines
]
return "\n".join(
"".join(line[i] for line in lines) for i in range(max_line_length)
)
else:
return "\n".join(lines)
def match_to_dict(
self,
m: Match[str],
main_group: int = 0,
return_groups: bool = True,
return_chars: bool = True,
) -> Dict[str, Any]:
subset = self.tuples[m.start(main_group) : m.end(main_group)]
chars = [c for (text, c) in subset if c is not None]
x0, top, x1, bottom = objects_to_bbox(chars)
result = {
"text": m.group(main_group),
"x0": x0,
"top": top,
"x1": x1,
"bottom": bottom,
}
if return_groups:
result["groups"] = m.groups()
if return_chars:
result["chars"] = chars
return result
def search(
self,
pattern: Union[str, Pattern[str]],
regex: bool = True,
case: bool = True,
return_groups: bool = True,
return_chars: bool = True,
main_group: int = 0,
) -> List[Dict[str, Any]]:
if isinstance(pattern, Pattern):
if regex is False:
raise ValueError(
"Cannot pass a compiled search pattern *and* regex=False together."
)
if case is False:
raise ValueError(
"Cannot pass a compiled search pattern *and* case=False together."
)
compiled = pattern
else:
if regex is False:
pattern = re.escape(pattern)
flags = re.I if case is False else 0
compiled = re.compile(pattern, flags)
gen = re.finditer(compiled, self.as_string)
# Remove zero-length matches (can happen, e.g., with optional
# patterns in regexes) and whitespace-only matches
filtered = filter(lambda m: bool(m.group(main_group).strip()), gen)
return [
self.match_to_dict(
m,
return_groups=return_groups,
return_chars=return_chars,
main_group=main_group,
)
for m in filtered
]
def extract_text_lines(
self, strip: bool = True, return_chars: bool = True
) -> List[Dict[str, Any]]:
"""
`strip` is analogous to Python's `str.strip()` method, and returns
`text` attributes without their surrounding whitespace. Only
relevant when the relevant TextMap is created with `layout` = True
Setting `return_chars` to False will exclude the individual
character objects from the returned text-line dicts.
"""
if strip:
pat = r" *([^\n]+?) *(\n|$)"
else:
pat = r"([^\n]+)"
return self.search(
pat, main_group=1, return_chars=return_chars, return_groups=False
)
class WordMap:
"""
A WordMap maps words->chars.
"""
def __init__(self, tuples: List[Tuple[T_obj, T_obj_list]]) -> None:
self.tuples = tuples
def to_textmap(
self,
layout: bool = False,
layout_width: T_num = 0,
layout_height: T_num = 0,
layout_width_chars: int = 0,
layout_height_chars: int = 0,
layout_bbox: T_bbox = (0, 0, 0, 0),
x_density: T_num = DEFAULT_X_DENSITY,
y_density: T_num = DEFAULT_Y_DENSITY,
x_shift: T_num = 0,
y_shift: T_num = 0,
y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
line_dir: T_dir = DEFAULT_LINE_DIR,
char_dir: T_dir = DEFAULT_CHAR_DIR,
line_dir_rotated: Optional[T_dir] = None,
char_dir_rotated: Optional[T_dir] = None,
char_dir_render: Optional[T_dir] = None,
line_dir_render: Optional[T_dir] = None,
use_text_flow: bool = False,
presorted: bool = False,
expand_ligatures: bool = True,
) -> TextMap:
"""
Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
(char-text, char) tuples (i.e., a TextMap) that can be used to mimic
the structural layout of the text on the page(s), using the following
approach for top-to-bottom, left-to-right text:
- Sort the words by (top, x0) if not already sorted.
- Cluster the words by top (taking `y_tolerance` into account), and
iterate through them.
- For each cluster, divide (top - y_shift) by `y_density` to calculate
the minimum number of newlines that should come before this cluster.
Append that number of newlines *minus* the number of newlines already
appended, with a minimum of one.
- Then for each cluster, iterate through each word in it. Divide each
word's x0, minus `x_shift`, by `x_density` to calculate the minimum
number of characters that should come before this cluster. Append that
number of spaces *minus* the number of characters and spaces already
appended, with a minimum of one. Then append the word's text.
- At the termination of each line, add more spaces if necessary to
mimic `layout_width`.
- Finally, add newlines to the end if necessary to mimic to
`layout_height`.
For other line/character directions (e.g., bottom-to-top,
right-to-left), these steps are adjusted.
"""
_textmap: List[Tuple[str, Optional[T_obj]]] = []
if not len(self.tuples):
return TextMap(
_textmap,
line_dir_render=line_dir_render or line_dir,
char_dir_render=char_dir_render or char_dir,
)
expansions = LIGATURES if expand_ligatures else {}
if layout:
if layout_width_chars:
if layout_width:
raise ValueError(
"`layout_width` and `layout_width_chars` cannot both be set."
)
else:
layout_width_chars = int(round(layout_width / x_density))
if layout_height_chars:
if layout_height:
raise ValueError(
"`layout_height` and `layout_height_chars` cannot both be set."
)
else:
layout_height_chars = int(round(layout_height / y_density))
blank_line = [(" ", None)] * layout_width_chars
else:
blank_line = []
num_newlines = 0
line_cluster_key = get_line_cluster_key(line_dir)
char_sort_key = get_char_sort_key(char_dir)
line_position_key = POSITION_KEYS[line_dir]
char_position_key = POSITION_KEYS[char_dir]
y_origin = BBOX_ORIGIN_KEYS[line_dir](layout_bbox)
x_origin = BBOX_ORIGIN_KEYS[char_dir](layout_bbox)
words_sorted_line_dir = (
self.tuples
if presorted or use_text_flow
else sorted(self.tuples, key=lambda x: line_cluster_key(x[0]))
)
tuples_by_line = cluster_objects(
words_sorted_line_dir,
lambda x: line_cluster_key(x[0]),
y_tolerance,
preserve_order=presorted or use_text_flow,
)
for i, line_tuples in enumerate(tuples_by_line):
if layout:
line_position = line_position_key(line_tuples[0][0])
y_dist_raw = line_position - (y_origin + y_shift)
adj = -1 if line_dir in ["btt", "rtl"] else 1
y_dist = y_dist_raw * adj / y_density
else:
y_dist = 0
num_newlines_prepend = max(
# At least one newline, unless this iis the first line
int(i > 0),
# ... or as many as needed to get the imputed "distance" from the top
round(y_dist) - num_newlines,
)
for i in range(num_newlines_prepend):
if not len(_textmap) or _textmap[-1][0] == "\n":
_textmap += blank_line
_textmap.append(("\n", None))
num_newlines += num_newlines_prepend
line_len = 0
line_tuples_sorted = (
line_tuples
if presorted or use_text_flow
else sorted(line_tuples, key=lambda x: char_sort_key(x[0]))
)
for word, chars in line_tuples_sorted:
if layout:
char_position = char_position_key(word)
x_dist_raw = char_position - (x_origin + x_shift)
adj = -1 if char_dir in ["btt", "rtl"] else 1
x_dist = x_dist_raw * adj / x_density
else:
x_dist = 0
num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
_textmap += [(" ", None)] * num_spaces_prepend
line_len += num_spaces_prepend
for c in chars:
letters = expansions.get(c["text"], c["text"])
for letter in letters:
_textmap.append((letter, c))
line_len += 1
# Append spaces at end of line
if layout:
_textmap += [(" ", None)] * (layout_width_chars - line_len)
# Append blank lines at end of text
if layout:
num_newlines_append = layout_height_chars - (num_newlines + 1)
for i in range(num_newlines_append):
if i > 0:
_textmap += blank_line
_textmap.append(("\n", None))
# Remove terminal newline
if _textmap[-1] == ("\n", None):
_textmap = _textmap[:-1]
return TextMap(
_textmap,
line_dir_render=line_dir_render or line_dir,
char_dir_render=char_dir_render or char_dir,
)
class WordExtractor:
def __init__(
self,
x_tolerance: T_num = DEFAULT_X_TOLERANCE,
y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
x_tolerance_ratio: Union[int, float, None] = None,
y_tolerance_ratio: Union[int, float, None] = None,
keep_blank_chars: bool = False,
use_text_flow: bool = False,
vertical_ttb: bool = True, # Should vertical words be read top-to-bottom?
horizontal_ltr: bool = True, # Should words be read left-to-right?
line_dir: T_dir = DEFAULT_LINE_DIR,
char_dir: T_dir = DEFAULT_CHAR_DIR,
line_dir_rotated: Optional[T_dir] = None,
char_dir_rotated: Optional[T_dir] = None,
extra_attrs: Optional[List[str]] = None,
split_at_punctuation: Union[bool, str] = False,
expand_ligatures: bool = True,
):
self.x_tolerance = x_tolerance
self.y_tolerance = y_tolerance
self.x_tolerance_ratio = x_tolerance_ratio
self.y_tolerance_ratio = y_tolerance_ratio
self.keep_blank_chars = keep_blank_chars
self.use_text_flow = use_text_flow
self.horizontal_ltr = horizontal_ltr
self.vertical_ttb = vertical_ttb
if vertical_ttb is False:
logger.warning(
"vertical_ttb is deprecated and will be removed;"
" use line_dir/char_dir instead."
)
if horizontal_ltr is False:
logger.warning(
"horizontal_ltr is deprecated and will be removed;"
" use line_dir/char_dir instead."
)
self.line_dir = line_dir
self.char_dir = char_dir
# Default is to "flip" the directions for rotated text
self.line_dir_rotated = line_dir_rotated or char_dir
self.char_dir_rotated = char_dir_rotated or line_dir
validate_directions(self.line_dir, self.char_dir)
validate_directions(self.line_dir_rotated, self.char_dir_rotated, "_rotated")
self.extra_attrs = [] if extra_attrs is None else extra_attrs
# Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
self.split_at_punctuation = (
string.punctuation
if split_at_punctuation is True
else (split_at_punctuation or "")
)
self.expansions = LIGATURES if expand_ligatures else {}
def get_char_dir(self, upright: int) -> T_dir:
# Note: This can be simplified and reincorporated into .merge_chars and
# .iter_chars_to_lines once .vertical_ttb and .horizontal_ltr
# deprecation is complete.
if not upright and not self.vertical_ttb:
return "btt"
elif upright and not self.horizontal_ltr:
return "rtl"
return self.char_dir if upright else self.char_dir_rotated
def merge_chars(self, ordered_chars: T_obj_list) -> T_obj:
x0, top, x1, bottom = objects_to_bbox(ordered_chars)
doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
upright = ordered_chars[0]["upright"]
char_dir = self.get_char_dir(upright)
word = {
"text": "".join(
self.expansions.get(c["text"], c["text"]) for c in ordered_chars
),
"x0": x0,
"x1": x1,
"top": top,
"doctop": top + doctop_adj,
"bottom": bottom,
"upright": upright,
"height": bottom - top,
"width": x1 - x0,
"direction": char_dir,
}
for key in self.extra_attrs:
word[key] = ordered_chars[0][key]
return word
def char_begins_new_word(
self,
prev_char: T_obj,
curr_char: T_obj,
direction: T_dir,
x_tolerance: T_num,
y_tolerance: T_num,
) -> bool:
"""This method takes several factors into account to determine if
`curr_char` represents the beginning of a new word:
- Whether the text is "upright" (i.e., non-rotated)
- Whether the user has specified that horizontal text runs
left-to-right (default) or right-to-left, as represented by
self.horizontal_ltr
- Whether the user has specified that vertical text the text runs
top-to-bottom (default) or bottom-to-top, as represented by
self.vertical_ttb
- The x0, top, x1, and bottom attributes of prev_char and
curr_char
- The self.x_tolerance and self.y_tolerance settings. Note: In
this case, x/y refer to those directions for non-rotated text.
For vertical text, they are flipped. A more accurate terminology
might be "*intra*line character distance tolerance" and
"*inter*line character distance tolerance"
An important note: The *intra*line distance is measured from the
*end* of the previous character to the *beginning* of the current
character, while the *inter*line distance is measured from the
*top* of the previous character to the *top* of the next
character. The reasons for this are partly repository-historical,
and partly logical, as successive text lines' bounding boxes often
overlap slightly (and we don't want that overlap to be interpreted
as the two lines being the same line).
The upright-ness of the character determines the attributes to
compare, while horizontal_ltr/vertical_ttb determine the direction
of the comparison.
"""
# Note: Due to the grouping step earlier in the process,
# curr_char["upright"] will always equal prev_char["upright"].
if direction in ("ltr", "rtl"):
x = x_tolerance
y = y_tolerance
ay = prev_char["top"]
cy = curr_char["top"]
if direction == "ltr":
ax = prev_char["x0"]
bx = prev_char["x1"]
cx = curr_char["x0"]
else:
ax = -prev_char["x1"]
bx = -prev_char["x0"]
cx = -curr_char["x1"]
else:
x = y_tolerance
y = x_tolerance
ay = prev_char["x0"]
cy = curr_char["x0"]
if direction == "ttb":
ax = prev_char["top"]
bx = prev_char["bottom"]
cx = curr_char["top"]
else:
ax = -prev_char["bottom"]
bx = -prev_char["top"]
cx = -curr_char["bottom"]
return bool(
# Intraline test
(cx < ax)
or (cx > bx + x)
# Interline test
or abs(cy - ay) > y
)
def iter_chars_to_words(
self,
ordered_chars: T_obj_iter,
direction: T_dir,
) -> Generator[T_obj_list, None, None]:
current_word: T_obj_list = []
def start_next_word(
new_char: Optional[T_obj],
) -> Generator[T_obj_list, None, None]:
nonlocal current_word
if current_word:
yield current_word
current_word = [] if new_char is None else [new_char]
xt = self.x_tolerance
xtr = self.x_tolerance_ratio
yt = self.y_tolerance
ytr = self.y_tolerance_ratio
for char in ordered_chars:
text = char["text"]
if not self.keep_blank_chars and text.isspace():
yield from start_next_word(None)
elif text in self.split_at_punctuation:
yield from start_next_word(char)
yield from start_next_word(None)
elif current_word and self.char_begins_new_word(
current_word[-1],
char,
direction,
x_tolerance=(xt if xtr is None else xtr * current_word[-1]["size"]),
y_tolerance=(yt if ytr is None else ytr * current_word[-1]["size"]),
):
yield from start_next_word(char)
else:
current_word.append(char)
# Finally, after all chars processed
if current_word:
yield current_word
def iter_chars_to_lines(
self, chars: T_obj_iter
) -> Generator[Tuple[T_obj_list, T_dir], None, None]:
chars = list(chars)
upright = chars[0]["upright"]
line_dir = self.line_dir if upright else self.line_dir_rotated
char_dir = self.get_char_dir(upright)
line_cluster_key = get_line_cluster_key(line_dir)
char_sort_key = get_char_sort_key(char_dir)
# Cluster by line
subclusters = cluster_objects(
chars,
line_cluster_key,
(self.y_tolerance if line_dir in ("ttb", "btt") else self.x_tolerance),
)
for sc in subclusters:
# Sort within line
chars_sorted = sorted(sc, key=char_sort_key)
yield (chars_sorted, char_dir)
def iter_extract_tuples(
self, chars: T_obj_iter
) -> Generator[Tuple[T_obj, T_obj_list], None, None]:
grouping_key = itemgetter("upright", *self.extra_attrs)
grouped_chars = itertools.groupby(chars, grouping_key)
for keyvals, char_group in grouped_chars:
line_groups = (
[(char_group, self.char_dir)]
if self.use_text_flow
else self.iter_chars_to_lines(char_group)
)
for line_chars, direction in line_groups:
for word_chars in self.iter_chars_to_words(line_chars, direction):
yield (self.merge_chars(word_chars), word_chars)
def extract_wordmap(self, chars: T_obj_iter) -> WordMap:
return WordMap(list(self.iter_extract_tuples(chars)))
def extract_words(
self, chars: T_obj_list, return_chars: bool = False
) -> T_obj_list:
if return_chars:
return list(
{**word, "chars": word_chars}
for word, word_chars in self.iter_extract_tuples(chars)
)
else:
return list(word for word, word_chars in self.iter_extract_tuples(chars))
def extract_words(
chars: T_obj_list, return_chars: bool = False, **kwargs: Any
) -> T_obj_list:
return WordExtractor(**kwargs).extract_words(chars, return_chars)
TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
def chars_to_textmap(chars: T_obj_list, **kwargs: Any) -> TextMap:
kwargs.update(
{
"presorted": True,
"layout_bbox": kwargs.get("layout_bbox") or objects_to_bbox(chars),
}
)
extractor = WordExtractor(
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
)
wordmap = extractor.extract_wordmap(chars)
textmap = wordmap.to_textmap(
**{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
)
return textmap
def extract_text(
chars: T_obj_list,
line_dir_render: Optional[T_dir] = None,
char_dir_render: Optional[T_dir] = None,
**kwargs: Any,
) -> str:
chars = to_list(chars)
if len(chars) == 0:
return ""
if kwargs.get("layout"):
textmap_kwargs = {
**kwargs,
**{"line_dir_render": line_dir_render, "char_dir_render": char_dir_render},
}
return chars_to_textmap(chars, **textmap_kwargs).as_string
else:
extractor = WordExtractor(
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
)
words = extractor.extract_words(chars)
line_dir_render = line_dir_render or extractor.line_dir
char_dir_render = char_dir_render or extractor.char_dir
line_cluster_key = get_line_cluster_key(extractor.line_dir)
x_tolerance = kwargs.get("x_tolerance", DEFAULT_X_TOLERANCE)
y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
lines = cluster_objects(
words,
line_cluster_key,
y_tolerance if line_dir_render in ("ttb", "btt") else x_tolerance,
)
return TextMap(
[
(char, None)
for char in (
"\n".join(" ".join(word["text"] for word in line) for line in lines)
)
],
line_dir_render=line_dir_render,
char_dir_render=char_dir_render,
).as_string
def collate_line(
line_chars: T_obj_list,
tolerance: T_num = DEFAULT_X_TOLERANCE,
) -> str:
coll = ""
last_x1 = None
for char in sorted(line_chars, key=itemgetter("x0")):
if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
coll += " "
last_x1 = char["x1"]
coll += char["text"]
return coll
def extract_text_simple(
chars: T_obj_list,
x_tolerance: T_num = DEFAULT_X_TOLERANCE,
y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
) -> str:
clustered = cluster_objects(chars, itemgetter("doctop"), y_tolerance)
return "\n".join(collate_line(c, x_tolerance) for c in clustered)
def dedupe_chars(
chars: T_obj_list,
tolerance: T_num = 1,
extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"),
) -> T_obj_list:
"""
Removes duplicate chars — those sharing the same text and positioning
(within `tolerance`) as other characters in the set. Use extra_args to
be more restrictive with the properties shared by the matching chars.
"""
key = itemgetter(*("upright", "text"), *(extra_attrs or tuple()))
pos_key = itemgetter("doctop", "x0")
def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
sorted_chars = sorted(chars, key=key)
for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
for y_cluster in cluster_objects(
list(grp_chars), itemgetter("doctop"), tolerance
):
for x_cluster in cluster_objects(
y_cluster, itemgetter("x0"), tolerance
):
yield sorted(x_cluster, key=pos_key)[0]
deduped = yield_unique_chars(chars)
return sorted(deduped, key=chars.index)
================================================
FILE: requirements-dev.txt
================================================
black==24.8.0
flake8==7.1.1
isort==5.13.2
jupyterlab>=4.4.8
mypy==1.11.1
nbexec==0.2.0
pandas-stubs>=2.2.2.240805
pandas>=2.2.2
py==1.11.0
pytest-cov==5.0.0
pytest-xdist==3.8.0
pytest==8.3.2
setuptools>=78.1.1
types-Pillow==10.2.0.20240520
================================================
FILE: requirements.txt
================================================
pdfminer.six==20260107
Pillow>=9.1
pypdfium2>=4.18.0
================================================
FILE: setup.cfg
================================================
[flake8]
# max-complexity = 10
max-line-length = 88
ignore =
# https://black.readthedocs.io/en/stable/the_black_code_style.html#slices
E203
# Impossible to obey both W503 and W504
W503
# https://github.com/psf/black/issues/3887
E704
[tool:pytest]
addopts=--cov=pdfplumber --cov-report xml:coverage.xml --cov-report term
[tool.isort]
profile = "black"
[testenv]
deps=
-r requirements.txt
-r requirements-dev.txt
commands=python -m pytest
================================================
FILE: setup.py
================================================
import os
from setuptools import setup, find_packages
NAME = "pdfplumber"
HERE = os.path.abspath(os.path.dirname(__file__))
version_ns = {}
def _open(subpath):
path = os.path.join(HERE, subpath)
return open(path, encoding="utf-8")
with _open(NAME + "/_version.py") as f:
exec(f.read(), {}, version_ns)
with _open("requirements.txt") as f:
base_reqs = f.read().strip().split("\n")
with _open("requirements-dev.txt") as f:
dev_reqs = f.read().strip().split("\n")
with _open("README.md") as f:
long_description = f.read()
setup(
name=NAME,
url="https://github.com/jsvine/pdfplumber",
author="Jeremy Singer-Vine",
author_email="jsvine@gmail.com",
description="Plumb a PDF for detailed information about each char, rectangle, and line.",
long_description=long_description,
long_description_content_type="text/markdown",
version=version_ns["__version__"],
packages=find_packages(
exclude=[
"test",
]
),
include_package_data=True,
package_data={"pdfplumber": ["py.typed"]},
zip_safe=False,
tests_require=base_reqs + dev_reqs,
python_requires=">=3.8",
install_requires=base_reqs,
entry_points={"console_scripts": ["pdfplumber = pdfplumber.cli:main"]},
classifiers=[
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
],
)
================================================
FILE: tests/comparisons/scotus-transcript-p1-cropped.txt
================================================
1 IN THE SUPREME COURT OF THE UNITED STATES
2 - - - - - - - - - - - - - - - - - x
3 MICHAEL A. KNOWLES, :
4 WARDEN, :
5 Petitioner :
6 v. : No. 07-1315
7 ALEXANDRE MIRZAYANCE. :
8 - - - - - - - - - - - - - - - - - x
9 Washington, D.C.
10 Tuesday, January 13, 2009
================================================
FILE: tests/comparisons/scotus-transcript-p1.txt
================================================
Official - Subject to Final Review
1 IN THE SUPREME COURT OF THE UNITED STATES
2 - - - - - - - - - - - - - - - - - x
3 MICHAEL A. KNOWLES, :
4 WARDEN, :
5 Petitioner :
6 v. : No. 07-1315
7 ALEXANDRE MIRZAYANCE. :
8 - - - - - - - - - - - - - - - - - x
9 Washington, D.C.
10 Tuesday, January 13, 2009
11
12 The above-entitled matter came on for oral
13 argument before the Supreme Court of the United States
14 at 1:01 p.m.
15 APPEARANCES:
16 STEVEN E. MERCER, ESQ., Deputy Attorney General, Los
17 Angeles, Cal.; on behalf of the Petitioner.
18 CHARLES M. SEVILLA, ESQ., San Diego, Cal.; on behalf
19 of the Respondent.
20
21
22
23
24
25
1
Alderson Reporting Company
================================================
FILE: tests/pdfs/make_xref.py
================================================
#!/usr/bin/env python
"""Create an xref section for a simple handmade PDF.
Not a general purpose tool!!!"""
import re
import sys
with open(sys.argv[1], "r+b") as infh:
pos = 0
xref = [(0, 65535, "f")]
for spam in infh:
text = spam.decode("ascii")
if re.match(r"\s*(\d+)\s+(\d+)\s+obj", text):
xref.append((pos, 0, "n"))
elif text.strip() == "xref":
startxref = pos
pos = infh.tell()
infh.seek(startxref)
infh.write(b"xref\n")
infh.write(("0 %d\n" % len(xref)).encode("ascii"))
for x in xref:
infh.write(("%010d %05d %s \n" % x).encode("ascii"))
infh.write(("trailer << /Size %d /Root 1 0 R >>\n" % len(xref)).encode("ascii"))
infh.write(b"startxref\n")
infh.write(("%d\n" % startxref).encode("ascii"))
infh.write(b"%%EOF\n")
================================================
FILE: tests/test_basics.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pytest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
self.pdf = pdfplumber.open(path)
# via http://www.pdfill.com/example/pdf_drawing_new.pdf
path_2 = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf_2 = pdfplumber.open(path_2)
@classmethod
def teardown_class(self):
self.pdf.close()
self.pdf_2.close()
def test_metadata(self):
metadata = self.pdf.metadata
assert isinstance(metadata["Producer"], str)
def test_pagecount(self):
assert len(self.pdf.pages) == 1
def test_page_number(self):
assert self.pdf.pages[0].page_number == 1
assert str(self.pdf.pages[0]) == ""
def test_objects(self):
assert len(self.pdf.chars)
assert len(self.pdf.rects)
assert len(self.pdf.lines)
assert len(self.pdf.rect_edges)
assert len(self.pdf_2.curve_edges)
# Ensure that caching is working:
assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges)
assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges)
assert id(self.pdf.pages[0]._layout) == id(self.pdf.pages[0].layout)
def test_annots(self):
pdf = self.pdf_2
assert len(pdf.annots)
assert len(pdf.hyperlinks) == 17
uri = "http://www.pdfill.com/pdf_drawing.html"
assert pdf.hyperlinks[0]["uri"] == uri
path = os.path.join(HERE, "pdfs/annotations.pdf")
with pdfplumber.open(path) as pdf:
assert len(pdf.annots)
def test_annots_cropped(self):
pdf = self.pdf_2
page = pdf.pages[0]
assert len(page.annots) == 13
assert len(page.hyperlinks) == 1
cropped = page.crop(page.bbox)
assert len(cropped.annots) == 13
assert len(cropped.hyperlinks) == 1
h0_bbox = pdfplumber.utils.obj_to_bbox(page.hyperlinks[0])
cropped = page.crop(h0_bbox)
assert len(cropped.annots) == len(cropped.hyperlinks) == 1
def test_annots_rotated(self):
def get_annot(filename, n=0):
path = os.path.join(HERE, "pdfs", filename)
with pdfplumber.open(path) as pdf:
return pdf.pages[0].annots[n]
a = get_annot("annotations.pdf", 3)
b = get_annot("annotations-rotated-180.pdf", 3)
c = get_annot("annotations-rotated-90.pdf", 3)
d = get_annot("annotations-rotated-270.pdf", 3)
assert (
int(a["width"]) == int(b["width"]) == int(c["height"]) == int(d["height"])
)
assert (
int(a["height"]) == int(b["height"]) == int(c["width"]) == int(d["width"])
)
assert int(a["x0"]) == int(c["top"]) == int(d["y0"])
assert int(a["x1"]) == int(c["bottom"]) == int(d["y1"])
assert int(a["top"]) == int(b["y0"]) == int(d["x0"])
assert int(a["bottom"]) == int(b["y1"]) == int(d["x1"])
def test_crop_and_filter(self):
def test(obj):
return obj["object_type"] == "char"
bbox = (0, 0, 200, 200)
original = self.pdf.pages[0]
cropped = original.crop(bbox)
assert id(cropped.chars) == id(cropped._objects["char"])
assert cropped.width == 200
assert len(cropped.rects) > 0
assert len(cropped.chars) < len(original.chars)
within_bbox = original.within_bbox(bbox)
assert len(within_bbox.chars) < len(cropped.chars)
assert len(within_bbox.chars) > 0
filtered = cropped.filter(test)
assert id(filtered.chars) == id(filtered._objects["char"])
assert len(filtered.rects) == 0
def test_outside_bbox(self):
original = self.pdf.pages[0]
outside_bbox = original.outside_bbox(original.find_tables()[0].bbox)
assert outside_bbox.extract_text() == "Page 1 of 205"
assert outside_bbox.bbox == original.bbox
def test_relative_crop(self):
page = self.pdf.pages[0]
cropped = page.crop((10, 10, 40, 40))
recropped = cropped.crop((10, 15, 20, 25), relative=True)
target_bbox = (20, 25, 30, 35)
assert recropped.bbox == target_bbox
recropped_wi = cropped.within_bbox((10, 15, 20, 25), relative=True)
assert recropped_wi.bbox == target_bbox
# via issue #245, should not throw error when using `relative=True`
bottom = page.crop((0, 0.8 * float(page.height), page.width, page.height))
bottom.crop((0, 0, 0.5 * float(bottom.width), bottom.height), relative=True)
bottom.crop(
(0.5 * float(bottom.width), 0, bottom.width, bottom.height), relative=True
)
# An extra test for issue #914, in which relative crops were
# using the the wrong bboxes for cropping, leading to empty object-lists
crop_right = page.crop((page.width / 2, 0, page.width, page.height))
crop_right_again_rel = crop_right.crop(
(0, 0, crop_right.width / 2, page.height), relative=True
)
assert len(crop_right_again_rel.chars)
def test_invalid_crops(self):
page = self.pdf.pages[0]
with pytest.raises(ValueError):
page.crop((0, 0, 0, 0))
with pytest.raises(ValueError):
page.crop((0, 0, 10000, 10))
with pytest.raises(ValueError):
page.crop((-10, 0, 10, 10))
with pytest.raises(ValueError):
page.crop((100, 0, 0, 100))
with pytest.raises(ValueError):
page.crop((0, 100, 100, 0))
# via issue #245
bottom = page.crop((0, 0.8 * float(page.height), page.width, page.height))
with pytest.raises(ValueError):
bottom.crop((0, 0, 0.5 * float(bottom.width), bottom.height))
with pytest.raises(ValueError):
bottom.crop((0.5 * float(bottom.width), 0, bottom.width, bottom.height))
# via issue #421, testing strict=True/False
with pytest.raises(ValueError):
page.crop((0, 0, page.width + 10, page.height + 10))
page.crop((0, 0, page.width + 10, page.height + 10), strict=False)
def test_rotation(self):
assert self.pdf.pages[0].width == 1008
assert self.pdf.pages[0].height == 612
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11-rotated.pdf")
with pdfplumber.open(path) as rotated:
assert rotated.pages[0].width == 612
assert rotated.pages[0].height == 1008
assert rotated.pages[0].cropbox != self.pdf.pages[0].cropbox
assert rotated.pages[0].bbox != self.pdf.pages[0].bbox
def test_password(self):
path = os.path.join(HERE, "pdfs/password-example.pdf")
with pdfplumber.open(path, password="test") as pdf:
assert len(pdf.chars) > 0
def test_unicode_normalization(self):
path = os.path.join(HERE, "pdfs/issue-905.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
print(page.extract_text())
assert ord(page.chars[0]["text"]) == 894
with pdfplumber.open(path, unicode_norm="NFC") as pdf:
page = pdf.pages[0]
assert ord(page.chars[0]["text"]) == 59
assert page.extract_text() == ";;"
def test_colors(self):
rect = self.pdf.pages[0].rects[0]
assert rect["non_stroking_color"] == (0.8, 1, 1)
def test_text_colors(self):
char = self.pdf.pages[0].chars[3358]
assert char["non_stroking_color"] == (1, 0, 0)
def test_load_with_custom_laparams(self):
# See https://github.com/jsvine/pdfplumber/issues/168
path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")
laparams = dict(line_margin=0.2)
with pdfplumber.open(path, laparams=laparams) as pdf:
assert round(pdf.pages[0].chars[0]["top"], 3) == 66.384
def test_loading_pathobj(self):
from pathlib import Path
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
path_obj = Path(path)
with pdfplumber.open(path_obj) as pdf:
assert len(pdf.metadata)
def test_loading_fileobj(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
with open(path, "rb") as f:
with pdfplumber.open(f) as pdf:
assert len(pdf.metadata)
assert not f.closed
def test_bad_fileobj(self):
path = os.path.join(HERE, "pdfs/empty.pdf")
with pytest.raises(pdfplumber.utils.exceptions.PdfminerException):
pdfplumber.open(path)
f = open(path)
with pytest.raises(pdfplumber.utils.exceptions.PdfminerException):
pdfplumber.open(f)
# File objects passed to pdfplumber should not be auto-closed
assert not f.closed
f.close()
def test_uncommon_boxes(self):
path = os.path.join(HERE, "pdfs/page-boxes-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert page.artbox == (42.51969, 70.86613999999997, 552.75591, 827.71653)
assert page.bleedbox == (0, 0.0, 623.62205, 870.23622)
assert page.trimbox == (28.34646, 56.69290999999998, 566.92913, 841.88976)
================================================
FILE: tests/test_ca_warn_report.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pdfplumber
from pdfplumber import table, utils
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
def fix_row_spaces(row):
return [(x or "").replace(" ", "") for x in row[:3]] + row[3:]
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
self.path = os.path.join(
HERE, "pdfs/WARN-Report-for-7-1-2015-to-03-25-2016.pdf"
)
self.pdf = pdfplumber.open(self.path)
self.PDF_WIDTH = self.pdf.pages[0].width
@classmethod
def teardown_class(self):
self.pdf.close()
def test_page_limiting(self):
with pdfplumber.open(self.path, pages=[1, 3]) as pdf:
assert len(pdf.pages) == 2
assert pdf.pages[1].page_number == 3
def test_objects(self):
p = self.pdf.pages[0]
assert len(p.chars)
assert len(p.rects)
assert len(p.images)
def test_parse(self):
rect_x0_clusters = utils.cluster_list(
[r["x0"] for r in self.pdf.pages[1].rects], tolerance=3
)
v_lines = [x[0] for x in rect_x0_clusters]
def parse_page(page):
data = page.extract_table(
{"vertical_strategy": "explicit", "explicit_vertical_lines": v_lines}
)
without_spaces = [fix_row_spaces(row) for row in data]
return without_spaces
parsed = parse_page(self.pdf.pages[0])
assert parsed[0] == [
"NoticeDate",
"Effective",
"Received",
"Company",
"City",
"No. Of",
"Layoff/Closure",
]
assert parsed[1] == [
"06/22/2015",
"03/25/2016",
"07/01/2015",
"Maxim Integrated Product",
"San Jose",
"150",
"Closure Permanent",
]
def test_edge_merging(self):
p0 = self.pdf.pages[0]
assert len(p0.edges) == 364
assert (
len(
table.merge_edges(
p0.edges,
snap_x_tolerance=3,
snap_y_tolerance=3,
join_x_tolerance=3,
join_y_tolerance=3,
)
)
== 46
)
assert (
len(
table.merge_edges(
p0.edges,
snap_x_tolerance=3,
snap_y_tolerance=3,
join_x_tolerance=3,
join_y_tolerance=0,
)
)
== 52
)
assert (
len(
table.merge_edges(
p0.edges,
snap_x_tolerance=0,
snap_y_tolerance=3,
join_x_tolerance=3,
join_y_tolerance=3,
)
)
== 94
)
assert (
len(
table.merge_edges(
p0.edges,
snap_x_tolerance=3,
snap_y_tolerance=0,
join_x_tolerance=3,
join_y_tolerance=3,
)
)
== 174
)
def test_vertices(self):
p0 = self.pdf.pages[0]
edges = table.merge_edges(
p0.edges,
snap_x_tolerance=3,
snap_y_tolerance=3,
join_x_tolerance=3,
join_y_tolerance=3,
)
ixs = table.edges_to_intersections(edges)
assert len(ixs.keys()) == 304 # 38x8
================================================
FILE: tests/test_convert.py
================================================
#!/usr/bin/env python
import json
import logging
import os
import sys
import unittest
from io import StringIO
from subprocess import PIPE, Popen
import pytest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
SCOTUS_TEXT = [
{
"type": "Div",
"children": [
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 21.625,
"O": "Layout",
},
"mcids": [1],
"text": [
"IN THE SUPREME COURT OF THE UNITED STATES - - - - - - - - - - - - "
"- - - - - x MICHAEL A. KNOWLES, : WARDEN, :"
],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"StartIndent": 86.375,
"O": "Layout",
},
"mcids": [2],
"text": [" Petitioner :"],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 50.375,
"O": "Layout",
},
"mcids": [3, 4],
"text": [
" v. ",
": No. 07-1315 ALEXANDRE MIRZAYANCE. : - - - - - - - - - - - - - -"
" - - - x",
],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"O": "Layout",
"SpaceAfter": 24.5,
"LineHeight": 25.75,
"StartIndent": 165.625,
"EndIndent": 57.625,
},
"mcids": [5],
"text": [" Washington, D.C. Tuesday, January 13, 2009"],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 100.75,
"O": "Layout",
},
"mcids": [6],
"text": [
" The above-entitled matter came on for oral argument before the "
"Supreme Court of the United States at 1:01 p.m. APPEARANCES: "
"STEVEN E. MERCER, ESQ., Deputy Attorney General, Los"
],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"O": "Layout",
"SpaceAfter": 179.125,
"LineHeight": 25.75,
"TextIndent": 21.625,
"EndIndent": 50.375,
"TextAlign": "None",
},
"mcids": [7],
"text": [
" Angeles, Cal.; on behalf of the Petitioner. CHARLES M. SEVILLA, "
"ESQ., San Diego, Cal.; on behalf of the Respondent. "
],
},
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Layout", "TextAlign": "Center", "SpaceAfter": 8.5},
"mcids": [8],
"text": ["1\n"],
},
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Layout", "TextAlign": "Center"},
"mcids": [9],
"text": ["Alderson Reporting Company "],
},
],
}
]
def run(cmd):
return Popen(cmd, stdout=PIPE).communicate()[0]
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
self.path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf = pdfplumber.open(self.path, pages=[1, 2, 5])
@classmethod
def teardown_class(self):
self.pdf.close()
def test_json(self):
c = json.loads(self.pdf.to_json())
assert (
c["pages"][0]["rects"][0]["bottom"] == self.pdf.pages[0].rects[0]["bottom"]
)
def test_json_attr_filter(self):
c = json.loads(self.pdf.to_json(include_attrs=["page_number"]))
assert list(c["pages"][0]["rects"][0].keys()) == ["object_type", "page_number"]
with pytest.raises(ValueError):
self.pdf.to_json(include_attrs=["page_number"], exclude_attrs=["bottom"])
with pytest.raises(ValueError):
self.pdf.to_json(exclude_attrs=["object_type"])
def test_json_all_types(self):
c = json.loads(self.pdf.to_json(object_types=None))
found_types = c["pages"][0].keys()
assert "chars" in found_types
assert "lines" in found_types
assert "rects" in found_types
assert "images" in found_types
assert "curves" in c["pages"][2].keys()
def test_single_pages(self):
c = json.loads(self.pdf.pages[0].to_json())
assert c["rects"][0]["bottom"] == self.pdf.pages[0].rects[0]["bottom"]
def test_additional_attr_types(self):
path = os.path.join(HERE, "pdfs/issue-67-example.pdf")
with pdfplumber.open(path, pages=[1]) as pdf:
c = json.loads(pdf.to_json())
assert len(c["pages"][0]["images"])
def test_csv(self):
c = self.pdf.to_csv(precision=3)
assert c.split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
"18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"
'"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"'
',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,18.0,,,,"(0,)",,Y,,1,'
)
io = StringIO()
self.pdf.to_csv(io, precision=3)
io.seek(0)
c_from_io = io.read()
assert c == c_from_io
def test_csv_all_types(self):
c = self.pdf.to_csv(object_types=None)
assert c.split("\r\n")[1].split(",")[0] == "line"
def test_cli_help(self):
res = run([sys.executable, "-m", "pdfplumber.cli"])
assert b"usage:" in res
def test_cli_structure(self):
res = run([sys.executable, "-m", "pdfplumber.cli", self.path, "--structure"])
c = json.loads(res)
# lol no structure
assert c == []
def test_cli_structure_text(self):
path = os.path.join(HERE, "pdfs/scotus-transcript-p1.pdf")
res = run([sys.executable, "-m", "pdfplumber.cli", path, "--structure-text"])
c = json.loads(res)
assert c == SCOTUS_TEXT
def test_cli_json(self):
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
self.path,
"--format",
"json",
"--pages",
"1-2",
"5",
"--indent",
"2",
]
)
c = json.loads(res)
assert c["pages"][0]["page_number"] == 1
assert c["pages"][1]["page_number"] == 2
assert c["pages"][2]["page_number"] == 5
assert c["pages"][0]["rects"][0]["bottom"] == float(
self.pdf.pages[0].rects[0]["bottom"]
)
def test_cli_csv(self):
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
self.path,
"--format",
"csv",
"--precision",
"3",
]
)
assert res.decode("utf-8").split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
"18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"
'"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"'
',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,18.0,,,,"(0,)",,Y,,1,'
)
def test_cli_csv_exclude(self):
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
self.path,
"--format",
"csv",
"--precision",
"3",
"--exclude-attrs",
"matrix",
"mcid",
"ncs",
]
)
assert res.decode("utf-8").split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
"18.0,12.996,,,,,,,TimesNewRomanPSMT,"
',,,"(0.0, 0.0, 0.0)",,,18.0,,,,"(0,)",,Y,,1,'
)
def test_cli_csv_include(self):
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
self.path,
"--format",
"csv",
"--precision",
"3",
"--include-attrs",
"page_number",
]
)
assert res.decode("utf-8").split("\r\n")[9] == ("char,1")
def test_cli_text(self):
path = os.path.join(HERE, "pdfs/scotus-transcript-p1.pdf")
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
path,
"--format",
"text",
]
)
target_path = os.path.join(HERE, "comparisons/scotus-transcript-p1.txt")
target = open(target_path).read()
assert res.decode("utf-8") == target
def test_page_to_dict(self):
x = self.pdf.pages[0].to_dict(object_types=["char"])
assert len(x["chars"]) == len(self.pdf.pages[0].chars)
================================================
FILE: tests/test_ctm.py
================================================
#!/usr/bin/env python
import os
import unittest
import pdfplumber
from pdfplumber.ctm import CTM
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
def test_pdffill_demo(self):
path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
pdf = pdfplumber.open(path)
left_r = pdf.pages[3].chars[97]
right_r = pdf.pages[3].chars[105]
left_ctm = CTM(*left_r["matrix"])
right_ctm = CTM(*right_r["matrix"])
assert round(left_ctm.translation_x) == 126
assert round(right_ctm.translation_x) == 372
assert round(left_ctm.translation_y) == 519
assert round(right_ctm.translation_y) == 562
assert left_ctm.skew_x == 45
assert right_ctm.skew_x == -45
assert left_ctm.skew_y == 45
assert right_ctm.skew_y == -45
assert round(left_ctm.scale_x, 3) == 1
assert round(right_ctm.scale_x, 3) == 1
assert round(left_ctm.scale_y, 3) == 1
assert round(right_ctm.scale_y, 3) == 1
================================================
FILE: tests/test_dedupe_chars.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/issue-71-duplicate-chars.pdf")
self.pdf = pdfplumber.open(path)
@classmethod
def teardown_class(self):
self.pdf.close()
def test_extract_table(self):
page = self.pdf.pages[0]
table_without_drop_duplicates = page.extract_table()
table_with_drop_duplicates = page.dedupe_chars().extract_table()
last_line_without_drop = table_without_drop_duplicates[1][1].split("\n")[-1]
last_line_with_drop = table_with_drop_duplicates[1][1].split("\n")[-1]
assert (
last_line_without_drop
== "微微软软 培培训训课课程程:: 名名模模意意义义一一些些有有意意义义一一些些"
)
assert last_line_with_drop == "微软 培训课程: 名模意义一些有意义一些"
def test_extract_words(self):
page = self.pdf.pages[0]
x0 = 440.143
x1_without_drop = 534.992
x1_with_drop = 534.719
top_windows = 791.849
top_linux = 794.357
bottom = 802.961
last_words_without_drop = page.extract_words()[-1]
last_words_with_drop = page.dedupe_chars().extract_words()[-1]
assert round(last_words_without_drop["x0"], 3) == x0
assert round(last_words_without_drop["x1"], 3) == x1_without_drop
assert round(last_words_without_drop["top"], 3) in (top_windows, top_linux)
assert round(last_words_without_drop["bottom"], 3) == bottom
assert last_words_without_drop["upright"] == 1
assert (
last_words_without_drop["text"]
== "名名模模意意义义一一些些有有意意义义一一些些"
)
assert round(last_words_with_drop["x0"], 3) == x0
assert round(last_words_with_drop["x1"], 3) == x1_with_drop
assert round(last_words_with_drop["top"], 3) in (top_windows, top_linux)
assert round(last_words_with_drop["bottom"], 3) == bottom
assert last_words_with_drop["upright"] == 1
assert last_words_with_drop["text"] == "名模意义一些有意义一些"
def test_extract_text(self):
page = self.pdf.pages[0]
last_line_without_drop = page.extract_text().split("\n")[-1]
last_line_with_drop = page.dedupe_chars().extract_text().split("\n")[-1]
assert (
last_line_without_drop
== "微微软软 培培训训课课程程:: 名名模模意意义义一一些些有有意意义义一一些些"
)
assert last_line_with_drop == "微软 培训课程: 名模意义一些有意义一些"
def test_extract_text2(self):
path = os.path.join(HERE, "pdfs/issue-71-duplicate-chars-2.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
assert (
page.dedupe_chars().extract_text(y_tolerance=6).splitlines()[4]
== "UE 8. Circulation - Métabolismes"
)
def test_extra_attrs(self):
path = os.path.join(HERE, "pdfs/issue-1114-dedupe-chars.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
def dup_chars(s: str) -> str:
return "".join((char if char == " " else char + char) for char in s)
ground_truth = (
("Simple", False, False),
("Duplicated", True, True),
("Font", "fontname", True),
("Size", "size", True),
("Italic", "fontname", True),
("Weight", "fontname", True),
("Horizontal shift", False, "HHoorrizizoonntatal ls shhifitft"),
("Vertical shift", False, True),
)
gt = []
for text, should_dedup, dup_text in ground_truth:
if isinstance(dup_text, bool):
if dup_text:
dup_text = dup_chars(text)
else:
dup_text = text
gt.append((text, should_dedup, dup_text))
keys_list = ["no_dedupe", (), ("size",), ("fontname",), ("size", "fontname")]
for keys in keys_list:
if keys != "no_dedupe":
filtered_page = page.dedupe_chars(tolerance=2, extra_attrs=keys)
else:
filtered_page = page
for i, line in enumerate(
filtered_page.extract_text(y_tolerance=5).splitlines()
):
text, should_dedup, dup_text = gt[i]
if keys == "no_dedupe":
should_dedup = False
if isinstance(should_dedup, str):
if should_dedup in keys:
fail_msg = (
f"{should_dedup} is not required to match "
"so it should be duplicated"
)
assert line == dup_text, fail_msg
else:
fail_msg = (
"Should not be duplicated "
f"when requiring matching {should_dedup}"
)
assert line == text, fail_msg
elif should_dedup:
assert line == text
else:
assert line == dup_text
================================================
FILE: tests/test_display.py
================================================
#!/usr/bin/env python
import io
import logging
import os
import unittest
from zipfile import ZipFile
import PIL.Image
import pytest
import pdfplumber
from pdfplumber.table import TableFinder
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
self.pdf = pdfplumber.open(path)
self.im = self.pdf.pages[0].to_image()
@classmethod
def teardown_class(self):
self.pdf.close()
def test_basic_conversion(self):
self.im.reset()
self.im.draw_rects(self.im.page.rects)
self.im.draw_circle(self.im.page.chars[0])
self.im.draw_line(self.im.page.edges[0])
self.im.draw_vlines([10])
self.im.draw_hlines([10])
def test_width_height(self):
p = self.pdf.pages[0]
with pytest.raises(ValueError):
p.to_image(resolution=72, height=100)
im = p.to_image(width=503)
assert im.original.width == 503
im = p.to_image(height=805)
assert im.original.height == 805
def test_debug_tablefinder(self):
self.im.reset()
settings = {"horizontal_strategy": "text", "intersection_tolerance": 5}
self.im.debug_tablefinder(settings)
finder = TableFinder(self.im.page, settings)
self.im.debug_tablefinder(finder)
self.im.debug_tablefinder(None)
# https://github.com/jsvine/pdfplumber/issues/1237
self.im.debug_tablefinder(table_settings={})
with pytest.raises(ValueError):
self.im.debug_tablefinder(0)
def test_bytes_stream_to_image(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
page = pdfplumber.PDF(io.BytesIO(open(path, "rb").read())).pages[0]
page.to_image()
def test_curves(self):
path = os.path.join(HERE, "../examples/pdfs/ag-energy-round-up-2017-02-24.pdf")
page = pdfplumber.open(path).pages[0]
im = page.to_image()
im.draw_lines(page.curves)
def test_cropped(self):
im = self.pdf.pages[0].crop((10, 20, 30, 50)).to_image()
assert im.original.size == (20, 30)
def test_cropbox(self):
path = os.path.join(HERE, "pdfs/issue-1054-example.pdf")
with pdfplumber.open(path) as pdf:
im = pdf.pages[0].to_image()
assert im.original.size == (596, 842)
im = pdf.pages[0].to_image(force_mediabox=True)
assert im.original.size == (2227, 2923)
def test_copy(self):
assert self.im.copy().original == self.im.original
def test_outline_words(self):
self.im.outline_words(
stroke="blue",
fill=(0, 200, 10),
stroke_width=2,
x_tolerance=5,
y_tolerance=5,
)
def test_outline_chars(self):
self.im.outline_chars(stroke="blue", fill=(0, 200, 10), stroke_width=2)
def test__repr_png_(self):
png = self.im._repr_png_()
assert isinstance(png, bytes)
assert 20000 < len(png) < 80000
def test_no_quantize(self):
b = io.BytesIO()
self.im.save(b, "PNG", quantize=False)
assert len(b.getvalue()) > len(self.im._repr_png_())
def test_antialias(self):
aa = self.pdf.pages[0].to_image(antialias=True)
assert len(aa._repr_png_()) > len(self.im._repr_png_())
def test_decompression_bomb(self):
original_max = PIL.Image.MAX_IMAGE_PIXELS
PIL.Image.MAX_IMAGE_PIXELS = 10
# Previously, this raised PIL.Image.DecompressionBombError
self.pdf.pages[0].to_image()
PIL.Image.MAX_IMAGE_PIXELS = original_max
def test_password(self):
path = os.path.join(HERE, "pdfs/password-example.pdf")
with pdfplumber.open(path, password="test") as pdf:
pdf.pages[0].to_image()
def test_zip(self):
# See https://github.com/jsvine/pdfplumber/issues/948
# reproducer.py
path = os.path.join(HERE, "pdfs/issue-948.zip")
with ZipFile(path) as zip_file:
with zip_file.open("dummy.pdf") as pdf_file:
with pdfplumber.open(pdf_file) as pdf:
page = pdf.pages[0]
page.to_image()
================================================
FILE: tests/test_issues.py
================================================
#!/usr/bin/env python
import logging
import os
import re
try:
import resource
except ModuleNotFoundError:
resource = None
import unittest
import pytest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
def test_issue_13(self):
"""
Test slightly simplified from gist here:
https://github.com/jsvine/pdfplumber/issues/13
"""
pdf = pdfplumber.open(
os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf")
)
# Only find checkboxes this size
RECT_WIDTH = 9.3
RECT_HEIGHT = 9.3
RECT_TOLERANCE = 2
def filter_rects(rects):
# Just get the rects that are the right size to be checkboxes
rects_found = []
for rect in rects:
if (
rect["height"] > (RECT_HEIGHT - RECT_TOLERANCE)
and (rect["height"] < RECT_HEIGHT + RECT_TOLERANCE)
and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE)
and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE)
):
rects_found.append(rect)
return rects_found
def determine_if_checked(checkbox, checklines):
"""
This figures out if the bounding box of (either) line used to make
one half of the 'x' is the right size and overlaps with a rectangle.
This isn't foolproof, but works for this case.
It's not totally clear (to me) how common this style of checkboxes
are used, and whether this is useful approach to them.
Also note there should be *two* matching LTCurves for each checkbox.
But here we only test there's at least one.
"""
for cl in checklines:
if (
checkbox["height"] > (RECT_HEIGHT - RECT_TOLERANCE)
and (checkbox["height"] < RECT_HEIGHT + RECT_TOLERANCE)
and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)
and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)
):
xmatch = False
ymatch = False
if max(checkbox["x0"], cl["x0"]) <= min(checkbox["x1"], cl["x1"]):
xmatch = True
if max(checkbox["y0"], cl["y0"]) <= min(checkbox["y1"], cl["y1"]):
ymatch = True
if xmatch and ymatch:
return True
return False
p0 = pdf.pages[0]
checklines = [
line
for line in p0.lines
if round(line["height"], 2) == round(line["width"], 2)
] # These are diagonals
rects = filter_rects(p0.objects["rect"])
n_checked = sum([determine_if_checked(rect, checklines) for rect in rects])
assert n_checked == 5
pdf.close()
def test_issue_14(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf"))
assert len(pdf.objects)
pdf.close()
def test_issue_21(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf"))
assert len(pdf.objects)
pdf.close()
def test_issue_33(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf"))
assert len(pdf.metadata.keys())
pdf.close()
def test_issue_53(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-53-example.pdf"))
assert len(pdf.objects)
pdf.close()
def test_issue_67(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-67-example.pdf"))
assert len(pdf.metadata.keys())
pdf.close()
def test_pr_88(self):
# via https://github.com/jsvine/pdfplumber/pull/88
path = os.path.join(HERE, "pdfs/pr-88-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
words = page.extract_words()
assert len(words) == 25
def test_issue_90(self):
path = os.path.join(HERE, "pdfs/issue-90-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
page.extract_words()
def test_pr_136(self):
path = os.path.join(HERE, "pdfs/pr-136-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
page.extract_words()
def test_pr_138(self):
path = os.path.join(HERE, "pdfs/pr-138-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert len(page.chars) == 5140
page.extract_tables(
{
"vertical_strategy": "explicit",
"horizontal_strategy": "lines",
"explicit_vertical_lines": page.curves + page.edges,
}
)
def test_issue_140(self):
path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
cropped_page = page.crop((0, 0, page.width, 122))
assert len(cropped_page.extract_table()) == 5
def test_issue_203(self):
path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf")
with pdfplumber.open(path) as pdf:
assert len(pdf.objects)
def test_issue_216(self):
"""
.extract_table() should return None if there's no table,
instead of crashing
"""
path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
with pdfplumber.open(path) as pdf:
cropped = pdf.pages[0].crop((0, 0, 1, 1))
assert cropped.extract_table() is None
def test_issue_297(self):
"""
Handle integer type metadata
"""
path = os.path.join(HERE, "pdfs/issue-297-example.pdf")
with pdfplumber.open(path) as pdf:
assert isinstance(pdf.metadata["Copies"], int)
def test_issue_316(self):
"""
Handle invalid metadata
"""
path = os.path.join(HERE, "pdfs/issue-316-example.pdf")
with pdfplumber.open(path) as pdf:
assert (
pdf.metadata["Changes"][0]["CreationDate"] == "D:20061207105020Z00'00'"
)
def test_issue_386(self):
"""
util.extract_text() should not raise exception if given pure iterator
"""
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
with pdfplumber.open(path) as pdf:
chars = (char for char in pdf.chars)
pdfplumber.utils.extract_text(chars)
def test_issue_461_and_842(self):
"""
pdfplumber should gracefully handle characters with byte-encoded
font names.
"""
before = b"RGJSAP+\xcb\xce\xcc\xe5"
after = pdfplumber.page.fix_fontname_bytes(before)
assert after == "RGJSAP+SimSun,Regular"
before = b"\xcb\xce\xcc\xe5"
after = pdfplumber.page.fix_fontname_bytes(before)
assert after == "SimSun,Regular"
path = os.path.join(HERE, "pdfs/issue-461-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert all(isinstance(c["fontname"], str) for c in page.chars)
page.dedupe_chars()
path = os.path.join(HERE, "pdfs/issue-842-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert all(isinstance(c["fontname"], str) for c in page.chars)
page.dedupe_chars()
def test_issue_463(self):
"""
Extracting annotations should not raise UnicodeDecodeError on utf-16 text
"""
path = os.path.join(HERE, "pdfs/issue-463-example.pdf")
with pdfplumber.open(path) as pdf:
annots = pdf.annots
annots[0]["contents"] == "日本語"
def test_issue_598(self):
"""
Ligatures should be translated by default.
"""
path = os.path.join(HERE, "pdfs/issue-598-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
a = page.extract_text()
assert "fiction" in a
assert "fiction" not in a
b = page.extract_text(expand_ligatures=False)
assert "fiction" in b
assert "fiction" not in b
assert page.extract_words()[53]["text"] == "fiction"
assert page.extract_words(expand_ligatures=False)[53]["text"] == "fiction"
def test_issue_683(self):
"""
Page.search ValueError: min() arg is an empty sequence
This ultimately stemmed from a mistaken assumption in
LayoutEngine.calculate(...) that len(char["text"]) would always equal
1, which is not true for ligatures. Issue 683 does not provide a PDF,
but the test PDF triggers the same error, which should now be fixed.
Thank you to @samkit-jain for identifying and writing this test.
"""
path = os.path.join(HERE, "pdfs/issue-71-duplicate-chars-2.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
page.search(r"\d+", regex=True)
def test_issue_982(self):
"""
extract_text(use_text_flow=True) apparently does nothing
This is because, while we took care not to sort the words by
`doctop` in `WordExtractor` and `WordMap`, no such precaution
was taken in `cluster_objects`. We thus add an option to
`cluster_objects` to preserve the ordering (which could come
from `use_text_flow` or from `presorted`) of the input objects.
"""
path = os.path.join(HERE, "pdfs/issue-982-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True))
words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True))
assert text[0:100] == words[0:100]
def test_issue_1147(self):
"""
Edge-case for when decode_text is passed a string
that is out of bounds of PDFDocEncoding
"""
path = os.path.join(HERE, "pdfs/issue-1147-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
# Should not error:
assert page.extract_text()
def test_issue_1181(self):
"""
Correctly re-calculate coordinates when MediaBox does not start at (0,0)
"""
path = os.path.join(HERE, "pdfs/issue-1181.pdf")
with pdfplumber.open(path) as pdf:
p0, p1 = pdf.pages
assert p0.crop(p0.bbox).extract_table() == [
["FooCol1", "FooCol2", "FooCol3"],
["Foo4", "Foo5", "Foo6"],
["Foo7", "Foo8", "Foo9"],
["Foo10", "Foo11", "Foo12"],
["", "", ""],
]
assert p1.crop(p1.bbox).extract_table() == [
["BarCol1", "BarCol2", "BarCol3"],
["Bar4", "Bar5", "Bar6"],
["Bar7", "Bar8", "Bar9"],
["Bar10", "Bar11", "Bar12"],
["", "", ""],
]
def test_pr_1195(self):
"""
In certain scenarios, annotations may include invalid or extraneous
data that can obstruct the annotation processing workflow. To mitigate
this, the raise_unicode_errors parameter in the PDF initializer and the
.open() method provides a configurable option to bypass these errors
and generate warnings instead, ensuring smoother handling of such
anomalies.
The following tests verifies the functionality of the
raise_unicode_errors parameter.
"""
path = os.path.join(HERE, "pdfs/annotations-unicode-issues.pdf")
with pdfplumber.open(path) as pdf, pytest.raises(UnicodeDecodeError):
for _ in pdf.annots:
pass
with pdfplumber.open(path, raise_unicode_errors=False) as pdf, pytest.warns(
UserWarning
):
for _ in pdf.annots:
pass
================================================
FILE: tests/test_laparams.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
self.path = os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf")
def test_without_laparams(self):
with pdfplumber.open(self.path, laparams=None) as pdf:
objs = pdf.pages[0].objects
assert "textboxhorizontal" not in objs.keys()
assert len(objs["char"]) == 4408
def test_with_laparams(self):
with pdfplumber.open(self.path, laparams={}) as pdf:
page = pdf.pages[0]
assert len(page.textboxhorizontals) == 27
assert len(page.textlinehorizontals) == 79
assert "text" in page.textboxhorizontals[0]
assert "text" in page.textlinehorizontals[0]
assert len(page.chars) == 4408
assert "anno" not in page.objects.keys()
def test_vertical_texts(self):
path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
laparams = {"detect_vertical": True}
with pdfplumber.open(path, laparams=laparams) as pdf:
page = pdf.pages[0]
assert len(page.textlinehorizontals) == 142
assert len(page.textboxhorizontals) == 74
assert len(page.textlineverticals) == 11
assert len(page.textboxverticals) == 6
assert "text" in page.textboxverticals[0]
assert "text" in page.textlineverticals[0]
def test_issue_383(self):
with pdfplumber.open(self.path, laparams={}) as pdf:
p0 = pdf.pages[0]
assert "anno" not in p0.objects.keys()
cropped = p0.crop((0, 0, 100, 100))
assert len(cropped.objects)
================================================
FILE: tests/test_list_metadata.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
def test_load(self):
path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")
with pdfplumber.open(path) as pdf:
assert len(pdf.metadata)
================================================
FILE: tests/test_mcids.py
================================================
#!/usr/bin/env python3
import os
import unittest
import pdfplumber
HERE = os.path.abspath(os.path.dirname(__file__))
class TestMCIDs(unittest.TestCase):
"""Test MCID extraction."""
def test_mcids(self):
path = os.path.join(HERE, "pdfs/mcid_example.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
# Check text of MCIDS
mcids = []
for c in page.chars:
if "mcid" in c:
while len(mcids) <= c["mcid"]:
mcids.append("")
if not mcids[c["mcid"]]:
mcids[c["mcid"]] = c["tag"] + ": "
mcids[c["mcid"]] += c["text"]
assert mcids == [
"Standard: Test of figures",
"",
"P: 1 ligne",
"P: 2 ligne",
"P: 3 ligne",
"P: 4 ligne",
"P: 0",
"P: 2",
"P: 4",
"P: 6",
"P: 8",
"P: 10",
"P: 12",
"P: Figure 1: Chart",
"",
"P: 1 colonne",
"P: 2 colonne",
"P: 3 colonne",
]
# Check line and curve MCIDs
line_mcids = set(x["mcid"] for x in page.lines)
curve_mcids = set(x["mcid"] for x in page.curves)
assert all(x["tag"] == "Figure" for x in page.lines)
assert all(x["tag"] == "Figure" for x in page.curves)
assert line_mcids & {1, 14}
assert curve_mcids & {1, 14}
# No rects to test unfortunately!
================================================
FILE: tests/test_nics_report.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
from operator import itemgetter
import pdfplumber
from pdfplumber.utils import extract_text, within_bbox
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
COLUMNS = [
"state",
"permit",
"handgun",
"long_gun",
"other",
"multiple",
"admin",
"prepawn_handgun",
"prepawn_long_gun",
"prepawn_other",
"redemption_handgun",
"redemption_long_gun",
"redemption_other",
"returned_handgun",
"returned_long_gun",
"returned_other",
"rentals_handgun",
"rentals_long_gun",
"private_sale_handgun",
"private_sale_long_gun",
"private_sale_other",
"return_to_seller_handgun",
"return_to_seller_long_gun",
"return_to_seller_other",
"totals",
]
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
self.pdf = pdfplumber.open(path)
self.PDF_WIDTH = self.pdf.pages[0].width
@classmethod
def teardown_class(self):
self.pdf.close()
def test_edges(self):
assert len(self.pdf.vertical_edges) == 700
assert len(self.pdf.horizontal_edges) == 508
def test_plain(self):
page = self.pdf.pages[0]
cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
table = cropped.extract_table(
{
"horizontal_strategy": "text",
"explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))],
"intersection_tolerance": 5,
}
)
def parse_value(k, x):
if k == 0:
return x
if x in (None, ""):
return None
return int(x.replace(",", ""))
def parse_row(row):
return dict((COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row))
parsed_table = [parse_row(row) for row in table]
# [1:] because first column is state name
for c in COLUMNS[1:]:
total = parsed_table[-1][c]
colsum = sum(row[c] or 0 for row in parsed_table)
assert colsum == (total * 2)
month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
month_text = extract_text(month_chars)
assert month_text == "November - 2015"
def test_filter(self):
page = self.pdf.pages[0]
def test(obj):
if obj["object_type"] == "char":
if obj["size"] < 15:
return False
return True
filtered = page.filter(test)
text = filtered.extract_text()
assert text == "NICS Firearm Background Checks\nNovember - 2015"
def test_text_only_strategy(self):
cropped = self.pdf.pages[0].crop((0, 80, self.PDF_WIDTH, 475))
table = cropped.extract_table(
dict(
horizontal_strategy="text",
vertical_strategy="text",
)
)
assert table[0][0] == "Alabama"
assert table[0][22] == "71,137"
assert table[-1][0] == "Wyoming"
assert table[-1][22] == "5,017"
def test_explicit_horizontal(self):
cropped = self.pdf.pages[0].crop((0, 80, self.PDF_WIDTH, 475))
table = cropped.find_tables(
dict(
horizontal_strategy="text",
vertical_strategy="text",
)
)[0]
h_positions = [row.cells[0][1] for row in table.rows] + [
table.rows[-1].cells[0][3]
]
t_explicit = cropped.find_tables(
dict(
horizontal_strategy="explicit",
vertical_strategy="text",
explicit_horizontal_lines=h_positions,
)
)[0]
assert table.extract() == t_explicit.extract()
h_objs = [
{
"x0": 0,
"x1": self.PDF_WIDTH,
"width": self.PDF_WIDTH,
"top": h,
"bottom": h,
"object_type": "line",
}
for h in h_positions
]
t_explicit_objs = cropped.find_tables(
dict(
horizontal_strategy="explicit",
vertical_strategy="text",
explicit_horizontal_lines=h_objs,
)
)[0]
assert table.extract() == t_explicit_objs.extract()
================================================
FILE: tests/test_oss_fuzz.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
from pathlib import Path
import pdfplumber
from pdfplumber.utils.exceptions import MalformedPDFException, PdfminerException
logging.disable(logging.ERROR)
HERE = Path(os.path.abspath(os.path.dirname(__file__)))
ACCEPTABLE_EXCEPTIONS = (MalformedPDFException, PdfminerException)
class Test(unittest.TestCase):
def test_load(self):
def test_conversions(pdf):
methods = [pdf.to_dict, pdf.to_json, pdf.to_csv, pdf.pages[0].to_image]
for method in methods:
try:
method()
except ACCEPTABLE_EXCEPTIONS:
continue
except Exception as e:
print(f"Failed on: {path.name}")
raise e
paths = sorted((HERE / "pdfs/from-oss-fuzz/load/").glob("*.pdf"))
for path in paths:
try:
with pdfplumber.open(path) as pdf:
assert pdf.pages
test_conversions(pdf)
except ACCEPTABLE_EXCEPTIONS:
continue
except Exception as e:
print(f"Failed on: {path.name}")
raise e
================================================
FILE: tests/test_repair.py
================================================
#!/usr/bin/env python
import os
import shutil
import tempfile
import unittest
import pytest
import pdfplumber
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
def test_from_issue_932(self):
path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
char = page.chars[0]
assert char["bottom"] > page.height
with pdfplumber.open(path, repair=True) as pdf:
page = pdf.pages[0]
char = page.chars[0]
assert char["bottom"] < page.height
with pdfplumber.repair(path) as repaired:
with pdfplumber.open(repaired) as pdf:
page = pdf.pages[0]
char = page.chars[0]
assert char["bottom"] < page.height
def test_other_repair_inputs(self):
path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
with pdfplumber.open(open(path, "rb"), repair=True) as pdf:
page = pdf.pages[0]
char = page.chars[0]
assert char["bottom"] < page.height
def test_bad_repair_path(self):
path = os.path.join(HERE, "pdfs/abc.xyz")
with pytest.raises(Exception):
with pdfplumber.open(path, repair=True):
pass
def test_repair_to_file(self):
path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
with tempfile.NamedTemporaryFile("wb") as out:
pdfplumber.repair(path, outfile=out.name)
with pdfplumber.open(out.name) as pdf:
page = pdf.pages[0]
char = page.chars[0]
assert char["bottom"] < page.height
def test_repair_setting(self):
path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
with tempfile.NamedTemporaryFile("wb") as out:
pdfplumber.repair(path, outfile=out.name)
with tempfile.NamedTemporaryFile("wb") as out:
pdfplumber.repair(path, outfile=out.name, setting="prepress")
def test_repair_password(self):
path = os.path.join(HERE, "pdfs/password-example.pdf")
with pdfplumber.open(path, repair=True, password="test") as pdf:
assert len(pdf.pages[0].chars)
def test_repair_custom_path(self):
path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
with pdfplumber.open(path, repair=True, gs_path=shutil.which("gs")) as pdf:
assert len(pdf.pages[0].chars)
================================================
FILE: tests/test_structure.py
================================================
#!/usr/bin/env python3
import os
import re
import unittest
from collections import deque
from pdfminer.pdftypes import resolve1
import pdfplumber
from pdfplumber.structure import PDFStructTree
HERE = os.path.abspath(os.path.dirname(__file__))
TREE = [
{
"type": "Document",
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.24,
"TextAlign": "Center",
},
"mcids": [0],
},
{
"type": "H1",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.36,
},
"mcids": [1],
},
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.12,
},
"mcids": [2],
},
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.181,
},
"mcids": [3, 4, 5, 6, 7],
},
{
"type": "H2",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.381,
},
"mcids": [8],
},
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.12,
},
"mcids": [9],
},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.181,
"StartIndent": 0.36,
},
"mcids": [10, 11],
}
],
}
],
},
{
"type": "LI",
"children": [
{
"type": "LBody",
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.181,
"StartIndent": 0.36,
},
"mcids": [12, 13],
},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block", # noqa: E501
"SpaceBefore": 0.181, # noqa: E501
"StartIndent": 0.72, # noqa: E501
},
"mcids": [14, 15],
}
],
}
],
}
],
},
],
}
],
},
{
"type": "LI",
"children": [
{
"type": "LBody",
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.181,
"StartIndent": 0.36,
},
"mcids": [16, 17, 18, 19, 20, 21, 22, 23],
}
],
}
],
},
],
},
{
"type": "H3",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.321,
},
"mcids": [24],
},
{
"type": "Table",
"attributes": {
"O": "Layout",
"Placement": "Block",
"SpaceBefore": 0.12,
"SpaceAfter": 0.015,
"Width": 9.972,
"Height": 1.047,
"BBox": [56.7, 249.75, 555.3, 302.1],
},
"children": [
{
"type": "TR",
"attributes": {"O": "Layout", "Placement": "Block"},
"children": [
{
"type": "TH",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.985,
"Height": 0.291,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [25],
}
],
},
{
"type": "TH",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.987,
"Height": 0.291,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [26],
}
],
},
],
},
{
"type": "TR",
"attributes": {"O": "Layout", "Placement": "Block"},
"children": [
{
"type": "TD",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.985,
"Height": 0.291,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [27],
}
],
},
{
"type": "TD",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.987,
"Height": 0.291,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [28],
}
],
},
],
},
{
"type": "TR",
"attributes": {"O": "Layout", "Placement": "Block"},
"children": [
{
"type": "TD",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.985,
"Height": 0.33,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [29],
}
],
},
{
"type": "TD",
"attributes": {
"O": "Layout",
"Placement": "Inline",
"Width": 4.987,
"Height": 0.33,
},
"children": [
{
"type": "P",
"attributes": {
"O": "Layout",
"Placement": "Block",
},
"mcids": [30],
}
],
},
],
},
],
},
],
}
]
class Test(unittest.TestCase):
"""Test a PDF specifically created to show structure."""
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
self.pdf = pdfplumber.open(path)
@classmethod
def teardown_class(self):
self.pdf.close()
def test_structure_tree(self):
assert self.pdf.pages[0].structure_tree == TREE
# Add page numbers
d = deque(TREE)
while d:
el = d.popleft()
el["page_number"] = 1
if "children" in el:
d.extend(el["children"])
assert self.pdf.structure_tree == TREE
PVSTRUCT = [
{
"type": "Sect",
"children": [
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [0]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [1]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [2]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [3]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [4]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [5]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [6]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [7]},
{
"type": "P",
"lang": "FR-FR",
"page_number": 1,
"mcids": [8],
"children": [
{"type": "Span", "lang": "FR-CA", "page_number": 1, "mcids": [9]}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [11]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [12]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [13]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [14]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [15]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [16]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 1,
"mcids": [19],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [22]},
{"type": "P", "lang": "FR-FR", "page_number": 1, "mcids": [23]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [24]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 1,
"mcids": [27],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [30]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [31]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [32]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 1,
"mcids": [35],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [38]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [39]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [40]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 1,
"mcids": [43, 45],
"children": [
{
"type": "Span",
"lang": "FR-FR",
"page_number": 1,
"mcids": [44],
}
],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [48]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [49]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [50]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [51]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [52]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [53]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [54]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [55]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [56]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [57]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [58]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [59]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [60]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [61]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [62]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [63]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [64]},
{"type": "P", "lang": "FR-CA", "page_number": 1, "mcids": [65]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [0]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [1]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [2]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [3]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [4]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [5]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [6]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 2,
"mcids": [9, 11],
"children": [
{
"type": "Span",
"lang": "FR-FR",
"page_number": 2,
"mcids": [10],
}
],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [14]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [15]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [16]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [17]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [18]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [19]},
],
}
]
PVSTRUCT1 = [
{
"type": "Sect",
"children": [
{"lang": "FR-CA", "type": "P", "mcids": [0]},
{"lang": "FR-CA", "type": "P", "mcids": [1]},
{"lang": "FR-CA", "type": "P", "mcids": [2]},
{"lang": "FR-CA", "type": "P", "mcids": [3]},
{"lang": "FR-CA", "type": "P", "mcids": [4]},
{"lang": "FR-CA", "type": "P", "mcids": [5]},
{"lang": "FR-CA", "type": "P", "mcids": [6]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"lang": "FR-CA",
"type": "LBody",
"mcids": [9, 11],
"children": [
{"lang": "FR-FR", "type": "Span", "mcids": [10]}
],
}
],
}
],
},
{"lang": "FR-CA", "type": "P", "mcids": [14]},
{"lang": "FR-CA", "type": "P", "mcids": [15]},
{"lang": "FR-CA", "type": "P", "mcids": [16]},
{"lang": "FR-FR", "type": "P", "mcids": [17]},
{"lang": "FR-FR", "type": "P", "mcids": [18]},
{"lang": "FR-FR", "type": "P", "mcids": [19]},
],
}
]
PVSTRUCT2 = [
{
"type": "Sect",
"children": [
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [0]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [1]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [2]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [3]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [4]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [5]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [6]},
{
"type": "L",
"children": [
{
"type": "LI",
"children": [
{
"type": "LBody",
"lang": "FR-CA",
"page_number": 2,
"mcids": [9, 11],
"children": [
{
"type": "Span",
"lang": "FR-FR",
"page_number": 2,
"mcids": [10],
}
],
}
],
}
],
},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [14]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [15]},
{"type": "P", "lang": "FR-CA", "page_number": 2, "mcids": [16]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [17]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [18]},
{"type": "P", "lang": "FR-FR", "page_number": 2, "mcids": [19]},
],
}
]
IMAGESTRUCT = [
{
"type": "Document",
"children": [
{"type": "P", "mcids": [0]},
{"type": "P", "mcids": [1]},
{
"type": "Figure",
"alt_text": "pdfplumber on github\n\n"
"a screen capture of the github page for pdfplumber",
"mcids": [2],
},
],
}
]
WORD365 = [
{
"type": "Document",
"children": [
{
"type": "H1",
"children": [
{"type": "Span", "mcids": [0]},
{"type": "Span", "actual_text": " ", "mcids": [1]},
],
},
{"type": "P", "mcids": [2]},
{
"type": "L",
"attributes": {"O": "List", "ListNumbering": "Disc"},
"children": [
{"type": "LI", "children": [{"type": "LBody", "mcids": [3]}]},
{"type": "LI", "children": [{"type": "LBody", "mcids": [4]}]},
{"type": "LI", "children": [{"type": "LBody", "mcids": [5]}]},
],
},
{"type": "P", "mcids": [6]},
{
"type": "L",
"attributes": {"O": "List", "ListNumbering": "Decimal"},
"children": [
{"type": "LI", "children": [{"type": "LBody", "mcids": [7]}]},
{"type": "LI", "children": [{"type": "LBody", "mcids": [8]}]},
],
},
{
"type": "Table",
"children": [
{
"type": "THead",
"children": [
{
"type": "TR",
"children": [
{
"type": "TH",
"children": [{"type": "P", "mcids": [9, 10]}],
},
{
"type": "TH",
"children": [{"type": "P", "mcids": [11, 12]}],
},
{
"type": "TH",
"children": [{"type": "P", "mcids": [13, 14]}],
},
],
}
],
},
{
"type": "TBody",
"children": [
{
"type": "TR",
"children": [
{
"type": "TD",
"children": [{"type": "P", "mcids": [15, 16]}],
},
{
"type": "TD",
"children": [{"type": "P", "mcids": [17, 18]}],
},
{
"type": "TD",
"children": [{"type": "P", "mcids": [19, 20]}],
},
],
},
{
"type": "TR",
"children": [
{
"type": "TD",
"children": [{"type": "P", "mcids": [21, 22]}],
},
{
"type": "TD",
"children": [{"type": "P", "mcids": [23, 24]}],
},
{
"type": "TD",
"children": [{"type": "P", "mcids": [25, 26]}],
},
],
},
],
},
],
},
{"type": "P", "mcids": [27]},
],
}
]
SCOTUS = [
{
"type": "Div",
"children": [
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 21.625,
"O": "Layout",
},
"mcids": [1],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"StartIndent": 86.375,
"O": "Layout",
},
"mcids": [2],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 50.375,
"O": "Layout",
},
"mcids": [3, 4],
},
{
"type": "P",
"page_number": 1,
# This is important, it has attributes and a class
"attributes": {
"LineHeight": 25.75,
"StartIndent": 165.625,
"EndIndent": 57.625,
"SpaceAfter": 24.5,
"O": "Layout",
},
"mcids": [5],
},
{
"type": "P",
"page_number": 1,
"attributes": {
"LineHeight": 25.75,
"TextIndent": 100.75,
"O": "Layout",
},
"mcids": [6],
},
{
"type": "P",
"page_number": 1,
# This is important, it has attributes and a class
"attributes": {
"LineHeight": 25.75,
"TextIndent": 21.625,
"EndIndent": 50.375,
"O": "Layout",
"TextAlign": "None",
"SpaceAfter": 179.125,
},
"mcids": [7],
},
{
"type": "P",
"page_number": 1,
# This is important, it has two attribute classes
"attributes": {"O": "Layout", "TextAlign": "Center", "SpaceAfter": 8.5},
"mcids": [8],
},
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Layout", "TextAlign": "Center"},
"mcids": [9],
},
],
}
]
HELLO = [
{
"type": "Section",
"page_number": 1,
"children": [
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Foo", "A1": 1},
"mcids": [1],
},
{
"type": "P",
"page_number": 2,
"attributes": {"O": "Foo", "A1": 2, "A2": 2},
"mcids": [1],
},
],
},
{
"type": "P",
"revision": 1,
"page_number": 2,
"attributes": {"O": "Foo", "A1": 3, "A2": 3},
"mcids": [2],
},
]
HELLO1 = [
{
"type": "Section",
"page_number": 1,
"children": [
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Foo", "A1": 1},
"mcids": [1],
},
],
}
]
HELLO1P = [
{
"type": "Section",
"children": [{"type": "P", "attributes": {"O": "Foo", "A1": 1}, "mcids": [1]}],
}
]
class TestClass(unittest.TestCase):
"""Test the underlying Structure tree class"""
def test_structure_tree_class(self):
path = os.path.join(HERE, "pdfs/image_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf, pdf.pages[0])
doc_elem = next(iter(stree))
assert [k.type for k in doc_elem] == ["P", "P", "Figure"]
def test_find_all_tree(self):
"""
Test find_all() and find() on trees
"""
path = os.path.join(HERE, "pdfs/image_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf, pdf.pages[0])
figs = list(stree.find_all("Figure"))
assert len(figs) == 1
fig = stree.find("Figure")
assert fig == figs[0]
assert stree.find("Fogure") is None
figs = list(stree.find_all(re.compile(r"Fig.*")))
assert len(figs) == 1
figs = list(stree.find_all(lambda x: x.type == "Figure"))
assert len(figs) == 1
figs = list(stree.find_all("Foogure"))
assert len(figs) == 0
figs = list(stree.find_all(re.compile(r"Fog.*")))
assert len(figs) == 0
figs = list(stree.find_all(lambda x: x.type == "Flogger"))
assert len(figs) == 0
def test_find_all_element(self):
"""
Test find_all() and find() on elements
"""
path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf)
for list_elem in stree.find_all("L"):
items = list(list_elem.find_all("LI"))
assert items
for item in items:
body = list(item.find_all("LBody"))
assert body
body1 = item.find("LBody")
assert body1 == body[0]
assert item.find("Loonie") is None
def test_all_mcids(self):
"""
Test all_mcids()
"""
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path)
# Make sure we can get them with page numbers
stree = PDFStructTree(pdf)
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
pages = set(page for page, mcid in mcids)
assert 1 in pages
assert 2 in pages
# If we take only a single page there are no page numbers
# (FIXME: may wish to reconsider this API decision...)
page = pdf.pages[1]
stree = PDFStructTree(pdf, page)
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
pages = set(page for page, mcid in mcids)
assert None in pages
assert 1 not in pages
assert 2 not in pages
# Assure that we get the MCIDs for a content element
for p in sect.find_all("P"):
assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)
def test_element_bbox(self):
"""
Test various ways of getting element bboxes
"""
path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
pdf = pdfplumber.open(path)
stree = PDFStructTree(pdf)
# As BBox attribute
table = next(stree.find_all("Table"))
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
# With child elements
tr = next(table.find_all("TR"))
assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
# From a specific page it should also work
stree = PDFStructTree(pdf, pdf.pages[0])
table = next(stree.find_all("Table"))
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
tr = next(table.find_all("TR"))
assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
# Yeah but what happens if you crop the page?
page = pdf.pages[0].crop((10, 400, 500, 500))
stree = PDFStructTree(pdf, page)
table = next(stree.find_all("Table"))
# The element gets cropped too
assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 500, 500)
# And if you crop it out of the page?
page = pdf.pages[0].crop((0, 0, 560, 400))
stree = PDFStructTree(pdf, page)
table = next(stree.find_all("Table"))
with self.assertRaises(IndexError):
_ = stree.element_bbox(table)
class TestUnparsed(unittest.TestCase):
"""Test handling of PDFs with unparsed pages."""
def test_unparsed_pages(self):
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path, pages=[2])
assert pdf.structure_tree == PVSTRUCT2
class TestMany(unittest.TestCase):
"""Test various PDFs."""
def test_no_stucture(self):
path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
pdf = pdfplumber.open(path)
assert pdf.structure_tree == []
assert pdf.pages[0].structure_tree == []
def test_word365(self):
path = os.path.join(HERE, "pdfs/word365_structure.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
assert page.structure_tree == WORD365
def test_proces_verbal(self):
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path)
assert pdf.structure_tree == PVSTRUCT
page = pdf.pages[1]
assert page.structure_tree == PVSTRUCT1
def test_missing_parenttree(self):
"""Verify we can get structure without a ParentTree."""
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path)
root = resolve1(pdf.doc.catalog["StructTreeRoot"])
del root["ParentTree"]
assert pdf.pages[1].structure_tree == PVSTRUCT1
def test_image_structure(self):
path = os.path.join(HERE, "pdfs/image_structure.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
assert page.structure_tree == IMAGESTRUCT
def test_figure_mcids(self):
path = os.path.join(HERE, "pdfs/figure_structure.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
d = deque(page.structure_tree)
while d:
el = d.popleft()
if el["type"] == "Figure":
break
if "children" in el:
d.extend(el["children"])
# We found a Figure
assert el["type"] == "Figure"
# It has these MCIDS
assert el["mcids"] == [1, 14]
def test_scotus(self):
# This one actually has attribute classes!
path = os.path.join(HERE, "pdfs/scotus-transcript-p1.pdf")
pdf = pdfplumber.open(path)
assert pdf.structure_tree == SCOTUS
def test_chelsea_pdta(self):
# This one has structure elements for marked content sections
path = os.path.join(HERE, "pdfs/chelsea_pdta.pdf")
pdf = pdfplumber.open(path)
# This page has no structure tree (really!)
tree8 = pdf.pages[7].structure_tree
assert tree8 == []
# We should also have no structure tree here
with pdfplumber.open(path, pages=[8]) as pdf8:
assert pdf8.structure_tree == []
# This page is empty
tree3 = pdf.pages[3].structure_tree
assert tree3 == []
# This page in particular has OBJR and MCR elements
tree1 = pdf.pages[2].structure_tree
assert tree1 # Should contain a tree!
pdf = pdfplumber.open(path, pages=[3])
tree2 = pdf.structure_tree
assert tree2
# Compare modulo page_number
d = deque(zip(tree1, tree2))
while d:
el1, el2 = d.popleft()
if "page_number" in el1:
assert el1["page_number"] == 3
assert el1 == el2
if "children" in el1:
assert len(el1["children"]) == len(el2["children"])
d.extend(zip(el1["children"], el2["children"]))
def test_hello_structure(self):
# Synthetic PDF to test some corner cases
path = os.path.join(HERE, "pdfs/hello_structure.pdf")
with pdfplumber.open(path) as pdf:
assert pdf.structure_tree == HELLO
assert pdf.pages[0].structure_tree == HELLO1P
with pdfplumber.open(path, pages=[1]) as pdf:
assert pdf.structure_tree == HELLO1
================================================
FILE: tests/test_table.py
================================================
#!/usr/bin/env python
import logging
import os
import unittest
import pytest
import pdfplumber
from pdfplumber import table
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf = pdfplumber.open(path)
@classmethod
def teardown_class(self):
self.pdf.close()
def test_orientation_errors(self):
with pytest.raises(ValueError):
table.join_edge_group([], "x")
def test_table_settings_errors(self):
with pytest.raises(ValueError):
tf = table.TableFinder(self.pdf.pages[0], tuple())
with pytest.raises(TypeError):
tf = table.TableFinder(self.pdf.pages[0], {"strategy": "x"})
tf.get_edges()
with pytest.raises(ValueError):
tf = table.TableFinder(self.pdf.pages[0], {"vertical_strategy": "x"})
with pytest.raises(ValueError):
tf = table.TableFinder(
self.pdf.pages[0],
{
"vertical_strategy": "explicit",
"explicit_vertical_lines": [],
},
)
with pytest.raises(ValueError):
tf = table.TableFinder(self.pdf.pages[0], {"join_tolerance": -1})
tf.get_edges()
def test_edges_strict(self):
path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
with pdfplumber.open(path) as pdf:
t = pdf.pages[0].extract_table(
{
"vertical_strategy": "lines_strict",
"horizontal_strategy": "lines_strict",
}
)
assert t[-1] == [
"",
"0085648100300",
"CENTRAL KMA",
"LILYS 55% DARK CHOC BAR",
"415",
"$ 0.61",
"$ 253.15",
"0.0000",
"",
]
def test_rows_and_columns(self):
path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
table = page.find_table()
row = [page.crop(bbox).extract_text() for bbox in table.rows[0].cells]
assert row == [
"Line no",
"UPC code",
"Location",
"Item Description",
"Item Quantity",
"Bill Amount",
"Accrued Amount",
"Handling Rate",
"PO number",
]
col = [page.crop(bbox).extract_text() for bbox in table.columns[1].cells]
assert col == [
"UPC code",
"0085648100305",
"0085648100380",
"0085648100303",
"0085648100300",
]
def test_explicit_desc_decimalization(self):
"""
See issue #290
"""
tf = table.TableFinder(
self.pdf.pages[0],
{
"vertical_strategy": "explicit",
"explicit_vertical_lines": [100, 200, 300],
"horizontal_strategy": "explicit",
"explicit_horizontal_lines": [100, 200, 300],
},
)
assert tf.tables[0].extract()
def test_text_tolerance(self):
path = os.path.join(HERE, "pdfs/senate-expenditures.pdf")
with pdfplumber.open(path) as pdf:
bbox = (70.332, 130.986, 420, 509.106)
cropped = pdf.pages[0].crop(bbox)
t = cropped.extract_table(
{
"horizontal_strategy": "text",
"vertical_strategy": "text",
"min_words_vertical": 20,
}
)
t_tol = cropped.extract_table(
{
"horizontal_strategy": "text",
"vertical_strategy": "text",
"min_words_vertical": 20,
"text_x_tolerance": 1,
}
)
t_tol_from_tables = cropped.extract_tables(
{
"horizontal_strategy": "text",
"vertical_strategy": "text",
"min_words_vertical": 20,
"text_x_tolerance": 1,
}
)[0]
assert t[-1] == [
"DHAW20190070",
"09/09/2019",
"CITIBANK-TRAVELCBACARD",
"08/12/2019",
"08/14/2019",
]
assert t_tol[-1] == [
"DHAW20190070",
"09/09/2019",
"CITIBANK - TRAVEL CBA CARD",
"08/12/2019",
"08/14/2019",
]
assert t_tol[-1] == t_tol_from_tables[-1]
def test_text_layout(self):
path = os.path.join(HERE, "pdfs/issue-53-example.pdf")
with pdfplumber.open(path) as pdf:
table = pdf.pages[0].extract_table(
{
"text_layout": True,
}
)
assert table[3][0] == " FY2013 \n FY2014 "
def test_text_without_words(self):
assert table.words_to_edges_h([]) == []
assert table.words_to_edges_v([]) == []
def test_order(self):
"""
See issue #336
"""
path = os.path.join(HERE, "pdfs/issue-336-example.pdf")
with pdfplumber.open(path) as pdf:
tables = pdf.pages[0].extract_tables()
assert len(tables) == 3
assert len(tables[0]) == 8
assert len(tables[1]) == 11
assert len(tables[2]) == 2
def test_issue_466_mixed_strategy(self):
"""
See issue #466
"""
path = os.path.join(HERE, "pdfs/issue-466-example.pdf")
with pdfplumber.open(path) as pdf:
tables = pdf.pages[0].extract_tables(
{
"vertical_strategy": "lines",
"horizontal_strategy": "text",
"snap_tolerance": 8,
"intersection_tolerance": 4,
}
)
# The engine only extracts the tables which have drawn horizontal
# lines.
# For the 3 extracted tables, some common properties are expected:
# - 4 rows
# - 3 columns
# - Data in last row contains the string 'last'
for t in tables:
assert len(t) == 4
assert len(t[0]) == 3
# Verify that all cell contain real data
for cell in t[3]:
assert "last" in cell
def test_discussion_539_null_value(self):
"""
See discussion #539
"""
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"text_keep_blank_chars": False,
"text_tolerance": 3,
"intersection_tolerance": 3,
}
assert page.extract_table(table_settings)
assert page.extract_tables(table_settings)
def test_table_curves(self):
# See https://github.com/jsvine/pdfplumber/discussions/808
path = os.path.join(HERE, "pdfs/table-curves-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert len(page.curves)
tables = page.extract_tables()
assert len(tables) == 1
t = tables[0]
assert t[-2][-2] == "Uncommon"
assert len(page.extract_tables({"vertical_strategy": "lines_strict"})) == 0
================================================
FILE: tests/test_utils.py
================================================
#!/usr/bin/env python
import logging
import os
import re
import unittest
from itertools import groupby
from operator import itemgetter
import pandas as pd
import pytest
from pdfminer.pdfparser import PDFObjRef
from pdfminer.psparser import PSLiteral
import pdfplumber
from pdfplumber import utils
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
self.pdf = pdfplumber.open(os.path.join(HERE, "pdfs/pdffill-demo.pdf"))
self.pdf_scotus = pdfplumber.open(
os.path.join(HERE, "pdfs/scotus-transcript-p1.pdf")
)
@classmethod
def teardown_class(self):
self.pdf.close()
def test_cluster_list(self):
a = [1, 2, 3, 4]
assert utils.cluster_list(a) == [[x] for x in a]
assert utils.cluster_list(a, tolerance=1) == [a]
a = [1, 2, 5, 6]
assert utils.cluster_list(a, tolerance=1) == [[1, 2], [5, 6]]
def test_cluster_objects(self):
a = ["a", "ab", "abc", "b"]
assert utils.cluster_objects(a, len, 0) == [["a", "b"], ["ab"], ["abc"]]
b = [{"x": 1, 7: "a"}, {"x": 1, 7: "b"}, {"x": 2, 7: "b"}, {"x": 2, 7: "b"}]
assert utils.cluster_objects(b, "x", 0) == [[b[0], b[1]], [b[2], b[3]]]
assert utils.cluster_objects(b, 7, 0) == [[b[0]], [b[1], b[2], b[3]]]
def test_resolve(self):
annot = self.pdf.annots[0]
annot_ad0 = utils.resolve(annot["data"]["A"]["D"][0])
assert annot_ad0["MediaBox"] == [0, 0, 612, 792]
assert utils.resolve(1) == 1
def test_resolve_all(self):
info = self.pdf.doc.xrefs[0].trailer["Info"]
assert type(info) is PDFObjRef
a = [{"info": info}]
a_res = utils.resolve_all(a)
assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]
def test_decode_psl_list(self):
a = [PSLiteral("test"), "test_2"]
assert utils.decode_psl_list(a) == ["test", "test_2"]
def test_x_tolerance_ratio(self):
pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-987-test.pdf"))
page = pdf.pages[0]
assert page.extract_text() == "Big Te xt\nSmall Text"
assert page.extract_text(x_tolerance=4) == "Big Te xt\nSmallText"
assert page.extract_text(x_tolerance_ratio=0.15) == "Big Text\nSmall Text"
words = page.extract_words(x_tolerance_ratio=0.15)
assert "|".join(w["text"] for w in words) == "Big|Text|Small|Text"
def test_extract_words(self):
path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
with pdfplumber.open(path) as pdf:
p = pdf.pages[0]
words = p.extract_words(vertical_ttb=False)
words_attr = p.extract_words(vertical_ttb=False, extra_attrs=["size"])
words_w_spaces = p.extract_words(vertical_ttb=False, keep_blank_chars=True)
words_rtl = p.extract_words(horizontal_ltr=False)
assert words[0]["text"] == "Agaaaaa:"
assert words[0]["direction"] == "ltr"
assert "size" not in words[0]
assert round(words_attr[0]["size"], 2) == 9.96
assert words_w_spaces[0]["text"] == "Agaaaaa: AAAA"
vertical = [w for w in words if w["upright"] == 0]
assert vertical[0]["text"] == "Aaaaaabag8"
assert vertical[0]["direction"] == "btt"
assert words_rtl[1]["text"] == "baaabaaA/AAA"
assert words_rtl[1]["direction"] == "rtl"
def test_extract_words_return_chars(self):
path = os.path.join(HERE, "pdfs/extra-attrs-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
words = page.extract_words()
assert "chars" not in words[0]
words = page.extract_words(return_chars=True)
assert "chars" in words[0]
assert "".join(c["text"] for c in words[0]["chars"]) == words[0]["text"]
def test_text_rotation(self):
rotations = {
"0": ("ltr", "ttb"),
"-0": ("rtl", "ttb"),
"180": ("rtl", "btt"),
"-180": ("ltr", "btt"),
"90": ("ttb", "rtl"),
"-90": ("btt", "rtl"),
"270": ("btt", "ltr"),
"-270": ("ttb", "ltr"),
}
path = os.path.join(HERE, "pdfs/issue-848.pdf")
with pdfplumber.open(path) as pdf:
expected = utils.text.extract_text(pdf.pages[0].chars)
for i, (rotation, (char_dir, line_dir)) in enumerate(rotations.items()):
if i == 0:
continue
print(f"--- {rotation} ---")
p = pdf.pages[i].filter(lambda obj: obj.get("text") != " ")
output = utils.text.extract_text(
x_tolerance=2,
y_tolerance=2,
chars=p.chars,
char_dir=char_dir,
line_dir=line_dir,
char_dir_rotated=char_dir,
line_dir_rotated=line_dir,
char_dir_render="ltr",
line_dir_render="ttb",
)
assert output == expected
def test_text_rotation_layout(self):
rotations = {
"0": ("ltr", "ttb"),
"-0": ("rtl", "ttb"),
"180": ("rtl", "btt"),
"-180": ("ltr", "btt"),
"90": ("ttb", "rtl"),
"-90": ("btt", "rtl"),
"270": ("btt", "ltr"),
"-270": ("ttb", "ltr"),
}
def meets_expectations(text):
# Both texts should be found, and the first should appear before the second
a = re.search("opens with a news report", text)
b = re.search("having been transferred", text)
return a and b and (a.start() < b.start())
path = os.path.join(HERE, "pdfs/issue-848.pdf")
with pdfplumber.open(path) as pdf:
for i, (rotation, (char_dir, line_dir)) in enumerate(rotations.items()):
print(f"--- {rotation} ---")
p = pdf.pages[i].filter(lambda obj: obj.get("text") != " ")
output = p.extract_text(
layout=True,
x_tolerance=2,
y_tolerance=2,
char_dir=char_dir,
line_dir=line_dir,
char_dir_rotated=char_dir,
line_dir_rotated=line_dir,
char_dir_render="ltr",
line_dir_render="ttb",
y_density=14,
)
assert meets_expectations(output)
def test_text_render_directions(self):
path = os.path.join(HERE, "pdfs/line-char-render-example.pdf")
targets = {
("ttb", "ltr"): "first line\nsecond line\nthird line",
("ttb", "rtl"): "enil tsrif\nenil dnoces\nenil driht",
("btt", "ltr"): "third line\nsecond line\nfirst line",
("btt", "rtl"): "enil driht\nenil dnoces\nenil tsrif",
("ltr", "ttb"): "fst\nieh\nrci\nsor\ntnd\n d \nl l\nili\nnin\nene\n e ",
("ltr", "btt"): " s \nfet\nich\nroi\nsnr\ntdd\n \nlll\niii\nnnn\neee",
("rtl", "ttb"): "tsf\nhei\nicr\nros\ndnt\n d \nl l\nili\nnin\nene\n e ",
("rtl", "btt"): " s \ntef\nhci\nior\nrns\nddt\n \nlll\niii\nnnn\neee",
}
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
for (line_dir, char_dir), target in targets.items():
text = page.extract_text(
line_dir_render=line_dir, char_dir_render=char_dir
)
assert text == target
def test_invalid_directions(self):
path = os.path.join(HERE, "pdfs/line-char-render-example.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
with pytest.raises(ValueError):
page.extract_text(line_dir="xxx", char_dir="ltr")
with pytest.raises(ValueError):
page.extract_text(line_dir="ttb", char_dir="a")
with pytest.raises(ValueError):
page.extract_text(line_dir="rtl", char_dir="ltr")
with pytest.raises(ValueError):
page.extract_text(line_dir="ttb", char_dir="btt")
with pytest.raises(ValueError):
page.extract_text(line_dir_rotated="ttb", char_dir="btt")
with pytest.raises(ValueError):
page.extract_text(line_dir_render="ttb", char_dir_render="btt")
pdf.close()
def test_extra_attrs(self):
path = os.path.join(HERE, "pdfs/extra-attrs-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert page.extract_text() == "BlackRedArial"
assert (
page.extract_text(extra_attrs=["non_stroking_color"])
== "Black RedArial"
)
assert page.extract_text(extra_attrs=["fontname"]) == "BlackRed Arial"
assert (
page.extract_text(extra_attrs=["non_stroking_color", "fontname"])
== "Black Red Arial"
)
# Should not error
assert page.extract_text(
layout=True,
use_text_flow=True,
extra_attrs=["non_stroking_color", "fontname"],
)
def test_extract_words_punctuation(self):
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
with pdfplumber.open(path) as pdf:
wordsA = pdf.pages[0].extract_words(split_at_punctuation=True)
wordsB = pdf.pages[0].extract_words(split_at_punctuation=False)
wordsC = pdf.pages[0].extract_words(
split_at_punctuation=r"!\"&'()*+,.:;<=>?@[]^`{|}~"
)
assert wordsA[0]["text"] == "https"
assert (
wordsB[0]["text"]
== "https://dell-research-harvard.github.io/HJDataset/"
)
assert wordsC[2]["text"] == "//dell-research-harvard"
wordsA = pdf.pages[1].extract_words(split_at_punctuation=True)
wordsB = pdf.pages[1].extract_words(split_at_punctuation=False)
wordsC = pdf.pages[1].extract_words(
split_at_punctuation=r"!\"&'()*+,.:;<=>?@[]^`{|}~"
)
assert len(wordsA) == 4
assert len(wordsB) == 2
assert len(wordsC) == 2
wordsA = pdf.pages[2].extract_words(split_at_punctuation=True)
wordsB = pdf.pages[2].extract_words(split_at_punctuation=False)
wordsC = pdf.pages[2].extract_words(
split_at_punctuation=r"!\"&'()*+,.:;<=>?@[]^`{|}~"
)
assert wordsA[1]["text"] == "["
assert wordsB[1]["text"] == "[2,"
assert wordsC[1]["text"] == "["
wordsA = pdf.pages[3].extract_words(split_at_punctuation=True)
wordsB = pdf.pages[3].extract_words(split_at_punctuation=False)
wordsC = pdf.pages[3].extract_words(
split_at_punctuation=r"!\"&'()*+,.:;<=>?@[]^`{|}~"
)
assert wordsA[2]["text"] == "al"
assert wordsB[2]["text"] == "al."
assert wordsC[2]["text"] == "al"
def test_extract_text_punctuation(self):
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
with pdfplumber.open(path) as pdf:
text = pdf.pages[0].extract_text(
layout=True,
split_at_punctuation=True,
)
assert "https " in text
def test_text_flow(self):
path = os.path.join(HERE, "pdfs/federal-register-2020-17221.pdf")
def words_to_text(words):
grouped = groupby(words, key=itemgetter("top"))
lines = [" ".join(word["text"] for word in grp) for top, grp in grouped]
return "\n".join(lines)
with pdfplumber.open(path) as pdf:
p0 = pdf.pages[0]
using_flow = p0.extract_words(use_text_flow=True)
not_using_flow = p0.extract_words()
target_text = (
"The FAA proposes to\n"
"supersede Airworthiness Directive (AD)\n"
"2018–23–51, which applies to all The\n"
"Boeing Company Model 737–8 and 737–\n"
"9 (737 MAX) airplanes. Since AD 2018–\n"
)
assert target_text in words_to_text(using_flow)
assert target_text not in words_to_text(not_using_flow)
def test_text_flow_overlapping(self):
path = os.path.join(HERE, "pdfs/issue-912.pdf")
with pdfplumber.open(path) as pdf:
p0 = pdf.pages[0]
using_flow = p0.extract_text(use_text_flow=True, layout=True, x_tolerance=1)
not_using_flow = p0.extract_text(layout=True, x_tolerance=1)
assert re.search("2015 RICE PAYMENT 26406576 0 1207631 Cr", using_flow)
assert re.search("124644,06155766", using_flow) is None
assert re.search("124644,06155766", not_using_flow)
assert (
re.search("2015 RICE PAYMENT 26406576 0 1207631 Cr", not_using_flow) is None
)
def test_text_flow_words_mixed_lines(self):
path = os.path.join(HERE, "pdfs/issue-1279-example.pdf")
with pdfplumber.open(path) as pdf:
p0 = pdf.pages[0]
words = p0.extract_words(use_text_flow=True)
texts = set(w["text"] for w in words)
assert "claim" in texts
assert "lence" in texts
assert "claimlence" not in texts
def test_extract_text(self):
text = self.pdf.pages[0].extract_text()
goal_lines = [
"First Page Previous Page Next Page Last Page",
"Print",
"PDFill: PDF Drawing",
"You can open a PDF or create a blank PDF by PDFill.",
"Online Help",
"Here are the PDF drawings created by PDFill",
"Please save into a new PDF to see the effect!",
"Goto Page 2: Line Tool",
"Goto Page 3: Arrow Tool",
"Goto Page 4: Tool for Rectangle, Square and Rounded Corner",
"Goto Page 5: Tool for Circle, Ellipse, Arc, Pie",
"Goto Page 6: Tool for Basic Shapes",
"Goto Page 7: Tool for Curves",
"Here are the tools to change line width, style, arrow style and colors",
]
goal = "\n".join(goal_lines)
assert text == goal
text_simple = self.pdf.pages[0].extract_text_simple()
assert text_simple == goal
assert self.pdf.pages[0].crop((0, 0, 1, 1)).extract_text() == ""
def test_extract_text_blank(self):
assert utils.extract_text([]) == ""
def test_extract_text_layout(self):
target = (
open(os.path.join(HERE, "comparisons/scotus-transcript-p1.txt"))
.read()
.strip("\n")
)
page = self.pdf_scotus.pages[0]
text = page.extract_text(layout=True)
utils_text = utils.extract_text(
page.chars,
layout=True,
layout_width=page.width,
layout_height=page.height,
layout_bbox=page.bbox,
)
assert text == utils_text
assert text == target
def test_extract_text_layout_cropped(self):
target = (
open(os.path.join(HERE, "comparisons/scotus-transcript-p1-cropped.txt"))
.read()
.strip("\n")
)
p = self.pdf_scotus.pages[0]
cropped = p.crop((90, 70, p.width, 300))
text = cropped.extract_text(layout=True)
assert text == target
def test_extract_text_layout_widths(self):
p = self.pdf_scotus.pages[0]
text = p.extract_text(layout=True, layout_width_chars=75)
assert all(len(line) == 75 for line in text.splitlines())
with pytest.raises(ValueError):
p.extract_text(layout=True, layout_width=300, layout_width_chars=50)
with pytest.raises(ValueError):
p.extract_text(layout=True, layout_height=300, layout_height_chars=50)
def test_extract_text_nochars(self):
charless = self.pdf.pages[0].filter(lambda df: df["object_type"] != "char")
assert charless.extract_text() == ""
assert charless.extract_text(layout=True) == ""
def test_search_regex_compiled(self):
page = self.pdf_scotus.pages[0]
pat = re.compile(r"supreme\s+(\w+)", re.I)
results = page.search(pat)
assert results[0]["text"] == "SUPREME COURT"
assert results[0]["groups"] == ("COURT",)
assert results[1]["text"] == "Supreme Court"
assert results[1]["groups"] == ("Court",)
with pytest.raises(ValueError):
page.search(re.compile(r"x"), regex=False)
with pytest.raises(ValueError):
page.search(re.compile(r"x"), case=False)
def test_search_regex_uncompiled(self):
page = self.pdf_scotus.pages[0]
pat = r"supreme\s+(\w+)"
results = page.search(pat, case=False)
assert results[0]["text"] == "SUPREME COURT"
assert results[0]["groups"] == ("COURT",)
assert results[1]["text"] == "Supreme Court"
assert results[1]["groups"] == ("Court",)
def test_search_string(self):
page = self.pdf_scotus.pages[0]
results = page.search("SUPREME COURT", regex=False)
assert results[0]["text"] == "SUPREME COURT"
assert results[0]["groups"] == tuple()
results = page.search("supreme court", regex=False)
assert len(results) == 0
results = page.search("supreme court", regex=False, case=False)
assert len(results) == 2
results = page.search("supreme court", regex=True, case=False)
assert len(results) == 2
results = page.search(r"supreme\s+(\w+)", regex=False)
assert len(results) == 0
results = page.search(r"10 Tuesday", layout=False)
assert len(results) == 1
results = page.search(r"10 Tuesday", layout=True)
assert len(results) == 0
def test_extract_text_lines(self):
page = self.pdf_scotus.pages[0]
results = page.extract_text_lines()
assert len(results) == 28
assert "chars" in results[0]
assert results[0]["text"] == "Official - Subject to Final Review"
alt = page.extract_text_lines(layout=True, strip=False, return_chars=False)
assert "chars" not in alt[0]
assert (
alt[0]["text"]
== " Official - Subject to Final Review " # noqa: E501
)
assert results[10]["text"] == "10 Tuesday, January 13, 2009"
assert (
alt[10]["text"]
== " 10 Tuesday, January 13, 2009 " # noqa: E501
)
assert (
page.extract_text_lines(layout=True)[10]["text"]
== "10 Tuesday, January 13, 2009"
) # noqa: E501
def test_handle_empty_and_whitespace_search_results(self):
# via https://github.com/jsvine/pdfplumber/discussions/853
# The searches below should not raise errors but instead
# should return empty result-sets.
page = self.pdf_scotus.pages[0]
for regex in [True, False]:
results = page.search("\n", regex=regex)
assert len(results) == 0
assert len(page.search("(sdfsd)?")) == 0
assert len(page.search("")) == 0
def test_intersects_bbox(self):
objs = [
# Is same as bbox
{
"x0": 0,
"top": 0,
"x1": 20,
"bottom": 20,
},
# Inside bbox
{
"x0": 10,
"top": 10,
"x1": 15,
"bottom": 15,
},
# Overlaps bbox
{
"x0": 10,
"top": 10,
"x1": 30,
"bottom": 30,
},
# Touching on one side
{
"x0": 20,
"top": 0,
"x1": 40,
"bottom": 20,
},
# Touching on one corner
{
"x0": 20,
"top": 20,
"x1": 40,
"bottom": 40,
},
# Fully outside
{
"x0": 21,
"top": 21,
"x1": 40,
"bottom": 40,
},
]
bbox = utils.obj_to_bbox(objs[0])
assert utils.intersects_bbox(objs, bbox) == objs[:4]
assert utils.intersects_bbox(iter(objs), bbox) == objs[:4]
def test_merge_bboxes(self):
bboxes = [
(0, 10, 20, 20),
(10, 5, 10, 30),
]
merged = utils.merge_bboxes(bboxes)
assert merged == (0, 5, 20, 30)
merged = utils.merge_bboxes(iter(bboxes))
assert merged == (0, 5, 20, 30)
def test_resize_object(self):
obj = {
"x0": 5,
"x1": 10,
"top": 20,
"bottom": 30,
"width": 5,
"height": 10,
"doctop": 120,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "x0", 0) == {
"x0": 0,
"x1": 10,
"top": 20,
"doctop": 120,
"bottom": 30,
"width": 10,
"height": 10,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "x1", 50) == {
"x0": 5,
"x1": 50,
"top": 20,
"doctop": 120,
"bottom": 30,
"width": 45,
"height": 10,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "top", 0) == {
"x0": 5,
"x1": 10,
"top": 0,
"doctop": 100,
"bottom": 30,
"height": 30,
"width": 5,
"y0": 40,
"y1": 70,
}
assert utils.resize_object(obj, "bottom", 40) == {
"x0": 5,
"x1": 10,
"top": 20,
"doctop": 120,
"bottom": 40,
"height": 20,
"width": 5,
"y0": 30,
"y1": 50,
}
def test_move_object(self):
a = {
"x0": 5,
"x1": 10,
"top": 20,
"bottom": 30,
"width": 5,
"height": 10,
"doctop": 120,
"y0": 40,
"y1": 50,
}
b = dict(a)
b["x0"] = 15
b["x1"] = 20
a_new = utils.move_object(a, "h", 10)
assert a_new == b
def test_snap_objects(self):
a = {
"x0": 5,
"x1": 10,
"top": 20,
"bottom": 30,
"width": 5,
"height": 10,
"doctop": 120,
"y0": 40,
"y1": 50,
}
b = dict(a)
b["x0"] = 6
b["x1"] = 11
c = dict(a)
c["x0"] = 7
c["x1"] = 12
a_new, b_new, c_new = utils.snap_objects([a, b, c], "x0", 1)
assert a_new == b_new == c_new
a_new, b_new, c_new = utils.snap_objects(iter([a, b, c]), "x0", 1)
assert a_new == b_new == c_new
def test_filter_edges(self):
with pytest.raises(ValueError):
utils.filter_edges([], "x")
def test_to_list(self):
objs = [
{
"x0": 0,
"top": 0,
"x1": 20,
"bottom": 20,
},
{
"x0": 10,
"top": 10,
"x1": 15,
"bottom": 15,
},
]
assert utils.to_list(objs) == objs
assert utils.to_list(iter(objs)) == objs
assert utils.to_list(tuple(objs)) == objs
assert utils.to_list((o for o in objs)) == objs
assert utils.to_list(pd.DataFrame(objs)) == objs