" + line.decode().strip() + "")
if proc.returncode != 0:
st.error(f"ocrmypdf failed with exit code {proc.returncode}")
st.session_state['running'] = False
st.stop()
if Path(output_file.name).stat().st_size == 0:
st.error("No output PDF file was generated")
st.stop()
st.download_button(
label="Download output PDF",
data=output_file.read(),
file_name=uploaded.name,
mime="application/pdf",
)
st.session_state['running'] = False
================================================
FILE: misc/batch.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2016 findingorder # pre-release
[-_\.]?
(?P(a|b|c|rc|alpha|beta|pre|preview))
[-_\.]?
(?P[0-9]+)?
)?
(?P # post release
(?:-(?P[0-9]+))
|
(?:
[-_\.]?
(?Ppost|rev|r)
[-_\.]?
(?P[0-9]+)?
)
)?
(?P # dev release
[-_\.]?
(?Pdev)
[-_\.]?
(?P[0-9]+)?
)?
(?P
[-_\.]
(?:20[0-9][0-9] [0-1][0-9] [0-3][0-9]) # yyyy mm dd
)?
(?P
[-_\.]?
[0-9]+
)?
(?P
[-_\.]?
g[0-9a-f]{2,10}
)?
)
(?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))? # local version
"""
class TesseractVersion(Version):
"""Modify standard packaging.Version regex to support Tesseract idiosyncrasies."""
_regex = re.compile(
r"^\s*" + TESSERACT_VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE
)
def version() -> Version:
return TesseractVersion(get_version('tesseract', regex=r'tesseract\s(.+)'))
def has_thresholding() -> bool:
"""Does Tesseract have -c thresholding method capability?"""
return version() >= Version('5.0')
def get_languages() -> set[str]:
def lang_error(output):
msg = (
"Tesseract failed to report available languages.\n"
"Output from Tesseract:\n"
"-----------\n"
)
msg += output
return msg
args_tess = ['tesseract', '--list-langs']
try:
proc = run(
args_tess,
text=True,
stdout=PIPE,
stderr=STDOUT,
logs_errors_to_stdout=True,
check=True,
)
output = proc.stdout
except CalledProcessError as e:
raise MissingDependencyError(lang_error(e.output)) from e
for line in output.splitlines():
if line.startswith('Error'):
raise MissingDependencyError(lang_error(output))
_header, *rest = output.splitlines()
return {lang.strip() for lang in rest}
def tess_base_args(langs: list[str], engine_mode: int | None) -> list[str]:
args = ['tesseract']
if langs:
args.extend(['-l', '+'.join(langs)])
if engine_mode is not None:
args.extend(['--oem', str(engine_mode)])
return args
def _parse_tesseract_output(binary_output: bytes) -> dict[str, str]:
def gen():
for line in binary_output.decode().splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
yield parts[0].strip(), parts[1].strip()
return dict(gen())
def get_orientation(
input_file: Path,
engine_mode: int | None,
timeout: float,
omp_thread_limit: int | None = None,
) -> OrientationConfidence:
args_tesseract = tess_base_args(['osd'], engine_mode) + [
'--psm',
'0',
fspath(input_file),
'stdout',
]
try:
p = run(
args_tesseract,
stdout=PIPE,
stderr=STDOUT,
timeout=timeout,
check=True,
env=_tesseract_env(omp_thread_limit),
)
except TimeoutExpired:
return OrientationConfidence(angle=0, confidence=0.0)
except CalledProcessError as e:
tesseract_log_output(e.stdout)
tesseract_log_output(e.stderr)
# Check both stdout (e.output) and stderr for known non-fatal messages
all_output = (e.output or b'') + (e.stderr or b'')
if (
b'Too few characters. Skipping this page' in all_output
or b'Image too large' in all_output
):
return OrientationConfidence(0, 0)
raise SubprocessOutputError() from e
osd = _parse_tesseract_output(p.stdout)
angle = int(osd.get('Orientation in degrees', 0))
orient_conf = OrientationConfidence(
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
)
return orient_conf
def _is_empty_page_error(exc):
if b'Empty page!!' in exc.output: # Tesseract 4.x
return True
return exc.returncode == 1 and (
# Tesseract 5.0-5.4 or so
exc.output == b''
# Tesseract 5.5+
or exc.output.startswith(b"Error in boxClipToRectangle: box outside rectangle")
)
def get_deskew(
input_file: Path,
languages: list[str],
engine_mode: int | None,
timeout: float,
omp_thread_limit: int | None = None,
) -> float:
"""Gets angle to deskew this page, in degrees."""
args_tesseract = tess_base_args(languages, engine_mode) + [
'--psm',
'2',
fspath(input_file),
'stdout',
]
try:
p = run(
args_tesseract,
stdout=PIPE,
stderr=STDOUT,
timeout=timeout,
check=True,
env=_tesseract_env(omp_thread_limit),
)
except TimeoutExpired:
return 0.0
except CalledProcessError as e:
tesseract_log_output(e.stdout)
tesseract_log_output(e.stderr)
if _is_empty_page_error(e):
# Not enough info for a skew angle
return 0.0
raise SubprocessOutputError() from e
parsed = _parse_tesseract_output(p.stdout)
deskew_radians = float(parsed.get('Deskew angle', 0))
deskew_degrees = 180 / pi * deskew_radians
log.debug(f"Deskew angle: {deskew_degrees:.3f}")
return deskew_degrees
def tesseract_log_output(stream: bytes) -> None:
tlog = TesseractLoggerAdapter(
log,
extra=log.extra if hasattr(log, 'extra') else None, # type: ignore
)
if not stream:
return
try:
text = stream.decode()
except UnicodeDecodeError:
text = stream.decode('utf-8', 'ignore')
lines = text.splitlines()
for line in lines:
if line.startswith(
("Tesseract Open Source", "Warning in pixReadMem")
):
continue
elif 'diacritics' in line:
tlog.warning("lots of diacritics - possibly poor OCR")
elif line.startswith('OSD: Weak margin'):
tlog.warning("unsure about page orientation")
elif 'Error in pixScanForForeground' in line:
pass # Appears to be spurious/problem with nonwhite borders
elif 'Error in boxClipToRectangle' in line:
pass # Always appears with pixScanForForeground message
elif 'parameter not found: ' in line.lower():
tlog.error(line.strip())
problem = line.split('found: ')[1]
raise TesseractConfigError(problem)
elif 'error' in line.lower() or 'exception' in line.lower():
tlog.error(line.strip())
elif 'warning' in line.lower():
tlog.warning(line.strip())
elif 'read_params_file' in line.lower():
tlog.error(line.strip())
else:
tlog.info(line.strip())
def page_timedout(timeout: float) -> None:
if timeout == 0:
return
log.warning("[tesseract] took too long to OCR - skipping")
def _generate_null_hocr(output_hocr: Path, output_text: Path, image: Path) -> None:
"""Produce an empty .hocr file.
Ensures page is the same size as the input image.
"""
output_hocr.write_text('', encoding='utf-8')
output_text.write_text('[skipped page]', encoding='utf-8')
def generate_hocr(
*,
input_file: Path,
output_hocr: Path,
output_text: Path,
languages: list[str],
engine_mode: int,
tessconfig: list[str],
timeout: float,
pagesegmode: int,
thresholding: ThresholdingMethod,
user_words,
user_patterns,
omp_thread_limit: int | None = None,
) -> None:
"""Generate a hOCR file, which must be converted to PDF."""
prefix = output_hocr.with_suffix('')
args_tesseract = tess_base_args(languages, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
if thresholding != ThresholdingMethod.AUTO and has_thresholding():
args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
# Reminder: test suite tesseract test plugins will break after any changes
# to the number of order parameters here
args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt'])
args_tesseract.extend(tessconfig)
try:
p = run(
args_tesseract,
stdout=PIPE,
stderr=STDOUT,
timeout=timeout,
check=True,
env=_tesseract_env(omp_thread_limit),
)
stdout = p.stdout
except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(timeout)
_generate_null_hocr(output_hocr, output_text, input_file)
except CalledProcessError as e:
tesseract_log_output(e.output)
if b'Image too large' in e.output or b'Empty page!!' in e.output:
_generate_null_hocr(output_hocr, output_text, input_file)
return
raise SubprocessOutputError() from e
else:
tesseract_log_output(stdout)
# The sidecar text file will get the suffix .txt; rename it to
# whatever caller wants it named
with suppress(FileNotFoundError):
prefix.with_suffix('.txt').replace(output_text)
def use_skip_page(output_pdf: Path, output_text: Path) -> None:
output_text.write_text('[skipped page]', encoding='utf-8')
# A 0 byte file to the output to indicate a skip
output_pdf.write_bytes(b'')
def generate_pdf(
*,
input_file: Path,
output_pdf: Path,
output_text: Path,
languages: list[str],
engine_mode: int,
tessconfig: list[str],
timeout: float,
pagesegmode: int,
thresholding: ThresholdingMethod,
user_words,
user_patterns,
omp_thread_limit: int | None = None,
) -> None:
"""Generate a PDF using Tesseract's internal PDF generator.
We specifically a text-only PDF which is more suitable for combining with
the input page.
"""
args_tesseract = tess_base_args(languages, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
args_tesseract.extend(['-c', 'textonly_pdf=1'])
if thresholding != ThresholdingMethod.AUTO and has_thresholding():
args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
prefix = output_pdf.parent / Path(output_pdf.stem)
# Reminder: test suite tesseract test plugins might break after any changes
# to the number of order parameters here
args_tesseract.extend([fspath(input_file), fspath(prefix), 'pdf', 'txt'])
args_tesseract.extend(tessconfig)
try:
p = run(
args_tesseract,
stdout=PIPE,
stderr=STDOUT,
timeout=timeout,
check=True,
env=_tesseract_env(omp_thread_limit),
)
stdout = p.stdout
with suppress(FileNotFoundError):
prefix.with_suffix('.txt').replace(output_text)
except TimeoutExpired:
page_timedout(timeout)
use_skip_page(output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(e.output)
if b'Image too large' in e.output or b'Empty page!!' in e.output:
use_skip_page(output_pdf, output_text)
return
raise SubprocessOutputError() from e
else:
tesseract_log_output(stdout)
================================================
FILE: src/ocrmypdf/_exec/unpaper.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to unpaper executable."""
from __future__ import annotations
import logging
import os
from collections.abc import Iterator
from contextlib import contextmanager
from decimal import Decimal
from pathlib import Path
from subprocess import PIPE, STDOUT
from tempfile import TemporaryDirectory
from packaging.version import Version
from PIL import Image
from ocrmypdf.exceptions import SubprocessOutputError
from ocrmypdf.subprocess import get_version, run
# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/main/doc/basic-concepts.md
UNPAPER_IMAGE_PIXEL_LIMIT = 256 * 1024 * 1024
DecFloat = Decimal | float
log = logging.getLogger(__name__)
class UnpaperImageTooLargeError(Exception):
"""To capture details when an image is too large for unpaper."""
def __init__(
self,
w,
h,
message="Image with size {}x{} is too large for cleaning with 'unpaper'.",
):
self.w = w
self.h = h
self.message = message.format(w, h)
super().__init__(self.message)
def version() -> Version:
return Version(get_version('unpaper', regex=r'(?m).*?(\d+(\.\d+)(\.\d+)?)'))
@contextmanager
def _setup_unpaper_io(input_file: Path) -> Iterator[tuple[Path, Path, Path]]:
with Image.open(input_file) as im:
if im.width * im.height >= UNPAPER_IMAGE_PIXEL_LIMIT:
raise UnpaperImageTooLargeError(w=im.width, h=im.height)
with TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
tmppath = Path(tmpdir)
# No changes, PNG input, just use the file we already have
input_png = input_file
# unpaper can write .png too, but it seems to write them slowly
# adds a few seconds to test suite - so just use pnm
output_pnm = tmppath / 'output.pnm'
yield input_png, output_pnm, tmppath
def run_unpaper(
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: list[str]
) -> None:
args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
with _setup_unpaper_io(input_file) as (input_png, output_pnm, tmpdir):
# To prevent any shenanigans from accepting arbitrary parameters in
# --unpaper-args, we:
# 1) run with cwd set to a tmpdir with only unpaper's files
# 2) forbid the use of '/' in arguments, to prevent changing paths
# 3) append absolute paths for the input and output file
# This should ensure that a user cannot clobber some other file with
# their unpaper arguments (whether intentionally or otherwise)
args_unpaper.extend([os.fspath(input_png), os.fspath(output_pnm)])
run(
args_unpaper,
close_fds=True,
check=True,
stderr=STDOUT, # unpaper writes logging output to stdout and stderr
stdout=PIPE, # and cannot send file output to stdout
cwd=tmpdir,
logs_errors_to_stdout=True,
)
try:
with Image.open(output_pnm) as imout:
imout.save(output_file, dpi=(dpi, dpi))
except OSError as e:
raise SubprocessOutputError(
"unpaper: failed to produce the expected output file. "
+ " Called with: "
+ str(args_unpaper)
) from e
def clean(
input_file: Path,
output_file: Path,
*,
dpi: DecFloat,
unpaper_args: list[str] | None = None,
) -> Path:
default_args = [
'--layout',
'none',
'--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
]
if not unpaper_args:
unpaper_args = default_args
try:
run_unpaper(input_file, output_file, dpi=dpi, mode_args=unpaper_args)
return output_file
except UnpaperImageTooLargeError as e:
log.warning(str(e))
return input_file
================================================
FILE: src/ocrmypdf/_exec/verapdf.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to verapdf executable."""
from __future__ import annotations
import json
import logging
from pathlib import Path
from subprocess import PIPE
from typing import NamedTuple
from packaging.version import Version
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run
log = logging.getLogger(__name__)
class ValidationResult(NamedTuple):
"""Result of PDF/A validation."""
valid: bool
failed_rules: int
message: str
def version() -> Version:
"""Get verapdf version."""
return Version(get_version('verapdf', regex=r'veraPDF (\d+(\.\d+)*)'))
def available() -> bool:
"""Check if verapdf is available."""
try:
version()
except (MissingDependencyError, OSError):
return False
return True
def output_type_to_flavour(output_type: str) -> str:
"""Map OCRmyPDF output_type to verapdf flavour.
Args:
output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'
Returns:
verapdf flavour string like '1b', '2b', '3b'
"""
mapping = {
'pdfa': '2b',
'pdfa-1': '1b',
'pdfa-2': '2b',
'pdfa-3': '3b',
}
return mapping.get(output_type, '2b')
def validate(input_file: Path, flavour: str) -> ValidationResult:
"""Validate a PDF against a PDF/A profile.
Args:
input_file: Path to PDF file to validate
flavour: verapdf flavour (1a, 1b, 2a, 2b, 2u, 3a, 3b, 3u)
Returns:
ValidationResult with validation status
"""
args = [
'verapdf',
'--format',
'json',
'--flavour',
flavour,
str(input_file),
]
try:
proc = run(args, stdout=PIPE, stderr=PIPE, check=False)
except FileNotFoundError as e:
raise MissingDependencyError('verapdf') from e
try:
result = json.loads(proc.stdout)
jobs = result.get('report', {}).get('jobs', [])
if not jobs:
return ValidationResult(False, -1, 'No validation jobs in result')
validation_results = jobs[0].get('validationResult', [])
if not validation_results:
return ValidationResult(False, -1, 'No validation result in output')
validation_result = validation_results[0]
details = validation_result.get('details', {})
failed_rules = details.get('failedRules', 0)
if failed_rules == 0:
return ValidationResult(True, 0, 'PDF/A validation passed')
else:
return ValidationResult(
False,
failed_rules,
f'PDF/A validation failed with {failed_rules} rule violations',
)
except (json.JSONDecodeError, KeyError, TypeError) as e:
log.debug('Failed to parse verapdf output: %s', e)
return ValidationResult(False, -1, f'Failed to parse verapdf output: {e}')
================================================
FILE: src/ocrmypdf/_graft.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""For grafting text-only PDF pages onto freeform PDF pages."""
from __future__ import annotations
import logging
from contextlib import suppress
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ocrmypdf.hocrtransform import OcrElement
from pikepdf import (
Dictionary,
Name,
Operator,
Page,
Pdf,
Stream,
parse_content_stream,
unparse_content_stream,
)
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._options import ProcessingMode
from ocrmypdf._pipeline import VECTOR_PAGE_DPI
class RenderMode(Enum):
"""Controls where the OCR text layer is placed relative to page content.
ON_TOP: Text layer renders above page content (reserved for future use).
UNDERNEATH: Text layer renders below page content (current default behavior).
"""
ON_TOP = 0
UNDERNEATH = 1
@dataclass
class Fpdf2PageInfo:
"""Information needed to render and graft an fpdf2 page."""
pageno: int
hocr_path: Path
dpi: float
autorotate_correction: int
emplaced_page: bool
@dataclass
class Fpdf2ParsedPage:
"""Parsed page data ready for fpdf2 rendering."""
pageno: int
ocr_tree: OcrElement
dpi: float
autorotate_correction: int
emplaced_page: bool
# Alias for backward compatibility with plan documentation
Fpdf2DirectPage = Fpdf2ParsedPage
def _compute_text_misalignment(
content_rotation: int, autorotate_correction: int, emplaced_page: bool
) -> int:
"""Compute rotation needed to align text layer with page content.
Args:
content_rotation: Original page /Rotate value (degrees).
autorotate_correction: Rotation applied during rasterization (degrees).
emplaced_page: Whether the page content was replaced with rasterized image.
Returns:
Rotation in degrees to apply to text layer to align with content.
"""
if emplaced_page:
# New image is upright after autorotation was applied
content_rotation = autorotate_correction
text_rotation = autorotate_correction
return (text_rotation - content_rotation) % 360
def _compute_page_rotation(
content_rotation: int, autorotate_correction: int, emplaced_page: bool
) -> int:
"""Compute final page /Rotate value after grafting.
Args:
content_rotation: Original page /Rotate value (degrees).
autorotate_correction: Rotation applied during rasterization (degrees).
emplaced_page: Whether the page content was replaced with rasterized image.
Returns:
Final /Rotate value for the page.
"""
if emplaced_page:
content_rotation = autorotate_correction
return (content_rotation - autorotate_correction) % 360
def _build_text_layer_ctm(
text_width: float,
text_height: float,
page_width: float,
page_height: float,
page_origin_x: float,
page_origin_y: float,
text_rotation: int,
):
"""Build transformation matrix to align text layer with page content.
Always computes the full CTM to handle non-zero page origins (e.g.,
JSTOR PDFs with MediaBox like [0, 100, 595, 982]) and minor scale
differences due to DPI rounding.
Args:
text_width: Width of text layer mediabox.
text_height: Height of text layer mediabox.
page_width: Width of target page mediabox.
page_height: Height of target page mediabox.
page_origin_x: X origin of target page mediabox.
page_origin_y: Y origin of target page mediabox.
text_rotation: Rotation in degrees (clockwise) to apply to text layer.
Returns:
pikepdf.Matrix transformation matrix, or None if identity.
"""
from pikepdf import Matrix
wt, ht = text_width, text_height
# Center text, rotate, scale to fit page, then position at page origin
translate = Matrix().translated(-wt / 2, -ht / 2)
untranslate = Matrix().translated(page_width / 2, page_height / 2)
corner = Matrix().translated(page_origin_x, page_origin_y)
# Negate rotation because input is clockwise angle
rotate = Matrix().rotated(-text_rotation % 360)
# Swap dimensions if 90 or 270 degree rotation
if text_rotation in (90, 270):
wt, ht = ht, wt
# Scale to fit page dimensions
scale_x = page_width / wt if wt else 1.0
scale_y = page_height / ht if ht else 1.0
scale = Matrix().scaled(scale_x, scale_y)
ctm = translate @ rotate @ scale @ untranslate @ corner
# Return None if the result is effectively identity
identity = Matrix()
if ctm == identity:
return None
return ctm
log = logging.getLogger(__name__)
MAX_REPLACE_PAGES = 100
def _ensure_dictionary(obj: Dictionary | Stream, name: Name):
if name not in obj:
obj[name] = Dictionary({})
return obj[name]
def strip_invisible_text(pdf: Pdf, page: Page):
stream = []
in_text_obj = False
render_mode = 0
render_mode_stack = []
text_objects = []
for operands, operator in parse_content_stream(page, ''):
if operator == Operator('Tr'):
render_mode = operands[0]
if operator == Operator('q'):
render_mode_stack.append(render_mode)
if operator == Operator('Q'):
# IndexError is raised if stack is empty; try to carry on
with suppress(IndexError):
render_mode = render_mode_stack.pop()
if not in_text_obj:
if operator == Operator('BT'):
in_text_obj = True
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
text_objects.append((operands, operator))
if operator == Operator('ET'):
in_text_obj = False
if render_mode != 3:
stream.extend(text_objects)
text_objects.clear()
content_stream = unparse_content_stream(stream)
page.Contents = Stream(pdf, content_stream)
class OcrGrafter:
"""Manages grafting text-only PDFs onto regular PDFs."""
def __init__(self, context: PdfContext):
self.context = context
self.path_base = context.origin
self.pdf_base = Pdf.open(self.path_base)
self.pdfinfo = context.pdfinfo
self.output_file = context.get_path('graft_layers.pdf')
self.emplacements = 1
self.render_mode = RenderMode.UNDERNEATH
# Check renderer type
pdf_renderer = context.options.pdf_renderer
self.use_sandwich_renderer = pdf_renderer == 'sandwich'
# For fpdf2: accumulate pages before rendering
self.fpdf2_hocr_pages: list[Fpdf2PageInfo] = []
self.fpdf2_parsed_pages: list[Fpdf2ParsedPage] = []
def graft_page(
self,
*,
pageno: int,
image: Path | None,
ocr_output: Path | None,
ocr_tree: OcrElement | None,
autorotate_correction: int,
):
"""Graft OCR output onto a page of the base PDF.
Args:
pageno: Zero-based page number.
image: Path to the visible page image PDF, or None if not replacing.
ocr_output: Path to OCR output file. For fpdf2 renderer this is an
hOCR file; for sandwich renderer this is a text-only PDF.
ocr_tree: OCR tree for fpdf2 renderer.
autorotate_correction: Orientation correction in degrees (0, 90, 180, 270).
"""
if ocr_output and ocr_tree:
raise ValueError(
'Cannot specify both ocr_output and ocr_tree for fpdf2 renderer'
)
# Handle image emplacement first
emplaced_page = False
content_rotation = self.pdfinfo[pageno].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != self.path_base:
# We are updating the old page with a rasterized PDF of the new
# page (without changing objgen, to preserve references)
log.debug("Emplacement update")
with Pdf.open(path_image) as pdf_image:
self.emplacements += 1
foreign_image_page = pdf_image.pages[0]
self.pdf_base.pages.append(foreign_image_page)
local_image_page = self.pdf_base.pages[-1]
self.pdf_base.pages[pageno].emplace(
local_image_page, retain=(Name.Parent,)
)
del self.pdf_base.pages[-1]
emplaced_page = True
if self.use_sandwich_renderer:
# Sandwich renderer: graft pre-rendered PDF immediately
if ocr_output:
text_misaligned = _compute_text_misalignment(
content_rotation, autorotate_correction, emplaced_page
)
self._graft_sandwich_text_layer(
pageno=pageno,
textpdf=ocr_output,
text_rotation=text_misaligned,
)
page_rotation = _compute_page_rotation(
content_rotation, autorotate_correction, emplaced_page
)
self.pdf_base.pages[pageno].Rotate = page_rotation
else:
# fpdf2 renderer: accumulate page info for batch rendering.
# The hOCR coordinates are in the corrected (upright) coordinate system.
# We store autorotate_correction and emplaced_page to set the final
# page /Rotate tag after grafting.
if ocr_tree:
self.fpdf2_parsed_pages.append(
Fpdf2ParsedPage(
ocr_tree=ocr_tree,
pageno=pageno,
autorotate_correction=autorotate_correction,
emplaced_page=emplaced_page,
dpi=self.pdfinfo[pageno].dpi.to_scalar(),
)
)
if ocr_output:
self.fpdf2_hocr_pages.append(
Fpdf2PageInfo(
hocr_path=ocr_output,
pageno=pageno,
autorotate_correction=autorotate_correction,
emplaced_page=emplaced_page,
dpi=self.pdfinfo[pageno].dpi.to_scalar(),
)
)
def finalize(self):
# Can have hocr OR parsed pages OR neither (no OCR), but not both
assert not (
self.fpdf2_hocr_pages and self.fpdf2_parsed_pages
), "Can't have both hocr and ocrtree pages"
if self.fpdf2_hocr_pages:
# Render all pages with fpdf2, then graft
parsed_pages = self._parse_hocr_pages()
self.fpdf2_parsed_pages = parsed_pages
if self.fpdf2_parsed_pages:
self._render_and_graft_fpdf2_pages()
self.pdf_base.save(self.output_file)
self.pdf_base.close()
return self.output_file
def _parse_hocr_pages(self):
"""Render all pages to multi-page PDF with shared fonts, then graft."""
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
log.info(
"Parsing %d pages with HocrParser",
len(self.fpdf2_hocr_pages),
)
# Parse all hOCR files and collect OcrElements
pages_data: list[Fpdf2ParsedPage] = []
for page_info in self.fpdf2_hocr_pages:
if page_info.hocr_path.stat().st_size == 0:
continue # Skip empty pages
# Parse hOCR to OcrElement
parser = HocrParser(page_info.hocr_path)
ocr_tree = parser.parse()
# Use DPI from hOCR (scan_res) which reflects actual rasterization DPI.
# Fall back to pdfinfo DPI or VECTOR_PAGE_DPI for vector-only pages.
effective_dpi = ocr_tree.dpi or page_info.dpi or float(VECTOR_PAGE_DPI)
pages_data.append(
Fpdf2ParsedPage(
pageno=page_info.pageno,
ocr_tree=ocr_tree,
dpi=effective_dpi,
autorotate_correction=page_info.autorotate_correction,
emplaced_page=page_info.emplaced_page,
)
)
return pages_data
def _render_and_graft_fpdf2_pages(self):
font_dir = Path(__file__).parent / "data"
# Render all pages to single PDF
multi_page_pdf_path = self.context.get_path('fpdf2_multipage.pdf')
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import Fpdf2MultiPageRenderer
multi_font_manager = MultiFontManager(font_dir)
# Build renderer input as (pageno, ocr_tree, dpi) tuples
renderer_pages_data = [
(parsed.pageno, parsed.ocr_tree, parsed.dpi)
for parsed in self.fpdf2_parsed_pages
]
renderer = Fpdf2MultiPageRenderer(
pages_data=renderer_pages_data,
multi_font_manager=multi_font_manager,
invisible_text=True,
)
renderer.render(multi_page_pdf_path)
# Now graft each page from the multi-page PDF
with Pdf.open(multi_page_pdf_path) as pdf_text:
for idx, parsed in enumerate(self.fpdf2_parsed_pages):
# Copy page from multi-page PDF
text_page = pdf_text.pages[idx]
content_rotation = self.pdfinfo[parsed.pageno].rotation
text_misaligned = _compute_text_misalignment(
content_rotation,
parsed.autorotate_correction,
parsed.emplaced_page,
)
self._graft_fpdf2_text_layer(parsed.pageno, text_page, text_misaligned)
page_rotation = _compute_page_rotation(
content_rotation,
parsed.autorotate_correction,
parsed.emplaced_page,
)
self.pdf_base.pages[parsed.pageno].Rotate = page_rotation
# Clean up multi-page PDF if not keeping temp files
if not self.context.options.keep_temporary_files:
with suppress(FileNotFoundError):
multi_page_pdf_path.unlink()
def _graft_fpdf2_text_layer(self, pageno: int, text_page: Page, text_rotation: int):
"""Graft a single text page onto the base PDF.
Similar to existing _graft_text_layer but works with
already-rendered pikepdf Page instead of file path.
Args:
pageno: Zero-based page number.
text_page: The text-only PDF page to graft.
text_rotation: Rotation to apply to align text with content (degrees).
"""
from pikepdf import Array
base_page = self.pdf_base.pages[pageno]
# Extract content stream from text_page
text_contents = text_page.Contents.read_bytes()
# Get the mediabox from the text page
mediabox = Array([float(x) for x in text_page.mediabox]) # type: ignore[misc]
wt = float(mediabox[2]) - float(mediabox[0])
ht = float(mediabox[3]) - float(mediabox[1])
# Get base page mediabox
base_mediabox = base_page.mediabox
wp = float(base_mediabox[2]) - float(base_mediabox[0])
hp = float(base_mediabox[3]) - float(base_mediabox[1])
# Create Form XObject from text page content
base_resources = _ensure_dictionary(base_page.obj, Name.Resources)
base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
text_xobj_name = Name.random(prefix="OCR-")
xobj = self.pdf_base.make_stream(text_contents)
base_xobjs[text_xobj_name] = xobj
xobj.Type = Name.XObject
xobj.Subtype = Name.Form
xobj.FormType = 1
xobj.BBox = base_mediabox
# Copy resources from text page's Resources to xobj
# We need to handle this carefully since text_page is from a foreign PDF
if hasattr(text_page, 'Resources') and text_page.Resources:
# Create empty Resources dictionary for xobj
xobj_resources = _ensure_dictionary(xobj, Name.Resources)
# Copy fonts if they exist
if Name.Font in text_page.Resources:
xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)
text_fonts = text_page.Resources[Name.Font]
# Copy each font from the foreign PDF
for font_name, font_obj in text_fonts.items():
xobj_fonts[font_name] = self.pdf_base.copy_foreign(font_obj)
# Copy ExtGState (graphics state) if it exists - needed for transparency
if Name.ExtGState in text_page.Resources:
xobj_extstates = _ensure_dictionary(xobj_resources, Name.ExtGState)
text_extstates = text_page.Resources[Name.ExtGState]
# Copy each graphics state from the foreign PDF
for gs_name, gs_obj in text_extstates.items():
xobj_extstates[gs_name] = self.pdf_base.copy_foreign(gs_obj)
# Build transformation matrix for rotation and scaling
ctm = _build_text_layer_ctm(
wt,
ht,
wp,
hp,
float(base_mediabox[0]),
float(base_mediabox[1]),
text_rotation,
)
if ctm is not None:
pdf_draw_xobj = (
(b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'Q\n'
)
else:
pdf_draw_xobj = b'q\n' + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)
# Strip old invisible text if redo mode is enabled
if self.context.options.mode == ProcessingMode.redo:
strip_invisible_text(self.pdf_base, base_page)
# Add text layer to base page
base_page.contents_coalesce()
base_page.contents_add(
new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH
)
base_page.contents_coalesce()
def _graft_sandwich_text_layer(
self,
*,
pageno: int,
textpdf: Path,
text_rotation: int,
):
"""Graft a pre-rendered text-only PDF onto the base PDF.
This is used by the sandwich renderer which generates PDFs directly
from Tesseract rather than going through hOCR.
"""
from pikepdf import PdfError
log.debug("Grafting sandwich text layer")
if Path(textpdf).stat().st_size == 0:
return
try:
with Pdf.open(textpdf) as pdf_text:
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
base_page = self.pdf_base.pages[pageno]
# Get font from the text PDF
pdf_text_fonts = pdf_text.pages[0].Resources.get(
Name.Font, Dictionary()
)
font = None
font_key = None
for f in ('/f-0-0', '/F1'):
pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None:
font_key = Name(f)
font = self.pdf_base.copy_foreign(pdf_text_font)
break
# Get mediabox dimensions for rotation calculations
mediabox = pdf_text.pages[0].mediabox
wt = float(mediabox[2]) - float(mediabox[0])
ht = float(mediabox[3]) - float(mediabox[1])
base_mediabox = base_page.mediabox
wp = float(base_mediabox[2]) - float(base_mediabox[0])
hp = float(base_mediabox[3]) - float(base_mediabox[1])
# Build transformation matrix for rotation and scaling
ctm = _build_text_layer_ctm(
wt,
ht,
wp,
hp,
float(base_mediabox[0]),
float(base_mediabox[1]),
text_rotation,
)
log.debug("Grafting with ctm %r", ctm)
# Create Form XObject
base_resources = _ensure_dictionary(base_page.obj, Name.Resources)
base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
text_xobj_name = Name.random(prefix="OCR-")
xobj = self.pdf_base.make_stream(pdf_text_contents)
base_xobjs[text_xobj_name] = xobj
xobj.Type = Name.XObject
xobj.Subtype = Name.Form
xobj.FormType = 1
xobj.BBox = base_mediabox
# Add font to xobj resources
if font_key is not None and font is not None:
xobj_resources = _ensure_dictionary(xobj, Name.Resources)
xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)
if font_key not in xobj_fonts:
xobj_fonts[font_key] = font
if ctm is not None:
pdf_draw_xobj = (
(b'q %s cm\n' % ctm.encode())
+ (b'%s Do\n' % text_xobj_name)
+ b'\nQ\n'
)
else:
pdf_draw_xobj = b'q\n' + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)
if self.context.options.mode == ProcessingMode.redo:
strip_invisible_text(self.pdf_base, base_page)
base_page.contents_coalesce()
base_page.contents_add(
new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH
)
base_page.contents_coalesce()
# Add font to page resources
if font_key is not None and font is not None:
page_resources = _ensure_dictionary(base_page.obj, Name.Resources)
page_fonts = _ensure_dictionary(page_resources, Name.Font)
if font_key not in page_fonts:
page_fonts[font_key] = font
except (FileNotFoundError, PdfError):
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
pass
================================================
FILE: src/ocrmypdf/_jobcontext.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Defines context objects that are passed to child processes/threads."""
from __future__ import annotations
from collections.abc import Iterator
from pathlib import Path
from typing import TYPE_CHECKING
from ocrmypdf._options import OcrOptions
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pdfinfo.info import PageInfo
if TYPE_CHECKING:
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
class PdfContext:
"""Holds the context for a particular run of the pipeline."""
options: OcrOptions #: The specified options for processing this PDF.
origin: Path #: The filename of the original input file.
pdfinfo: PdfInfo #: Detailed data for this PDF.
plugin_manager: (
OcrmypdfPluginManager #: PluginManager for processing the current PDF.
)
def __init__(
self,
options: OcrOptions,
work_folder: Path,
origin: Path,
pdfinfo: PdfInfo,
plugin_manager,
):
self.options = options
self.work_folder = work_folder
self.origin = origin
self.pdfinfo = pdfinfo
self.plugin_manager = plugin_manager
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for an intermediate file involved in processing.
The path will be in a temporary folder that is common for all processing
of this particular PDF.
"""
return self.work_folder / name
def get_page_contexts(self) -> Iterator[PageContext]:
"""Get all ``PageContext`` for this PDF."""
npages = len(self.pdfinfo)
for n in range(npages):
yield PageContext(self, n)
def get_page_context_args(self) -> Iterator[tuple[PageContext]]:
"""Get all ``PageContext`` for this PDF packaged in tuple for args-splatting."""
npages = len(self.pdfinfo)
for n in range(npages):
yield (PageContext(self, n),)
class PageContext:
"""Holds our context for a page.
Must be pickle-able, so stores only intrinsic/simple data elements or those
capable of their serializing themselves via ``__getstate__``.
Note: Uses OcrOptions with JSON serialization for multiprocessing compatibility.
"""
origin: Path #: The filename of the original input file.
pageno: int #: This page number (zero-based).
pageinfo: PageInfo #: Information on this page.
plugin_manager: (
OcrmypdfPluginManager #: PluginManager for processing the current PDF.
)
def __init__(self, pdf_context: PdfContext, pageno):
self.work_folder = pdf_context.work_folder
self.origin = pdf_context.origin
# Store OcrOptions directly instead of Namespace
self.options = pdf_context.options
self.pageno = pageno
self.pageinfo = pdf_context.pdfinfo[pageno]
self.plugin_manager = pdf_context.plugin_manager
# Ensure no reference to PdfContext which contains OcrOptions
self._pdf_context = None
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for a file that is part of processing this page.
The path will be based in a common temporary folder and have a prefix based
on the page number.
"""
return self.work_folder / f"{(self.pageno + 1):06d}_{name}"
def __getstate__(self):
state = self.__dict__.copy()
options_json = self.options.model_dump_json_safe()
state['options_json'] = options_json
# Remove the OcrOptions object to avoid pickle issues
del state['options']
# Remove any potential references to Pydantic objects
state.pop('_pdf_context', None)
return state
def __setstate__(self, state):
self.__dict__.update(state)
# Reconstruct OcrOptions from JSON if available
if 'options_json' in state:
from ocrmypdf._options import OcrOptions
self.options = OcrOptions.model_validate_json_safe(state['options_json'])
# Otherwise, we have a fallback Namespace (shouldn't happen in normal operation)
# Leave it as-is for compatibility
================================================
FILE: src/ocrmypdf/_logging.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Logging support classes."""
from __future__ import annotations
import logging
from rich.console import Console
from rich.logging import RichHandler
class PageNumberFilter(logging.Filter):
"""Insert PDF page number that emitted log message to log record."""
def filter(self, record):
pageno = getattr(record, 'pageno', None)
if isinstance(pageno, int):
record.pageno = f'{pageno:5d} '
elif pageno is None:
record.pageno = ''
return True
class RichLoggingHandler(RichHandler):
def __init__(self, console: Console, **kwargs):
super().__init__(
console=console, show_level=False, show_time=False, markup=False, **kwargs
)
================================================
FILE: src/ocrmypdf/_metadata.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF page processing pipeline functions."""
from __future__ import annotations
import datetime as dt
import logging
import os
from pathlib import Path
from typing import Any
from pikepdf import Dictionary, Name, Pdf
from pikepdf import __version__ as PIKEPDF_VERSION
from pikepdf.models.metadata import PdfMetadata, encode_pdf_date
from ocrmypdf._defaults import PROGRAM_NAME
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._version import __version__ as OCRMYPF_VERSION
from ocrmypdf.languages import iso_639_2_from_3
log = logging.getLogger(__name__)
def get_docinfo(base_pdf: Pdf, context: PdfContext) -> dict[str, str]:
"""Read the document info and store it in a dictionary."""
options = context.options
def from_document_info(key):
try:
s = base_pdf.docinfo[key]
return str(s)
except (KeyError, TypeError):
return ''
pdfmark = {
k: from_document_info(k)
for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')
}
if options.title:
pdfmark['/Title'] = options.title
if options.author:
pdfmark['/Author'] = options.author
if options.keywords:
pdfmark['/Keywords'] = options.keywords
if options.subject:
pdfmark['/Subject'] = options.subject
creator_tag = context.plugin_manager.get_ocr_engine(options=options).creator_tag(
options
)
pdfmark['/Creator'] = f'{PROGRAM_NAME} {OCRMYPF_VERSION} / {creator_tag}'
pdfmark['/Producer'] = f'pikepdf {PIKEPDF_VERSION}'
pdfmark['/ModDate'] = encode_pdf_date(dt.datetime.now(dt.UTC))
return pdfmark
def report_on_metadata(options, missing):
if not missing:
return
if options.output_type.startswith('pdfa'):
log.warning(
"Some input metadata could not be copied because it is not "
"permitted in PDF/A. You may wish to examine the output "
"PDF's XMP metadata."
)
log.debug("The following metadata fields were not copied: %r", missing)
else:
log.error(
"Some input metadata could not be copied."
"You may wish to examine the output PDF's XMP metadata."
)
log.info("The following metadata fields were not copied: %r", missing)
def repair_docinfo_nuls(pdf):
"""If the DocumentInfo block contains NUL characters, remove them.
If the DocumentInfo block is malformed, log an error and continue.
"""
modified = False
try:
if not isinstance(pdf.docinfo, Dictionary):
raise TypeError("DocumentInfo is not a dictionary")
for k, v in pdf.docinfo.items():
if isinstance(v, str) and b'\x00' in bytes(v):
pdf.docinfo[k] = bytes(v).replace(b'\x00', b'')
modified = True
except TypeError:
# TypeError can also be raised if dictionary items are unexpected types
log.error("File contains a malformed DocumentInfo block - continuing anyway.")
return modified
def should_linearize(working_file: Path, context: PdfContext) -> bool:
"""Determine whether the PDF should be linearized.
For smaller files, linearization is not worth the effort.
"""
filesize = os.stat(working_file).st_size
return filesize > (context.options.fast_web_view * 1_000_000)
def _fix_metadata(meta_original: PdfMetadata, meta_pdf: PdfMetadata):
# If xmp:CreateDate is missing, set it to the modify date to
# ensure consistency with Ghostscript.
if 'xmp:CreateDate' not in meta_pdf:
meta_pdf['xmp:CreateDate'] = meta_pdf.get('xmp:ModifyDate', '')
if meta_pdf.get('dc:title') == 'Untitled' and ('dc:title' not in meta_original):
# Ghostscript likes to set title to Untitled if omitted from input.
# Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1
# and the XMP Spec do not make this recommendation.
del meta_pdf['dc:title']
def _unset_empty_metadata(meta: PdfMetadata, options):
"""Unset metadata fields that were explicitly set to empty strings.
If the user explicitly specified an empty string for any of the
following, they should be unset and not reported as missing in
the output pdf. Note that some metadata fields use differing names
between PDF/A and PDF.
"""
if options.title == '' and 'dc:title' in meta:
del meta['dc:title'] # PDF/A and PDF
if options.author == '':
if 'dc:creator' in meta:
del meta['dc:creator'] # PDF/A (Not xmp:CreatorTool)
if 'pdf:Author' in meta:
del meta['pdf:Author'] # PDF
if options.subject == '':
if 'dc:description' in meta:
del meta['dc:description'] # PDF/A
if 'dc:subject' in meta:
del meta['dc:subject'] # PDF
if options.keywords == '' and 'pdf:Keywords' in meta:
del meta['pdf:Keywords'] # PDF/A and PDF
def _set_language(pdf: Pdf, languages: list[str]):
"""Set the language of the PDF."""
if Name.Lang in pdf.Root or not languages:
return # Already set or can't change
primary_language_iso639_3 = languages[0]
if not primary_language_iso639_3:
return
iso639_2 = iso_639_2_from_3(primary_language_iso639_3)
if not iso639_2:
return
pdf.Root.Lang = iso639_2
class MetadataProgress:
def __init__(self, progressbar_class, enable: bool = True):
self.progressbar_class = progressbar_class
self.progressbar = self.progressbar_class(
total=100, desc="Linearizing", unit='%', disable=not enable
)
def __enter__(self):
self.progressbar.__enter__()
return self
def __exit__(self, exc_type, exc_value, traceback):
return self.progressbar.__exit__(exc_type, exc_value, traceback)
def __call__(self, percent: int):
if not self.progressbar_class:
return
self.progressbar.update(completed=percent)
def metadata_fixup(
working_file: Path, context: PdfContext, pdf_save_settings: dict[str, Any]
) -> Path:
"""Fix certain metadata fields whether PDF or PDF/A.
Override some of Ghostscript's metadata choices.
Also report on metadata in the input file that was not retained during
conversion.
"""
output_file = context.get_path('metafix.pdf')
options = context.options
pbar_class = context.plugin_manager.get_progressbar_class()
with (
Pdf.open(context.origin) as original,
Pdf.open(working_file) as pdf,
MetadataProgress(pbar_class, options.progress_bar) as pbar,
):
docinfo = get_docinfo(original, context)
with (
original.open_metadata(
set_pikepdf_as_editor=False, update_docinfo=False, strict=False
) as meta_original,
pdf.open_metadata() as meta_pdf,
):
meta_pdf.load_from_docinfo(
docinfo, delete_missing=False, raise_failure=False
)
_fix_metadata(meta_original, meta_pdf)
_unset_empty_metadata(meta_original, options)
_unset_empty_metadata(meta_pdf, options)
meta_missing = set(meta_original.keys()) - set(meta_pdf.keys())
report_on_metadata(options, meta_missing)
_set_language(pdf, options.languages)
pdf.save(output_file, progress=pbar, **pdf_save_settings)
return output_file
================================================
FILE: src/ocrmypdf/_options.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Internal options model for OCRmyPDF."""
from __future__ import annotations
import json
import logging
import os
import shlex
import unicodedata
from collections.abc import Sequence
from enum import StrEnum
from io import IOBase
from pathlib import Path
from typing import Any, BinaryIO
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from ocrmypdf._defaults import DEFAULT_LANGUAGE, DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf.exceptions import BadArgsError
from ocrmypdf.helpers import monotonic
# Import plugin option models - these will be available after plugins are loaded
# We'll use forward references and handle imports dynamically
log = logging.getLogger(__name__)
# Module-level registry for plugin option models
# This is populated by setup_plugin_infrastructure() after plugins are loaded
_plugin_option_models: dict[str, type] = {}
PathOrIO = BinaryIO | IOBase | Path | str | bytes
class ProcessingMode(StrEnum):
"""OCR processing mode for handling pages with existing text.
This enum controls how OCRmyPDF handles pages that already contain text:
- ``default``: Error if text is found (standard OCR behavior)
- ``force``: Rasterize all content and run OCR regardless of existing text
- ``skip``: Skip OCR on pages that already have text
- ``redo``: Re-OCR pages, stripping old invisible text layer
"""
default = 'default'
force = 'force'
skip = 'skip'
redo = 'redo'
class TaggedPdfMode(StrEnum):
"""Control behavior when encountering a Tagged PDF.
Tagged PDFs often indicate documents generated from office applications
that may not need OCR. This enum controls how OCRmyPDF handles them:
- ``default``: Error if ProcessingMode is default, otherwise warn
- ``ignore``: Always warn but continue processing (never error)
"""
default = 'default'
ignore = 'ignore'
def _pages_from_ranges(ranges: str) -> set[int]:
"""Convert page range string to set of page numbers."""
pages: list[int] = []
page_groups = ranges.replace(' ', '').split(',')
for group in page_groups:
if not group:
continue
try:
start, end = group.split('-')
except ValueError:
pages.append(int(group) - 1)
else:
try:
new_pages = list(range(int(start) - 1, int(end)))
if not new_pages:
raise BadArgsError(
f"invalid page subrange '{start}-{end}'"
) from None
pages.extend(new_pages)
except ValueError:
raise BadArgsError(f"invalid page subrange '{group}'") from None
if not pages:
raise BadArgsError(
f"The string of page ranges '{ranges}' did not contain any recognizable "
f"page ranges."
)
if not monotonic(pages):
log.warning(
"List of pages to process contains duplicate pages, or pages that are "
"out of order"
)
if any(page < 0 for page in pages):
raise BadArgsError("pages refers to a page number less than 1")
log.debug("OCRing only these pages: %s", pages)
return set(pages)
class OcrOptions(BaseModel):
"""Internal options model that can masquerade as argparse.Namespace.
This model provides proper typing and validation while maintaining
compatibility with existing code that expects argparse.Namespace behavior.
"""
# I/O options
input_file: PathOrIO
output_file: PathOrIO
sidecar: PathOrIO | None = None
output_folder: Path | None = None
work_folder: Path | None = None
# Core OCR options
languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])
output_type: str = 'auto'
mode: ProcessingMode = ProcessingMode.default
# Backward compatibility properties for force_ocr, skip_text, redo_ocr
@property
def force_ocr(self) -> bool:
"""Backward compatibility alias for mode == ProcessingMode.force."""
return self.mode == ProcessingMode.force
@property
def skip_text(self) -> bool:
"""Backward compatibility alias for mode == ProcessingMode.skip."""
return self.mode == ProcessingMode.skip
@property
def redo_ocr(self) -> bool:
"""Backward compatibility alias for mode == ProcessingMode.redo."""
return self.mode == ProcessingMode.redo
# Job control
jobs: int | None = None
use_threads: bool = True
progress_bar: bool = True
quiet: bool = False
verbose: int = 0
keep_temporary_files: bool = False
# Image processing
image_dpi: int | None = None
deskew: bool = False
clean: bool = False
clean_final: bool = False
rotate_pages: bool = False
remove_background: bool = False
remove_vectors: bool = False
oversample: int = 0
unpaper_args: list[str] | None = None
# OCR behavior
skip_big: float | None = None
pages: str | set[int] | None = None # Can be string or set after validation
invalidate_digital_signatures: bool = False
tagged_pdf_mode: TaggedPdfMode = TaggedPdfMode.default
# Metadata
title: str | None = None
author: str | None = None
subject: str | None = None
keywords: str | None = None
# Optimization
optimize: int = 1
jpg_quality: int | None = None
png_quality: int | None = None
jbig2_threshold: float = 0.85
# Compatibility alias for plugins that expect jpeg_quality
@property
def jpeg_quality(self):
"""Compatibility alias for jpg_quality."""
return self.jpg_quality
@jpeg_quality.setter
def jpeg_quality(self, value):
"""Compatibility alias for jpg_quality."""
self.jpg_quality = value
# Output behavior
no_overwrite: bool = False
# Advanced options
max_image_mpixels: float = 250.0
pdf_renderer: str = 'auto'
ocr_engine: str = 'auto'
rasterizer: str = 'auto'
rotate_pages_threshold: float = DEFAULT_ROTATE_PAGES_THRESHOLD
user_words: os.PathLike | None = None
user_patterns: os.PathLike | None = None
fast_web_view: float = 1.0
continue_on_soft_render_error: bool | None = None
# Tesseract options - also accessible via options.tesseract.
tesseract_config: list[str] = []
tesseract_pagesegmode: int | None = None
tesseract_oem: int | None = None
tesseract_thresholding: int | None = None
tesseract_timeout: float | None = None
tesseract_non_ocr_timeout: float | None = None
tesseract_downsample_above: int = 32767
tesseract_downsample_large_images: bool | None = None
# Ghostscript options - also accessible via options.ghostscript.
pdfa_image_compression: str | None = None
color_conversion_strategy: str = "LeaveColorUnchanged"
# Optimize/JBIG2 options - also accessible via options.optimize.
jbig2_threshold: float = 0.85
# Plugin system
plugins: Sequence[Path | str] | None = None
# Store any extra attributes (for plugins and dynamic options)
extra_attrs: dict[str, Any] = Field(
default_factory=dict, exclude=True, alias='_extra_attrs'
)
@field_validator('languages')
@classmethod
def validate_languages(cls, v):
"""Ensure languages list is not empty."""
if not v:
return [DEFAULT_LANGUAGE]
return v
@field_validator('output_type')
@classmethod
def validate_output_type(cls, v):
"""Validate output type is one of the allowed values."""
valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
if v not in valid_types:
raise ValueError(f"output_type must be one of {valid_types}")
return v
@field_validator('pdf_renderer')
@classmethod
def validate_pdf_renderer(cls, v):
"""Validate PDF renderer is one of the allowed values."""
valid_renderers = {'auto', 'sandwich', 'fpdf2'}
# Legacy hocr/hocrdebug are accepted but redirected to fpdf2
legacy_renderers = {'hocr', 'hocrdebug'}
all_accepted = valid_renderers | legacy_renderers
if v not in all_accepted:
raise ValueError(f"pdf_renderer must be one of {all_accepted}")
return v
@field_validator('rasterizer')
@classmethod
def validate_rasterizer(cls, v):
"""Validate rasterizer is one of the allowed values."""
valid_rasterizers = {'auto', 'ghostscript', 'pypdfium'}
if v not in valid_rasterizers:
raise ValueError(f"rasterizer must be one of {valid_rasterizers}")
return v
@field_validator('clean_final')
@classmethod
def validate_clean_final(cls, v, info):
"""If clean_final is True, also set clean to True."""
if v and hasattr(info, 'data') and 'clean' in info.data:
info.data['clean'] = True
return v
@field_validator('jobs')
@classmethod
def validate_jobs(cls, v):
"""Validate jobs is a reasonable number."""
if v is not None and (v < 0 or v > 256):
raise ValueError("jobs must be between 0 and 256")
return v
@field_validator('verbose')
@classmethod
def validate_verbose(cls, v):
"""Validate verbose level."""
if v < 0 or v > 2:
raise ValueError("verbose must be between 0 and 2")
return v
@field_validator('oversample')
@classmethod
def validate_oversample(cls, v):
"""Validate oversample DPI."""
if v < 0 or v > 5000:
raise ValueError("oversample must be between 0 and 5000")
return v
@field_validator('max_image_mpixels')
@classmethod
def validate_max_image_mpixels(cls, v):
"""Validate max image megapixels."""
if v < 0:
raise ValueError("max_image_mpixels must be non-negative")
return v
@field_validator('rotate_pages_threshold')
@classmethod
def validate_rotate_pages_threshold(cls, v):
"""Validate rotate pages threshold."""
if v < 0 or v > 1000:
raise ValueError("rotate_pages_threshold must be between 0 and 1000")
return v
@field_validator('title', 'author', 'keywords', 'subject')
@classmethod
def validate_metadata_unicode(cls, v):
"""Validate metadata strings don't contain unsupported Unicode characters."""
if v is None:
return v
for char in v:
if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000:
hexchar = hex(ord(char))[2:].upper()
raise ValueError(
f"Metadata string contains unsupported Unicode character: "
f"{char} (U+{hexchar})"
)
return v
@field_validator('pages')
@classmethod
def validate_pages_format(cls, v):
"""Convert page ranges string to set of page numbers."""
if v is None:
return v
if isinstance(v, set):
return v # Already processed
# Convert string ranges to set of page numbers
return _pages_from_ranges(v)
@field_validator('unpaper_args', mode='before')
@classmethod
def validate_unpaper_args(cls, v):
"""Normalize unpaper_args from string to list and validate security."""
if v is None:
return v
if isinstance(v, str):
v = shlex.split(v)
if isinstance(v, list):
if any(('/' in arg or arg == '.' or arg == '..') for arg in v):
raise ValueError('No filenames allowed in --unpaper-args')
return v
raise ValueError(f'unpaper_args must be a string or list, got {type(v)}')
@model_validator(mode='before')
@classmethod
def handle_special_cases(cls, data):
"""Handle special cases for API compatibility and legacy options."""
if isinstance(data, dict):
# For hOCR API, output_file might not be present
if 'output_folder' in data and 'output_file' not in data:
data['output_file'] = '/dev/null' # Placeholder
# Convert legacy boolean options (force_ocr, skip_text, redo_ocr) to mode
force = data.pop('force_ocr', None)
skip = data.pop('skip_text', None)
redo = data.pop('redo_ocr', None)
# Count how many legacy options are set to True
legacy_set = [
(force, ProcessingMode.force),
(skip, ProcessingMode.skip),
(redo, ProcessingMode.redo),
]
legacy_true = [(val, mode) for val, mode in legacy_set if val]
legacy_count = len(legacy_true)
# Get current mode value (may be string or enum)
current_mode = data.get('mode', ProcessingMode.default)
if isinstance(current_mode, str):
current_mode = ProcessingMode(current_mode)
mode_is_set = current_mode != ProcessingMode.default
if legacy_count > 1:
raise ValueError(
"Choose only one of --force-ocr, --skip-text, --redo-ocr."
)
if legacy_count == 1:
expected_mode = legacy_true[0][1]
if mode_is_set and current_mode != expected_mode:
legacy_flag = f"--{expected_mode.value.replace('_', '-')}-ocr"
raise ValueError(
f"Conflicting options: --mode {current_mode.value} "
f"cannot be used with {legacy_flag} or similar legacy flag."
)
# Set mode from legacy option
data['mode'] = expected_mode
return data
@model_validator(mode='after')
def validate_redo_ocr_options(self):
"""Validate options compatible with redo mode."""
if self.mode == ProcessingMode.redo and (
self.deskew or self.clean_final or self.remove_background
):
raise ValueError(
"--redo-ocr (or --mode redo) is not currently compatible with "
"--deskew, --clean-final, and --remove-background"
)
return self
@model_validator(mode='after')
def validate_output_type_compatibility(self):
"""Validate output type is compatible with output file."""
if self.output_type == 'none' and str(self.output_file) not in (
os.devnull,
'-',
):
raise ValueError(
"Since you specified `--output-type none`, the output file "
f"{self.output_file} cannot be produced. Set the output file to "
f"`-` to suppress this message."
)
return self
@property
def lossless_reconstruction(self):
"""Determine lossless_reconstruction based on other options."""
lossless = not any(
[
self.deskew,
self.clean_final,
self.mode == ProcessingMode.force,
self.remove_background,
]
)
return lossless
def model_dump_json_safe(self) -> str:
"""Serialize to JSON with special handling for non-serializable types."""
# Create a copy of the model data for serialization
data = self.model_dump()
# Handle special types that don't serialize to JSON directly
def _serialize_value(value):
if isinstance(value, Path):
return {'__type__': 'Path', 'value': str(value)}
elif (
isinstance(value, BinaryIO | IOBase)
or hasattr(value, 'read')
or hasattr(value, 'write')
):
# Stream object - replace with placeholder
return {'__type__': 'Stream', 'value': 'stream'}
elif hasattr(value, '__class__') and 'Iterator' in value.__class__.__name__:
# Handle Pydantic serialization iterators
return {'__type__': 'Stream', 'value': 'stream'}
elif isinstance(value, property):
# Handle property objects that shouldn't be serialized
return None
elif isinstance(value, list | tuple):
return [_serialize_value(item) for item in value]
elif isinstance(value, dict):
return {k: _serialize_value(v) for k, v in value.items()}
else:
return value
# Process all fields
serializable_data = {}
for key, value in data.items():
serialized_value = _serialize_value(value)
if serialized_value is not None: # Skip None values from properties
serializable_data[key] = serialized_value
# Add extra_attrs, excluding plugin cache entries (they'll be recreated lazily)
if self.extra_attrs:
filtered_extra = {
k: v
for k, v in self.extra_attrs.items()
if not k.startswith('_plugin_cache_')
}
if filtered_extra:
serializable_data['_extra_attrs'] = _serialize_value(filtered_extra)
return json.dumps(serializable_data)
@classmethod
def model_validate_json_safe(cls, json_str: str) -> OcrOptions:
"""Reconstruct from JSON with special handling for non-serializable types."""
data = json.loads(json_str)
# Handle special types during deserialization
def _deserialize_value(value):
if isinstance(value, dict) and '__type__' in value:
if value['__type__'] == 'Path':
return Path(value['value'])
elif value['__type__'] == 'Stream':
# For streams, we'll use a placeholder string
return value['value']
else:
return value['value']
elif isinstance(value, list):
return [_deserialize_value(item) for item in value]
elif isinstance(value, dict):
return {k: _deserialize_value(v) for k, v in value.items()}
else:
return value
# Process all fields
deserialized_data = {}
extra_attrs = {}
for key, value in data.items():
if key == '_extra_attrs':
extra_attrs = _deserialize_value(value)
else:
deserialized_data[key] = _deserialize_value(value)
# Create instance
instance = cls(**deserialized_data)
instance.extra_attrs = extra_attrs
return instance
model_config = ConfigDict(
extra="forbid", # Force use of extra_attrs for unknown fields
arbitrary_types_allowed=True, # Allow BinaryIO, Path, etc.
validate_assignment=True, # Validate on attribute assignment
)
@classmethod
def register_plugin_models(cls, models: dict[str, type]) -> None:
"""Register plugin option model classes for nested access.
Args:
models: Dictionary mapping namespace to model class
"""
global _plugin_option_models
_plugin_option_models.update(models)
def _get_plugin_options(self, namespace: str) -> Any:
"""Get or create a plugin options instance for the given namespace.
This method creates plugin option instances lazily from flat field values.
Args:
namespace: The plugin namespace (e.g., 'tesseract', 'optimize')
Returns:
An instance of the plugin's option model, or None if not registered
"""
# Use extra_attrs to cache plugin option instances
cache_key = f'_plugin_cache_{namespace}'
if cache_key in self.extra_attrs:
return self.extra_attrs[cache_key]
if namespace not in _plugin_option_models:
raise AttributeError(
f"Plugin namespace '{namespace}' is not registered. "
f"Ensure setup_plugin_infrastructure() was called."
)
model_class = _plugin_option_models[namespace]
def _convert_value(value):
"""Convert value to be compatible with plugin model fields."""
if isinstance(value, os.PathLike):
return os.fspath(value)
return value
# Build kwargs from flat fields
kwargs = {}
for field_name in model_class.model_fields:
# Try namespace_field pattern first (e.g., tesseract_timeout)
flat_name = f"{namespace}_{field_name}"
if flat_name in OcrOptions.model_fields:
value = getattr(self, flat_name)
if value is not None:
kwargs[field_name] = _convert_value(value)
# Also check direct field name (for fields like jbig2_lossy)
elif field_name in OcrOptions.model_fields:
value = getattr(self, field_name)
if value is not None:
kwargs[field_name] = _convert_value(value)
# Check for special mappings
elif namespace == 'optimize' and field_name == 'level':
# 'optimize' field maps to 'level' in OptimizeOptions
if 'optimize' in OcrOptions.model_fields:
value = self.optimize
if value is not None:
kwargs[field_name] = _convert_value(value)
elif namespace == 'optimize' and field_name == 'jpeg_quality':
# jpg_quality maps to jpeg_quality
if 'jpg_quality' in OcrOptions.model_fields:
value = self.jpg_quality
if value is not None:
kwargs[field_name] = _convert_value(value)
# Create and cache the plugin options instance
instance = model_class(**kwargs)
self.extra_attrs[cache_key] = instance
return instance
def __getattr__(self, name: str) -> Any:
"""Support dynamic access to plugin option namespaces.
This allows accessing plugin options like:
options.tesseract.timeout
options.optimize.level
Plugin models must be registered via register_plugin_models() for
namespace access to work. Built-in plugins register their models
during initialization.
Args:
name: Attribute name
Returns:
Plugin options instance if name is a registered namespace,
otherwise raises AttributeError
"""
# Check if this is a plugin namespace
if name.startswith('_'):
# Private attributes should not trigger plugin lookup
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)
# Try to get plugin options for this namespace
if name in _plugin_option_models:
return self._get_plugin_options(name)
# Check extra_attrs
if 'extra_attrs' in self.__dict__ and name in self.extra_attrs:
return self.extra_attrs[name]
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)
================================================
FILE: src/ocrmypdf/_pipeline.py
================================================
# SPDX-FileCopyrightText: 2018-2022 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF page processing pipeline functions."""
from __future__ import annotations
import logging
import os
import re
import sys
from collections.abc import Iterable, Iterator, Sequence
from contextlib import suppress
from io import BytesIO
from pathlib import Path
from shutil import copyfileobj
from typing import TYPE_CHECKING, Any, BinaryIO, TypeVar, cast
if TYPE_CHECKING:
from ocrmypdf.hocrtransform import OcrElement
import img2pdf
import pikepdf
from PIL import Image, ImageColor, ImageDraw
from ocrmypdf._concurrent import Executor
from ocrmypdf._exec import unpaper
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._metadata import repair_docinfo_nuls
from ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode
from ocrmypdf.exceptions import (
DigitalSignatureError,
DpiError,
EncryptedPdfError,
InputFileError,
PriorOcrFoundError,
TaggedPDFError,
UnsupportedImageFormatError,
)
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink
from ocrmypdf.pdfa import (
file_claims_pdfa,
generate_pdfa_ps,
speculative_pdfa_conversion,
)
from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice, OrientationConfidence
try:
from pi_heif import register_heif_opener
except ImportError:
def register_heif_opener():
pass
T = TypeVar("T")
log = logging.getLogger(__name__)
VECTOR_PAGE_DPI = 400
register_heif_opener()
def triage_image_file(input_file: Path, output_file: Path, options: OcrOptions) -> None:
"""Triage the input image file.
If the input file is an image, check its resolution and convert it to PDF.
Args:
input_file: The path to the input file.
output_file: The path to the output file.
options: An object containing the options passed to the OCRmyPDF command.
Raises:
UnsupportedImageFormatError: If the input file is not a supported image format.
DpiError: If the input image has no resolution (DPI) in its metadata or if the
resolution is not credible.
"""
log.info("Input file is not a PDF, checking if it is an image...")
try:
im = Image.open(input_file)
except OSError as e:
# Recover the original filename
log.error(str(e).replace(str(input_file), str(options.input_file)))
if not input_file.exists():
log.error("Input file does not exist: %s", input_file)
if input_file.is_dir():
log.error("Input file is a directory: %s", input_file)
if input_file.is_file():
log.error("Input file is a file: %s", input_file)
if input_file.stat().st_size == 0:
log.error("Input file is empty: %s", input_file)
raise UnsupportedImageFormatError() from e
with im:
log.info("Input file is an image")
if 'dpi' in im.info:
if im.info['dpi'] <= (96, 96) and not options.image_dpi:
log.info("Image size: (%d, %d)", *im.size)
log.info("Image resolution: (%d, %d)", *im.info['dpi'])
raise DpiError(
"Input file is an image, but the resolution (DPI) is "
"not credible. Estimate the resolution at which the "
"image was scanned and specify it using --image-dpi."
)
elif not options.image_dpi:
log.info("Image size: (%d, %d)", *im.size)
raise DpiError(
"Input file is an image, but has no resolution (DPI) "
"in its metadata. Estimate the resolution at which "
"image was scanned and specify it using --image-dpi."
)
if im.mode in ('RGBA', 'LA'):
raise UnsupportedImageFormatError(
"The input image has an alpha channel. Remove the alpha "
"channel first."
)
if 'iccprofile' not in im.info:
if im.mode == 'RGB':
log.info("Input image has no ICC profile, assuming sRGB")
elif im.mode == 'CMYK':
raise UnsupportedImageFormatError(
"Input CMYK image has no ICC profile, not usable"
)
try:
log.info("Image seems valid. Try converting to PDF...")
layout_fun = img2pdf.default_layout_fun
if options.image_dpi:
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
Resolution(options.image_dpi, options.image_dpi)
)
with open(output_file, 'wb') as outf:
img2pdf.convert(
os.fspath(input_file),
layout_fun=layout_fun,
outputstream=outf,
**IMG2PDF_KWARGS,
)
log.info("Successfully converted to PDF, processing...")
except img2pdf.ImageOpenError as e:
raise UnsupportedImageFormatError() from e
def _pdf_guess_version(input_file: Path, search_window=1024) -> str:
"""Try to find version signature at start of file.
Not robust enough to deal with appended files.
Returns empty string if not found, indicating file is probably not PDF.
"""
with open(input_file, 'rb') as f:
signature = f.read(search_window)
m = re.search(rb'%PDF-(\d\.\d)', signature)
if m:
return m.group(1).decode('ascii')
return ''
def triage(
original_filename: str, input_file: Path, output_file: Path, options: OcrOptions
) -> Path:
"""Triage the input file. We can handle PDFs and images."""
try:
if _pdf_guess_version(input_file):
if options.image_dpi:
log.warning(
"Argument --image-dpi is being ignored because the "
"input file is a PDF, not an image."
)
try:
with pikepdf.open(input_file) as pdf:
pdf.save(output_file)
except pikepdf.PdfError as e:
raise InputFileError() from e
except pikepdf.PasswordError as e:
raise EncryptedPdfError() from e
return output_file
except OSError as e:
log.debug(f"Temporary file was at: {input_file}")
msg = str(e).replace(str(input_file), original_filename)
raise InputFileError(msg) from e
triage_image_file(input_file, output_file, options)
return output_file
def get_pdfinfo(
input_file,
*,
executor: Executor,
detailed_analysis: bool = False,
progbar: bool = False,
max_workers: int | None = None,
use_threads: bool = True,
check_pages=None,
) -> PdfInfo:
"""Get the PDF info."""
try:
return PdfInfo(
input_file,
detailed_analysis=detailed_analysis,
progbar=progbar,
max_workers=max_workers,
use_threads=use_threads,
check_pages=check_pages,
executor=executor,
)
except pikepdf.PasswordError as e:
raise EncryptedPdfError() from e
except pikepdf.PdfError as e:
raise InputFileError() from e
def validate_pdfinfo_options(context: PdfContext) -> None:
"""Validate the PDF info options."""
pdfinfo = context.pdfinfo
options = context.options
if pdfinfo.needs_rendering:
raise InputFileError(
"This PDF contains dynamic XFA forms created by Adobe LiveCycle "
"Designer and can only be read by Adobe Acrobat or Adobe Reader."
)
if pdfinfo.has_signature:
if options.invalidate_digital_signatures:
log.warning("All digital signatures will be invalidated")
else:
raise DigitalSignatureError()
if pdfinfo.has_acroform:
if options.mode == ProcessingMode.redo:
raise InputFileError(
"This PDF has a user fillable form. --redo-ocr (or --mode redo) "
"is not currently possible on such files."
)
else:
log.warning(
"This PDF has a fillable form. "
"Chances are it is a pure digital "
"document that does not need OCR."
)
if options.mode != ProcessingMode.force:
log.info(
"Use the option --force-ocr (or --mode force) to produce an "
"image of the form and all filled form fields. The output PDF "
"will be 'flattened' and will no longer be fillable."
)
if pdfinfo.is_tagged:
log.warning(
"This PDF is marked as a Tagged PDF. This often indicates "
"that the PDF was generated from an office document and does "
"not need OCR. PDF pages processed by OCRmyPDF may not be "
"tagged correctly."
)
if (
options.tagged_pdf_mode == TaggedPdfMode.default
and options.mode == ProcessingMode.default
):
log.info("Use --tagged-pdf-mode ignore to ignore Tagged PDFs.")
raise TaggedPDFError()
context.plugin_manager.validate(pdfinfo=pdfinfo, options=options)
def _vector_page_dpi(pageinfo: PageInfo) -> int:
"""Get a DPI to use for vector pages, if the page has vector content."""
return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0
def get_page_square_dpi(
page_context: PageContext, image_dpi: Resolution | None = None
) -> Resolution:
"""Get the DPI when we require xres == yres, scaled to physical units.
Page DPI includes UserUnit scaling.
"""
pageinfo = page_context.pageinfo
options = page_context.options
if not image_dpi:
image_dpi = pageinfo.dpi
xres = image_dpi.x or 0.0
yres = image_dpi.y or 0.0
userunit = float(pageinfo.userunit) or 1.0
units = float(
max(
(xres * userunit) or VECTOR_PAGE_DPI,
(yres * userunit) or VECTOR_PAGE_DPI,
_vector_page_dpi(pageinfo),
options.oversample or 0.0,
)
)
return Resolution(units, units)
def get_canvas_square_dpi(
page_context: PageContext, image_dpi: Resolution | None = None
) -> Resolution:
"""Get the DPI when we require xres == yres, in Postscript units.
Canvas DPI is independent of PDF UserUnit scaling, which is
used to describe situations where the PDF user space is not 1:1 with
the physical units of the page.
"""
pageinfo = page_context.pageinfo
options = page_context.options
if not image_dpi:
image_dpi = pageinfo.dpi
units = float(
max(
image_dpi.x or VECTOR_PAGE_DPI,
image_dpi.y or VECTOR_PAGE_DPI,
_vector_page_dpi(pageinfo),
options.oversample or 0.0,
)
)
return Resolution(units, units)
def is_ocr_required(page_context: PageContext) -> bool:
"""Check if the page needs to be OCR'd."""
pageinfo = page_context.pageinfo
options = page_context.options
ocr_required = True
if options.pages and pageinfo.pageno not in options.pages:
log.debug(f"skipped {pageinfo.pageno} as requested by --pages {options.pages}")
ocr_required = False
elif pageinfo.has_text:
if options.mode == ProcessingMode.default:
raise PriorOcrFoundError(
"page already has text! - aborting (use --force-ocr or --mode force "
"to force OCR; see also help for --skip-text, --redo-ocr, and --mode)"
)
elif options.mode == ProcessingMode.force:
log.info("page already has text! - rasterizing text and running OCR anyway")
ocr_required = True
elif options.mode == ProcessingMode.redo:
if pageinfo.has_corrupt_text:
log.warning(
"some text on this page cannot be mapped to characters: "
"consider using --force-ocr (or --mode force) instead"
)
else:
log.info("redoing OCR")
ocr_required = True
elif options.mode == ProcessingMode.skip:
log.info("skipping all processing on this page")
ocr_required = False
elif not pageinfo.images and not options.lossless_reconstruction:
# We found a page with no images and no text. That means it may
# have vector art that the user wants to OCR. If we determined
# lossless reconstruction is not possible then we have to rasterize
# the image. So if OCR is being forced, take that to mean YES, go
# ahead and rasterize. If not forced, then pretend there's no text
# on the page at all so we don't lose anything.
# This could be made smarter by explicitly searching for vector art.
if options.mode == ProcessingMode.force and options.oversample:
# The user really wants to reprocess this file
log.info(
"page has no images - "
f"rasterizing at {options.oversample} DPI because "
"--force-ocr --oversample (or --mode force --oversample) was specified"
)
elif options.mode == ProcessingMode.force:
# Warn the user they might not want to do this
log.warning(
"page has no images - "
"all vector content will be "
f"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and "
"likely increasing file size. Use --oversample to adjust the "
"DPI."
)
else:
log.info(
"page has no images - "
"skipping all processing on this page to avoid losing detail. "
"Use --force-ocr (or --mode force) if you wish to perform OCR on "
"pages that have vector content."
)
ocr_required = False
if ocr_required and options.skip_big and pageinfo.images:
pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
if pixel_count > (options.skip_big * 1_000_000):
ocr_required = False
log.warning(
"page too big, skipping OCR "
f"({(pixel_count / 1_000_000):.1f} MPixels > "
f"{options.skip_big:.1f} MPixels --skip-big)"
)
return ocr_required
def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
"""Generate a lower quality preview image."""
output_file = page_context.get_path('rasterize_preview.jpg')
canvas_dpi = Resolution(300.0, 300.0).take_min(
[get_canvas_square_dpi(page_context)]
)
page_dpi = Resolution(300.0, 300.0).take_min([get_page_square_dpi(page_context)])
page_context.plugin_manager.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
raster_device=GhostscriptRasterDevice.JPEGGRAY,
raster_dpi=canvas_dpi,
pageno=page_context.pageinfo.pageno + 1,
page_dpi=page_dpi,
rotation=0,
filter_vector=False,
stop_on_soft_error=not page_context.options.continue_on_soft_render_error,
options=page_context.options,
use_cropbox=False,
)
return output_file
def describe_rotation(
page_context: PageContext, orient_conf: OrientationConfidence, correction: int
) -> str:
"""Describe the page rotation we are going to perform (or not perform)."""
direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}
turns = {0: ' ', 90: '⬏', 180: '↻', 270: '⬑'}
existing_rotation = page_context.pageinfo.rotation
action = ''
if orient_conf.confidence >= page_context.options.rotate_pages_threshold:
if correction != 0:
action = 'will rotate ' + turns[correction]
else:
action = 'rotation appears correct'
else:
action = "confidence too low to rotate" if correction != 0 else "no change"
facing = ''
if existing_rotation != 0:
facing = f"with existing rotation {direction.get(existing_rotation, '?')}, "
facing += f"page is facing {direction.get(orient_conf.angle, '?')}"
return f"{facing}, confidence {orient_conf.confidence:.2f} - {action}"
def get_orientation_correction(preview: Path, page_context: PageContext) -> int:
"""Work out orientation correction for each page.
We ask Ghostscript to draw a preview page, which will rasterize with the
current /Rotate applied, and then ask OCR which way the page is
oriented. If the value of /Rotate is correct (e.g., a user already
manually fixed rotation), then OCR will say the page is pointing
up and the correction is zero. Otherwise, the orientation found by
OCR represents the clockwise rotation, or the counterclockwise
correction to rotation.
When we draw the real page for OCR, we rotate it by the CCW correction,
which points it (hopefully) upright. _graft.py takes care of the orienting
the image and text layers.
"""
ocr_engine = page_context.plugin_manager.get_ocr_engine(
options=page_context.options
)
orient_conf = ocr_engine.get_orientation(preview, page_context.options)
correction = orient_conf.angle % 360
log.info(describe_rotation(page_context, orient_conf, correction))
if (
orient_conf.confidence >= page_context.options.rotate_pages_threshold
and correction != 0
):
return correction
return 0
def calculate_image_dpi(page_context: PageContext) -> Resolution:
"""Calculate the DPI for the page image."""
pageinfo = page_context.pageinfo
dpi_profile = pageinfo.page_dpi_profile()
if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
image_dpi = Resolution(dpi_profile.weighted_dpi, dpi_profile.weighted_dpi)
else:
image_dpi = pageinfo.dpi
return image_dpi
def calculate_raster_dpi(page_context: PageContext):
"""Calculate the DPI for rasterization."""
# Produce the page image with square resolution or else deskew and OCR
# will not work properly.
image_dpi = calculate_image_dpi(page_context)
dpi_profile = page_context.pageinfo.page_dpi_profile()
canvas_dpi = get_canvas_square_dpi(page_context, image_dpi)
page_dpi = get_page_square_dpi(page_context, image_dpi)
if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
log.warning(
"Weighted average image DPI is %0.1f, max DPI is %0.1f. "
"The discrepancy may indicate a high detail region on this page, "
"but could also indicate a problem with the input PDF file. "
"Page image will be rendered at %0.1f DPI.",
dpi_profile.weighted_dpi,
dpi_profile.max_dpi,
canvas_dpi.to_scalar(),
)
return canvas_dpi, page_dpi
def rasterize(
input_file: Path,
page_context: PageContext,
correction: int = 0,
output_tag: str = '',
remove_vectors: bool | None = None,
) -> Path:
"""Rasterize a PDF page to a PNG image.
Args:
input_file: The input PDF file path.
page_context: The page context object.
correction: The orientation correction angle. Defaults to 0.
output_tag: The output tag. Defaults to ''.
remove_vectors: Whether to remove vectors. Defaults to None, which means
the value from the page context options will be used. If the value
is True or False, it will override the page context options.
Returns:
Path: The output PNG file path.
"""
colorspaces = [
GhostscriptRasterDevice.PNGMONO,
GhostscriptRasterDevice.PNGGRAY,
GhostscriptRasterDevice.PNG256,
GhostscriptRasterDevice.PNG16M,
]
device_idx = 0
if remove_vectors is None:
remove_vectors = page_context.options.remove_vectors
output_file = page_context.get_path(f'rasterize{output_tag}.png')
pageinfo = page_context.pageinfo
def at_least(colorspace):
return max(device_idx, colorspaces.index(colorspace))
for image in pageinfo.images:
if image.type_ != 'image':
continue # ignore masks
if image.bpc > 1:
if image.color == Colorspace.index:
device_idx = at_least(GhostscriptRasterDevice.PNG256)
elif image.color == Colorspace.gray:
device_idx = at_least(GhostscriptRasterDevice.PNGGRAY)
else:
device_idx = at_least(GhostscriptRasterDevice.PNG16M)
if pageinfo.has_vector:
log.debug(f"Page has vector content, using {GhostscriptRasterDevice.PNG16M}")
device_idx = at_least(GhostscriptRasterDevice.PNG16M)
device = colorspaces[device_idx]
log.debug(
f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
)
canvas_dpi, page_dpi = calculate_raster_dpi(page_context)
page_context.plugin_manager.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
raster_device=device,
raster_dpi=canvas_dpi,
page_dpi=page_dpi,
pageno=pageinfo.pageno + 1,
rotation=correction,
filter_vector=remove_vectors,
stop_on_soft_error=not page_context.options.continue_on_soft_render_error,
options=page_context.options,
use_cropbox=False,
)
return output_file
def preprocess_remove_background(input_file: Path, page_context: PageContext) -> Path:
"""Remove the background from the input image (temporarily disabled)."""
if any(image.bpc > 1 for image in page_context.pageinfo.images):
raise NotImplementedError("--remove-background is temporarily not implemented")
# output_file = page_context.get_path('pp_rm_bg.png')
# leptonica.remove_background(input_file, output_file)
# return output_file
log.info("background removal skipped on mono page")
return input_file
def preprocess_deskew(input_file: Path, page_context: PageContext) -> Path:
"""Deskews the input image using the OCR engine and saves the output to a file.
Args:
input_file: The input image file to deskew.
page_context: The context of the page being processed.
Returns:
Path: The path to the deskewed image file.
"""
output_file = page_context.get_path('pp_deskew.png')
dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))
ocr_engine = page_context.plugin_manager.get_ocr_engine(
options=page_context.options
)
deskew_angle_degrees = ocr_engine.get_deskew(input_file, page_context.options)
with Image.open(input_file) as im:
# According to Pillow docs, .rotate() will automatically use Image.NEAREST
# resampling if image is mode '1' or 'P'
deskewed = im.rotate(
deskew_angle_degrees,
resample=Image.Resampling.BICUBIC,
fillcolor=ImageColor.getcolor('white', mode=im.mode), # type: ignore
)
deskewed.save(output_file, dpi=dpi)
return output_file
def preprocess_clean(input_file: Path, page_context: PageContext) -> Path:
"""Clean the input image using unpaper."""
output_file = page_context.get_path('pp_clean.png')
dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))
return unpaper.clean(
input_file,
output_file,
dpi=dpi.to_scalar(),
unpaper_args=page_context.options.unpaper_args,
)
def create_ocr_image(image: Path, page_context: PageContext) -> Path:
"""Create the image we send for OCR.
Might not be the same as the display image depending on preprocessing.
This image will never be shown to the user.
"""
output_file = page_context.get_path('ocr.png')
options = page_context.options
with Image.open(image) as im:
log.debug('resolution %r', im.info['dpi'])
if options.mode != ProcessingMode.force:
# Do not mask text areas when forcing OCR, because we need to OCR
# all text areas
mask = None # Exclude both visible and invisible text from OCR
if options.mode == ProcessingMode.redo:
mask = True # Mask visible text, but not invisible text
draw = ImageDraw.ImageDraw(im)
for textarea in page_context.pageinfo.get_textareas(
visible=mask, corrupt=None
):
# Calculate resolution based on the image size and page dimensions
# without regard whatever resolution is in pageinfo (may differ or
# be None)
bbox = [float(v) for v in textarea]
xyscale = tuple(float(coord) / 72.0 for coord in im.info['dpi'])
pixcoords = (
bbox[0] * xyscale[0],
im.height - bbox[3] * xyscale[1],
bbox[2] * xyscale[0],
im.height - bbox[1] * xyscale[1],
)
log.debug('blanking %r', pixcoords)
draw.rectangle(pixcoords, fill='white')
# draw.rectangle(pixcoords, outline='pink')
filter_im = page_context.plugin_manager.filter_ocr_image(
page=page_context, image=im
)
if filter_im is not None:
im = filter_im
# Pillow requires integer DPI
dpi = tuple(round(coord) for coord in im.info['dpi'])
im.save(output_file, dpi=dpi)
return output_file
def ocr_engine_hocr(input_file: Path, page_context: PageContext) -> tuple[Path, Path]:
"""Run the OCR engine and generate hOCR output."""
hocr_out = page_context.get_path('ocr_hocr.hocr')
hocr_text_out = page_context.get_path('ocr_hocr.txt')
options = page_context.options
ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
ocr_engine.generate_hocr(
input_file=input_file,
output_hocr=hocr_out,
output_text=hocr_text_out,
options=options,
)
return hocr_out, hocr_text_out
def ocr_engine_direct(
input_file: Path, page_context: PageContext
) -> tuple[OcrElement, Path]:
"""Run the OCR engine and return OcrElement tree directly.
This is the modern path for OCR engines that support the generate_ocr() API.
It bypasses hOCR file generation for better performance and richer data.
Args:
input_file: The image file to OCR.
page_context: The page context with options and path utilities.
Returns:
A tuple of (OcrElement tree, path to text sidecar file).
"""
text_out = page_context.get_path('ocr_direct.txt')
options = page_context.options
ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
ocr_tree, text_content = ocr_engine.generate_ocr(
input_file=input_file,
options=options,
page_number=page_context.pageno,
)
# Write text sidecar file
text_out.write_text(text_content, encoding='utf-8')
return ocr_tree, text_out
def should_visible_page_image_use_jpg(pageinfo: PageInfo) -> bool:
"""Determines whether the visible page image should be saved as a JPEG.
If all images were JPEGs originally (including FlateDecode+DCTDecode),
permit a JPEG as output.
Args:
pageinfo: The PageInfo object containing information about the page.
Returns:
A boolean indicating whether the visible page image should be saved as a JPEG.
"""
return bool(pageinfo.images) and all(
im.enc in (Encoding.jpeg, Encoding.flate_jpeg) for im in pageinfo.images
)
def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path:
"""Create a visible page image in JPEG format.
This is intended to be used when all images on the page were originally JPEGs.
"""
output_file = page_context.get_path('visible.jpg')
with Image.open(image) as im:
# At this point the image should be a .png, but deskew, unpaper
# might have removed the DPI information. In this case, fall back to
# square DPI used to rasterize. When the preview image was
# rasterized, it was also converted to square resolution, which is
# what we want to give to the OCR engine, so keep it square.
if 'dpi' in im.info:
dpi = Resolution(*im.info['dpi'])
else:
# Fallback to page-implied DPI
dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))
# Pillow requires integer DPI
im.save(output_file, format='JPEG', dpi=dpi.to_int())
return output_file
def create_pdf_page_from_image(
image: Path, page_context: PageContext, orientation_correction: int
) -> Path:
"""Create a PDF page from a page image."""
# We rasterize a square DPI version of each page because most image
# processing tools don't support rectangular DPI. Use the square DPI as it
# accurately describes the image. It would be possible to resample the image
# at this stage back to non-square DPI to more closely resemble the input,
# except that the hocr renderer does not understand non-square DPI. The
# sandwich renderer would be fine.
output_file = page_context.get_path('visible.pdf')
pageinfo = page_context.pageinfo
pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches)
effective_rotation = (pageinfo.rotation - orientation_correction) % 360
swap_axis = effective_rotation % 180 == 90
if swap_axis:
pagesize = pagesize[1], pagesize[0]
# Create a new single page PDF to hold
bio = BytesIO()
with open(image, 'rb') as imfile:
log.debug('convert')
layout_fun = img2pdf.get_layout_fun(pagesize)
img2pdf.convert(
imfile,
layout_fun=layout_fun,
outputstream=bio,
engine=img2pdf.Engine.pikepdf,
rotation=img2pdf.Rotation.ifvalid,
)
log.debug('convert done')
# img2pdf does not generate boxes correctly, so we fix them
bio.seek(0)
fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis)
output_file = page_context.plugin_manager.filter_pdf_page(
page=page_context, image_filename=image, output_pdf=output_file
)
return output_file
def ocr_engine_textonly_pdf(
input_image: Path, page_context: PageContext
) -> tuple[Path, Path]:
"""Run the OCR engine and generate a text-only PDF (will look blank)."""
output_pdf = page_context.get_path('ocr_tess.pdf')
output_text = page_context.get_path('ocr_tess.txt')
options = page_context.options
ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
ocr_engine.generate_pdf(
input_file=input_image,
output_pdf=output_pdf,
output_text=output_text,
options=options,
)
return output_pdf, output_text
def _offset_rect(rect: tuple[float, float, float, float], offset: tuple[float, float]):
"""Offset a rectangle by a given amount."""
return (
rect[0] + offset[0],
rect[1] + offset[1],
rect[2] + offset[0],
rect[3] + offset[1],
)
def _adjust_pagebox(
page: pikepdf.Page,
media_box: FloatRect,
name: pikepdf.Name,
target_box: FloatRect,
offset: tuple[float, float],
swap_axis: bool,
):
if media_box == target_box:
return
box = _offset_rect(target_box, offset)
if swap_axis:
box = box[1], box[0], box[3], box[2]
page[name] = box
log.debug(f"{str(name)} = {target_box}")
def fix_pagepdf_boxes(
infile: Path | BinaryIO,
out_file: Path,
page_context: PageContext,
swap_axis: bool = False,
) -> Path:
"""Fix the bounding boxes in a single page PDF.
The single page PDF is created with a normal MediaBox with its lower left corner
at (0, 0). infile is the single page PDF. page_context.mediabox has the original
file's mediabox, which may have a different origin. We need to adjust the other
boxes in the single page PDF to match the effect they had on the original page.
When correcting page rotation, we create a single page PDF that is correctly
rotated instead of an incorrectly rotated and then setting page.Rotate on it.
If rotation is either 90 or 270 degrees, then this function can be called
with swap_axis to swap the X and Y coordinates of all the boxes.
We are not concerned with solving degenerate cases where the boxes overlap or
or express invalid rectangles. We merely pass the boxes, producing a
transformation equivalent to the change made by constructing a new page image.
"""
with pikepdf.open(infile) as pdf:
for page in pdf.pages:
log.debug(
f"initial mediabox={page.MediaBox} and pageinfo "
f"mediabox={page_context.pageinfo.mediabox}"
)
mediabox = page_context.pageinfo.mediabox
offset = -mediabox[0], -mediabox[1]
if swap_axis:
mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
boxes = ['CropBox', 'TrimBox', 'ArtBox', 'BleedBox']
for box_name in boxes:
_adjust_pagebox(
page,
mediabox,
pikepdf.Name(f"/{box_name}"),
getattr(page_context.pageinfo, box_name.lower()),
offset,
swap_axis,
)
pdf.save(out_file)
return out_file
def generate_postscript_stub(context: PdfContext) -> Path:
"""Generates a PostScript file stub for the given PDF context.
Args:
context: The PDF context to generate the PostScript file stub for.
Returns:
Path: The path to the generated PostScript file stub.
"""
output_file = context.get_path('pdfa.ps')
generate_pdfa_ps(output_file)
return output_file
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -> Path:
"""Converts the given PDF to PDF/A.
Args:
input_pdf: The input PDF file path (presumably not PDF/A).
input_ps_stub: The input PostScript file path, containing instructions
for the PDF/A generator to use.
context: The PDF context.
"""
options = context.options
input_pdfinfo = context.pdfinfo
fix_docinfo_file = context.get_path('fix_docinfo.pdf')
output_file = context.get_path('pdfa.pdf')
# If the DocumentInfo record contains NUL characters, Ghostscript will
# produce XMP metadata which contains invalid XML entities ().
# NULs in DocumentInfo seem to be common since older Acrobats included them.
# pikepdf can deal with this, but we make the world a better place by
# stamping them out as soon as possible.
with pikepdf.open(input_pdf) as pdf_file:
if repair_docinfo_nuls(pdf_file):
pdf_file.save(fix_docinfo_file)
else:
safe_symlink(input_pdf, fix_docinfo_file)
# Extract PDF/A part correctly
if options.output_type.startswith('pdfa'):
if options.output_type == 'pdfa':
pdfa_part = '2' # Default to PDF/A-2
else:
pdfa_part = options.output_type.split('-')[
-1
] # Extract number from pdfa-1, pdfa-2, etc.
else:
pdfa_part = '2' # Fallback
context.plugin_manager.generate_pdfa(
pdf_version=input_pdfinfo.min_version,
pdf_pages=[fix_docinfo_file],
pdfmark=input_ps_stub,
output_file=output_file,
context=context,
pdfa_part=pdfa_part,
progressbar_class=(
context.plugin_manager.get_progressbar_class()
if options.progress_bar
else None
),
stop_on_soft_error=not options.continue_on_soft_render_error,
)
return output_file
def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:
"""Try speculative PDF/A conversion with verapdf validation.
This attempts a fast PDF/A conversion by adding PDF/A structures
directly with pikepdf, then validating with verapdf. If validation
passes, returns the converted file. If it fails or verapdf is not
available, returns None to signal that Ghostscript should be used.
Args:
input_pdf: Path to the PDF to convert
context: The PDF context
Returns:
Path to valid PDF/A file, or None if speculative conversion failed
"""
from ocrmypdf._exec import verapdf
options = context.options
# Skip speculative conversion if user requested specific image compression,
# since that requires Ghostscript to apply
gs_opts = getattr(options, 'ghostscript', None)
if gs_opts is not None:
compression = getattr(gs_opts, 'pdfa_image_compression', 'auto')
if compression != 'auto':
log.debug(
'Skipping speculative PDF/A: --pdfa-image-compression=%s requires '
'Ghostscript',
compression,
)
return None
if not verapdf.available():
log.debug('verapdf not available, skipping speculative PDF/A conversion')
return None
output_file = context.get_path('speculative_pdfa.pdf')
try:
speculative_pdfa_conversion(input_pdf, output_file, options.output_type)
flavour = verapdf.output_type_to_flavour(options.output_type)
result = verapdf.validate(output_file, flavour)
if result.valid:
log.info('Speculative PDF/A conversion succeeded - skipping Ghostscript')
return output_file
else:
log.debug(
'Speculative PDF/A validation failed (%d rule violations), '
'falling back to Ghostscript',
result.failed_rules,
)
return None
except Exception as e:
log.debug('Speculative PDF/A conversion failed: %s', e)
return None
def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:
"""Best-effort PDF/A for 'auto' output type.
This function attempts to produce PDF/A without requiring Ghostscript:
1. If verapdf is available, tries speculative conversion with validation
2. Without verapdf, passes through as PDF/A if safe (input already PDF/A
or force-ocr was used)
3. Falls back to regular PDF if neither condition is met
Args:
input_pdf: Path to the PDF to convert
context: The PDF context
Returns:
Tuple of (output_path, actual_output_type) where actual_output_type
is 'pdfa' if PDF/A was achieved, 'pdf' otherwise
"""
from ocrmypdf._exec import verapdf
# If verapdf available, try speculative conversion with validation
if verapdf.available():
result = try_speculative_pdfa(input_pdf, context)
if result is not None:
return (result, 'pdfa')
# verapdf validation failed - fall through to regular PDF
log.info(
'Auto mode: speculative PDF/A validation failed, outputting regular PDF'
)
return (input_pdf, 'pdf')
# Without verapdf, check if we can pass through as PDF/A
if _is_safe_pdfa(input_pdf, context.options):
# Pass through as-is (no modifications needed)
log.info('Auto mode: passing through as PDF/A (input already compliant)')
return (input_pdf, 'pdfa')
# Fall through to regular PDF
log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')
return (input_pdf, 'pdf')
def _is_safe_pdfa(input_pdf: Path, options) -> bool:
"""Check if file can be considered PDF/A without validation.
These are cases where our modifications don't break PDF/A compliance:
1. Input already claims PDF/A (we just grafted OCR text onto it)
2. We used force-ocr (we rewrote the entire PDF from scratch)
Args:
input_pdf: Path to the PDF to check
options: OCR options
Returns:
True if file can safely be considered PDF/A
"""
# Safe if input already claims PDF/A
pdfa_status = file_claims_pdfa(input_pdf)
if pdfa_status['pass']:
return True
# Safe if we rewrote the PDF with force mode
return options.mode == ProcessingMode.force
def should_linearize(working_file: Path, context: PdfContext) -> bool:
"""Determine whether the PDF should be linearized.
For smaller files, linearization is not worth the effort.
"""
filesize = os.stat(working_file).st_size
return filesize > (context.options.fast_web_view * 1_000_000)
def get_pdf_save_settings(output_type: str) -> dict[str, Any]:
"""Get pikepdf.Pdf.save settings for the given output type.
Essentially, don't use features that are incompatible with a given
PDF/A specification.
"""
if output_type == 'pdfa-1':
# Trigger recompression to ensure object streams are removed, because
# Acrobat complains about them in PDF/A-1b validation.
return dict(
preserve_pdfa=True,
compress_streams=True,
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
object_stream_mode=pikepdf.ObjectStreamMode.disable,
)
else:
return dict(
preserve_pdfa=True,
compress_streams=True,
object_stream_mode=(pikepdf.ObjectStreamMode.generate),
)
def _file_size_ratio(
input_file: Path, output_file: Path
) -> tuple[float | None, float | None]:
"""Calculate ratio of input to output file sizes and percentage savings.
Args:
input_file (Path): The path to the input file.
output_file (Path): The path to the output file.
Returns:
tuple[float | None, float | None]: A tuple containing the file size
ratio and the percentage savings achieved by the output file size
compared to the input file size.
"""
input_size = input_file.stat().st_size
output_size = output_file.stat().st_size
if output_size == 0:
return None, None
ratio = input_size / output_size
savings = 1 - output_size / input_size
return ratio, savings
def optimize_pdf(
input_file: Path, context: PdfContext, executor: Executor
) -> tuple[Path, Sequence[str]]:
"""Optimize the given PDF file."""
output_file = context.get_path('optimize.pdf')
output_pdf, messages = context.plugin_manager.optimize_pdf(
input_pdf=input_file,
output_pdf=output_file,
context=context,
executor=executor,
linearize=should_linearize(input_file, context),
)
ratio, savings = _file_size_ratio(input_file, output_file)
if ratio:
log.info(f"Image optimization ratio: {ratio:.2f} savings: {(savings):.1%}")
ratio, savings = _file_size_ratio(context.origin, output_file)
if ratio:
log.info(f"Total file size ratio: {ratio:.2f} savings: {(savings):.1%}")
return output_pdf, messages
def enumerate_compress_ranges(
iterable: Iterable[T],
) -> Iterator[tuple[tuple[int, int], T | None]]:
"""Enumerate the ranges of non-empty elements in an iterable.
Compresses consecutive ranges of length 1 into single elements.
Args:
iterable: An iterable of elements to enumerate.
Yields:
A tuple containing a range of indices and the corresponding element.
If the element is None, the range represents a skipped range of indices.
"""
skipped_from, index = None, None
for index, txt_file in enumerate(iterable):
index += 1
if txt_file:
if skipped_from is not None:
yield (skipped_from, index - 1), None
skipped_from = None
yield (index, index), txt_file
else:
if skipped_from is None:
skipped_from = index
if skipped_from is not None:
yield (skipped_from, index), None
def merge_sidecars(txt_files: Iterable[Path | None], context: PdfContext) -> Path:
"""Merge the page sidecar files into a single file.
Sidecar files are created by the OCR engine and contain the text for each
page in the PDF. This function merges the sidecar files into a single file
and returns the path to the merged file.
"""
output_file = context.get_path('sidecar.txt')
with open(output_file, 'w', encoding="utf-8") as stream:
for (from_, to_), txt_file in enumerate_compress_ranges(txt_files):
if from_ != 1:
stream.write('\f') # Form feed between pages for all pages after first
if txt_file:
txt = txt_file.read_text(encoding="utf-8")
# Some versions of Tesseract add a form feed at the end and
# others don't. Remove it if it exists, since we add one manually.
stream.write(txt.removesuffix('\f'))
else:
pages = f"{from_}-{to_}" if from_ != to_ else f"{from_}"
stream.write(f'[OCR skipped on page(s) {pages}]')
return output_file
def copy_final(
input_file: Path, output_file: str | Path | BinaryIO, original_file: Path | None
) -> None:
"""Copy the final temporary file to the output destination.
Args:
input_file (Path): The intermediate input file to copy.
output_file (str | Path | BinaryIO): The output file to copy to.
original_file: The original file to copy attributes from.
Returns:
None
"""
log.debug('%s -> %s', input_file, output_file)
with input_file.open('rb') as input_stream:
if output_file == '-':
copyfileobj(input_stream, sys.stdout.buffer) # type: ignore[misc]
sys.stdout.flush()
elif hasattr(output_file, 'writable'):
output_stream = cast(BinaryIO, output_file)
copyfileobj(input_stream, output_stream) # type: ignore[misc]
with suppress(AttributeError):
output_stream.flush()
else:
# At this point we overwrite the output_file specified by the user
# use copyfileobj because then we use open() to create the file and
# get the appropriate umask, ownership, etc.
with open(output_file, 'w+b') as output_stream:
copyfileobj(input_stream, output_stream)
================================================
FILE: src/ocrmypdf/_pipelines/__init__.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
================================================
FILE: src/ocrmypdf/_pipelines/_common.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import json
import logging
import logging.handlers
import os
import shutil
import sys
import threading
from collections.abc import Callable, Sequence
from concurrent.futures.process import BrokenProcessPool
from concurrent.futures.thread import BrokenThreadPool
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, cast
if TYPE_CHECKING:
from ocrmypdf.hocrtransform import OcrElement
import PIL
import PIL.Image
from pikepdf import Pdf
from ocrmypdf._annots import remove_broken_goto_annotations
from ocrmypdf._concurrent import Executor, setup_executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._logging import PageNumberFilter
from ocrmypdf._metadata import metadata_fixup
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
convert_to_pdfa,
create_ocr_image,
create_pdf_page_from_image,
create_visible_page_jpg,
generate_postscript_stub,
get_orientation_correction,
get_pdf_save_settings,
get_pdfinfo,
optimize_pdf,
preprocess_clean,
preprocess_deskew,
preprocess_remove_background,
rasterize,
rasterize_preview,
should_linearize,
should_visible_page_image_use_jpg,
try_auto_pdfa,
try_speculative_pdfa,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._validation import (
report_output_file_size,
)
from ocrmypdf.exceptions import ExitCode, ExitCodeException
from ocrmypdf.helpers import (
check_pdf,
pikepdf_enable_mmap,
running_in_docker,
running_in_snap,
samefile,
)
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.pdfinfo import PdfInfo
log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None
def _set_logging_tls(tls):
"""Inject current page number (when available) into log records."""
old_factory = logging.getLogRecordFactory()
def wrapper(*args, **kwargs):
record = old_factory(*args, **kwargs)
if hasattr(tls, 'pageno'):
record.pageno = tls.pageno
return record
logging.setLogRecordFactory(wrapper)
_set_logging_tls(tls)
def set_thread_pageno(pageno: int | None):
"""Set page number (1-based) that the current thread is processing."""
tls.pageno = pageno
class PageResult(NamedTuple):
"""Result when a page is finished processing."""
pageno: int
"""Page number, 0-based."""
pdf_page_from_image: Path | None = None
"""Single page PDF from image."""
ocr: Path | None = None
"""Single page OCR PDF."""
text: Path | None = None
"""Single page text file."""
orientation_correction: int = 0
"""Orientation correction in degrees."""
ocr_tree: OcrElement | None = None
"""Direct OcrElement tree (when using generate_ocr() API)."""
class HOCRResultEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Path):
return {'Path': str(obj)}
return super().default(obj)
class HOCRResultDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
kwargs['object_hook'] = self.dict_to_object
super().__init__(*args, **kwargs)
def dict_to_object(self, d):
if 'Path' in d:
return Path(d['Path'])
return d
@dataclass
class HOCRResult:
"""Result when hOCR is finished processing."""
pageno: int
"""Page number, 0-based."""
pdf_page_from_image: Path | None = None
"""Single page PDF from image."""
hocr: Path | None = None
"""Single page hOCR file."""
textpdf: Path | None = None
"""hOCR file after conversion to PDF."""
orientation_correction: int = 0
"""Orientation correction in degrees."""
ocr_tree: OcrElement | None = None
"""Direct OcrElement tree (when using generate_ocr() API)."""
@classmethod
def from_json(cls, json_str: str) -> HOCRResult:
"""Create an instance from a dict."""
return cls(**json.loads(json_str, cls=HOCRResultDecoder))
def to_json(self) -> str:
"""Serialize to a JSON string."""
return json.dumps(self.__dict__, cls=HOCRResultEncoder)
def configure_debug_logging(
log_filename: Path, prefix: str = ''
) -> tuple[logging.FileHandler, Callable[[], None]]:
"""Create a debug log file at a specified location.
Returns the log handler, and a function to remove the handler.
Args:
log_filename: Where to the put the log file.
prefix: The logging domain prefix that should be sent to the log.
"""
log_file_handler = logging.FileHandler(log_filename, delay=True)
log_file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'[%(asctime)s] - %(name)s - %(levelname)7s -%(pageno)s %(message)s'
)
log_file_handler.setFormatter(formatter)
log_file_handler.addFilter(PageNumberFilter())
logging.getLogger(prefix).addHandler(log_file_handler)
def remover():
try:
logging.getLogger(prefix).removeHandler(log_file_handler)
log_file_handler.close()
except OSError as e:
print(e, file=sys.stderr)
return log_file_handler, remover
def worker_init(max_pixels: int | None) -> None:
"""Initialize a worker thread or process."""
# In Windows, child process will not inherit our change to this value in
# the parent process, so ensure workers get it set. Not needed when running
# threaded, but harmless to set again.
PIL.Image.MAX_IMAGE_PIXELS = max_pixels
pikepdf_enable_mmap()
@contextmanager
def manage_debug_log_handler(
*,
options: OcrOptions,
work_folder: Path,
):
remover = None
if (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get(
'PYTEST_CURRENT_TEST', ''
):
# Debug log for command line interface only with verbose output
# See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
# when pytest is running
_debug_log_handler, remover = configure_debug_logging(
work_folder / "debug.log", prefix=""
) # pragma: no cover
try:
yield
finally:
if remover:
remover()
def _print_temp_folder_location(work_folder: Path):
"""Print the location of the temporary work folder."""
msgs = [f"Temporary working files retained at:\n{work_folder}"]
if running_in_docker(): # pragma: no cover
msgs.append(
"OCRmyPDF is running in a Docker container, "
"so the files will be inside the container."
)
elif running_in_snap(): # pragma: no cover
msgs.append(
"OCRmyPDF is running in a Snap container, "
"so the files will be inside the container."
)
print('\n'.join(msgs), file=sys.stderr)
@contextmanager
def manage_work_folder(*, work_folder: Path, retain: bool, print_location: bool):
try:
yield work_folder
finally:
if retain:
if print_location:
_print_temp_folder_location(work_folder)
else:
shutil.rmtree(work_folder, ignore_errors=True)
def cli_exception_handler(
fn: Callable[[OcrOptions, OcrmypdfPluginManager], ExitCode],
options: OcrOptions,
plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
"""Convert exceptions into command line error messages and exit codes.
When known exceptions are raised, the exception message is printed to stderr
and the program exits with a non-zero exit code. When unknown exceptions are
raised, the exception traceback is printed to stderr and the program exits
with a non-zero exit code.
"""
try:
# We cannot use a generator and yield here, as would be the usual pattern
# for exception handling context managers, because we need to return an exit
# code.
return fn(options, plugin_manager)
except KeyboardInterrupt:
if options.verbose >= 1:
log.exception("KeyboardInterrupt")
else:
log.error("KeyboardInterrupt")
return ExitCode.ctrl_c
except ExitCodeException as e:
e = cast(ExitCodeException, e)
if options.verbose >= 1:
log.exception("ExitCodeException")
elif str(e):
log.error("%s: %s", type(e).__name__, str(e))
else:
log.error(type(e).__name__)
return e.exit_code
except ValueError as e:
# Convert Pydantic validation errors to BadArgsError for proper exit code
if "validation error" in str(e).lower() or "value error" in str(e).lower():
if options.verbose >= 1:
log.exception("Validation error")
else:
log.error("Invalid argument: %s", str(e))
return ExitCode.bad_args
# Re-raise other ValueErrors to be caught by the general exception handler
raise
except PIL.Image.DecompressionBombError:
log.exception(
"A decompression bomb error was encountered while executing the "
"pipeline. Use the argument --max-image-mpixels to raise the maximum "
"image pixel limit."
)
return ExitCode.other_error
except (
BrokenProcessPool,
BrokenThreadPool,
):
log.exception(
"A worker process was terminated unexpectedly. This is known to occur if "
"processing your file takes all available swap space and RAM. It may "
"help to try again with a smaller number of jobs, using the --jobs "
"argument."
)
return ExitCode.child_process_error
except Exception: # pylint: disable=broad-except
log.exception("An exception occurred while executing the pipeline")
return ExitCode.other_error
def setup_pipeline(
options: OcrOptions,
plugin_manager: OcrmypdfPluginManager,
) -> Executor:
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
# Note: OcrOptions is immutable, so we can't modify options.jobs directly
# The jobs field should already be set correctly during OcrOptions creation
# Apply PIL max image pixels side effect
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None # type: ignore
pikepdf_enable_mmap()
executor = setup_executor(plugin_manager)
return executor
def do_get_pdfinfo(pdf_path: Path, executor: Executor, options) -> PdfInfo:
# Handle pages field - it might be a string that needs conversion
check_pages = options.pages
if isinstance(check_pages, str):
from ocrmypdf._options import _pages_from_ranges
check_pages = _pages_from_ranges(check_pages)
return get_pdfinfo(
pdf_path,
executor=executor,
detailed_analysis=options.redo_ocr,
progbar=options.progress_bar,
max_workers=options.jobs,
use_threads=options.use_threads,
check_pages=check_pages,
)
def preprocess(
page_context: PageContext,
image: Path,
remove_background: bool,
deskew: bool,
clean: bool,
) -> Path:
"""Preprocess an image."""
if remove_background:
image = preprocess_remove_background(image, page_context)
if deskew:
image = preprocess_deskew(image, page_context)
if clean:
image = preprocess_clean(image, page_context)
return image
def make_intermediate_images(
page_context: PageContext, orientation_correction: int
) -> tuple[Path, Path | None]:
"""Create intermediate and preprocessed images for OCR."""
options = page_context.options
ocr_image = preprocess_out = None
rasterize_out = rasterize(
page_context.origin,
page_context,
correction=orientation_correction,
remove_vectors=False,
)
if not any([options.clean, options.clean_final, options.remove_vectors]):
ocr_image = preprocess_out = preprocess(
page_context,
rasterize_out,
options.remove_background,
options.deskew,
clean=False,
)
else:
if not options.lossless_reconstruction:
preprocess_out = preprocess(
page_context,
rasterize_out,
options.remove_background,
options.deskew,
clean=options.clean_final,
)
if options.remove_vectors:
rasterize_ocr_out = rasterize(
page_context.origin,
page_context,
correction=orientation_correction,
remove_vectors=True,
output_tag='_ocr',
)
else:
rasterize_ocr_out = rasterize_out
if (
preprocess_out
and rasterize_ocr_out == rasterize_out
and options.clean == options.clean_final
):
# Optimization: image for OCR is identical to presentation image
ocr_image = preprocess_out
else:
ocr_image = preprocess(
page_context,
rasterize_ocr_out,
options.remove_background,
options.deskew,
clean=options.clean,
)
return ocr_image, preprocess_out
def process_page(page_context: PageContext) -> tuple[Path, Path | None, int]:
"""Process page to create OCR image, visible page image and orientation."""
options = page_context.options
orientation_correction = 0
if options.rotate_pages:
# Rasterize
rasterize_preview_out = rasterize_preview(page_context.origin, page_context)
orientation_correction = get_orientation_correction(
rasterize_preview_out, page_context
)
ocr_image, preprocess_out = make_intermediate_images(
page_context, orientation_correction
)
ocr_image_out = create_ocr_image(ocr_image, page_context)
pdf_page_from_image_out = None
if not options.lossless_reconstruction:
assert preprocess_out
visible_image_out = preprocess_out
if should_visible_page_image_use_jpg(page_context.pageinfo):
visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
filtered_image = page_context.plugin_manager.filter_page_image(
page=page_context, image_filename=visible_image_out
)
if filtered_image is not None: # None if no hook is present
visible_image_out = filtered_image
pdf_page_from_image_out = create_pdf_page_from_image(
visible_image_out, page_context, orientation_correction
)
return ocr_image_out, pdf_page_from_image_out, orientation_correction
def postprocess(
pdf_file: Path, context: PdfContext, executor: Executor
) -> tuple[Path, Sequence[str]]:
"""Postprocess the PDF file."""
# pdf_out = pdf_file
with Pdf.open(pdf_file) as pdf:
fix_annots = context.get_path('fix_annots.pdf')
if remove_broken_goto_annotations(pdf):
pdf.save(fix_annots)
pdf_out = fix_annots
else:
pdf_out = pdf_file
if context.options.output_type == 'auto':
# Best effort PDF/A - never uses Ghostscript
pdf_out, actual_type = try_auto_pdfa(pdf_out, context)
# Store actual output type for reporting
context.options.extra_attrs['_actual_output_type'] = actual_type
elif context.options.output_type.startswith('pdfa'):
# Required PDF/A - uses Ghostscript as fallback
speculative_result = try_speculative_pdfa(pdf_out, context)
if speculative_result is not None:
pdf_out = speculative_result
else:
# Fall back to Ghostscript conversion
ps_stub_out = generate_postscript_stub(context)
pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)
optimizing = context.plugin_manager.is_optimization_enabled(context=context)
save_settings = get_pdf_save_settings(context.options.output_type)
save_settings['linearize'] = not optimizing and should_linearize(pdf_out, context)
pdf_out = metadata_fixup(pdf_out, context, pdf_save_settings=save_settings)
return optimize_pdf(pdf_out, context, executor)
def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:
if options.output_file == '-':
log.info("Output sent to stdout")
elif hasattr(options.output_file, 'writable') and options.output_file.writable():
log.info("Output written to stream")
elif samefile(options.output_file, Path(os.devnull)):
pass # Say nothing when sending to dev null
else:
if options.output_type == 'auto':
# For 'auto' mode, check what we actually produced
actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')
pdfa_info = file_claims_pdfa(options.output_file)
if actual_type == 'pdfa' and pdfa_info['pass']:
log.info(
"Output file is a %s (auto mode achieved PDF/A)",
pdfa_info['conformance'],
)
elif pdfa_info['pass']:
# Unexpectedly got PDF/A
log.info("Output file is a %s", pdfa_info['conformance'])
else:
# Regular PDF - this is expected for auto mode fallback
log.info("Output file is a PDF (auto mode)")
elif options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
log.info("Output file is a %s (as expected)", pdfa_info['conformance'])
else:
log.warning(
"Output file is a valid PDF, but conversion to PDF/A did not "
"succeed (issue: %s)",
pdfa_info['conformance'],
)
return ExitCode.pdfa_conversion_failed
if not check_pdf(options.output_file):
log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
report_output_file_size(
options, start_input_file, options.output_file, optimize_messages
)
return ExitCode.ok
================================================
FILE: src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0
"""Implements the concurrent and page synchronous parts of the pipeline."""
from __future__ import annotations
import logging
import logging.handlers
from collections.abc import Sequence
from functools import partial
import PIL
from ocrmypdf._concurrent import Executor
from ocrmypdf._graft import OcrGrafter
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import copy_final
from ocrmypdf._pipelines._common import (
HOCRResult,
do_get_pdfinfo,
manage_work_folder,
postprocess,
report_output_pdf,
set_thread_pageno,
setup_pipeline,
worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import available_cpu_count
log = logging.getLogger(__name__)
def _exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult:
"""Process each page."""
hocr_json = page_context.get_path('hocr.json')
if not hocr_json.exists():
# No hOCR file, so no OCR was performed on this page.
return HOCRResult(pageno=page_context.pageno)
hocr_result = HOCRResult.from_json(hocr_json.read_text())
# hOCR path is passed directly to the grafting phase where fpdf2 renders it
hocr_result.textpdf = page_context.get_path('ocr_hocr.hocr')
return hocr_result
def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[str]:
"""Convert hOCR files to OCR PDF."""
# Run exec_page_sync on every page
options = context.options
jobs = options.jobs or available_cpu_count()
max_workers = min(len(context.pdfinfo), jobs)
if max_workers > 1:
log.info("Continue processing %d pages concurrently", max_workers)
ocrgraft = OcrGrafter(context)
def graft_page(result: HOCRResult, pbar: ProgressBar):
"""Graft text only PDF on to main PDF's page."""
try:
set_thread_pageno(result.pageno + 1)
pbar.update()
ocrgraft.graft_page(
pageno=result.pageno,
image=result.pdf_page_from_image,
ocr_output=result.textpdf,
ocr_tree=result.ocr_tree,
autorotate_correction=result.orientation_correction,
)
pbar.update()
finally:
set_thread_pageno(None)
executor(
use_threads=options.use_threads,
max_workers=max_workers,
progress_kwargs=dict(
total=(2 * len(context.pdfinfo)),
desc='Grafting hOCR to PDF',
unit='page',
unit_scale=0.5,
disable=not options.progress_bar,
),
worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
task=_exec_hocrtransform_sync,
task_arguments=context.get_page_context_args(),
task_finished=graft_page,
)
pdf = ocrgraft.finalize()
messages: Sequence[str] = []
if options.output_type != 'none':
# PDF/A and metadata
log.info("Postprocessing...")
pdf, messages = postprocess(pdf, context, executor)
# Copy PDF file to destination (we don't know the input PDF file name)
copy_final(pdf, options.output_file, None)
return messages
def run_hocr_to_ocr_pdf_pipeline(
options: OcrOptions,
*,
plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
"""Run pipeline to convert hOCR to final output PDF."""
with manage_work_folder(
work_folder=options.work_folder, retain=True, print_location=False
) as work_folder:
executor = setup_pipeline(options, plugin_manager)
origin_pdf = work_folder / 'origin.pdf'
# Gather pdfinfo and create context
pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
plugin_manager.check_options(options=options)
optimize_messages = exec_hocr_to_ocr_pdf(context, executor)
return report_output_pdf(options, origin_pdf, optimize_messages)
================================================
FILE: src/ocrmypdf/_pipelines/ocr.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0
"""Implements the concurrent and page synchronous parts of the pipeline."""
from __future__ import annotations
import logging
import logging.handlers
from collections.abc import Sequence
from functools import partial
from pathlib import Path
from tempfile import mkdtemp
import PIL
from ocrmypdf._concurrent import Executor
from ocrmypdf._graft import OcrGrafter
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
copy_final,
is_ocr_required,
merge_sidecars,
ocr_engine_direct,
ocr_engine_hocr,
ocr_engine_textonly_pdf,
triage,
validate_pdfinfo_options,
)
from ocrmypdf._pipelines._common import (
PageResult,
cli_exception_handler,
do_get_pdfinfo,
manage_debug_log_handler,
manage_work_folder,
postprocess,
process_page,
report_output_pdf,
set_thread_pageno,
setup_pipeline,
worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf._validation import (
check_requested_output_file,
create_input_file,
)
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import available_cpu_count
from ocrmypdf.models.ocr_element import OcrElement
log = logging.getLogger(__name__)
def _image_to_ocr_text(
page_context: PageContext, ocr_image_out: Path
) -> tuple[Path | None, Path, OcrElement | None]:
"""Run OCR engine on image to create OCR PDF and text file."""
options = page_context.options
pdf_renderer = options.pdf_renderer
# fpdf2 is the default renderer (auto resolves to fpdf2)
if pdf_renderer in ('auto', 'fpdf2'):
# Use generate_ocr() if the engine supports it, otherwise use hOCR path
ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
if ocr_engine and ocr_engine.supports_generate_ocr():
ocr_tree, text_out = ocr_engine_direct(ocr_image_out, page_context)
return None, text_out, ocr_tree
ocr_out, text_out = ocr_engine_hocr(ocr_image_out, page_context)
elif pdf_renderer == 'sandwich':
ocr_out, text_out = ocr_engine_textonly_pdf(ocr_image_out, page_context)
else:
raise NotImplementedError(f"pdf_renderer {pdf_renderer}")
return ocr_out, text_out, None
def _exec_page_sync(page_context: PageContext) -> PageResult:
"""Execute a pipeline for a single page synchronously."""
set_thread_pageno(page_context.pageno + 1)
if not is_ocr_required(page_context):
return PageResult(pageno=page_context.pageno)
ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(
page_context
)
ocr_out, text_out, ocr_tree = _image_to_ocr_text(page_context, ocr_image_out)
return PageResult(
pageno=page_context.pageno,
pdf_page_from_image=pdf_page_from_image_out,
ocr=ocr_out,
text=text_out,
orientation_correction=orientation_correction,
ocr_tree=ocr_tree,
)
def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
"""Execute the OCR pipeline concurrently."""
options = context.options
jobs = options.jobs or available_cpu_count()
max_workers = min(len(context.pdfinfo), jobs)
if max_workers > 1:
log.info("Starting processing with %d workers concurrently", max_workers)
sidecars: list[Path | None] = [None] * len(context.pdfinfo)
ocrgraft = OcrGrafter(context)
def update_page(result: PageResult, pbar: ProgressBar):
"""After OCR is complete for a page, update the PDF."""
try:
set_thread_pageno(result.pageno + 1)
sidecars[result.pageno] = result.text
pbar.update(0.5)
ocrgraft.graft_page(
pageno=result.pageno,
image=result.pdf_page_from_image,
ocr_output=result.ocr,
ocr_tree=result.ocr_tree,
autorotate_correction=result.orientation_correction,
)
pbar.update(0.5)
finally:
set_thread_pageno(None)
executor(
use_threads=options.use_threads,
max_workers=max_workers,
progress_kwargs=dict(
total=len(context.pdfinfo),
desc='OCR' if options.ocr_engine != 'none' else 'Image processing',
unit='page',
disable=not options.progress_bar,
),
worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
task=_exec_page_sync,
task_arguments=context.get_page_context_args(),
task_finished=update_page,
)
# Output sidecar text
if options.sidecar:
text = merge_sidecars(sidecars, context)
# Copy text file to destination
copy_final(text, options.sidecar, options.input_file)
# Merge layers to one single pdf
pdf = ocrgraft.finalize()
messages: Sequence[str] = []
if options.output_type != 'none':
# PDF/A and metadata
log.info("Postprocessing...")
pdf, messages = postprocess(pdf, context, executor)
# Copy PDF file to destination
copy_final(pdf, options.output_file, options.input_file)
return messages
def _run_pipeline(
options: OcrOptions,
plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
with (
manage_work_folder(
work_folder=Path(mkdtemp(prefix="ocrmypdf.io.")),
retain=options.keep_temporary_files,
print_location=options.keep_temporary_files,
) as work_folder,
manage_debug_log_handler(options=options, work_folder=work_folder),
):
executor = setup_pipeline(options, plugin_manager)
check_requested_output_file(options)
start_input_file, original_filename = create_input_file(options, work_folder)
# Triage image or pdf
origin_pdf = triage(
original_filename, start_input_file, work_folder / 'origin.pdf', options
)
# Gather pdfinfo and create context
pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
# Validate options are okay for this pdf
validate_pdfinfo_options(context)
# Execute the pipeline
optimize_messages = exec_concurrent(context, executor)
exitcode = report_output_pdf(options, start_input_file, optimize_messages)
return exitcode
def run_pipeline_cli(
options: OcrOptions,
*,
plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
"""Run the OCR pipeline with command line exception handling.
Args:
options: The parsed OCR options.
plugin_manager: The plugin manager to use. If not provided, one will be
created.
"""
return cli_exception_handler(_run_pipeline, options, plugin_manager)
def run_pipeline(
options: OcrOptions,
*,
plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
"""Run the OCR pipeline without command line exception handling.
Args:
options: The parsed OCR options.
plugin_manager: The plugin manager to use. If not provided, one will be
created.
"""
return _run_pipeline(options, plugin_manager)
================================================
FILE: src/ocrmypdf/_pipelines/pdf_to_hocr.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0
"""Implements the concurrent and page synchronous parts of the pipeline."""
from __future__ import annotations
import logging
import logging.handlers
import shutil
from functools import partial
import PIL
from ocrmypdf._concurrent import Executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
is_ocr_required,
ocr_engine_hocr,
validate_pdfinfo_options,
)
from ocrmypdf._pipelines._common import (
HOCRResult,
do_get_pdfinfo,
manage_work_folder,
process_page,
set_thread_pageno,
setup_pipeline,
worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf.helpers import available_cpu_count
log = logging.getLogger(__name__)
def _exec_page_hocr_sync(page_context: PageContext) -> HOCRResult:
"""Execute a pipeline for a single page hOCR."""
set_thread_pageno(page_context.pageno + 1)
if not is_ocr_required(page_context):
return HOCRResult(pageno=page_context.pageno)
ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(
page_context
)
hocr_out, _ = ocr_engine_hocr(ocr_image_out, page_context)
result = HOCRResult(
pageno=page_context.pageno,
pdf_page_from_image=pdf_page_from_image_out,
hocr=hocr_out,
orientation_correction=orientation_correction,
)
page_context.get_path('hocr.json').write_text(result.to_json())
return result
def exec_pdf_to_hocr(context: PdfContext, executor: Executor) -> None:
"""Execute the OCR pipeline concurrently and output hOCR."""
# Run exec_page_sync on every page
options = context.options
jobs = options.jobs or available_cpu_count()
max_workers = min(len(context.pdfinfo), jobs)
if max_workers > 1:
log.info("Starting processing with %d workers concurrently", max_workers)
executor(
use_threads=options.use_threads,
max_workers=max_workers,
progress_kwargs=dict(
total=(2 * len(context.pdfinfo)),
desc='hOCR',
unit='page',
unit_scale=0.5,
disable=not options.progress_bar,
),
worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
task=_exec_page_hocr_sync,
task_arguments=context.get_page_context_args(),
)
def run_hocr_pipeline(
options: OcrOptions,
*,
plugin_manager: OcrmypdfPluginManager,
) -> None:
"""Run pipeline to output hOCR."""
if options.output_folder is None:
raise ValueError("output_folder must be specified for hOCR pipeline")
with manage_work_folder(
work_folder=options.output_folder, retain=True, print_location=False
) as work_folder:
executor = setup_pipeline(options, plugin_manager)
origin_pdf = work_folder / 'origin.pdf'
shutil.copy2(options.input_file, origin_pdf)
# Gather pdfinfo and create context
pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
context = PdfContext(
options, work_folder, options.input_file, pdfinfo, plugin_manager
)
# Validate options are okay for this pdf
validate_pdfinfo_options(context)
exec_pdf_to_hocr(context, executor)
================================================
FILE: src/ocrmypdf/_plugin_manager.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Plugin manager using pluggy with type-safe interface."""
from __future__ import annotations
import importlib
import importlib.util
import pkgutil
import sys
from argparse import ArgumentParser
from collections.abc import Sequence
from logging import Handler
from pathlib import Path
from typing import TYPE_CHECKING
import pluggy
from pydantic import BaseModel
import ocrmypdf.builtin_plugins
from ocrmypdf import Executor, PdfContext, pluginspec
from ocrmypdf._options import OcrOptions
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.helpers import Resolution
from ocrmypdf.pluginspec import OcrEngine
if TYPE_CHECKING:
from PIL import Image
from ocrmypdf._jobcontext import PageContext
from ocrmypdf.pdfinfo import PdfInfo
class OcrmypdfPluginManager:
"""Type-safe wrapper around pluggy.PluginManager.
Capable of reconstructing itself in child workers via pickle.
This class provides type-safe methods for all hooks defined in pluginspec.py,
removing the need for unsafe `hook.method_name()` calls.
"""
def __init__(
self,
*args,
plugins: Sequence[str | Path],
builtins: bool = True,
**kwargs,
):
self._init_args = args
self._init_kwargs = kwargs
self._plugins = plugins
self._builtins = builtins
self._pm = pluggy.PluginManager(*args, **kwargs)
self._setup_plugins()
@property
def pluggy(self) -> pluggy.PluginManager:
"""Access the underlying pluggy.PluginManager for advanced use cases.
This is useful for plugins that need to call methods like set_blocked()
in their initialize hook.
"""
return self._pm
def __getstate__(self):
state = dict(
init_args=self._init_args,
plugins=self._plugins,
builtins=self._builtins,
init_kwargs=self._init_kwargs,
)
return state
def __setstate__(self, state):
self.__init__(
*state['init_args'],
plugins=state['plugins'],
builtins=state['builtins'],
**state['init_kwargs'],
)
def _setup_plugins(self):
self._pm.add_hookspecs(pluginspec)
# 1. Register builtins
if self._builtins:
for module in sorted(
pkgutil.iter_modules(ocrmypdf.builtin_plugins.__path__)
):
name = f'ocrmypdf.builtin_plugins.{module.name}'
module = importlib.import_module(name)
self._pm.register(module)
# 2. Register setuptools plugins
self._pm.load_setuptools_entrypoints('ocrmypdf')
# 3. Register plugins specified on command line
for name in self._plugins:
if isinstance(name, Path) or name.endswith('.py'):
# Import by filename
module_name = Path(name).stem
spec = importlib.util.spec_from_file_location(module_name, name)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
else:
# Import by dotted module name
module = importlib.import_module(name)
self._pm.register(module)
# =========================================================================
# Type-safe hook methods
# =========================================================================
# --- firstresult hooks ---
def get_logging_console(self) -> Handler | None:
"""Returns a custom logging handler for progress bar compatibility."""
return self._pm.hook.get_logging_console()
def get_executor(self, *, progressbar_class: type[ProgressBar]) -> Executor | None:
"""Returns an executor for parallel processing."""
return self._pm.hook.get_executor(progressbar_class=progressbar_class)
def get_progressbar_class(self) -> type[ProgressBar] | None:
"""Returns a progress bar class."""
return self._pm.hook.get_progressbar_class()
def rasterize_pdf_page(
self,
*,
input_file: Path,
output_file: Path,
raster_device: str,
raster_dpi: Resolution,
pageno: int,
page_dpi: Resolution | None,
rotation: int | None,
filter_vector: bool,
stop_on_soft_error: bool,
options: OcrOptions | None,
use_cropbox: bool,
) -> Path | None:
"""Rasterize one page of a PDF at specified resolution."""
return self._pm.hook.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
raster_device=raster_device,
raster_dpi=raster_dpi,
pageno=pageno,
page_dpi=page_dpi,
rotation=rotation,
filter_vector=filter_vector,
stop_on_soft_error=stop_on_soft_error,
options=options,
use_cropbox=use_cropbox,
)
def filter_ocr_image(
self, *, page: PageContext, image: Image.Image
) -> Image.Image | None:
"""Filter the image before it is sent to OCR."""
return self._pm.hook.filter_ocr_image(page=page, image=image)
def filter_page_image(
self, *, page: PageContext, image_filename: Path
) -> Path | None:
"""Filter the whole page image before it is inserted into the PDF."""
return self._pm.hook.filter_page_image(page=page, image_filename=image_filename)
def filter_pdf_page(
self, *, page: PageContext, image_filename: Path, output_pdf: Path
) -> Path:
"""Convert a filtered whole page image into a PDF."""
result = self._pm.hook.filter_pdf_page(
page=page, image_filename=image_filename, output_pdf=output_pdf
)
if result is None:
raise ValueError('No PDF produced')
if result != output_pdf:
raise ValueError('filter_pdf_page must return output_pdf')
return result
def get_ocr_engine(self, *, options: OcrOptions | None = None) -> OcrEngine:
"""Returns an OcrEngine to use for processing.
Args:
options: OcrOptions to pass to the hook for engine selection.
"""
result = self._pm.hook.get_ocr_engine(options=options)
if result is None:
raise ValueError('No OCR engine selected')
return result
def generate_pdfa(
self,
*,
pdf_pages: list[Path],
pdfmark: Path,
output_file: Path,
context: PdfContext,
pdf_version: str,
pdfa_part: str,
progressbar_class: type[ProgressBar] | None,
stop_on_soft_error: bool,
) -> Path | None:
"""Generate a PDF/A file."""
return self._pm.hook.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=progressbar_class,
stop_on_soft_error=stop_on_soft_error,
)
def optimize_pdf(
self,
*,
input_pdf: Path,
output_pdf: Path,
context: PdfContext,
executor: Executor,
linearize: bool,
) -> tuple[Path, Sequence[str]]:
"""Optimize a PDF after OCR processing."""
result = self._pm.hook.optimize_pdf(
input_pdf=input_pdf,
output_pdf=output_pdf,
context=context,
executor=executor,
linearize=linearize,
)
if result is None:
return input_pdf, []
return result
def is_optimization_enabled(self, *, context: PdfContext) -> bool | None:
"""Returns whether optimization is enabled for given context."""
return self._pm.hook.is_optimization_enabled(context=context)
# --- non-firstresult hooks ---
def initialize(self, *, plugin_manager: pluggy.PluginManager) -> list[None]:
"""Called when plugins are first loaded.
Args:
plugin_manager: The underlying pluggy.PluginManager, allowing
plugins to call methods like set_blocked().
"""
return self._pm.hook.initialize(plugin_manager=plugin_manager)
def add_options(self, *, parser: ArgumentParser) -> list[None]:
"""Allows plugins to add command line and API arguments."""
return self._pm.hook.add_options(parser=parser)
def register_options(self) -> list[dict[str, type[BaseModel]]]:
"""Returns plugin option models keyed by namespace."""
return self._pm.hook.register_options()
def check_options(self, *, options: OcrOptions) -> list[None]:
"""Called to validate options after parsing."""
return self._pm.hook.check_options(options=options)
def validate(self, *, pdfinfo: PdfInfo, options: OcrOptions) -> list[None]:
"""Called to validate options and pdfinfo after PDF is loaded."""
return self._pm.hook.validate(pdfinfo=pdfinfo, options=options)
def get_plugin_manager(
plugins: Sequence[str | Path] | None = None, builtins=True
) -> OcrmypdfPluginManager:
return OcrmypdfPluginManager(
project_name='ocrmypdf',
plugins=plugins if plugins is not None else [],
builtins=builtins,
)
__all__ = ['OcrmypdfPluginManager', 'get_plugin_manager']
================================================
FILE: src/ocrmypdf/_plugin_registry.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Plugin option registry for dynamic model composition."""
from __future__ import annotations
import logging
from pydantic import BaseModel
log = logging.getLogger(__name__)
class PluginOptionRegistry:
"""Registry for plugin option models.
This registry collects option models from plugins during initialization.
Plugin options can be accessed via nested namespaces on OcrOptions
(e.g., options.tesseract.timeout) or via flat field names for backward
compatibility (e.g., options.tesseract_timeout).
"""
def __init__(self):
self._option_models: dict[str, type[BaseModel]] = {}
def register_option_model(
self, namespace: str, model_class: type[BaseModel]
) -> None:
"""Register a plugin's option model.
Args:
namespace: The namespace for the plugin options (e.g., 'tesseract')
model_class: The Pydantic model class for the plugin options
"""
if namespace in self._option_models:
log.warning(
f"Plugin option namespace '{namespace}' already registered, overriding"
)
self._option_models[namespace] = model_class
log.debug(
f"Registered plugin option model for namespace '{namespace}': "
f"{model_class.__name__}"
)
def get_registered_models(self) -> dict[str, type[BaseModel]]:
"""Get all registered plugin option models."""
return self._option_models.copy()
================================================
FILE: src/ocrmypdf/_progressbar.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Defines progress bar API."""
from __future__ import annotations
from typing import Protocol
from rich.console import Console
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TaskProgressColumn,
TextColumn,
TimeRemainingColumn,
)
from rich.table import Column
class ProgressBar(Protocol):
"""The protocol that OCRmyPDF expects progress bar classes to be compatible with.
In practice this could be used for any time of monitoring, not just a progress bar.
Calling the class should return a new progress bar object, which is activated
with ``__enter__`` and terminated with ``__exit__``. An update method is called
whenever the progress bar is updated. Progress bar objects will not be reused;
a new one will be created for each group of tasks.
The progress bar is held in the main process/thread and not updated by child
process/threads. When a child notifies the parent of completed work, the
parent updates the progress bar.
Progress bars should never write to ``sys.stdout``, or they will corrupt the
output if OCRmyPDF writes a PDF to standard output.
Note:
The type of events that OCRmyPDF reports to a progress bar may change in
minor releases.
Args:
total (int | float | None):
The total number of work units expected. If ``None``, the total is unknown.
For example, if you are processing pages, this might be the number of pages,
or if you are measuring overall progress in percent, this might be 100.
desc (str | None):
A brief description of the current step (e.g. "Scanning contents",
"OCR", "PDF/A conversion"). OCRmyPDF updates this before each major step.
unit (str | None):
A short label for the type of work being tracked
(e.g. "page", "%", "image").
disable (bool):
If ``True``, progress updates are suppressed (no output).
Defaults to ``False``.
**kwargs:
Future or extra parameters that OCRmyPDF might pass. Implementations
should accept and ignore unrecognized keywords gracefully.
Example:
A simple plugin implementation could look like this:
.. code-block:: python
from ocrmypdf.pluginspec import ProgressBar
from ocrmypdf import hookimpl
class ConsoleProgressBar(ProgressBar):
def __init__(self, *, total=None, desc=None, unit=None, disable=False,
**kwargs):
self.total = total
self.desc = desc
self.unit = unit
self.disable = disable
self.current = 0
def __enter__(self):
if not self.disable:
print(f"Starting {self.desc or 'an OCR task'} "
f"(total={self.total} {self.unit})"
)
return self
def __exit__(self, exc_type, exc_value, traceback):
if not self.disable:
if exc_type is None:
print("Completed successfully.")
else:
print(f"Task ended with error: {exc_value}")
return False # Let OCRmyPDF raise any exceptions
def update(self, n=1, *, completed=None):
if completed is not None:
# If 'completed' is given, set self.current
# but let's just read it to show usage
print(f"Absolute completion reported: {completed}")
# Otherwise, we increment by 'n'
self.current += n
if not self.disable:
if self.total:
percent = (self.current / self.total) * 100
print(
f"{self.desc}: {self.current}"
f"/{self.total} ({percent:.1f}%)"
)
else:
print(f"{self.desc}: {self.current} units done")
@hookimpl
def get_progressbar_class():
return MyProgressBar
"""
def __init__(
self,
*,
total: int | float | None,
desc: str | None,
unit: str | None,
disable: bool = False,
**kwargs,
):
"""Initialize a progress bar.
This is called once before any work is done. OCRmyPDF supplies the total
number of units (or None if unknown), a description of the work, and the
type of units. The ``disable`` parameter can be used to turn off progress
reporting. Unrecognized keyword arguments should be ignored.
Args:
total (int | float | None):
The total amount of work. If ``None``, the total is unknown.
desc (str | None):
A description of the current task. May change for different stages.
unit (str | None):
A short label for the unit of work.
disable (bool):
If ``True``, no output or logging should be displayed.
**kwargs:
Extra parameters that may be passed by OCRmyPDF in future versions.
"""
def __enter__(self):
"""Enter a progress bar context."""
def __exit__(self, *args):
"""Exit a progress bar context."""
def update(self, n: float = 1, *, completed: float | None = None):
"""Increment the progress bar by ``n`` units, or set an absolute completion.
OCRmyPDF calls this method repeatedly while processing pages or other tasks.
If your total is known and you track it, you might do something like:
.. code-block:: python
self.current += n
percent = (self.current / total) * 100
The ``completed`` argument can indicate an absolute position, which is
particularly helpful if you're tracking a percentage of work (e.g., 0 to 100)
and want precise updates. In contrast, the incremental parameter ``n`` is
often more useful for page-based increments.
Args:
n (float, optional):
The amount to increment the progress by. Defaults to 1. May be
fractional if OCRmyPDF performs partial steps. If you are tracking
pages, this is typically how many pages have been processed in the
most recent step.
completed (float | None, optional):
The absolute amount of work completed so far. This can override or
supplement the simple increment logic. It's particularly useful
for percentage-based tracking (e.g., when ``total`` is 100).
"""
class NullProgressBar:
"""Progress bar API that takes no actions."""
def __init__(self, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
return False
def update(self, _arg=None, *, completed=None):
return
class RichProgressBar:
"""Display progress bar using rich."""
def __init__(
self,
*,
console: Console,
desc: str,
total: float | None = None,
unit: str | None = None,
unit_scale: float | None = 1.0,
disable: bool = False,
**kwargs,
):
self._entered = False
self.progress = Progress(
TextColumn(
"[progress.description]{task.description}",
table_column=Column(min_width=20),
),
BarColumn(),
TaskProgressColumn(),
MofNCompleteColumn(),
TimeRemainingColumn(),
console=console,
auto_refresh=True,
redirect_stderr=True,
redirect_stdout=False,
disable=disable,
**kwargs,
)
self.unit_scale = unit_scale
self.progress_bar = self.progress.add_task(
desc,
total=total * self.unit_scale
if total is not None and self.unit_scale is not None
else None,
unit=unit,
)
def __enter__(self):
self.progress.start()
self._entered = True
return self
def __exit__(self, exc_type, exc_value, traceback):
self.progress.refresh()
self.progress.stop()
return False
def update(self, n=1, *, completed=None):
assert self._entered, "Progress bar must be entered before updating"
if completed is None:
advance = self.unit_scale if n is None else n
self.progress.update(self.progress_bar, advance=advance)
else:
self.progress.update(self.progress_bar, completed=completed)
================================================
FILE: src/ocrmypdf/_validation.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Validate a work order from API or command line."""
from __future__ import annotations
import logging
import os
import sys
from collections.abc import Sequence
from pathlib import Path
from shutil import copyfileobj
import pikepdf
from ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf._exec import unpaper
from ocrmypdf._options import OcrOptions
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf.exceptions import (
BadArgsError,
InputFileError,
MissingDependencyError,
OutputFileAccessError,
)
from ocrmypdf.helpers import (
is_file_writable,
running_in_docker,
running_in_snap,
safe_symlink,
)
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
def check_platform() -> None:
if sys.maxsize <= 2**32: # pragma: no cover
log.warning(
"You are running OCRmyPDF in a 32-bit (x86) Python interpreter. "
"This is not supported. 32-bit does not have enough address space "
"to process large files. "
"Please use a 64-bit (x86-64) version of Python."
)
def check_options_languages(
options: OcrOptions, ocr_engine_languages: list[str]
) -> None:
# Check for blocked languages first, before checking if they're installed
DENIED_LANGUAGES = {'equ', 'osd'}
blocked = DENIED_LANGUAGES & set(options.languages)
if blocked:
raise BadArgsError(
"The following languages are for Tesseract's internal use and "
"should not be issued explicitly: "
f"{', '.join(blocked)}\n"
"Remove them from the -l/--language argument."
)
if not ocr_engine_languages:
return
missing_languages = set(options.languages) - set(ocr_engine_languages)
if missing_languages:
lang_text = '\n'.join(lang for lang in missing_languages)
msg = (
"OCR engine does not have language data for the following "
"requested languages: \n"
f"{lang_text}\n"
"Please install the appropriate language data for your OCR engine.\n"
"\n"
"See the online documentation for instructions:\n"
" https://ocrmypdf.readthedocs.io/en/latest/languages.html\n"
"\n"
"Note: most languages are identified by a 3-letter ISO 639-2 Code.\n"
"For example, English is 'eng', German is 'deu', and Spanish is 'spa'.\n"
"Simplified Chinese is 'chi_sim' and Traditional Chinese is 'chi_tra'."
"\n"
)
raise MissingDependencyError(msg)
def check_options_sidecar(options: OcrOptions) -> None:
if options.sidecar == '\0':
if options.output_file == '-':
raise BadArgsError("--sidecar filename needed when output file is stdout.")
elif options.output_file == os.devnull:
raise BadArgsError(
"--sidecar filename needed when output file is /dev/null or NUL."
)
options.sidecar = options.output_file + '.txt'
if options.sidecar == options.input_file or options.sidecar == options.output_file:
raise BadArgsError(
"--sidecar file must be different from the input and output files"
)
def check_options_preprocessing(options: OcrOptions) -> None:
if options.clean_final:
options.clean = True
if options.unpaper_args and not options.clean:
raise BadArgsError("--clean is required for --unpaper-args")
if (
options.rotate_pages_threshold != DEFAULT_ROTATE_PAGES_THRESHOLD
and not options.rotate_pages
):
raise BadArgsError("--rotate-pages is required for --rotate-pages-threshold")
if options.clean:
check_external_program(
program='unpaper',
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
required_for="--clean, --clean-final",
)
def _check_plugin_invariant_options(options: OcrOptions) -> None:
check_platform()
check_options_sidecar(options)
check_options_preprocessing(options)
def _check_plugin_options(
options: OcrOptions, plugin_manager: OcrmypdfPluginManager
) -> None:
# First, let plugins check their external dependencies
plugin_manager.check_options(options=options)
# Then check OCR engine language support
ocr_engine_languages = plugin_manager.get_ocr_engine(options=options).languages(
options
)
check_options_languages(options, ocr_engine_languages)
# Finally, run comprehensive validation using the coordinator
from ocrmypdf._validation_coordinator import ValidationCoordinator
coordinator = ValidationCoordinator(plugin_manager)
coordinator.validate_all_options(options)
def check_options(options: OcrOptions, plugin_manager: OcrmypdfPluginManager) -> None:
"""Check options for validity and consistency.
This function coordinates validation across the entire system:
1. Core validation (platform, files, preprocessing)
2. Plugin external dependency validation
3. Plugin-specific validation (handled by plugin models)
4. Cross-cutting validation (handled by validation coordinator)
"""
_check_plugin_invariant_options(options)
_check_plugin_options(options, plugin_manager)
def create_input_file(options: OcrOptions, work_folder: Path) -> tuple[Path, str]:
if options.input_file == '-':
# stdin
log.info('reading file from standard input')
target = work_folder / 'stdin'
with open(target, 'wb') as stream_buffer:
copyfileobj(sys.stdin.buffer, stream_buffer)
return target, "stdin"
elif hasattr(options.input_file, 'readable'):
if not options.input_file.readable():
raise InputFileError("Input file stream is not readable")
log.info('reading file from input stream')
target = work_folder / 'stream'
with open(target, 'wb') as stream_buffer:
copyfileobj(options.input_file, stream_buffer)
return target, "stream"
else:
try:
target = work_folder / 'origin'
safe_symlink(options.input_file, target)
return target, os.fspath(options.input_file)
except FileNotFoundError as e:
msg = f"File not found - {options.input_file}"
if running_in_docker(): # pragma: no cover
msg += (
"\nDocker cannot access your working directory unless you "
"explicitly share it with the Docker container and set up"
"permissions correctly.\n"
"You may find it easier to use stdin/stdout:"
"\n"
"\tdocker run -i --rm jbarlow83/ocrmypdf - - output.pdf"
"\n"
)
elif running_in_snap(): # pragma: no cover
msg += (
"\nSnap applications cannot access files outside of "
"your home directory unless you explicitly allow it. "
"You may find it easier to use stdin/stdout:"
"\n"
"\tsnap run ocrmypdf - - output.pdf"
"\n"
)
raise InputFileError(msg) from e
def check_requested_output_file(options: OcrOptions) -> None:
if options.output_file == '-':
if sys.stdout.isatty():
raise BadArgsError(
"Output was set to stdout '-' but it looks like stdout "
"is connected to a terminal. Please redirect stdout to a "
"file."
)
elif hasattr(options.output_file, 'writable'):
if not options.output_file.writable():
raise OutputFileAccessError("Output stream is not writable")
elif not is_file_writable(options.output_file):
raise OutputFileAccessError(
f"Output file location ({options.output_file}) is not a writable file."
)
if (
options.no_overwrite
and not hasattr(options.output_file, 'writable')
and options.output_file != '-'
and Path(str(options.output_file)).exists()
):
raise OutputFileAccessError(
f"Output file already exists: {options.output_file}\n"
"To overwrite it, omit the --no-overwrite / -n option."
)
def report_output_file_size(
options: OcrOptions,
input_file: Path,
output_file: Path,
optimize_messages: Sequence[str] | None = None,
file_overhead: int = 4000,
page_overhead: int = 3000,
) -> None:
if optimize_messages is None:
optimize_messages = []
try:
output_size = Path(output_file).stat().st_size
input_size = Path(input_file).stat().st_size
except FileNotFoundError:
return # Outputting to stream or something
with pikepdf.open(output_file) as p:
# Overhead constants obtained by estimating amount of data added by OCR
# PDF/A conversion, and possible XMP metadata addition, with compression
reasonable_overhead = file_overhead + page_overhead * len(p.pages)
ratio = output_size / input_size
reasonable_ratio = output_size / (input_size + reasonable_overhead)
if reasonable_ratio < 1.35 or input_size < 25000:
return # Seems fine
reasons = []
image_preproc = {
'deskew',
'clean_final',
'remove_background',
'oversample',
}
for arg in image_preproc:
if getattr(options, arg, False):
reasons.append(
f"--{arg.replace('_', '-')} was issued, causing transcoding."
)
# Check force_ocr via the backward-compatible property
if options.force_ocr:
reasons.append("--force-ocr (or --mode force) was issued, causing transcoding.")
reasons.extend(optimize_messages)
if options.output_type.startswith('pdfa'):
reasons.append("PDF/A conversion was enabled. (Try `--output-type pdf`.)")
if options.plugins:
reasons.append("Plugins were used.")
if reasons:
explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
else:
explanation = "No reason for this increase is known. Please report this issue."
log.warning(
f"The output file size is {ratio:.2f}× larger than the input file.\n"
f"{explanation}"
)
================================================
FILE: src/ocrmypdf/_validation_coordinator.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Validation coordinator for plugin options and cross-cutting concerns."""
from __future__ import annotations
import logging
import os
from typing import TYPE_CHECKING
if TYPE_CHECKING:
import pluggy
from ocrmypdf._options import OcrOptions
log = logging.getLogger(__name__)
class ValidationCoordinator:
"""Coordinates validation across plugin models and core options."""
def __init__(self, plugin_manager: pluggy.PluginManager):
self.plugin_manager = plugin_manager
self.registry = getattr(plugin_manager, '_option_registry', None)
def validate_all_options(self, options: OcrOptions) -> None:
"""Run comprehensive validation on all options.
This runs validation in the correct order:
1. Plugin self-validation (already done by Pydantic)
2. Plugin context validation (requires external context)
3. Cross-cutting validation (between plugins and core)
Args:
options: The options to validate
"""
# Step 1: Plugin context validation
self._validate_plugin_contexts(options)
# Step 2: Cross-cutting validation
self._validate_cross_cutting_concerns(options)
def _validate_plugin_contexts(self, options: OcrOptions) -> None:
"""Validate plugin options that require external context."""
# For now, we'll run the plugin validation directly since the models
# are still being integrated. This ensures the validation warnings
# and checks still work as expected.
# Run Tesseract validation
self._validate_tesseract_options(options)
# Run Optimize validation
self._validate_optimize_options(options)
def _validate_tesseract_options(self, options: OcrOptions) -> None:
"""Validate Tesseract options."""
# Check pagesegmode warning
if options.tesseract.pagesegmode in (0, 2):
log.warning(
"The tesseract-pagesegmode you selected will disable OCR. "
"This may cause processing to fail."
)
# Check downsample consistency
if (
options.tesseract.downsample_above != 32767
and not options.tesseract.downsample_large_images
):
log.warning(
"The --tesseract-downsample-above argument will have no effect unless "
"--tesseract-downsample-large-images is also given."
)
# Note: blocked languages (equ, osd) are checked earlier in
# check_options_languages() to ensure the check runs before
# the missing language check.
def _validate_optimize_options(self, options: OcrOptions) -> None:
"""Validate optimization options."""
# Check optimization consistency
if options.optimize == 0 and any(
[
options.png_quality and options.png_quality > 0,
options.jpeg_quality and options.jpeg_quality > 0,
]
):
log.warning(
"The arguments --png-quality and --jpeg-quality "
"will be ignored because --optimize=0."
)
def _validate_cross_cutting_concerns(self, options: OcrOptions) -> None:
"""Validate cross-cutting concerns that span multiple plugins."""
from ocrmypdf._options import ProcessingMode
# Handle deprecated pdf_renderer values
self._handle_deprecated_pdf_renderer(options)
# Note: Mutual exclusivity of force_ocr/skip_text/redo_ocr is now enforced
# by the ProcessingMode enum - only one mode can be active at a time.
# Validate redo mode compatibility
if options.mode == ProcessingMode.redo and (
options.deskew or options.clean_final or options.remove_background
):
raise ValueError(
"--redo-ocr (or --mode redo) is not currently compatible with "
"--deskew, --clean-final, and --remove-background"
)
# Validate output type compatibility
if options.output_type == 'none' and str(options.output_file) not in (
os.devnull,
'-',
):
raise ValueError(
"Since you specified `--output-type none`, the output file "
f"{options.output_file} cannot be produced. Set the output file to "
"`-` to suppress this message."
)
# Validate PDF/A image compression compatibility
if (
options.ghostscript.pdfa_image_compression
and options.ghostscript.pdfa_image_compression != 'auto'
and not options.output_type.startswith('pdfa')
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
def _handle_deprecated_pdf_renderer(self, options: OcrOptions) -> None:
"""Handle deprecated pdf_renderer values by redirecting to fpdf2."""
if options.pdf_renderer in ('hocr', 'hocrdebug'):
log.info(
"The '%s' PDF renderer has been removed. Using 'fpdf2' instead, "
"which provides full international language support, proper RTL "
"rendering, and improved text positioning.",
options.pdf_renderer,
)
# Modify the options object to use fpdf2
object.__setattr__(options, 'pdf_renderer', 'fpdf2')
================================================
FILE: src/ocrmypdf/_version.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
__version__ = "17.3.0"
================================================
FILE: src/ocrmypdf/api.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Python API for OCRmyPDF.
This module provides the main Python API for OCRmyPDF, allowing you to perform
OCR operations programmatically without using the command line interface.
Main Functions:
ocr(): The primary function for OCR processing. Takes an input PDF or image
file and produces an OCR'd PDF with searchable text.
configure_logging(): Set up logging to match the command line interface
behavior, with support for progress bars and colored output.
Experimental Functions:
_pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for
manual editing before final PDF generation.
_hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after
manual text corrections.
The API maintains thread safety through internal locking since OCRmyPDF uses
global state for plugins. Only one OCR operation can run per Python process
at a time. For parallel processing, use multiple Python processes.
Example:
import ocrmypdf
# Configure logging (optional)
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
# Perform OCR
ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')
For detailed parameter documentation, see the ocr() function docstring and
the equivalent command line parameters in the OCRmyPDF documentation.
"""
from __future__ import annotations
import logging
import os
import sys
import threading
from collections.abc import Iterable, Sequence
from enum import IntEnum
from io import IOBase
from pathlib import Path
from typing import BinaryIO, overload
from warnings import warn
from ocrmypdf._logging import PageNumberFilter
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipelines.hocr_to_ocr_pdf import run_hocr_to_ocr_pdf_pipeline
from ocrmypdf._pipelines.ocr import run_pipeline, run_pipeline_cli
from ocrmypdf._pipelines.pdf_to_hocr import run_hocr_pipeline
from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager
from ocrmypdf._validation import check_options
from ocrmypdf.cli import ArgumentParser, get_parser
from ocrmypdf.exceptions import ExitCode
StrPath = Path | str | bytes
PathOrIO = BinaryIO | StrPath
# Installing plugins affects the global state of the Python interpreter,
# so we need to use a lock to prevent multiple threads from installing
# plugins at the same time.
_api_lock = threading.Lock()
def setup_plugin_infrastructure(
plugins: Sequence[Path | str] | None = None,
plugin_manager: OcrmypdfPluginManager | None = None,
) -> OcrmypdfPluginManager:
"""Set up plugin infrastructure with proper initialization.
This function handles:
1. Creating or validating the plugin manager
2. Calling plugin initialization hooks
3. Setting up plugin option registry
Args:
plugins: List of plugin paths/names to load
plugin_manager: Existing plugin manager (if any)
Returns:
Properly initialized plugin manager
Raises:
ValueError: If both plugins and plugin_manager are provided
"""
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
if not plugins:
plugins = []
elif isinstance(plugins, str | Path):
plugins = [plugins]
else:
plugins = list(plugins)
# Create plugin manager if not provided
if not plugin_manager:
plugin_manager = get_plugin_manager(plugins)
# Initialize plugins (pass the underlying pluggy manager)
plugin_manager.initialize(plugin_manager=plugin_manager.pluggy)
# Initialize plugin option registry
from ocrmypdf._plugin_registry import PluginOptionRegistry
registry = PluginOptionRegistry()
# Let plugins register their option models
option_models = plugin_manager.register_options()
all_plugin_models: dict[str, type] = {}
for plugin_options in option_models:
if plugin_options: # Skip None returns
for namespace, model_class in plugin_options.items():
registry.register_option_model(namespace, model_class)
all_plugin_models[namespace] = model_class
# Register plugin models with OcrOptions for dynamic nested access
OcrOptions.register_plugin_models(all_plugin_models)
# Store registry in plugin manager for later access
plugin_manager._option_registry = registry
return plugin_manager
class Verbosity(IntEnum):
"""Verbosity level for configure_logging."""
# pylint: disable=invalid-name
quiet = -1 #: Suppress most messages
default = 0 #: Default level of logging
debug = 1 #: Output ocrmypdf debug messages
debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules
def configure_logging(
verbosity: Verbosity,
*,
progress_bar_friendly: bool = True,
manage_root_logger: bool = False,
plugin_manager: OcrmypdfPluginManager | None = None,
):
"""Set up logging.
Before calling :func:`ocrmypdf.ocr()`, you can use this function to
configure logging if you want ocrmypdf's output to look like the ocrmypdf
command line interface. It will register log handlers, log filters, and
formatters, configure color logging to standard error, and adjust the log
levels of third party libraries. Details of this are fine-tuned and subject
to change. The ``verbosity`` argument is equivalent to the argument
``--verbose`` and applies those settings. If you have a wrapper
script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
function; if you are using ocrmypdf as part of an application that manages
its own logging, you probably do not want this function.
If this function is not called, ocrmypdf will not configure logging, and it
is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
the Python standard library's logging module. If this function is called,
the caller may of course make further adjustments to logging.
Regardless of whether this function is called, ocrmypdf will perform all of
its logging under the ``"ocrmypdf"`` logging namespace. In addition,
ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
may wish to configure both; note that pdfminer is extremely chatty at the
log level ``logging.INFO``.
This function does not set up the ``debug.log`` log file that the command
line interface does at certain verbosity levels. Applications should configure
their own debug logging.
Args:
verbosity: Verbosity level.
progress_bar_friendly: If True (the default), install a custom log handler
that is compatible with progress bars and colored output.
manage_root_logger: Configure the process's root logger.
plugin_manager: The plugin manager, used for obtaining the custom log handler.
Returns:
The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
"""
prefix = '' if manage_root_logger else 'ocrmypdf'
log = logging.getLogger(prefix)
log.setLevel(logging.DEBUG)
console = None
if plugin_manager and progress_bar_friendly:
console = plugin_manager.get_logging_console()
if not console:
console = logging.StreamHandler(stream=sys.stderr)
if verbosity < 0:
console.setLevel(logging.ERROR)
elif verbosity >= 1:
console.setLevel(logging.DEBUG)
else:
console.setLevel(logging.INFO)
console.addFilter(PageNumberFilter())
if verbosity >= 2:
fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'
else:
fmt = '%(pageno)s%(message)s'
formatter = None
if not formatter:
formatter = logging.Formatter(fmt=fmt)
console.setFormatter(formatter)
log.addHandler(console)
if verbosity <= 1:
pdfminer_log = logging.getLogger('pdfminer')
pdfminer_log.setLevel(logging.ERROR)
pil_log = logging.getLogger('PIL')
pil_log.setLevel(logging.INFO)
fonttools_log = logging.getLogger('fontTools')
fonttools_log.setLevel(logging.ERROR)
if manage_root_logger:
logging.captureWarnings(True)
return log
def _check_no_conflicting_ocr_params(
locals_dict: dict,
kwargs: dict,
excluded: set[str] | None = None,
) -> None:
"""Check that no individual OCR parameters conflict with OcrOptions.
When a user passes an OcrOptions object, they should not also pass
individual OCR parameters (except plugins/plugin_manager which are
handled separately).
Args:
locals_dict: The locals() dict from the calling function.
kwargs: The **kwargs dict from the calling function.
excluded: Parameter names to exclude from conflict checking.
Raises:
ValueError: If conflicting parameters are found.
"""
if excluded is None:
excluded = set()
# Parameters that are allowed alongside OcrOptions
allowed_with_options = {
'input_file_or_options',
'options', # The OcrOptions object itself after assignment
'plugins',
'plugin_manager',
'kwargs',
} | excluded
# Check all locals that are OCR parameters (not None and not allowed)
conflicts = [
name
for name, value in locals_dict.items()
if value is not None and name not in allowed_with_options
]
# Check kwargs
conflicts.extend(kwargs.keys())
if conflicts:
raise ValueError(
f"When passing OcrOptions as the first argument, do not pass "
f"additional OCR parameters. Conflicting parameters: "
f"{', '.join(sorted(conflicts))}. "
f"Set these values in OcrOptions instead."
)
def _remap_language_to_languages(options_kwargs: dict) -> None:
"""Map the public API 'language' parameter to OcrOptions 'languages' field.
The public API uses 'language' (matching CLI --language) but OcrOptions
uses 'languages' (plural). This also coerces a bare string to a list
and splits '+'-separated language codes (e.g. 'eng+deu' -> ['eng', 'deu'])
to match the CLI behavior.
"""
if 'language' in options_kwargs and 'languages' not in options_kwargs:
lang = options_kwargs.pop('language')
if lang is None:
return
if isinstance(lang, str):
lang = lang.split('+')
else:
# Flatten any '+'-separated entries in the list
expanded: list[str] = []
for item in lang:
if isinstance(item, str) and '+' in item:
expanded.extend(item.split('+'))
else:
expanded.append(item)
lang = expanded
options_kwargs['languages'] = lang
elif 'language' in options_kwargs:
del options_kwargs['language']
def create_options(
*, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs
) -> OcrOptions:
"""Construct an options object from the input/output files and keyword arguments.
Args:
input_file: Input file path or file object.
output_file: Output file path or file object.
parser: ArgumentParser object (kept for compatibility,
may be used for plugin validation).
**kwargs: Keyword arguments.
Returns:
OcrOptions: An options object containing the parsed arguments.
Raises:
TypeError: If the type of a keyword argument is not supported.
"""
# Prepare kwargs for direct OcrOptions construction
options_kwargs = kwargs.copy()
# Map API parameter 'language' to OcrOptions field 'languages'
_remap_language_to_languages(options_kwargs)
# Set input and output files
options_kwargs['input_file'] = input_file
options_kwargs['output_file'] = output_file
# Handle special stream cases for sidecar
if 'sidecar' in options_kwargs and isinstance(
options_kwargs['sidecar'], BinaryIO | IOBase
):
# Keep the stream object as-is - OcrOptions can handle it
pass
# Remove None values to let OcrOptions use its defaults
options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}
# Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
extra_attrs = {}
ocr_fields = set(OcrOptions.model_fields.keys())
# Legacy mode flags are handled by OcrOptions model validator
legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
# Known extra attributes that should be preserved
known_extra = {'progress_bar', 'plugins'}
for key in list(options_kwargs.keys()):
if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
continue
extra_attrs[key] = options_kwargs.pop(key)
# Create OcrOptions directly
try:
options = OcrOptions(**options_kwargs)
# Add any extra attributes
if extra_attrs:
options.extra_attrs.update(extra_attrs)
return options
except Exception as e:
# If direct construction fails, provide a helpful error message
raise TypeError(f"Failed to create OcrOptions: {e}") from e
@overload
def ocr(
options: OcrOptions,
/,
*,
plugins: Iterable[Path | str] | None = None,
plugin_manager: OcrmypdfPluginManager | None = None,
) -> ExitCode: ...
@overload
def ocr(
input_file_or_options: PathOrIO,
output_file: PathOrIO,
*,
language: Iterable[str] | None = None,
image_dpi: int | None = None,
output_type: str | None = None,
sidecar: PathOrIO | None = None,
jobs: int | None = None,
use_threads: bool | None = None,
title: str | None = None,
author: str | None = None,
subject: str | None = None,
keywords: str | None = None,
rotate_pages: bool | None = None,
remove_background: bool | None = None,
deskew: bool | None = None,
clean: bool | None = None,
clean_final: bool | None = None,
unpaper_args: str | None = None,
oversample: int | None = None,
remove_vectors: bool | None = None,
mode: str | None = None,
force_ocr: bool | None = None,
skip_text: bool | None = None,
redo_ocr: bool | None = None,
skip_big: float | None = None,
optimize: int | None = None,
jpg_quality: int | None = None,
png_quality: int | None = None,
jbig2_lossy: bool | None = None,
jbig2_page_group_size: int | None = None,
jbig2_threshold: float | None = None,
pages: str | None = None,
max_image_mpixels: float | None = None,
tesseract_config: Iterable[str] | None = None,
tesseract_pagesegmode: int | None = None,
tesseract_oem: int | None = None,
tesseract_thresholding: int | None = None,
pdf_renderer: str | None = None,
rasterizer: str | None = None,
tesseract_timeout: float | None = None,
tesseract_non_ocr_timeout: float | None = None,
tesseract_downsample_above: int | None = None,
tesseract_downsample_large_images: bool | None = None,
rotate_pages_threshold: float | None = None,
pdfa_image_compression: str | None = None,
color_conversion_strategy: str | None = None,
user_words: os.PathLike | None = None,
user_patterns: os.PathLike | None = None,
fast_web_view: float | None = None,
continue_on_soft_render_error: bool | None = None,
invalidate_digital_signatures: bool | None = None,
tagged_pdf_mode: str | None = None,
no_overwrite: bool | None = None,
plugins: Iterable[Path | str] | None = None,
plugin_manager: OcrmypdfPluginManager | None = None,
keep_temporary_files: bool | None = None,
progress_bar: bool | None = None,
**kwargs,
) -> ExitCode: ...
def ocr( # noqa: D417
input_file_or_options: PathOrIO | OcrOptions,
output_file: PathOrIO | None = None,
*,
language: Iterable[str] | None = None,
image_dpi: int | None = None,
output_type: str | None = None,
sidecar: PathOrIO | None = None,
jobs: int | None = None,
use_threads: bool | None = None,
title: str | None = None,
author: str | None = None,
subject: str | None = None,
keywords: str | None = None,
rotate_pages: bool | None = None,
remove_background: bool | None = None,
deskew: bool | None = None,
clean: bool | None = None,
clean_final: bool | None = None,
unpaper_args: str | None = None,
oversample: int | None = None,
remove_vectors: bool | None = None,
mode: str | None = None,
force_ocr: bool | None = None, # Legacy, use mode='force' instead
skip_text: bool | None = None, # Legacy, use mode='skip' instead
redo_ocr: bool | None = None, # Legacy, use mode='redo' instead
skip_big: float | None = None,
optimize: int | None = None,
jpg_quality: int | None = None,
png_quality: int | None = None,
jbig2_lossy: bool | None = None, # Deprecated, ignored
jbig2_page_group_size: int | None = None, # Deprecated, ignored
jbig2_threshold: float | None = None,
pages: str | None = None,
max_image_mpixels: float | None = None,
tesseract_config: Iterable[str] | None = None,
tesseract_pagesegmode: int | None = None,
tesseract_oem: int | None = None,
tesseract_thresholding: int | None = None,
pdf_renderer: str | None = None,
rasterizer: str | None = None,
tesseract_timeout: float | None = None,
tesseract_non_ocr_timeout: float | None = None,
tesseract_downsample_above: int | None = None,
tesseract_downsample_large_images: bool | None = None,
rotate_pages_threshold: float | None = None,
pdfa_image_compression: str | None = None,
color_conversion_strategy: str | None = None,
user_words: os.PathLike | None = None,
user_patterns: os.PathLike | None = None,
fast_web_view: float | None = None,
continue_on_soft_render_error: bool | None = None,
invalidate_digital_signatures: bool | None = None,
tagged_pdf_mode: str | None = None,
no_overwrite: bool | None = None,
plugins: Iterable[Path | str] | None = None,
plugin_manager: OcrmypdfPluginManager | None = None,
keep_temporary_files: bool | None = None,
progress_bar: bool | None = None,
**kwargs,
) -> ExitCode:
"""Run OCRmyPDF on one PDF or image.
This function supports two calling conventions:
**New style (recommended):**
>>> from ocrmypdf import ocr
>>> from ocrmypdf._options import OcrOptions
>>> options = OcrOptions(
... input_file="input.pdf",
... output_file="output.pdf",
... languages=["eng"],
... )
>>> ocr(options)
**Old style:**
>>> ocr("input.pdf", "output.pdf", language=["eng"])
For most arguments, see documentation for the equivalent command line parameter.
This API takes a threading lock, because OCRmyPDF uses global state in particular
for the plugin system. The jobs parameter will be used to create a pool of
worker threads or processes at different times, subject to change. A Python
process can only run one OCRmyPDF task at a time.
To run parallelize instances OCRmyPDF, use separate Python processes to scale
horizontally. Generally speaking you should set jobs=sqrt(cpu_count) and run
sqrt(cpu_count) processes as a starting point. If you have files with a high page
count, run fewer processes and more jobs per process. If you have a lot of short
files, run more processes and fewer jobs per process.
A few specific arguments are discussed here:
Args:
input_file_or_options: Either an OcrOptions object containing all settings,
or a path/stream for the input file (old-style API).
output_file: Output file path or stream. Required when using old-style API
with input_file as first argument. Must be None when passing OcrOptions.
use_threads: Use worker threads instead of processes. This reduces
performance but may make debugging easier since it is easier to set
breakpoints.
plugins: List of plugin paths to load. Can be passed alongside OcrOptions.
plugin_manager: Pre-configured plugin manager. Can be passed alongside
OcrOptions.
For input_file (old-style API): If a :class:`pathlib.Path`, ``str`` or
``bytes``, this is interpreted as file system path to the input file.
If the object appears to be a readable stream (with methods such as
``.read()`` and ``.seek()``), the object will be read in its entirety
and saved to a temporary file. If ``input_file`` is ``"-"``, standard
input will be read.
For output_file (old-style API): If a :class:`pathlib.Path`, ``str`` or
``bytes``, this is interpreted as file system path to the output file.
If the object appears to be a writable stream (with methods such as
``.write()`` and ``.seek()``), the output will be written to this
stream. If ``output_file`` is ``"-"``, the output will be written to
``sys.stdout`` (provided that standard output does not seem to be a
terminal device). When a stream is used as output, whether via a
writable object or ``"-"``, some final validation steps are not
performed (we do not read back the stream after it is written).
Raises:
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
was not found on PATH.
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
could not be read, or some other file type that is not a PDF.
ocrmypdf.DpiError: If the input file is an image, but the resolution of the
image is not credible (allowing it to proceed would cause poor OCR).
ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
file failed.
ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
text already, and settings did not tell us to proceed.
ocrmypdf.InputFileError: Any other problem with the input file.
ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
ocrmypdf.EncryptedPdfError: If the input PDF is encrypted (password protected).
OCRmyPDF does not remove passwords.
ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
valid.
ValueError: If OcrOptions is passed along with other OCR parameters, or if
both plugins and plugin_manager are provided.
TypeError: If output_file is missing when using the old-style API.
Returns:
:class:`ocrmypdf.ExitCode`
"""
# Detect calling convention: OcrOptions object vs individual parameters
if isinstance(input_file_or_options, OcrOptions):
# New-style API: OcrOptions passed directly
options = input_file_or_options
# Check for conflicting parameters
# (all should be None except plugins/plugin_manager)
_check_no_conflicting_ocr_params(locals(), kwargs)
# plugins and plugin_manager can still be passed alongside OcrOptions
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
# Use plugins from OcrOptions if not explicitly passed
if plugins is None:
plugins = options.plugins or []
if isinstance(plugins, str | Path):
plugins = [plugins]
else:
plugins = list(plugins) if plugins else []
# Run the pipeline with the OcrOptions
with _api_lock:
plugin_manager = setup_plugin_infrastructure(
plugins=plugins, plugin_manager=plugin_manager
)
parser = get_parser()
plugin_manager.add_options(parser=parser)
check_options(options, plugin_manager)
return run_pipeline(options=options, plugin_manager=plugin_manager)
else:
# Old-style API: positional arguments
input_file = input_file_or_options
if output_file is None:
raise TypeError(
"ocr() missing required argument: 'output_file'. "
"Either pass output_file as the second argument, or pass "
"an OcrOptions object as the first argument."
)
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
if not plugins:
plugins = []
elif isinstance(plugins, str | Path):
plugins = [plugins]
else:
plugins = list(plugins)
# No new variable names should be assigned until these two steps are run
create_options_kwargs = {
k: v
for k, v in locals().items()
if k
not in {
'input_file_or_options',
'input_file',
'output_file',
'kwargs',
'plugin_manager',
}
}
create_options_kwargs.update(kwargs)
parser = get_parser()
with _api_lock:
# Set up plugin infrastructure with proper initialization
plugin_manager = setup_plugin_infrastructure(
plugins=plugins, plugin_manager=plugin_manager
)
# Get parser and let plugins add their options
parser = get_parser()
plugin_manager.add_options(parser=parser)
if 'verbose' in kwargs:
warn(
"ocrmypdf.ocr(verbose=) is ignored. "
"Use ocrmypdf.configure_logging()."
)
# Warn about deprecated jbig2 options and remove from kwargs
if jbig2_lossy:
warn(
"jbig2_lossy is deprecated and will be ignored. "
"Lossy JBIG2 has been removed due to character substitution risks."
)
create_options_kwargs.pop('jbig2_lossy', None)
if jbig2_page_group_size:
warn("jbig2_page_group_size is deprecated and will be ignored.")
create_options_kwargs.pop('jbig2_page_group_size', None)
options = create_options(
input_file=input_file,
output_file=output_file,
parser=parser,
**create_options_kwargs,
)
check_options(options, plugin_manager)
return run_pipeline(options=options, plugin_manager=plugin_manager)
def _pdf_to_hocr( # noqa: D417
input_pdf: Path,
output_folder: Path,
*,
language: Iterable[str] | None = None,
image_dpi: int | None = None,
jobs: int | None = None,
use_threads: bool | None = None,
title: str | None = None,
author: str | None = None,
subject: str | None = None,
keywords: str | None = None,
rotate_pages: bool | None = None,
remove_background: bool | None = None,
deskew: bool | None = None,
clean: bool | None = None,
clean_final: bool | None = None,
unpaper_args: str | None = None,
oversample: int | None = None,
remove_vectors: bool | None = None,
mode: str | None = None,
force_ocr: bool | None = None, # Legacy, use mode='force' instead
skip_text: bool | None = None, # Legacy, use mode='skip' instead
redo_ocr: bool | None = None, # Legacy, use mode='redo' instead
skip_big: float | None = None,
pages: str | None = None,
max_image_mpixels: float | None = None,
tesseract_config: Iterable[str] | None = None,
tesseract_pagesegmode: int | None = None,
tesseract_oem: int | None = None,
tesseract_thresholding: int | None = None,
tesseract_timeout: float | None = None,
tesseract_non_ocr_timeout: float | None = None,
tesseract_downsample_above: int | None = None,
tesseract_downsample_large_images: bool | None = None,
rotate_pages_threshold: float | None = None,
rasterizer: str | None = None,
user_words: os.PathLike | None = None,
user_patterns: os.PathLike | None = None,
continue_on_soft_render_error: bool | None = None,
invalidate_digital_signatures: bool | None = None,
plugin_manager=None,
plugins: Sequence[Path | str] | None = None,
keep_temporary_files: bool | None = None,
**kwargs,
):
"""Partially run OCRmyPDF and produces an output folder containing hOCR files.
Given a PDF file, this function will run OCRmyPDF up to the point where
the PDF is rasterized to images, OCRed, and the hOCR files are produced,
all of which are saved to the output folder. This is useful for applications
that want to provide an interface for users to edit the text before
rendering the final PDF.
Use :func:`hocr_to_ocr_pdf` to produce the final PDF.
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
This API is **experimental** and subject to change.
Args:
input_pdf: Input PDF file path.
output_folder: Output folder path.
**kwargs: Keyword arguments.
"""
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
if not plugins:
plugins = []
elif isinstance(plugins, str | Path):
plugins = [plugins]
else:
plugins = list(plugins)
# Prepare kwargs for direct OcrOptions construction
options_kwargs = kwargs.copy()
# Set input file and handle special output_folder case
options_kwargs['input_file'] = input_pdf
options_kwargs['output_file'] = '/dev/null' # Placeholder for hOCR pipeline
# Add all the function parameters
for param_name, param_value in locals().items():
if (
param_name
not in {'input_pdf', 'output_folder', 'kwargs', 'plugin_manager', 'plugins'}
and param_value is not None
):
options_kwargs[param_name] = param_value
# Map API parameter 'language' to OcrOptions field 'languages'
_remap_language_to_languages(options_kwargs)
# Handle plugins
if plugins:
options_kwargs['plugins'] = plugins
# Remove None values to let OcrOptions use its defaults
options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}
# Add output_folder to options_kwargs since it's now a proper field
options_kwargs['output_folder'] = output_folder
# Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
extra_attrs = {}
ocr_fields = set(OcrOptions.model_fields.keys())
# Legacy mode flags are handled by OcrOptions model validator
legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
known_extra = {'progress_bar', 'plugins'}
for key in list(options_kwargs.keys()):
if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
continue
extra_attrs[key] = options_kwargs.pop(key)
with _api_lock:
# Set up plugin infrastructure with proper initialization
plugin_manager = setup_plugin_infrastructure(
plugins=plugins, plugin_manager=plugin_manager
)
plugin_manager.add_options(parser=get_parser())
# Create OcrOptions directly
try:
options = OcrOptions(**options_kwargs)
# Add any extra attributes
if extra_attrs:
options.extra_attrs.update(extra_attrs)
except Exception as e:
raise TypeError(
f"Failed to create OcrOptions for hOCR pipeline: {e}"
) from e
return run_hocr_pipeline(options=options, plugin_manager=plugin_manager)
def _hocr_to_ocr_pdf( # noqa: D417
work_folder: Path,
output_file: Path,
*,
jobs: int | None = None,
use_threads: bool | None = None,
optimize: int | None = None,
jpg_quality: int | None = None,
png_quality: int | None = None,
jbig2_lossy: bool | None = None, # Deprecated, ignored
jbig2_page_group_size: int | None = None, # Deprecated, ignored
jbig2_threshold: float | None = None,
pdfa_image_compression: str | None = None,
color_conversion_strategy: str | None = None,
fast_web_view: float | None = None,
plugin_manager=None,
plugins: Sequence[Path | str] | None = None,
**kwargs,
):
"""Run OCRmyPDF on a work folder and produce an output PDF.
After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work
folder to produce an output PDF. This function consolidates any changes made
to the hOCR files in the work folder and produces a final PDF.
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
This API is **experimental** and subject to change.
Args:
work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.
output_file: Output PDF file path.
**kwargs: Keyword arguments.
"""
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
if not plugins:
plugins = []
elif isinstance(plugins, str | Path):
plugins = [plugins]
else:
plugins = list(plugins)
# Prepare kwargs for direct OcrOptions construction
options_kwargs = kwargs.copy()
# Set output file and handle special work_folder case
options_kwargs['input_file'] = '/dev/null' # Placeholder for hOCR to PDF pipeline
options_kwargs['output_file'] = output_file
# Add all the function parameters
for param_name, param_value in locals().items():
if (
param_name
not in {'work_folder', 'output_file', 'kwargs', 'plugin_manager', 'plugins'}
and param_value is not None
):
options_kwargs[param_name] = param_value
# Handle plugins
if plugins:
options_kwargs['plugins'] = plugins
# Remove None values to let OcrOptions use its defaults
options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}
# Warn about deprecated jbig2 options and remove from kwargs
if jbig2_lossy:
warn(
"jbig2_lossy is deprecated and will be ignored. "
"Lossy JBIG2 has been removed due to character substitution risks."
)
options_kwargs.pop('jbig2_lossy', None)
if jbig2_page_group_size:
warn("jbig2_page_group_size is deprecated and will be ignored.")
options_kwargs.pop('jbig2_page_group_size', None)
# Add work_folder to options_kwargs since it's now a proper field
options_kwargs['work_folder'] = work_folder
# Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
extra_attrs = {}
ocr_fields = set(OcrOptions.model_fields.keys())
# Legacy mode flags are handled by OcrOptions model validator
legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
known_extra = {'progress_bar', 'plugins'}
for key in list(options_kwargs.keys()):
if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
continue
extra_attrs[key] = options_kwargs.pop(key)
with _api_lock:
# Set up plugin infrastructure with proper initialization
plugin_manager = setup_plugin_infrastructure(
plugins=plugins, plugin_manager=plugin_manager
)
plugin_manager.add_options(parser=get_parser())
# Create OcrOptions directly
try:
options = OcrOptions(**options_kwargs)
# Add any extra attributes
if extra_attrs:
options.extra_attrs.update(extra_attrs)
except Exception as e:
raise TypeError(
f"Failed to create OcrOptions for hOCR to PDF pipeline: {e}"
) from e
return run_hocr_to_ocr_pdf_pipeline(
options=options, plugin_manager=plugin_manager
)
__all__ = [
'PageNumberFilter',
'Verbosity',
'check_options',
'configure_logging',
'create_options',
'get_parser',
'get_plugin_manager',
'ocr',
'run_pipeline',
'run_pipeline_cli',
'setup_plugin_infrastructure',
]
================================================
FILE: src/ocrmypdf/builtin_plugins/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Plugins in this package are automatically loaded by ocrmypdf."""
from __future__ import annotations
================================================
FILE: src/ocrmypdf/builtin_plugins/concurrency.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF's multiprocessing/multithreading abstraction layer."""
from __future__ import annotations
import logging
import logging.handlers
import multiprocessing
import multiprocessing.queues
import os
import queue
import signal
import sys
import threading
from collections.abc import Callable, Iterable
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from contextlib import suppress
from typing import TYPE_CHECKING
from rich.console import Console as RichConsole
from ocrmypdf import Executor, hookimpl
from ocrmypdf._logging import RichLoggingHandler
from ocrmypdf._progressbar import RichProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers
if TYPE_CHECKING:
from typing import TypeAlias
Queue: TypeAlias = multiprocessing.queues.Queue | queue.Queue
UserInit: TypeAlias = Callable[[], None]
WorkerInit: TypeAlias = Callable[[Queue, UserInit, int], None]
FuturesExecutorClass = type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
def log_listener(q: Queue):
"""Listen to the worker processes and forward the messages to logging.
For simplicity this is a thread rather than a process. Only one process
should actually write to sys.stderr or whatever we're using, so if this is
made into a process the main application needs to be directed to it.
See:
https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
"""
while True:
try:
record = q.get()
if record is None:
break
logger = logging.getLogger(record.name)
logger.handle(record)
except Exception: # pylint: disable=broad-except
import traceback # pylint: disable=import-outside-toplevel
print("Logging problem", file=sys.stderr)
traceback.print_exc(file=sys.stderr)
def process_sigbus(*args):
"""Handle SIGBUS signal at the worker level."""
raise InputFileError("A worker process lost access to an input file")
def process_init(q: Queue, user_init: UserInit, loglevel) -> None:
"""Initialize a process pool worker."""
# Ignore SIGINT (our parent process will kill us gracefully)
signal.signal(signal.SIGINT, signal.SIG_IGN)
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
signal.signal(signal.SIGBUS, process_sigbus)
# Remove any log handlers inherited from the parent process
root = logging.getLogger()
remove_all_log_handlers(root)
# Set up our single log handler to forward messages to the parent
root.setLevel(loglevel)
root.addHandler(logging.handlers.QueueHandler(q))
user_init()
return
def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
"""Begin a thread pool worker."""
del q # unused but required argument
del loglevel # unused but required argument
# As a thread, block SIGBUS so the main thread deals with it...
with suppress(AttributeError):
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})
user_init()
return
def setup_executor(use_threads: bool) -> tuple[Queue, Executor, WorkerInit]:
if not use_threads:
# Some execution environments like AWS Lambda and Termux do not support
# semaphores. Check if semaphore support is available, and if not, fall back
# to using threads.
try:
# pylint: disable=import-outside-toplevel
from multiprocessing.synchronize import SemLock
del SemLock
except ImportError:
use_threads = True
if use_threads:
loq_queue = queue.Queue(-1)
executor_class = ThreadPoolExecutor
initializer = thread_init
else:
loq_queue = multiprocessing.Queue(-1)
executor_class = ProcessPoolExecutor
initializer = process_init
return loq_queue, executor_class, initializer
class StandardExecutor(Executor):
"""Standard OCRmyPDF concurrent task executor."""
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
progress_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
):
log_queue, executor_class, initializer = setup_executor(use_threads)
# Regardless of whether we use_threads for worker processes, the log_listener
# must be a thread. Make sure we create the listener after the worker pool,
# so that it does not get forked into the workers.
# If use_threads is False, we are currently guilty of creating a thread before
# forking on Linux, which is not recommended. However, we take a big
# performance hit in pdfinfo if we can't fork. Long term solution is to
# replace most of this with an asyncio implementation, and probably to
# migrate some of pdfinfo into C++ or Rust.
listener = threading.Thread(target=log_listener, args=(log_queue,))
listener.start()
with (
self.pbar_class(**progress_kwargs) as pbar,
executor_class(
max_workers=max_workers,
initializer=initializer,
initargs=(log_queue, worker_initializer, logging.getLogger("").level),
) as executor,
):
futures = [executor.submit(task, *args) for args in task_arguments]
try:
for future in as_completed(futures):
result = future.result()
task_finished(result, pbar)
except KeyboardInterrupt:
# Terminate pool so we exit instantly
executor.shutdown(wait=False, cancel_futures=True)
raise
except Exception:
if not os.environ.get("PYTEST_CURRENT_TEST", ""):
# Normally we shutdown without waiting for other child workers
# on error, because there is no point in waiting for them. Their
# results will be discard. But if the condition above is True,
# then we are running in pytest, and we want everything to exit
# as cleanly as possible so that we get good error messages.
executor.shutdown(wait=False, cancel_futures=True)
raise
finally:
# Terminate log listener
log_queue.put_nowait(None)
# When the above succeeds, wait for the listener thread to exit. (If
# an exception occurs, we don't try to join, in case it deadlocks.)
listener.join()
@hookimpl
def get_executor(progressbar_class):
"""Return the default executor."""
return StandardExecutor(pbar_class=progressbar_class)
RICH_CONSOLE = RichConsole(stderr=True)
@hookimpl
def get_progressbar_class():
"""Return the default progress bar class."""
def partial_RichProgressBar(*args, **kwargs):
return RichProgressBar(*args, **kwargs, console=RICH_CONSOLE)
return partial_RichProgressBar
@hookimpl
def get_logging_console():
"""Return the default logging console handler."""
return RichLoggingHandler(console=RICH_CONSOLE)
================================================
FILE: src/ocrmypdf/builtin_plugins/default_filters.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF automatically installs these filters as plugins."""
from __future__ import annotations
from ocrmypdf import hookimpl
@hookimpl
def filter_pdf_page(page, image_filename, output_pdf): # pylint: disable=unused-argument
return output_pdf
================================================
FILE: src/ocrmypdf/builtin_plugins/ghostscript.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page rasterization and PDF/A production."""
from __future__ import annotations
import logging
from enum import StrEnum
from pathlib import Path
from typing import Annotated
from packaging.version import Version
from pikepdf import Name, Pdf, Stream
from pydantic import BaseModel, Field
from ocrmypdf import hookimpl
from ocrmypdf._exec import ghostscript
from ocrmypdf._options import ProcessingMode
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
# Currently all blacklisted versions are lower than 9.55, so none need to
# be added here. If a future version is blacklisted, add it here.
BLACKLISTED_GS_VERSIONS: frozenset[Version] = frozenset()
class ColorConversionStrategy(StrEnum):
"""Ghostscript color conversion strategies."""
CMYK = 'CMYK'
GRAY = 'Gray'
LEAVE_COLOR_UNCHANGED = 'LeaveColorUnchanged'
RGB = 'RGB'
USE_DEVICE_INDEPENDENT_COLOR = 'UseDeviceIndependentColor'
class PdfaImageCompression(StrEnum):
"""PDF/A image compression methods."""
AUTO = 'auto'
JPEG = 'jpeg'
LOSSLESS = 'lossless'
class GhostscriptOptions(BaseModel):
"""Options specific to Ghostscript operations."""
color_conversion_strategy: Annotated[
ColorConversionStrategy,
Field(description="Ghostscript color conversion strategy"),
] = ColorConversionStrategy.LEAVE_COLOR_UNCHANGED
pdfa_image_compression: Annotated[
PdfaImageCompression, Field(description="PDF/A image compression method")
] = PdfaImageCompression.AUTO
@classmethod
def add_arguments_to_parser(cls, parser, namespace: str = 'ghostscript'):
"""Add Ghostscript-specific arguments to the argument parser.
Args:
parser: The argument parser to add arguments to
namespace: The namespace prefix for argument names (not used for ghostscript
for backward compatibility)
"""
gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript")
gs.add_argument(
'--color-conversion-strategy',
action='store',
type=str,
choices=[ccs.value for ccs in ColorConversionStrategy],
default=ColorConversionStrategy.LEAVE_COLOR_UNCHANGED.value,
help="Set Ghostscript color conversion strategy",
)
gs.add_argument(
'--pdfa-image-compression',
choices=[pc.value for pc in PdfaImageCompression],
default=PdfaImageCompression.AUTO.value,
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
@hookimpl
def register_options():
"""Register Ghostscript option model."""
return {'ghostscript': GhostscriptOptions}
@hookimpl
def add_options(parser):
# Use the model's CLI generation method
GhostscriptOptions.add_arguments_to_parser(parser)
@hookimpl
def check_options(options):
"""Check that the options are valid for this plugin."""
# Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')
# 'auto' mode uses best-effort PDF/A without Ghostscript fallback
if options.output_type.startswith('pdfa'):
check_external_program(
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55
)
gs_version = ghostscript.version()
if gs_version in BLACKLISTED_GS_VERSIONS:
raise MissingDependencyError(
f"Ghostscript {gs_version} contains serious regressions and is not "
"supported. Please upgrade to a newer version."
)
if Version('10.0.0') <= gs_version < Version('10.02.1') and (
options.mode in (ProcessingMode.skip, ProcessingMode.redo)
):
raise MissingDependencyError(
f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
"contain serious regressions that corrupt PDFs with existing text, "
"such as those processed using --skip-text or --redo-ocr "
"(or --mode skip/redo). Please upgrade to a newer version, or use "
"--output-type pdf to avoid Ghostscript, or use --force-ocr "
"(or --mode force) to discard existing text."
)
if gs_version >= Version('10.6.0'):
log.warning(
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
"images. OCRmyPDF will attempt to mitigate, but this version is "
"strongly not recommended. Please upgrade to a newer version. "
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
)
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if (
options.ghostscript.color_conversion_strategy
not in ghostscript.COLOR_CONVERSION_STRATEGIES
):
raise ValueError(
f"Invalid color conversion strategy: "
f"{options.ghostscript.color_conversion_strategy}"
)
if (
options.ghostscript.pdfa_image_compression != 'auto'
and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'"
)
@hookimpl
def rasterize_pdf_page(
input_file,
output_file,
raster_device,
raster_dpi,
pageno,
page_dpi,
rotation,
filter_vector,
stop_on_soft_error,
options,
use_cropbox,
):
"""Rasterize a single page of a PDF file using Ghostscript."""
# Check if user explicitly requested a different rasterizer
if options is not None and options.rasterizer == 'pypdfium':
# Let pypdfium handle it (it will error in check_options if unavailable)
return None
ghostscript.rasterize_pdf(
input_file,
output_file,
raster_device=raster_device,
raster_dpi=raster_dpi,
pageno=pageno,
page_dpi=page_dpi,
rotation=rotation,
filter_vector=filter_vector,
stop_on_error=stop_on_soft_error,
use_cropbox=use_cropbox,
)
return output_file
def _collect_dctdecode_images(pdf: Pdf) -> dict[tuple, list[tuple[Stream, bytes]]]:
"""Collect all DCTDecode (JPEG) images from a PDF.
Returns a dict mapping image signatures to a list of (stream, raw_bytes) tuples.
The signature is (Width, Height, Filter, BitsPerComponent, ColorSpace).
"""
images: dict[tuple, list[tuple[Stream, bytes]]] = {}
def get_colorspace_key(obj):
"""Get a hashable key for the colorspace."""
cs = obj.get(Name.ColorSpace)
if cs is None:
return None
if isinstance(cs, Name):
return str(cs)
# For array colorspaces like [/ICCBased ...], use the first element
try:
return str(cs[0]) if len(cs) > 0 else str(cs)
except (TypeError, KeyError):
return str(cs)
def process_xobject_dict(xobjects, depth=0):
"""Process an XObject dictionary for DCTDecode images."""
if xobjects is None:
return
if depth > 10:
log.warning("Recursion depth exceeded in _collect_dctdecode_images")
return
for key in xobjects.keys():
obj = xobjects[key]
if obj is None:
continue
# Check if it's an image with DCTDecode
if obj.get(Name.Subtype) == Name.Image:
filt = obj.get(Name.Filter)
if filt == Name.DCTDecode:
sig = (
int(obj.get(Name.Width, 0)),
int(obj.get(Name.Height, 0)),
str(filt),
int(obj.get(Name.BitsPerComponent, 0)),
get_colorspace_key(obj),
)
raw_bytes = obj.read_raw_bytes()
if sig not in images:
images[sig] = []
images[sig].append((obj, raw_bytes))
# Recurse into Form XObjects
elif obj.get(Name.Subtype) == Name.Form:
if Name.Resources in obj:
res = obj[Name.Resources]
if Name.XObject in res:
process_xobject_dict(res[Name.XObject], depth=depth + 1)
for page in pdf.pages:
if Name.Resources not in page:
continue
resources = page[Name.Resources]
if Name.XObject not in resources:
continue
process_xobject_dict(resources[Name.XObject])
return images
def _repair_gs106_jpeg_corruption(
input_pdf_path: Path,
output_pdf_path: Path,
) -> bool:
"""Repair JPEG corruption caused by Ghostscript 10.6.
Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes.
This function detects and repairs such corruption by copying the
original JPEG bytes from the input PDF.
Returns True if any repairs were made.
"""
repaired_count = 0
first_error_logged = False
with (
Pdf.open(input_pdf_path) as input_pdf,
Pdf.open(output_pdf_path, allow_overwriting_input=True) as output_pdf,
):
# Collect all DCTDecode images from both PDFs
input_images = _collect_dctdecode_images(input_pdf)
output_images = _collect_dctdecode_images(output_pdf)
# For each output image, try to find a corresponding input image
for sig, output_list in output_images.items():
if sig not in input_images:
continue
input_list = input_images[sig]
for output_stream, output_bytes in output_list:
# Try to find a matching input image
for _input_stream, input_bytes in input_list:
input_len = len(input_bytes)
output_len = len(output_bytes)
# Check if output is 1-15 bytes shorter
diff = input_len - output_len
if not (1 <= diff <= 15):
continue
# Check if the bytes are identical up to the truncation point
if output_bytes != input_bytes[:output_len]:
continue
# This is a corrupt image - repair it
if not first_error_logged:
log.error(
"Ghostscript 10.6 JPEG corruption detected. "
"Repairing damaged images from original PDF."
)
first_error_logged = True
log.warning(
f"Replacing corrupt JPEG image "
f"({sig[0]}x{sig[1]}, {diff} bytes truncated)"
)
# Write the original bytes back to the output stream
output_stream.write(
input_bytes,
filter=Name.DCTDecode,
)
repaired_count += 1
break # Move to next output image
if repaired_count > 0:
output_pdf.save(output_pdf_path)
log.info(
f"Repaired {repaired_count} JPEG image(s) corrupted by Ghostscript"
)
return repaired_count > 0
@hookimpl
def generate_pdfa(
pdf_pages,
pdfmark,
output_file,
context,
pdf_version,
pdfa_part,
progressbar_class,
stop_on_soft_error,
):
"""Generate a PDF/A from the list of PDF pages and PDF/A metadata."""
# Normalize output_type at point of use
output_type = context.options.output_type
if output_type == 'pdfa':
output_type = 'pdfa-2'
ghostscript.generate_pdfa(
pdf_pages=[pdfmark, *pdf_pages],
output_file=output_file,
compression=context.options.ghostscript.pdfa_image_compression,
color_conversion_strategy=context.options.ghostscript.color_conversion_strategy,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=progressbar_class,
stop_on_error=stop_on_soft_error,
)
# Repair JPEG corruption caused by Ghostscript 10.6.x
gs_version = ghostscript.version()
if gs_version >= Version('10.6.0') and len(pdf_pages) == 1:
input_pdf = Path(pdf_pages[0])
_repair_gs106_jpeg_corruption(input_pdf, Path(output_file))
return output_file
================================================
FILE: src/ocrmypdf/builtin_plugins/null_ocr.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin implementing a null OCR engine (no OCR).
This plugin provides an OCR engine that produces no text output. It is useful
when users want OCRmyPDF's image processing, PDF/A conversion, or optimization
features without performing actual OCR.
Usage:
ocrmypdf --ocr-engine none input.pdf output.pdf
"""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from PIL import Image
from ocrmypdf import hookimpl
from ocrmypdf.hocrtransform import BoundingBox, OcrClass, OcrElement
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence
if TYPE_CHECKING:
from ocrmypdf._options import OcrOptions
class NullOcrEngine(OcrEngine):
"""A no-op OCR engine that produces no text output.
Use this when you want OCRmyPDF's image processing, PDF/A conversion,
or optimization features without performing actual OCR.
"""
@staticmethod
def version() -> str:
"""Return version string."""
return "none"
@staticmethod
def creator_tag(options: OcrOptions) -> str:
"""Return creator tag for PDF metadata."""
return "OCRmyPDF (no OCR)"
def __str__(self) -> str:
"""Return human-readable engine name."""
return "No OCR engine"
@staticmethod
def languages(options: OcrOptions) -> set[str]:
"""Return supported languages (empty set for null engine)."""
return set()
@staticmethod
def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:
"""Return neutral orientation (no rotation detected)."""
return OrientationConfidence(angle=0, confidence=0.0)
@staticmethod
def get_deskew(input_file: Path, options: OcrOptions) -> float:
"""Return zero deskew angle."""
return 0.0
@staticmethod
def supports_generate_ocr() -> bool:
"""Return True - this engine supports the generate_ocr() API."""
return True
@staticmethod
def generate_ocr(
input_file: Path,
options: OcrOptions,
page_number: int = 0,
) -> tuple[OcrElement, str]:
"""Generate empty OCR results.
Args:
input_file: The image file (used to get dimensions).
options: OCR options (ignored).
page_number: Page number (stored in result).
Returns:
A tuple of (empty OcrElement page, empty string).
"""
# Get image dimensions
with Image.open(input_file) as img:
width, height = img.size
dpi_info = img.info.get('dpi', (72, 72))
dpi = dpi_info[0] if isinstance(dpi_info, tuple) else dpi_info
# Create empty page element with correct dimensions
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=width, bottom=height),
dpi=float(dpi),
page_number=page_number,
)
return page, ""
@staticmethod
def generate_hocr(
input_file: Path,
output_hocr: Path,
output_text: Path,
options: OcrOptions,
) -> None:
"""Generate empty hOCR file.
Creates minimal valid hOCR output with no text content.
"""
# Get image dimensions for hOCR bbox
with Image.open(input_file) as img:
width, height = img.size
hocr_content = f'''
OCRmyPDF - No OCR
'''
output_hocr.write_text(hocr_content, encoding='utf-8')
output_text.write_text('', encoding='utf-8')
@staticmethod
def generate_pdf(
input_file: Path,
output_pdf: Path,
output_text: Path,
options: OcrOptions,
) -> None:
"""NullOcrEngine cannot generate PDFs directly.
Use pdf_renderer='fpdf2' instead of 'sandwich'.
"""
raise NotImplementedError(
"NullOcrEngine cannot generate PDFs directly. "
"Use --pdf-renderer fpdf2 instead of sandwich mode."
)
@hookimpl
def get_ocr_engine(options):
"""Return NullOcrEngine when --ocr-engine none is selected."""
if options is not None:
ocr_engine = getattr(options, 'ocr_engine', 'auto')
if ocr_engine != 'none':
return None
return NullOcrEngine()
================================================
FILE: src/ocrmypdf/builtin_plugins/optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page optimization."""
from __future__ import annotations
import argparse
import logging
from collections.abc import Sequence
from pathlib import Path
from typing import Annotated
from pydantic import BaseModel, Field, model_validator
from ocrmypdf import Executor, PdfContext, hookimpl
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._pipeline import get_pdf_save_settings
from ocrmypdf.cli import numeric
from ocrmypdf.optimize import optimize
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
class OptimizeOptions(BaseModel):
"""Options specific to PDF optimization."""
level: Annotated[
int,
Field(
ge=0,
le=3,
description="Optimization level (0=none, 1=safe, 2=lossy, 3=aggressive)",
),
] = 1
jpeg_quality: Annotated[
int, Field(ge=0, le=100, description="JPEG quality level for optimization")
] = 0
png_quality: Annotated[
int, Field(ge=0, le=100, description="PNG quality level for optimization")
] = 0
jbig2_threshold: Annotated[
float,
Field(ge=0.4, le=0.9, description="JBIG2 symbol classification threshold"),
] = 0.85
@classmethod
def add_arguments_to_parser(cls, parser, namespace: str = 'optimize'):
"""Add optimization-specific arguments to the argument parser.
Args:
parser: The argument parser to add arguments to
namespace: The namespace prefix for argument names
(not used for optimize for backward compatibility)
"""
optimizing = parser.add_argument_group(
"Optimization options", "Control how the PDF is optimized after OCR"
)
optimizing.add_argument(
'-O',
'--optimize',
type=int,
choices=range(0, 4),
default=1,
help=(
"Control how PDF is optimized after processing:"
"0 - do not optimize; "
"1 - do safe, lossless optimizations (default); "
"2 - do lossy JPEG and JPEG2000 optimizations; "
"3 - do more aggressive lossy JPEG and JPEG2000 optimizations. "
"To enable lossy JBIG2, see --jbig2-lossy."
),
)
optimizing.add_argument(
'--jpeg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust JPEG quality level for JPEG optimization. "
"100 is best quality and largest output size; "
"1 is lowest quality and smallest output; "
"0 uses the default."
),
)
optimizing.add_argument(
'--jpg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
dest='jpeg_quality',
help=argparse.SUPPRESS, # Alias for --jpeg-quality
)
optimizing.add_argument(
'--png-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust PNG quality level to use when quantizing PNGs. "
"Values have same meaning as with --jpeg-quality"
),
)
# Deprecated arguments - kept for backward compatibility, emit warnings
optimizing.add_argument(
'--jbig2-lossy',
action='store_true',
help=argparse.SUPPRESS, # Deprecated, hidden from help
)
optimizing.add_argument(
'--jbig2-page-group-size',
type=numeric(int, 1, 10000),
default=0,
metavar='N',
help=argparse.SUPPRESS, # Deprecated, hidden from help
)
optimizing.add_argument(
'--jbig2-threshold',
type=numeric(float, 0.4, 0.9),
default=0.85,
metavar='T',
help=(
"Adjust JBIG2 symbol code classification threshold "
"(default 0.85), range 0.4 to 0.9."
),
)
@model_validator(mode='after')
def validate_optimization_consistency(self):
"""Validate optimization options are consistent."""
if self.level == 0 and any([self.png_quality > 0, self.jpeg_quality > 0]):
log.warning(
"The arguments --png-quality and --jpeg-quality "
"will be ignored because --optimize=0."
)
return self
def validate_with_context(
self, external_programs_available: dict[str, bool]
) -> None:
"""Validate options that require external context.
Args:
external_programs_available: Dict of program name -> availability
"""
if self.level >= 2:
if not external_programs_available.get('pngquant', False):
log.warning(
"pngquant is not available, so PNG optimization will be limited"
)
if not external_programs_available.get('jbig2enc', False):
log.warning(
"jbig2enc is not available, so JBIG2 optimization will be limited"
)
@hookimpl
def register_options():
"""Register optimization option model."""
return {'optimize': OptimizeOptions}
@hookimpl
def add_options(parser):
# Use the model's CLI generation method
OptimizeOptions.add_arguments_to_parser(parser)
@hookimpl
def check_options(options):
"""Check external dependencies for optimization."""
# Warn about deprecated options
if getattr(options, 'jbig2_lossy', False):
log.warning(
"The --jbig2-lossy option is deprecated and will be ignored. "
"Lossy JBIG2 compression has been removed due to risks of "
"character substitution errors."
)
if getattr(options, 'jbig2_page_group_size', 0) not in (0, None):
log.warning(
"The --jbig2-page-group-size option is deprecated and will be ignored."
)
if options.optimize >= 2:
check_external_program(
program='pngquant',
package='pngquant',
version_checker=pngquant.version,
need_version='2.12.2',
required_for='--optimize {2,3}',
)
if options.optimize >= 2:
# Although we use JBIG2 for optimize=1, don't nag about it unless the
# user is asking for more optimization
check_external_program(
program='jbig2',
package='jbig2enc',
version_checker=jbig2enc.version,
need_version='0.28',
required_for='--optimize {2,3}',
recommended=True,
)
@hookimpl
def optimize_pdf(
input_pdf: Path,
output_pdf: Path,
context: PdfContext,
executor: Executor,
linearize: bool,
) -> tuple[Path, Sequence[str]]:
save_settings = dict(
linearize=linearize,
**get_pdf_save_settings(context.options.output_type),
)
result_path = optimize(input_pdf, output_pdf, context, save_settings, executor)
messages = []
if context.options.optimize == 0:
messages.append("Optimization was disabled.")
else:
image_optimizers = {
'jbig2': jbig2enc.available(),
'pngquant': pngquant.available(),
}
for name, available in image_optimizers.items():
if not available:
messages.append(
f"The optional dependency '{name}' was not found, so some image "
f"optimizations could not be attempted."
)
return result_path, messages
@hookimpl
def is_optimization_enabled(context: PdfContext) -> bool:
return context.options.optimize != 0
================================================
FILE: src/ocrmypdf/builtin_plugins/pypdfium.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page rasterization using pypdfium2."""
from __future__ import annotations
import logging
import threading
from contextlib import closing
from pathlib import Path
from typing import TYPE_CHECKING, Literal
if TYPE_CHECKING:
import pypdfium2 as pdfium
else:
try:
import pypdfium2 as pdfium
except ImportError:
pdfium = None
from PIL import Image
from ocrmypdf import hookimpl
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.helpers import Resolution
log = logging.getLogger(__name__)
# pypdfium2/PDFium is not thread-safe. All calls to the library must be serialized.
# See: https://pypdfium2.readthedocs.io/en/stable/python_api.html#incompatibility-with-threading
# When using process-based parallelism (use_threads=False), each process has its own
# pdfium instance, so locking is not needed across processes.
_pdfium_lock = threading.Lock()
@hookimpl
def check_options(options):
"""Check that pypdfium2 is available if explicitly requested."""
if options.rasterizer == 'pypdfium' and pdfium is None:
raise MissingDependencyError(
"The --rasterizer pypdfium option requires the pypdfium2 package. "
"Install it with: pip install pypdfium2"
)
def _open_pdf_document(input_file: Path):
"""Open a PDF document using pypdfium2."""
assert pdfium is not None, "pypdfium2 must be available to call this function"
return pdfium.PdfDocument(input_file)
def _calculate_mediabox_crop(page) -> tuple[float, float, float, float]:
"""Calculate crop values to expand rendering from CropBox to MediaBox.
By default pypdfium2 renders to the CropBox. To render the full MediaBox,
we need negative crop values to expand the rendering area.
Returns:
Tuple of (left, bottom, right, top) crop values. Negative values
expand the rendering area beyond the CropBox to the MediaBox.
"""
mediabox = page.get_mediabox() # (left, bottom, right, top)
cropbox = page.get_cropbox() # (left, bottom, right, top), defaults to mediabox
# Calculate how much to expand from cropbox to mediabox
# Negative values = expand, positive = shrink
return (
mediabox[0] - cropbox[0], # Expand left
mediabox[1] - cropbox[1], # Expand bottom
cropbox[2] - mediabox[2], # Expand right
cropbox[3] - mediabox[3], # Expand top
)
def _render_page_to_bitmap(
page: pdfium.PdfPage,
raster_device: str,
raster_dpi: Resolution,
rotation: int | None,
use_cropbox: bool,
) -> tuple[pdfium.PdfBitmap, int, int]:
"""Render a PDF page to a bitmap."""
# Round DPI to match Ghostscript's precision
raster_dpi = raster_dpi.round(6)
# Get page dimensions BEFORE applying rotation
page_width_pts, page_height_pts = page.get_size()
# Calculate expected output dimensions using separate x/y DPI
expected_width = int(round(page_width_pts * raster_dpi.x / 72.0))
expected_height = int(round(page_height_pts * raster_dpi.y / 72.0))
# Calculate the scale factor based on DPI
# pypdfium2 uses points (72 DPI) as base unit
scale = raster_dpi.to_scalar() / 72.0
# Apply rotation if specified
if rotation:
# pypdfium2 rotation is in degrees, same as our input
# we track rotation in CCW, and pypdfium2 expects CW, so negate
page.set_rotation(-rotation % 360)
# When rotation is 90 or 270, dimensions are swapped in output
if rotation % 180 == 90:
expected_width, expected_height = expected_height, expected_width
# Render the page to a bitmap
# The scale parameter controls the resolution
# Render in grayscale for mono and gray devices (better input for 1-bit conversion)
grayscale = raster_device.lower() in ('pngmono', 'pnggray', 'jpeggray')
# Calculate crop to render the appropriate box
# Default (use_cropbox=False) renders MediaBox for consistency with Ghostscript
crop = (0, 0, 0, 0) if use_cropbox else _calculate_mediabox_crop(page)
bitmap = page.render(
scale=scale,
rotation=0, # We already set rotation on the page
crop=crop,
may_draw_forms=True,
draw_annots=True,
grayscale=grayscale,
# Note: pypdfium2 doesn't have a direct equivalent to filter_vector
# This would require more complex implementation if needed
)
return bitmap, expected_width, expected_height
def _process_image_for_output(
pil_image: Image.Image,
raster_device: str,
raster_dpi: Resolution,
page_dpi: Resolution | None,
stop_on_soft_error: bool,
expected_width: int | None = None,
expected_height: int | None = None,
) -> tuple[Image.Image, Literal['PNG', 'TIFF', 'JPEG']]:
"""Process PIL image for output format and set DPI metadata."""
# Correct dimensions if slightly off (within 2 pixels tolerance)
if expected_width and expected_height:
actual_width, actual_height = pil_image.width, pil_image.height
width_diff = abs(actual_width - expected_width)
height_diff = abs(actual_height - expected_height)
# Only resize if off by small amount (1-2 pixels)
if (width_diff <= 2 or height_diff <= 2) and (
width_diff > 0 or height_diff > 0
):
log.debug(
f"Adjusting rendered dimensions from "
f"{actual_width}x{actual_height} to expected "
f"{expected_width}x{expected_height}"
)
pil_image = pil_image.resize(
(expected_width, expected_height), Image.Resampling.LANCZOS
)
# Set the DPI metadata if page_dpi is specified
if page_dpi:
# PIL expects DPI as a tuple
dpi_tuple = (float(page_dpi.x), float(page_dpi.y))
pil_image.info['dpi'] = dpi_tuple
else:
# Use the raster DPI
dpi_tuple = (float(raster_dpi.x), float(raster_dpi.y))
pil_image.info['dpi'] = dpi_tuple
# Convert image mode to match raster_device
# This ensures pypdfium output matches Ghostscript's native device output
raster_device_lower = raster_device.lower()
if raster_device_lower == 'pngmono':
# Convert to 1-bit black and white (matches Ghostscript pngmono device)
if pil_image.mode != '1':
if pil_image.mode not in ('L', '1'):
pil_image = pil_image.convert('L')
pil_image = pil_image.convert('1')
elif raster_device_lower in ('pnggray', 'jpeggray'):
# Convert to 8-bit grayscale
if pil_image.mode not in ('L', '1'):
pil_image = pil_image.convert('L')
elif raster_device_lower == 'png256':
# Convert to 8-bit indexed color (256 colors)
if pil_image.mode != 'P':
if pil_image.mode not in ('RGB', 'RGBA'):
pil_image = pil_image.convert('RGB')
pil_image = pil_image.quantize(colors=256)
elif raster_device_lower in ('png16m', 'jpeg'):
# Convert to RGB
if pil_image.mode == 'RGBA':
background = Image.new('RGB', pil_image.size, (255, 255, 255))
background.paste(pil_image, mask=pil_image.split()[-1])
pil_image = background
elif pil_image.mode not in ('RGB',):
pil_image = pil_image.convert('RGB')
# pngalpha: keep RGBA as-is
# Determine output format based on raster_device
png_devices = ('png', 'pngmono', 'pnggray', 'png256', 'png16m', 'pngalpha')
if raster_device_lower in png_devices:
format_name = 'PNG'
elif raster_device_lower in ('jpeg', 'jpeggray', 'jpg'):
format_name = 'JPEG'
elif raster_device_lower in ('tiff', 'tif'):
format_name = 'TIFF'
else:
# Default to PNG for unknown formats
format_name = 'PNG'
if stop_on_soft_error:
raise ValueError(f"Unsupported raster device: {raster_device}")
else:
log.warning(f"Unsupported raster device {raster_device}, using PNG")
return pil_image, format_name
def _save_image(pil_image: Image.Image, output_file: Path, format_name: str) -> None:
"""Save PIL image to file with appropriate DPI metadata."""
save_kwargs = {}
if (
format_name in ('PNG', 'TIFF')
and 'dpi' in pil_image.info
or format_name == 'JPEG'
and 'dpi' in pil_image.info
):
save_kwargs['dpi'] = pil_image.info['dpi']
pil_image.save(output_file, format=format_name, **save_kwargs)
@hookimpl
def rasterize_pdf_page(
input_file: Path,
output_file: Path,
raster_device: str,
raster_dpi: Resolution,
pageno: int,
page_dpi: Resolution | None,
rotation: int | None,
filter_vector: bool,
stop_on_soft_error: bool,
options,
use_cropbox: bool,
) -> Path | None:
"""Rasterize a single page of a PDF file using pypdfium2.
Returns None if pypdfium2 is not available or if the user has selected
a different rasterizer, allowing Ghostscript to be used.
"""
# Check if user explicitly requested a different rasterizer
if options is not None and options.rasterizer == 'ghostscript':
return None # Let Ghostscript handle it
if pdfium is None:
return None # Fall back to Ghostscript
# Acquire lock to ensure thread-safe access to pypdfium2
with (
_pdfium_lock,
closing(_open_pdf_document(input_file)) as pdf,
closing(pdf[pageno - 1]) as page,
):
# Render the page to a bitmap
bitmap, expected_width, expected_height = _render_page_to_bitmap(
page, raster_device, raster_dpi, rotation, use_cropbox
)
with closing(bitmap):
# Convert to PIL Image
pil_image = bitmap.to_pil()
# Process and save image outside the lock (PIL operations are thread-safe)
pil_image, format_name = _process_image_for_output(
pil_image,
raster_device,
raster_dpi,
page_dpi,
stop_on_soft_error,
expected_width,
expected_height,
)
_save_image(pil_image, output_file, format_name)
return output_file
================================================
FILE: src/ocrmypdf/builtin_plugins/tesseract_ocr.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement OCR using Tesseract."""
from __future__ import annotations
import argparse
import logging
import os
from typing import Annotated
from PIL import Image
from pydantic import BaseModel, Field, field_validator, model_validator
from ocrmypdf import hookimpl
from ocrmypdf._exec import tesseract
from ocrmypdf._exec.tesseract import ThresholdingMethod
from ocrmypdf._jobcontext import PageContext
from ocrmypdf.cli import numeric
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from ocrmypdf.helpers import available_cpu_count, clamp
from ocrmypdf.imageops import calculate_downsample, downsample_image
from ocrmypdf.pluginspec import OcrEngine
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
def _thresholding_method_converter(value: str) -> ThresholdingMethod:
"""Convert string argument to ThresholdingMethod enum.
Args:
value: String name of thresholding method (auto, otsu, adaptive-otsu, sauvola)
Returns:
ThresholdingMethod enum value
Raises:
argparse.ArgumentTypeError: If value is not a valid thresholding method
"""
method_map = {
'auto': ThresholdingMethod.AUTO,
'otsu': ThresholdingMethod.OTSU,
'adaptive-otsu': ThresholdingMethod.ADAPTIVE_OTSU,
'sauvola': ThresholdingMethod.SAUVOLA,
}
if value.lower() not in method_map:
import argparse
valid = ', '.join(method_map.keys())
raise argparse.ArgumentTypeError(
f"Invalid thresholding method '{value}'. Must be one of: {valid}"
)
return method_map[value.lower()]
class TesseractOptions(BaseModel):
"""Options specific to Tesseract OCR engine."""
config: Annotated[
list[str], Field(description="Additional Tesseract configuration files")
] = []
pagesegmode: Annotated[
int | None,
Field(ge=0, le=13, description="Set Tesseract page segmentation mode"),
] = None
oem: Annotated[
int | None, Field(ge=0, le=3, description="Set Tesseract OCR engine mode")
] = None
thresholding: Annotated[
ThresholdingMethod,
Field(description="Set Tesseract input image thresholding mode"),
] = ThresholdingMethod.AUTO
timeout: Annotated[
float, Field(ge=0, description="Timeout for OCR operations in seconds")
] = 180.0
non_ocr_timeout: Annotated[
float, Field(ge=0, description="Timeout for non-OCR operations in seconds")
] = 180.0
downsample_large_images: Annotated[
bool, Field(description="Downsample large images before OCR")
] = True
downsample_above: Annotated[
int,
Field(
ge=100,
le=32767,
description="Downsample images larger than this pixel size",
),
] = 32767
user_words: Annotated[
str | None, Field(description="Path to Tesseract user words file")
] = None
user_patterns: Annotated[
str | None, Field(description="Path to Tesseract user patterns file")
] = None
omp_thread_limit: Annotated[
int | None,
Field(
description="Calculated OMP_THREAD_LIMIT for Tesseract subprocesses",
exclude=True,
),
] = None
@classmethod
def add_arguments_to_parser(cls, parser, namespace: str = 'tesseract'):
"""Add Tesseract-specific arguments to the argument parser.
Args:
parser: The argument parser to add arguments to
namespace: The namespace prefix for argument names
"""
tess = parser.add_argument_group(
"Tesseract", "Advanced control of Tesseract OCR"
)
tess.add_argument(
f'--{namespace}-config',
action='append',
metavar='CFG',
default=[],
dest=f'{namespace}_config',
help="Additional Tesseract configuration files -- see documentation.",
)
tess.add_argument(
f'--{namespace}-pagesegmode',
action='store',
type=int,
metavar='PSM',
choices=range(0, 14),
dest=f'{namespace}_pagesegmode',
help="Set Tesseract page segmentation mode (see tesseract --help).",
)
tess.add_argument(
f'--{namespace}-oem',
action='store',
type=int,
metavar='MODE',
choices=range(0, 4),
dest=f'{namespace}_oem',
help=(
"Set Tesseract 4+ OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
"3 - default."
),
)
tess.add_argument(
f'--{namespace}-thresholding',
action='store',
type=_thresholding_method_converter,
default='auto',
dest=f'{namespace}_thresholding',
help=(
"Set Tesseract 5.0+ input image thresholding mode. This may improve "
"OCR results on low quality images or those that contain high "
"contrast color. Options: auto, otsu, adaptive-otsu, sauvola. "
"auto/otsu is the Tesseract default (legacy Otsu); adaptive-otsu "
"is an improved Otsu algorithm with improved sort for background "
"color changes; sauvola is based on local standard deviation."
),
)
tess.add_argument(
f'--{namespace}-timeout',
default=180.0,
type=numeric(float, 0),
metavar='SECONDS',
dest=f'{namespace}_timeout',
help=(
"Give up on OCR after the timeout, but copy the preprocessed page "
"into the final output. This timeout is only used when using Tesseract "
"for OCR. When Tesseract is used for other operations such as "
"deskewing and orientation, the timeout is controlled by "
f"--{namespace}-non-ocr-timeout."
),
)
tess.add_argument(
f'--{namespace}-non-ocr-timeout',
default=180.0,
type=numeric(float, 0),
metavar='SECONDS',
dest=f'{namespace}_non_ocr_timeout',
help=(
"Give up on non-OCR operations such as deskewing and orientation "
f"after timeout. This is a separate timeout from --{namespace}-timeout "
"because these operations are not as expensive as OCR."
),
)
tess.add_argument(
f'--{namespace}-downsample-large-images',
action=argparse.BooleanOptionalAction,
default=True,
dest=f'{namespace}_downsample_large_images',
help=(
"Downsample large images before OCR. Tesseract has "
"an upper limit on the size images it will support."
" If this argument is given, OCRmyPDF will "
"downsample large images to fit Tesseract. This "
"may reduce OCR quality, on large images the most"
" desirable text is usually larger. If this "
"parameter is not supplied, Tesseract will error "
"out and produce no OCR on the page in question. "
"This argument should be used with a high value "
f"of --{namespace}-timeout to ensure Tesseract "
"has enough to time."
),
)
tess.add_argument(
f'--{namespace}-downsample-above',
action='store',
type=numeric(int, 100, 32767),
default=32767,
dest=f'{namespace}_downsample_above',
help=(
"Downsample images larger than this size pixel size (either dimension) "
f"before OCR. --{namespace}-downsample-large-images downsamples when "
"an image exceeds Tesseract's internal limits. This argument causes "
"downsampling to occur when an image exceeds the given size. This may "
"reduce OCR quality, but on large images the most desirable text is "
"usually larger."
),
)
tess.add_argument(
'--user-words',
metavar='FILE',
dest='user_words',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.",
)
tess.add_argument(
'--user-patterns',
metavar='FILE',
dest='user_patterns',
help="Specify the location of the Tesseract user patterns file.",
)
@field_validator('timeout', 'non_ocr_timeout')
@classmethod
def validate_timeout_reasonable(cls, v):
"""Validate timeout values are reasonable."""
if v > 3600: # 1 hour
log.warning(f"Timeout of {v} seconds is very long and may cause issues")
return v
@field_validator('pagesegmode')
@classmethod
def validate_pagesegmode_warning(cls, v):
"""Validate page segmentation mode and warn about problematic values."""
if v in (0, 2):
log.warning(
"The tesseract-pagesegmode you selected will disable OCR. "
"This may cause processing to fail."
)
return v
@model_validator(mode='after')
def validate_downsample_consistency(self):
"""Validate downsample options are consistent."""
if self.downsample_above != 32767 and not self.downsample_large_images:
log.warning(
"The --tesseract-downsample-above argument will have no effect unless "
"--tesseract-downsample-large-images is also given."
)
return self
def validate_with_context(self, languages: list[str]) -> None:
"""Validate options that require external context.
Args:
languages: List of languages being used for OCR
"""
# Validate languages are not internal Tesseract languages
DENIED_LANGUAGES = {'equ', 'osd'}
if DENIED_LANGUAGES & set(languages):
raise BadArgsError(
"The following languages are for Tesseract's internal use "
"and should not be issued explicitly: "
f"{', '.join(DENIED_LANGUAGES & set(languages))}\n"
"Remove them from the -l/--language argument."
)
@hookimpl
def register_options():
"""Register Tesseract option model."""
return {'tesseract': TesseractOptions}
@hookimpl
def add_options(parser):
# Use the model's CLI generation method - it now handles all Tesseract options
TesseractOptions.add_arguments_to_parser(parser)
@hookimpl
def check_options(options):
"""Check external dependencies and version compatibility for Tesseract."""
check_external_program(
program='tesseract',
package={'linux': 'tesseract-ocr'},
version_checker=tesseract.version,
need_version='4.1.1', # Ubuntu 22.04 version (also 20.04)
version_parser=tesseract.TesseractVersion,
)
tess_version = tesseract.version()
if tess_version == tesseract.TesseractVersion('5.4.0'):
raise MissingDependencyError(
"Tesseract 5.4.0 is not supported due to regressions in this version. "
"Please upgrade to a newer or supported older version."
)
# Check version-specific feature compatibility
if (
not tesseract.has_thresholding()
and options.tesseract.thresholding != ThresholdingMethod.AUTO
):
log.warning(
"The installed version of Tesseract does not support changes to its "
"thresholding method. The --tesseract-threshold argument will be "
"ignored."
)
@hookimpl
def validate(pdfinfo, options):
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
# to manage how many threads it uses to avoid creating total threads than cores.
# Performance testing shows we're better off
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
# input file is small, then we allow Tesseract to use threads, subject to the
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
if not os.environ.get('OMP_THREAD_LIMIT', '').isnumeric():
jobs = options.jobs or available_cpu_count()
tess_threads = clamp(jobs // len(pdfinfo), 1, 3)
else:
tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
# Store the thread limit in options - it will be passed to subprocess env
options.tesseract.omp_thread_limit = tess_threads
log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)
if (
options.tesseract.downsample_above != 32767
and not options.tesseract.downsample_large_images
):
log.warning(
"The --tesseract-downsample-above argument will have no effect unless "
"--tesseract-downsample-large-images is also given."
)
@hookimpl
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:
"""Filter the image before OCR.
Tesseract cannot handle images with more than 32767 pixels in either axis,
or more than 2**31 bytes. This function resizes the image to fit within
those limits.
"""
options = page.options
if getattr(options, 'tesseract', None) is None:
return image
threshold = min(options.tesseract.downsample_above, 32767)
if options.tesseract.downsample_large_images:
size = calculate_downsample(
image, max_size=(threshold, threshold), max_bytes=(2**31) - 1
)
image = downsample_image(image, size)
return image
class TesseractOcrEngine(OcrEngine):
"""Implements OCR with Tesseract."""
@staticmethod
def version():
return str(tesseract.version())
@staticmethod
def _determine_renderer(options):
"""Determine the PDF renderer to use based on options and languages."""
if options.pdf_renderer == 'auto':
return 'fpdf2'
return options.pdf_renderer
@staticmethod
def creator_tag(options):
renderer = TesseractOcrEngine._determine_renderer(options)
match renderer:
case 'hocr':
return f"OCRmyPDF hOCR + Tesseract OCR {TesseractOcrEngine.version()}"
case 'fpdf2':
return f"OCRmyPDF fpdf2 + Tesseract OCR {TesseractOcrEngine.version()}"
case "sandwich":
return f"Tesseract OCR + PDF {TesseractOcrEngine.version()}"
case _:
return f"Tesseract OCR {TesseractOcrEngine.version()}"
def __str__(self):
return f"Tesseract OCR {TesseractOcrEngine.version()}"
@staticmethod
def languages(options):
return tesseract.get_languages()
@staticmethod
def get_orientation(input_file, options):
return tesseract.get_orientation(
input_file,
engine_mode=options.tesseract.oem,
timeout=options.tesseract.non_ocr_timeout,
omp_thread_limit=options.tesseract.omp_thread_limit,
)
@staticmethod
def get_deskew(input_file, options) -> float:
return tesseract.get_deskew(
input_file,
languages=options.languages,
engine_mode=options.tesseract.oem,
timeout=options.tesseract.non_ocr_timeout,
omp_thread_limit=options.tesseract.omp_thread_limit,
)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
tesseract.generate_hocr(
input_file=input_file,
output_hocr=output_hocr,
output_text=output_text,
languages=options.languages,
engine_mode=options.tesseract.oem,
tessconfig=options.tesseract.config,
timeout=options.tesseract.timeout,
pagesegmode=options.tesseract.pagesegmode,
thresholding=options.tesseract.thresholding,
user_words=options.tesseract.user_words,
user_patterns=options.tesseract.user_patterns,
omp_thread_limit=options.tesseract.omp_thread_limit,
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
tesseract.generate_pdf(
input_file=input_file,
output_pdf=output_pdf,
output_text=output_text,
languages=options.languages,
engine_mode=options.tesseract.oem,
tessconfig=options.tesseract.config,
timeout=options.tesseract.timeout,
pagesegmode=options.tesseract.pagesegmode,
thresholding=options.tesseract.thresholding,
user_words=options.tesseract.user_words,
user_patterns=options.tesseract.user_patterns,
omp_thread_limit=options.tesseract.omp_thread_limit,
)
@hookimpl
def get_ocr_engine(options):
"""Return TesseractOcrEngine when selected or as default."""
if options is not None:
ocr_engine = getattr(options, 'ocr_engine', 'auto')
# Tesseract is selected if explicitly requested or if 'auto'
if ocr_engine not in ('auto', 'tesseract'):
return None
return TesseractOcrEngine()
================================================
FILE: src/ocrmypdf/cli.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Command line interface customization and validation."""
from __future__ import annotations
import argparse
from argparse import ArgumentParser
from collections.abc import Callable, Mapping
from typing import Any, TypeVar
from ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf._defaults import PROGRAM_NAME as _PROGRAM_NAME
from ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._version import __version__ as _VERSION
T = TypeVar('T', int, float)
def numeric(basetype: Callable[[Any], T], min_: T | None = None, max_: T | None = None):
"""Validator for numeric command line parameters.
Stipulates that the value must be of type basetype (typically int or float), and
optionally, within the range [min_, max_].
"""
min_ = basetype(min_) if min_ is not None else None
max_ = basetype(max_) if max_ is not None else None
def _numeric(s: str) -> T:
value = basetype(s)
if (min_ is not None and value < min_) or (max_ is not None and value > max_):
raise argparse.ArgumentTypeError(
f"{s!r} not in valid range {(min_, max_)!r}"
)
return value
_numeric.__name__ = basetype.__name__
return _numeric
def str_to_int(mapping: Mapping[str, int]):
"""Accept text on command line and convert to integer."""
def _str_to_int(s: str) -> int:
try:
return mapping[s]
except KeyError:
raise argparse.ArgumentTypeError(
f"{s!r} must be one of: {', '.join(mapping.keys())}"
) from None
return _str_to_int
class LanguageSetAction(argparse.Action):
"""Manages a list of languages."""
def __init__(self, option_strings, dest, default=None, **kwargs):
"""Initialize the action."""
if default is None:
default = list()
super().__init__(option_strings, dest, default=default, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
"""Add a language to the set."""
dest = getattr(namespace, self.dest)
if isinstance(values, str) and '+' in values:
[dest.append(lang) for lang in values.split('+')]
else:
dest.append(values)
def get_parser():
"""Get the main CLI parser."""
parser = ArgumentParser(
prog=_PROGRAM_NAME,
allow_abbrev=True,
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Generates a searchable PDF or PDF/A from a regular PDF.
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
""",
epilog="""\
OCRmyPDF attempts to keep the output file at about the same size. If a file
contains losslessly compressed images, and images in the output file will be
losslessly compressed as well.
PDF is a page description file that attempts to preserve a layout exactly.
A PDF can contain vector objects (such as text or lines) and raster objects
(images). A page might have multiple images. OCRmyPDF is prepared to deal
with the wide variety of PDFs that exist in the wild.
When a PDF page contains text, OCRmyPDF assumes that the page has already
been OCRed or is a "born digital" page that should not be OCRed. The default
behavior is to exit in this case without producing a file. You can use the
option --skip-text to ignore pages with text, or --force-ocr to rasterize
all objects on the page and produce an image-only PDF as output.
ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf
ocrmypdf --force-ocr word_document.pdf output.pdf
If you are concerned about long-term archiving of PDFs, use the default option
--output-type pdfa which converts the PDF to a standardized PDF/A-2b. This
removes some features from the PDF such as Javascript or forms. If you want to
minimize the number of changes made to your PDF, use --output-type pdf.
If OCRmyPDF is given an image file as input, it will attempt to convert the
image to a PDF before processing. For more control over the conversion of
images to PDF, use the Python package img2pdf or other image to PDF software.
For example, this command uses img2pdf to convert all .png files beginning
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
sending the result to OCRmyPDF through a pipe.
img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf
Online documentation is located at:
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
""",
)
parser.add_argument(
'input_file',
metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)",
)
parser.add_argument(
'output_file',
metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be overwritten (use --no-overwrite to prevent this). "
"If same as input file, the input file will be updated only if "
"processing is successful.",
)
parser.add_argument(
'-l',
'--language',
dest='languages',
action=LanguageSetAction,
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.",
)
parser.add_argument(
'--image-dpi',
metavar='DPI',
type=int,
help="When the input file is an image, not a PDF, use this DPI instead "
"of the DPI claimed by the input file. If the input does not claim a "
"sensible DPI, this option will be required.",
)
parser.add_argument(
'--output-type',
choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
default='auto',
help="Choose output type. 'auto' (default) produces best-effort PDF/A "
"without requiring Ghostscript - uses verapdf validation when available, "
"otherwise passes through as PDF/A if safe (input already PDF/A or "
"force-ocr was used), or falls back to regular PDF. 'pdfa' creates a "
"PDF/A-2b compliant file for long term archiving (requires Ghostscript "
"as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' "
"creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' "
"creates a PDF/A-3b file. 'none' will produce no output, which may be "
"helpful if only the --sidecar is desired.",
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar',
nargs='?',
const='\0',
default=None,
metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt; the next "
"argument must NOT be the name of the input PDF. "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.",
)
parser.add_argument(
'-n',
'--no-overwrite',
action='store_true',
default=False,
help="If the output file already exists, exit with an error instead of "
"overwriting it.",
)
parser.add_argument(
'--version',
action='version',
version=_VERSION,
help="Print program version and exit",
)
jobcontrol = parser.add_argument_group("Job control options")
jobcontrol.add_argument(
'-j',
'--jobs',
metavar='N',
type=numeric(int, 0, 256),
help="Use up to N CPU cores simultaneously (default: use all).",
)
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
)
jobcontrol.add_argument(
'-v',
'--verbose',
type=numeric(int, 0, 2),
default=0,
const=1,
nargs='?',
help="Print more verbose messages for each additional verbose level. Use "
"`-v 1` typically for much more detailed logging. Higher numbers "
"are probably only useful in debugging.",
)
jobcontrol.add_argument(
'--no-progress-bar',
action='store_false',
dest='progress_bar',
help=argparse.SUPPRESS,
)
jobcontrol.add_argument(
'--use-threads', action='store_true', default=True, help=argparse.SUPPRESS
)
jobcontrol.add_argument(
'--no-use-threads',
action='store_false',
dest='use_threads',
help=argparse.SUPPRESS,
)
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: copy input document's metadata)",
)
metadata.add_argument(
'--title', type=str, help="Set document title (place multiple words in quotes)"
)
metadata.add_argument('--author', type=str, help="Set document author")
metadata.add_argument(
'--subject', type=str, help="Set document subject description"
)
metadata.add_argument('--keywords', type=str, help="Set document keywords")
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR",
)
preprocessing.add_argument(
'-r',
'--rotate-pages',
action='store_true',
help="Automatically rotate pages based on detected text orientation",
)
preprocessing.add_argument(
'--remove-background',
action='store_true',
help="Attempt to remove background from gray or color pages, setting it "
"to white ",
)
preprocessing.add_argument(
'-d',
'--deskew',
action='store_true',
help="Deskew each page before performing OCR",
)
preprocessing.add_argument(
'-c',
'--clean',
action='store_true',
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output",
)
preprocessing.add_argument(
'-i',
'--clean-final',
action='store_true',
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.",
)
preprocessing.add_argument(
'--unpaper-args',
type=str,
default=None,
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
"Example: --unpaper-args '--layout double'.",
)
preprocessing.add_argument(
'--oversample',
metavar='DPI',
type=numeric(int, 0, 5000),
default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly",
)
preprocessing.add_argument(
'--remove-vectors',
action='store_true',
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
"will not be included in OCR. This can eliminate false characters.",
)
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
ocrsettings.add_argument(
'-m',
'--mode',
choices=[mode.value for mode in ProcessingMode],
default=ProcessingMode.default.value,
help="Processing mode for pages with existing text. "
"'default' errors if text is found. "
"'force' rasterizes all content and runs OCR (same as --force-ocr). "
"'skip' skips pages with existing text (same as --skip-text). "
"'redo' re-OCRs pages, replacing old invisible text (same as --redo-ocr).",
)
# Legacy flags for backward compatibility - these set the mode internally
ocrsettings.add_argument(
'-f',
'--force-ocr',
action='store_true',
help="Rasterize any text or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF). "
"Equivalent to --mode force.",
)
ocrsettings.add_argument(
'-s',
'--skip-text',
action='store_true',
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages. "
"Equivalent to --mode skip.",
)
ocrsettings.add_argument(
'--redo-ocr',
action='store_true',
help="Attempt to detect and remove the hidden OCR layer from files that "
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
"to text found in raster images. Existing visible text objects will "
"not be changed. If there is no existing OCR, OCR will be added. "
"Equivalent to --mode redo.",
)
ocrsettings.add_argument(
'--skip-big',
type=numeric(float, 0, 5000),
metavar='MPixels',
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output",
)
ocrsettings.add_argument(
'--invalidate-digital-signatures',
action='store_true',
help="Normally, OCRmyPDF will refuse to OCR a PDF that has a digital "
"signature. This option allows OCR to proceed, but the digital signature "
"will be invalidated.",
)
ocrsettings.add_argument(
'--tagged-pdf-mode',
choices=[mode.value for mode in TaggedPdfMode],
default=TaggedPdfMode.default.value,
help="Control behavior when a Tagged PDF is encountered. "
"'default' errors if --mode is default, otherwise warns. "
"'ignore' always warns but continues processing.",
)
advanced = parser.add_argument_group(
"Advanced", "Advanced options to control OCRmyPDF"
)
advanced.add_argument(
'--pages',
type=str,
help=(
"Limit OCR to the specified pages (ranges or comma separated), "
"skipping others"
),
)
advanced.add_argument(
'--max-image-mpixels',
action='store',
type=numeric(float, 0),
metavar='MPixels',
help="Set maximum number of megapixels to unpack before treating an image as a "
"decompression bomb",
default=250.0,
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'hocr', 'sandwich', 'hocrdebug', 'fpdf2'],
default='auto',
help="Choose OCR PDF renderer. 'auto' (recommended) uses fpdf2, which "
"provides full international language support including RTL scripts, "
"proper text positioning, and invisible text that becomes visible when "
"selected. 'sandwich' renders text as a background layer. Legacy 'hocr' "
"and 'hocrdebug' options are deprecated and will use fpdf2.",
)
advanced.add_argument(
'--ocr-engine',
choices=['auto', 'tesseract', 'none'],
default='auto',
help="OCR engine to use. 'auto' (default) selects the best available engine. "
"'tesseract' uses Tesseract OCR. "
"'none' skips OCR entirely, useful for PDF/A conversion or image processing "
"without text recognition.",
)
advanced.add_argument(
'--rasterizer',
choices=['auto', 'ghostscript', 'pypdfium'],
default='auto',
help="Choose PDF page rasterizer. 'auto' prefers pypdfium when available, "
"falling back to Ghostscript. 'pypdfium' is faster but requires the "
"pypdfium2 package. 'ghostscript' uses the traditional Ghostscript rasterizer.",
)
advanced.add_argument(
'--rotate-pages-threshold',
default=DEFAULT_ROTATE_PAGES_THRESHOLD,
type=numeric(float, 0, 1000),
metavar='CONFIDENCE',
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)",
)
advanced.add_argument(
'--fast-web-view',
type=numeric(float, 0),
default=1.0,
metavar="MEGABYTES",
help="If the size of file is more than this threshold (in MB), then "
"linearize the PDF for fast web viewing. This allows the PDF to be "
"displayed before it is fully downloaded in web browsers, but increases "
"the space required slightly. By default we skip this for small files "
"which do not benefit. If the threshold is 0 it will be apply to all files. "
"Set the threshold very high to disable.",
)
advanced.add_argument(
'--continue-on-soft-render-error',
action='store_true',
help="Continue processing pages after a recoverable PDF rendering error. "
"A recoverable error is one that does not prevent the page from being "
"rendered, but may result in visual differences compared to the input "
"file. Missing fonts are a typical source of these errors.",
)
advanced.add_argument(
'--plugin',
dest='plugins',
action='append',
default=[],
help="Name of plugin to import. Argument may be issued multiple times to "
"import multiple plugins. Plugins may be specified as module names in "
"Python syntax, provided they are installed in the same Python (virtual) "
"environment as ocrmypdf; or you may give the path to the Python file that "
"contains the plugin. Plugins must conform to the specification in the "
"OCRmyPDF documentation.",
)
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"
)
debugging.add_argument(
'-k',
'--keep-temporary-files',
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
return parser
plugins_only_parser = ArgumentParser(
prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
)
plugins_only_parser.add_argument(
'--plugin',
dest='plugins',
action='append',
default=[],
help="Name of plugin to import.",
)
def namespace_to_options(ns) -> OcrOptions:
"""Convert argparse.Namespace to OcrOptions.
This function encapsulates CLI-specific knowledge of how command line
arguments map to our internal options model.
"""
# Extract known fields
known_fields = {}
extra_attrs = {}
# Legacy boolean flags that map to mode - handled by OcrOptions model validator
legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
for key, value in vars(ns).items():
if key in OcrOptions.model_fields:
known_fields[key] = value
elif key in legacy_mode_flags:
# Pass legacy flags to OcrOptions for conversion to mode
known_fields[key] = value
else:
extra_attrs[key] = value
# Handle special cases for hOCR API
if 'output_folder' in extra_attrs and 'output_file' not in known_fields:
known_fields['output_file'] = '/dev/null' # Placeholder
# Handle case where input_file is missing (e.g., in _hocr_to_ocr_pdf)
if 'work_folder' in extra_attrs and 'input_file' not in known_fields:
known_fields['input_file'] = '/dev/null' # Placeholder
instance = OcrOptions(**known_fields)
instance.extra_attrs = extra_attrs
return instance
def get_options_and_plugins(
args=None,
) -> tuple[OcrOptions, OcrmypdfPluginManager]:
"""Parse command line arguments and return OcrOptions and plugin manager.
This is the main entry point for CLI argument processing. It handles
plugin discovery, argument parsing, and conversion to our internal
options model.
Args:
args: Command line arguments. If None, uses sys.argv.
Returns:
Tuple of (OcrOptions, PluginManager)
"""
# Import here to avoid circular imports
from ocrmypdf.api import setup_plugin_infrastructure
# First pass: get plugins so we can register their options
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
# Set up plugin infrastructure with proper initialization
plugin_manager = setup_plugin_infrastructure(plugins=pre_options.plugins)
# Get parser and let plugins add their options
parser = get_parser()
plugin_manager.add_options(parser=parser)
# Parse all arguments
namespace = parser.parse_args(args=args)
# Convert to OcrOptions
options = namespace_to_options(namespace)
return options, plugin_manager
================================================
FILE: src/ocrmypdf/data/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Data files used to generate certain PDFs."""
from __future__ import annotations
================================================
FILE: src/ocrmypdf/exceptions.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF's exceptions."""
from __future__ import annotations
from enum import IntEnum
from textwrap import dedent
class ExitCode(IntEnum):
"""OCRmyPDF's exit codes."""
# pylint: disable=invalid-name
ok = 0
bad_args = 1
input_file = 2
missing_dependency = 3
invalid_output_pdf = 4
file_access_error = 5
already_done_ocr = 6
child_process_error = 7
encrypted_pdf = 8
invalid_config = 9
pdfa_conversion_failed = 10
other_error = 15
ctrl_c = 130
class ExitCodeException(Exception):
"""An exception which should return an exit code with sys.exit()."""
exit_code = ExitCode.other_error
message = ""
def __str__(self):
"""Return a string representation of the exception."""
super_msg = super().__str__() # Don't do str(super())
if self.message:
return self.message.format(super_msg)
return super_msg
class BadArgsError(ExitCodeException):
"""Invalid arguments on the command line or API."""
exit_code = ExitCode.bad_args
class MissingDependencyError(ExitCodeException):
"""A third-party dependency is missing."""
exit_code = ExitCode.missing_dependency
class UnsupportedImageFormatError(ExitCodeException):
"""The image format is not supported."""
exit_code = ExitCode.input_file
class DpiError(ExitCodeException):
"""Missing information about input image DPI."""
exit_code = ExitCode.input_file
class OutputFileAccessError(ExitCodeException):
"""Cannot access the intended output file path."""
exit_code = ExitCode.file_access_error
class PriorOcrFoundError(ExitCodeException):
"""This file already has OCR."""
exit_code = ExitCode.already_done_ocr
class InputFileError(ExitCodeException):
"""Something is wrong with the input file."""
exit_code = ExitCode.input_file
class SubprocessOutputError(ExitCodeException):
"""A subprocess returned an unexpected error."""
exit_code = ExitCode.child_process_error
class EncryptedPdfError(ExitCodeException):
"""Input PDF is encrypted."""
exit_code = ExitCode.encrypted_pdf
message = dedent(
"""\
Input PDF is encrypted. The encryption must be removed to
perform OCR.
For information about this PDF's security use
qpdf --show-encryption infilename
You can remove the encryption using
qpdf --decrypt [--password=[password]] infilename
"""
)
class TesseractConfigError(ExitCodeException):
"""Tesseract config can't be parsed."""
exit_code = ExitCode.invalid_config
message = "Error occurred while parsing a Tesseract configuration file"
class DigitalSignatureError(InputFileError):
"""PDF has a digital signature."""
message = dedent(
"""\
Input PDF has a digital signature. OCR would alter the document,
invalidating the signature.
"""
)
class TaggedPDFError(InputFileError):
"""PDF is tagged."""
message = dedent(
"""\
This PDF is marked as a Tagged PDF. This often indicates
that the PDF was generated from an office document and does
not need OCR. Use --force-ocr, --skip-text or --redo-ocr to
override this error.
"""
)
class ColorConversionNeededError(BadArgsError):
"""PDF needs color conversion."""
message = dedent(
"""\
The input PDF has an unusual color space. Use
--color-conversion-strategy to convert to a common color space
such as RGB, or use --output-type pdf to skip PDF/A conversion
and retain the original color space.
"""
)
================================================
FILE: src/ocrmypdf/extra_plugins/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
#
# SPDX-License-Identifier: MPL-2.0
"""Extra plugins. These are not automatically inserted when ocrmypdf is run."""
================================================
FILE: src/ocrmypdf/extra_plugins/semfree.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Semaphore-free alternate executor.
There are two popular environments that do not fully support the standard Python
multiprocessing module: AWS Lambda, and Termux (a terminal emulator for Android).
This alternate executor divvies up work among worker processes before processing,
rather than having each worker consume work from a shared queue when they finish
their task. This means workers have no need to coordinate with each other. Each
worker communicates only with the main process.
This is not without drawbacks. If the tasks are not "even" in size, which cannot
be guaranteed, some workers may end up with too much work while others are idle.
It is less efficient than the standard implementation, so not the default.
This module is deprecated and will be removed in a future release. The standard
executor will fall back to threads in these environments.
"""
from __future__ import annotations
import logging
import logging.handlers
import signal
import warnings
from collections.abc import Callable, Iterable, Iterator
from contextlib import suppress
from enum import Enum, auto
from itertools import islice, repeat, takewhile, zip_longest
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection, wait
from ocrmypdf import Executor, hookimpl
from ocrmypdf._concurrent import NullProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers
warnings.warn(
"semfree.py is deprecated and will be removed in a future release.",
DeprecationWarning,
)
class MessageType(Enum):
"""Implement basic IPC messaging."""
exception = auto() # pylint: disable=invalid-name
result = auto() # pylint: disable=invalid-name
complete = auto() # pylint: disable=invalid-name
def split_every(n: int, iterable: Iterable) -> Iterator:
"""Split iterable into groups of n.
>>> list(split_every(4, range(10)))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
https://stackoverflow.com/a/22919323
"""
iterator = iter(iterable)
return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))
def process_sigbus(*args):
"""Handle SIGBUS signal at the worker level."""
raise InputFileError("A worker process lost access to an input file")
class ConnectionLogHandler(logging.handlers.QueueHandler):
"""Handler used by child processes to forward log messages to parent."""
def __init__(self, conn: Connection) -> None:
"""Initialize the handler."""
# sets the parent's queue to None - parent only touches queue
# in enqueue() which we override
super().__init__(None) # type: ignore
self.conn = conn
def enqueue(self, record):
"""Enqueue a log message."""
self.conn.send(('log', record))
def process_loop(
conn: Connection, user_init: Callable[[], None], loglevel, task, task_args
):
"""Initialize a process pool worker."""
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
signal.signal(signal.SIGBUS, process_sigbus)
# Reconfigure the root logger for this process to send all messages to a queue
h = ConnectionLogHandler(conn)
root = logging.getLogger()
remove_all_log_handlers(root)
root.setLevel(loglevel)
root.addHandler(h)
user_init()
for args in task_args:
try:
result = task(*args)
except Exception as e: # pylint: disable=broad-except
conn.send((MessageType.exception, e))
break
else:
conn.send((MessageType.result, result))
conn.send((MessageType.complete, None))
conn.close()
return
class LambdaExecutor(Executor):
"""Executor for AWS Lambda or similar environments that lack semaphores."""
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
progress_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
):
if use_threads and max_workers == 1:
with self.pbar_class(**progress_kwargs) as pbar:
for args in task_arguments:
result = task(*args)
task_finished(result, pbar)
return
task_arguments = list(task_arguments)
grouped_args = list(
zip_longest(*list(split_every(max_workers, task_arguments)))
)
if not grouped_args:
return
processes: list[Process] = []
connections: list[Connection] = []
for chunk in grouped_args:
parent_conn, child_conn = Pipe()
worker_args = [args for args in chunk if args is not None]
process = Process(
target=process_loop,
args=(
child_conn,
worker_initializer,
logging.getLogger("").level,
task,
worker_args,
),
)
process.daemon = True
processes.append(process)
connections.append(parent_conn)
for process in processes:
process.start()
with self.pbar_class(**progress_kwargs) as pbar:
while connections:
for result in wait(connections):
if not isinstance(result, Connection):
raise NotImplementedError("We only support Connection()")
try:
msg_type, msg = result.recv()
except EOFError:
connections.remove(result)
continue
if msg_type == MessageType.result:
task_finished(msg, pbar)
elif msg_type == 'log':
record = msg
logger = logging.getLogger(record.name)
logger.handle(record)
elif msg_type == MessageType.complete:
connections.remove(result)
elif msg_type == MessageType.exception:
for process in processes:
process.terminate()
raise msg
for process in processes:
process.join()
@hookimpl
def get_executor(progressbar_class):
"""Return a LambdaExecutor instance."""
return LambdaExecutor(pbar_class=progressbar_class)
@hookimpl
def get_logging_console():
"""Return a logging.StreamHandler instance."""
return logging.StreamHandler()
@hookimpl
def get_progressbar_class():
"""Return a NullProgressBar instance.
This executor cannot use a progress bar.
"""
return NullProgressBar
================================================
FILE: src/ocrmypdf/font/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Font management for OCRmyPDF PDF rendering.
This module provides font infrastructure for the fpdf2 PDF renderer. It includes:
- FontManager: Base class for font loading and glyph checking
- FontProvider: Protocol and implementations for font discovery
- MultiFontManager: Automatic font selection for multilingual documents
- SystemFontProvider: System font discovery
"""
from __future__ import annotations
from ocrmypdf.font.font_manager import FontManager
from ocrmypdf.font.font_provider import (
BuiltinFontProvider,
ChainedFontProvider,
FontProvider,
)
from ocrmypdf.font.multi_font_manager import MultiFontManager
from ocrmypdf.font.system_font_provider import SystemFontProvider
__all__ = [
"FontManager",
"FontProvider",
"BuiltinFontProvider",
"ChainedFontProvider",
"MultiFontManager",
"SystemFontProvider",
]
================================================
FILE: src/ocrmypdf/font/font_manager.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Base font management for PDF rendering.
This module provides the base FontManager class that handles font loading
and glyph checking using uharfbuzz.
"""
from __future__ import annotations
from pathlib import Path
import uharfbuzz as hb
class FontManager:
"""Manages font loading and glyph checking for PDF rendering.
This base class handles loading fonts with uharfbuzz for glyph checking
and text shaping. Renderer-specific subclasses should extend this to
add their own font objects.
Attributes:
font_path: Path to the font file
font_data: Raw font file bytes
font_index: Index within TTC collection (0 for single-font files)
hb_face: uharfbuzz Face object
hb_font: uharfbuzz Font object
"""
def __init__(self, font_path: Path, font_index: int = 0):
"""Initialize font manager.
Args:
font_path: Path to TrueType/OpenType font file
font_index: Index of font within a TTC collection (default 0).
For single-font files (.ttf, .otf), use 0.
"""
self.font_path = font_path
self.font_index = font_index
# Load font data
self.font_data = font_path.read_bytes()
# Load font with uharfbuzz for glyph checking and text measurement
# Note: uharfbuzz Face also supports font_index for TTC files
self.hb_face = hb.Face(self.font_data, font_index)
self.hb_font = hb.Font(self.hb_face)
def get_hb_font(self) -> hb.Font:
"""Get uharfbuzz Font object for text measurement.
Returns:
UHarfBuzz Font instance
"""
return self.hb_font
def has_glyph(self, codepoint: int) -> bool:
"""Check if font has a glyph for given codepoint.
Args:
codepoint: Unicode codepoint
Returns:
True if font has a real glyph (not .notdef)
"""
glyph_id = self.hb_font.get_nominal_glyph(codepoint)
return glyph_id is not None and glyph_id != 0
def get_font_metrics(self) -> tuple[float, float, float]:
"""Get normalized font metrics (ascent, descent, units_per_em).
Returns:
Tuple of (ascent, descent, units_per_em) where ascent and descent
are in font units. Ascent is positive (above baseline), descent
is typically negative (below baseline).
"""
extents = self.hb_font.get_font_extents('ltr')
units_per_em = self.hb_face.upem
return (extents.ascender, extents.descender, units_per_em)
def get_left_side_bearing(self, char: str, font_size: float) -> float:
"""Get the left side bearing of a character at a given font size.
The left side bearing (lsb) is the horizontal distance from the glyph
origin (x=0) to the leftmost pixel of the glyph. A positive lsb means
there's whitespace before the glyph starts.
Args:
char: Single character to get lsb for
font_size: Font size in points
Returns:
Left side bearing in points. Returns 0 if character not found.
"""
if not char:
return 0.0
codepoint = ord(char)
glyph_id = self.hb_font.get_nominal_glyph(codepoint)
if glyph_id is None or glyph_id == 0:
return 0.0
# Get glyph extents which include left/right bearing info
extents = self.hb_font.get_glyph_extents(glyph_id)
if extents is None:
return 0.0
# x_bearing is the left side bearing in font units
units_per_em = self.hb_face.upem
lsb_units = extents.x_bearing
lsb_pt = lsb_units * font_size / units_per_em
return lsb_pt
================================================
FILE: src/ocrmypdf/font/font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Font provider protocol and implementations for PDF rendering."""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Protocol
from ocrmypdf.font.font_manager import FontManager
log = logging.getLogger(__name__)
class FontProvider(Protocol):
"""Protocol for providing fonts to MultiFontManager.
Implementations are responsible for knowing where fonts are located
and loading them. MultiFontManager asks for fonts by name and uses
them for glyph coverage checking.
"""
def get_font(self, font_name: str) -> FontManager | None:
"""Get a FontManager for the named font.
Args:
font_name: Logical font name (e.g., 'NotoSans-Regular')
Returns:
FontManager if font is available, None otherwise
"""
...
def get_available_fonts(self) -> list[str]:
"""Get list of available font names.
Returns:
List of font names that can be retrieved with get_font()
"""
...
def get_fallback_font(self) -> FontManager:
"""Get the glyphless fallback font.
This font must always be available and handles any codepoint.
Returns:
FontManager for the glyphless fallback font (Occulta.ttf)
"""
...
class BuiltinFontProvider:
"""Font provider using builtin fonts from ocrmypdf/data directory."""
# Mapping of logical font names to filenames
# Only Latin (NotoSans) and the glyphless fallback (Occulta.ttf) are bundled.
# All other scripts (Arabic, Devanagari, CJK, etc.) are discovered from
# system fonts by SystemFontProvider to reduce package size.
FONT_FILES = {
'NotoSans-Regular': 'NotoSans-Regular.ttf',
'Occulta': 'Occulta.ttf',
}
def __init__(self, font_dir: Path | None = None):
"""Initialize builtin font provider.
Args:
font_dir: Directory containing font files. If None, uses
the default ocrmypdf/data directory.
"""
if font_dir is None:
font_dir = Path(__file__).parent.parent / "data"
self.font_dir = font_dir
self._fonts: dict[str, FontManager] = {}
self._load_fonts()
def _load_fonts(self) -> None:
"""Load available fonts, logging warnings for missing ones."""
for font_name, font_file in self.FONT_FILES.items():
font_path = self.font_dir / font_file
if not font_path.exists():
if font_name == 'Occulta':
raise FileNotFoundError(
f"Required fallback font not found: {font_path}"
)
log.warning(
"Font %s not found at %s - OCR output quality for some "
"scripts may be affected",
font_name,
font_path,
)
continue
try:
self._fonts[font_name] = FontManager(font_path)
except Exception as e:
if font_name == 'Occulta':
raise ValueError(
f"Failed to load required fallback font {font_file}: {e}"
) from e
log.warning(
"Failed to load font %s: %s - OCR output quality may be affected",
font_name,
e,
)
def get_font(self, font_name: str) -> FontManager | None:
"""Get a FontManager for the named font."""
return self._fonts.get(font_name)
def get_available_fonts(self) -> list[str]:
"""Get list of available font names."""
return list(self._fonts.keys())
def get_fallback_font(self) -> FontManager:
"""Get the glyphless fallback font."""
return self._fonts['Occulta']
class ChainedFontProvider:
"""Font provider that tries multiple providers in order.
This allows combining builtin fonts with system fonts, trying
the builtin provider first and falling back to system fonts
for fonts not bundled with the package.
"""
def __init__(self, providers: list[FontProvider]):
"""Initialize chained font provider.
Args:
providers: List of font providers to try in order.
The first provider that returns a font wins.
"""
if not providers:
raise ValueError("At least one provider is required")
self.providers = providers
def get_font(self, font_name: str) -> FontManager | None:
"""Get a FontManager for the named font.
Tries each provider in order until one returns a font.
Args:
font_name: Logical font name (e.g., 'NotoSans-Regular')
Returns:
FontManager if any provider has the font, None otherwise
"""
for provider in self.providers:
if font := provider.get_font(font_name):
return font
return None
def get_available_fonts(self) -> list[str]:
"""Get list of available font names from all providers.
Returns:
Combined list of font names (deduplicated, order preserved)
"""
seen: set[str] = set()
result: list[str] = []
for provider in self.providers:
for name in provider.get_available_fonts():
if name not in seen:
seen.add(name)
result.append(name)
return result
def get_fallback_font(self) -> FontManager:
"""Get the glyphless fallback font.
Tries each provider until one provides a fallback font.
Returns:
FontManager for the fallback font
Raises:
RuntimeError: If no provider can provide a fallback font
"""
for provider in self.providers:
try:
return provider.get_fallback_font()
except (NotImplementedError, AttributeError, KeyError):
continue
raise RuntimeError("No fallback font available from any provider")
================================================
FILE: src/ocrmypdf/font/multi_font_manager.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Multi-font management for PDF rendering.
Provides automatic font selection for multilingual documents based on
language hints and glyph coverage analysis.
"""
from __future__ import annotations
import logging
from pathlib import Path
from ocrmypdf.font.font_manager import FontManager
from ocrmypdf.font.font_provider import (
BuiltinFontProvider,
ChainedFontProvider,
FontProvider,
)
from ocrmypdf.font.system_font_provider import SystemFontProvider
log = logging.getLogger(__name__)
class MultiFontManager:
"""Manages multiple fonts with automatic selection and fallback.
This class orchestrates multiple FontManager instances to provide
word-level font selection for multilingual documents. It uses a hybrid
approach combining language hints from hOCR with glyph coverage analysis.
Font selection strategy:
1. Try language-preferred font (if language hint available)
2. Try fallback fonts in order by glyph coverage
3. Fall back to Occulta.ttf (glyphless fallback)
"""
# Language to font mapping
# Keys are ISO 639-2/3 codes or Tesseract language codes
LANGUAGE_FONT_MAP = {
# Arabic script
'ara': 'NotoSansArabic-Regular', # Arabic
'per': 'NotoSansArabic-Regular', # Persian (uses Arabic script)
'fas': 'NotoSansArabic-Regular', # Farsi (alternative code for Persian)
'urd': 'NotoSansArabic-Regular', # Urdu (uses Arabic script)
'pus': 'NotoSansArabic-Regular', # Pashto
'kur': 'NotoSansArabic-Regular', # Kurdish (Arabic script variant)
# Devanagari script
'hin': 'NotoSansDevanagari-Regular', # Hindi
'san': 'NotoSansDevanagari-Regular', # Sanskrit
'mar': 'NotoSansDevanagari-Regular', # Marathi
'nep': 'NotoSansDevanagari-Regular', # Nepali
'kok': 'NotoSansDevanagari-Regular', # Konkani
'bho': 'NotoSansDevanagari-Regular', # Bhojpuri
'mai': 'NotoSansDevanagari-Regular', # Maithili
# CJK
'chi': 'NotoSansCJK-Regular', # Chinese (generic)
'zho': 'NotoSansCJK-Regular', # Chinese (ISO 639-3)
'chi_sim': 'NotoSansCJK-Regular', # Chinese Simplified (Tesseract)
'chi_tra': 'NotoSansCJK-Regular', # Chinese Traditional (Tesseract)
'jpn': 'NotoSansCJK-Regular', # Japanese
'kor': 'NotoSansCJK-Regular', # Korean
# Thai
'tha': 'NotoSansThai-Regular', # Thai
# Hebrew
'heb': 'NotoSansHebrew-Regular', # Hebrew
'yid': 'NotoSansHebrew-Regular', # Yiddish (uses Hebrew script)
# Bengali script
'ben': 'NotoSansBengali-Regular', # Bengali
'asm': 'NotoSansBengali-Regular', # Assamese (uses Bengali script)
# Tamil
'tam': 'NotoSansTamil-Regular', # Tamil
# Gujarati
'guj': 'NotoSansGujarati-Regular', # Gujarati
# Telugu
'tel': 'NotoSansTelugu-Regular', # Telugu
# Kannada
'kan': 'NotoSansKannada-Regular', # Kannada
# Malayalam
'mal': 'NotoSansMalayalam-Regular', # Malayalam
# Myanmar (Burmese)
'mya': 'NotoSansMyanmar-Regular', # Myanmar
# Khmer (Cambodian)
'khm': 'NotoSansKhmer-Regular', # Khmer
# Lao
'lao': 'NotoSansLao-Regular', # Lao
# Georgian
'kat': 'NotoSansGeorgian-Regular', # Georgian
'geo': 'NotoSansGeorgian-Regular', # Georgian (alternative)
# Armenian
'hye': 'NotoSansArmenian-Regular', # Armenian
'arm': 'NotoSansArmenian-Regular', # Armenian (alternative)
# Ethiopic
'amh': 'NotoSansEthiopic-Regular', # Amharic
'tir': 'NotoSansEthiopic-Regular', # Tigrinya
# Sinhala
'sin': 'NotoSansSinhala-Regular', # Sinhala
# Gurmukhi (Punjabi)
'pan': 'NotoSansGurmukhi-Regular', # Punjabi
'pnb': 'NotoSansGurmukhi-Regular', # Western Punjabi
# Oriya
'ori': 'NotoSansOriya-Regular', # Oriya
'ory': 'NotoSansOriya-Regular', # Oriya (alternative)
# Tibetan
'bod': 'NotoSansTibetan-Regular', # Tibetan
'tib': 'NotoSansTibetan-Regular', # Tibetan (alternative)
}
# Ordered fallback chain for fonts (after language-preferred font)
# Order matters: most common scripts first for faster matching
FALLBACK_FONTS = [
'NotoSans-Regular', # Latin, Greek, Cyrillic
'NotoSansArabic-Regular',
'NotoSansDevanagari-Regular',
'NotoSansCJK-Regular',
'NotoSansThai-Regular',
'NotoSansHebrew-Regular',
'NotoSansBengali-Regular',
'NotoSansTamil-Regular',
'NotoSansGujarati-Regular',
'NotoSansTelugu-Regular',
'NotoSansKannada-Regular',
'NotoSansMalayalam-Regular',
'NotoSansMyanmar-Regular',
'NotoSansKhmer-Regular',
'NotoSansLao-Regular',
'NotoSansGeorgian-Regular',
'NotoSansArmenian-Regular',
'NotoSansEthiopic-Regular',
'NotoSansSinhala-Regular',
'NotoSansGurmukhi-Regular',
'NotoSansOriya-Regular',
'NotoSansTibetan-Regular',
]
def __init__(
self,
font_dir: Path | None = None,
*,
font_provider: FontProvider | None = None,
):
"""Initialize multi-font manager.
Args:
font_dir: Directory containing font files. If font_provider is
not specified, this is passed to BuiltinFontProvider.
font_provider: Provider for loading fonts. If None, uses a
ChainedFontProvider that tries builtin fonts first,
then searches system fonts.
"""
if font_provider is not None:
self.font_provider = font_provider
else:
# Use chained provider: try builtin fonts first, then system fonts
self.font_provider = ChainedFontProvider(
[
BuiltinFontProvider(font_dir),
SystemFontProvider(),
]
)
# Font selection cache: (word_text, language) -> font_name
self._selection_cache: dict[tuple[str, str | None], str] = {}
# Track whether we've warned about missing fonts (warn once per script)
self._warned_scripts: set[str] = set()
@property
def fonts(self) -> dict[str, FontManager]:
"""Get all loaded fonts (backward compatibility)."""
return self.get_all_fonts()
def _try_font(
self, font_name: str, word_text: str, cache_key: tuple[str, str | None]
) -> FontManager | None:
"""Try to use a font for the given word.
Args:
font_name: Name of font to try
word_text: Text content to check
cache_key: Cache key for storing successful result
Returns:
FontManager if font exists and has all glyphs, None otherwise
"""
font = self.font_provider.get_font(font_name)
if font is None:
return None
if self._has_all_glyphs(font, word_text):
self._selection_cache[cache_key] = font_name
return font
return None
def select_font_for_word(
self, word_text: str, line_language: str | None
) -> FontManager:
"""Select appropriate font for a word.
Uses a hybrid approach:
1. Language-based selection (if language hint available)
2. Ordered fallback through available fonts by glyph coverage
3. Final fallback to Occulta.ttf (glyphless)
Args:
word_text: The text content of the word
line_language: Language code from hOCR (e.g., 'ara', 'eng')
Returns:
FontManager instance to use for rendering this word
"""
cache_key = (word_text, line_language)
if cache_key in self._selection_cache:
cached_name = self._selection_cache[cache_key]
font = self.font_provider.get_font(cached_name)
if font:
return font
tried_fonts: set[str] = set()
# Phase 1: Try language-preferred font
if line_language and line_language in self.LANGUAGE_FONT_MAP:
preferred = self.LANGUAGE_FONT_MAP[line_language]
tried_fonts.add(preferred)
if result := self._try_font(preferred, word_text, cache_key):
return result
# Phase 2: Try fallback fonts in order
for font_name in self.FALLBACK_FONTS:
if font_name in tried_fonts:
continue
if result := self._try_font(font_name, word_text, cache_key):
return result
# Phase 3: Glyphless fallback (always succeeds)
# Warn if we're falling back for non-ASCII text (likely missing font)
self._warn_missing_font(word_text, line_language)
self._selection_cache[cache_key] = 'Occulta'
return self.font_provider.get_fallback_font()
def _warn_missing_font(self, word_text: str, line_language: str | None) -> None:
"""Warn user about missing font for non-Latin text.
Only warns once per language/script to avoid log spam.
"""
# Determine a key for deduplication (language or 'non-ascii')
warn_key = line_language if line_language else 'unknown'
# Only warn for non-ASCII text and only once per key
if warn_key in self._warned_scripts:
return
# Check if text contains non-ASCII characters
if not any(ord(c) > 127 for c in word_text):
return
self._warned_scripts.add(warn_key)
if line_language and line_language in self.LANGUAGE_FONT_MAP:
font_name = self.LANGUAGE_FONT_MAP[line_language]
log.warning(
"No font found with glyphs for '%s' text. "
"Install %s for better rendering. "
"See https://fonts.google.com/noto",
line_language,
font_name,
)
else:
log.warning(
"No font found with glyphs for some text. "
"Install Noto fonts for better rendering. "
"See https://fonts.google.com/noto"
)
def _has_all_glyphs(self, font: FontManager, text: str) -> bool:
"""Check if a font has glyphs for all characters in text.
Args:
font: FontManager instance to check
text: Text to verify coverage for
Returns:
True if font has real glyphs for all characters (not .notdef)
"""
if not text:
return True
hb_font = font.get_hb_font()
for char in text:
codepoint = ord(char)
glyph_id = hb_font.get_nominal_glyph(codepoint)
if glyph_id is None or glyph_id == 0: # 0 = .notdef glyph
return False
return True
def has_font(self, font_name: str) -> bool:
"""Check if a named font is available.
Args:
font_name: Name of font to check
Returns:
True if font is available
"""
return self.font_provider.get_font(font_name) is not None
def has_all_glyphs(self, font_name: str, text: str) -> bool:
"""Check if a named font has glyphs for all characters in text.
Args:
font_name: Name of font to check
text: Text to verify coverage for
Returns:
True if font has real glyphs for all characters (not .notdef)
"""
font = self.font_provider.get_font(font_name)
if font is None:
return False
return self._has_all_glyphs(font, text)
def get_all_fonts(self) -> dict[str, FontManager]:
"""Get all loaded font managers.
Returns:
Dictionary mapping font names to FontManager instances
"""
result = {}
for name in self.font_provider.get_available_fonts():
font = self.font_provider.get_font(name)
if font is not None:
result[name] = font
return result
================================================
FILE: src/ocrmypdf/font/system_font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""System font discovery for PDF rendering.
Provides lazy discovery of Noto fonts installed on the system across
Linux, macOS, and Windows platforms.
"""
from __future__ import annotations
import logging
import os
import sys
from pathlib import Path
from ocrmypdf.font.font_manager import FontManager
log = logging.getLogger(__name__)
class SystemFontProvider:
"""Discovers and provides system-installed Noto fonts with lazy scanning.
This provider searches standard system font directories for Noto fonts.
Scanning is performed lazily - only when a font is actually requested
and not found in the builtin fonts. Results are cached for the lifetime
of the provider instance.
"""
# System font directories by platform
SYSTEM_FONT_DIRS: dict[str, list[Path]] = {
'linux': [
Path('/usr/share/fonts'),
Path('/usr/local/share/fonts'),
Path.home() / '.fonts',
Path.home() / '.local/share/fonts',
],
'freebsd': [
Path('/usr/local/share/fonts'),
Path.home() / '.fonts',
],
'darwin': [
Path('/Library/Fonts'),
Path('/System/Library/Fonts'),
Path.home() / 'Library/Fonts',
],
# Windows is handled dynamically in _get_font_dirs()
}
# Noto font logical names → possible filenames (priority order)
# The first match found will be used
NOTO_FONT_PATTERNS: dict[str, list[str]] = {
'NotoSans-Regular': [
'NotoSans-Regular.ttf',
'NotoSans-Regular.otf',
],
'NotoSansArabic-Regular': [
'NotoSansArabic-Regular.ttf',
'NotoSansArabic-Regular.otf',
],
'NotoSansDevanagari-Regular': [
'NotoSansDevanagari-Regular.ttf',
'NotoSansDevanagari-Regular.otf',
],
'NotoSansCJK-Regular': [
# Language-specific variants (any will work for CJK)
'NotoSansCJKsc-Regular.otf', # Simplified Chinese
'NotoSansCJKtc-Regular.otf', # Traditional Chinese
'NotoSansCJKjp-Regular.otf', # Japanese
'NotoSansCJKkr-Regular.otf', # Korean
# TTC collections (common on Linux distros)
'NotoSansCJK-Regular.ttc',
'NotoSansCJKsc-Regular.ttc',
# Variable fonts
'NotoSansCJKsc-VF.otf',
],
'NotoSansThai-Regular': [
'NotoSansThai-Regular.ttf',
'NotoSansThai-Regular.otf',
],
'NotoSansHebrew-Regular': [
'NotoSansHebrew-Regular.ttf',
'NotoSansHebrew-Regular.otf',
],
'NotoSansBengali-Regular': [
'NotoSansBengali-Regular.ttf',
'NotoSansBengali-Regular.otf',
],
'NotoSansTamil-Regular': [
'NotoSansTamil-Regular.ttf',
'NotoSansTamil-Regular.otf',
],
'NotoSansGujarati-Regular': [
'NotoSansGujarati-Regular.ttf',
'NotoSansGujarati-Regular.otf',
],
'NotoSansTelugu-Regular': [
'NotoSansTelugu-Regular.ttf',
'NotoSansTelugu-Regular.otf',
],
'NotoSansKannada-Regular': [
'NotoSansKannada-Regular.ttf',
'NotoSansKannada-Regular.otf',
],
'NotoSansMalayalam-Regular': [
'NotoSansMalayalam-Regular.ttf',
'NotoSansMalayalam-Regular.otf',
],
'NotoSansMyanmar-Regular': [
'NotoSansMyanmar-Regular.ttf',
'NotoSansMyanmar-Regular.otf',
],
'NotoSansKhmer-Regular': [
'NotoSansKhmer-Regular.ttf',
'NotoSansKhmer-Regular.otf',
],
'NotoSansLao-Regular': [
'NotoSansLao-Regular.ttf',
'NotoSansLao-Regular.otf',
],
'NotoSansGeorgian-Regular': [
'NotoSansGeorgian-Regular.ttf',
'NotoSansGeorgian-Regular.otf',
],
'NotoSansArmenian-Regular': [
'NotoSansArmenian-Regular.ttf',
'NotoSansArmenian-Regular.otf',
],
'NotoSansEthiopic-Regular': [
'NotoSansEthiopic-Regular.ttf',
'NotoSansEthiopic-Regular.otf',
],
'NotoSansSinhala-Regular': [
'NotoSansSinhala-Regular.ttf',
'NotoSansSinhala-Regular.otf',
],
'NotoSansGurmukhi-Regular': [
'NotoSansGurmukhi-Regular.ttf',
'NotoSansGurmukhi-Regular.otf',
],
'NotoSansOriya-Regular': [
'NotoSansOriya-Regular.ttf',
'NotoSansOriya-Regular.otf',
],
'NotoSansTibetan-Regular': [
'NotoSansTibetan-Regular.ttf',
'NotoSansTibetan-Regular.otf',
],
}
def __init__(self) -> None:
"""Initialize system font provider with empty caches."""
# Cache: font_name -> FontManager (successfully loaded fonts)
self._font_cache: dict[str, FontManager] = {}
# Negative cache: font names we've searched for but not found
self._not_found: set[str] = set()
# Cached font directories (computed lazily)
self._font_dirs: list[Path] | None = None
def _get_platform(self) -> str:
"""Get the current platform identifier.
Returns:
Platform string: 'linux', 'darwin', 'windows', or 'freebsd'
"""
if sys.platform == 'win32':
return 'windows'
elif sys.platform == 'darwin':
return 'darwin'
elif 'freebsd' in sys.platform:
return 'freebsd'
else:
return 'linux'
def _get_font_dirs(self) -> list[Path]:
"""Get font directories for the current platform.
Returns:
List of paths to search for fonts (may include non-existent paths)
"""
if self._font_dirs is not None:
return self._font_dirs
platform = self._get_platform()
if platform == 'windows':
# Get Windows font directories from environment
windir = os.environ.get('WINDIR', r'C:\Windows')
self._font_dirs = [Path(windir) / 'Fonts']
# User-installed fonts (Windows 10+)
localappdata = os.environ.get('LOCALAPPDATA')
if localappdata:
self._font_dirs.append(
Path(localappdata) / 'Microsoft' / 'Windows' / 'Fonts'
)
else:
self._font_dirs = list(self.SYSTEM_FONT_DIRS.get(platform, []))
return self._font_dirs
def _find_font_file(self, font_name: str) -> Path | None:
"""Search system directories for a font file.
Args:
font_name: Logical font name (e.g., 'NotoSansCJK-Regular')
Returns:
Path to font file if found, None otherwise
"""
if font_name not in self.NOTO_FONT_PATTERNS:
return None
patterns = self.NOTO_FONT_PATTERNS[font_name]
for font_dir in self._get_font_dirs():
if not font_dir.exists():
continue
for pattern in patterns:
# Search recursively for the font file
try:
matches = list(font_dir.rglob(pattern))
if matches:
log.debug(
"Found system font %s at %s", font_name, matches[0]
)
return matches[0]
except PermissionError:
# Skip directories we can't read
continue
return None
def get_font(self, font_name: str) -> FontManager | None:
"""Get a FontManager for the named font (lazy loading).
This method implements lazy scanning: fonts are only searched for
when first requested. Results (both positive and negative) are
cached for subsequent calls.
Args:
font_name: Logical font name (e.g., 'NotoSansCJK-Regular')
Returns:
FontManager if font is found and loadable, None otherwise
"""
# Check positive cache first
if font_name in self._font_cache:
return self._font_cache[font_name]
# Check negative cache (already searched, not found)
if font_name in self._not_found:
return None
# Lazy scan for this specific font
font_path = self._find_font_file(font_name)
if font_path is not None:
try:
fm = FontManager(font_path)
self._font_cache[font_name] = fm
return fm
except Exception as e:
log.warning(
"Found font %s at %s but failed to load: %s",
font_name,
font_path,
e,
)
# Cache negative result
self._not_found.add(font_name)
return None
def get_available_fonts(self) -> list[str]:
"""Get list of font names this provider can potentially find.
Note: This returns all font names we know patterns for, not
necessarily fonts that are actually installed. Use get_font()
to check if a specific font is available.
Returns:
List of logical font names
"""
return list(self.NOTO_FONT_PATTERNS.keys())
def get_fallback_font(self) -> FontManager:
"""Get the glyphless fallback font.
Raises:
NotImplementedError: System provider doesn't provide fallback.
Use BuiltinFontProvider for the fallback font.
"""
raise NotImplementedError(
"SystemFontProvider does not provide a fallback font. "
"Use BuiltinFontProvider for Occulta.ttf fallback."
)
================================================
FILE: src/ocrmypdf/fpdf_renderer/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""fpdf2-based PDF renderer for OCR text layers.
This module provides the PDF renderer using fpdf2 for creating
searchable OCR text layers.
"""
from __future__ import annotations
from ocrmypdf.fpdf_renderer.renderer import (
DebugRenderOptions,
Fpdf2MultiPageRenderer,
Fpdf2PdfRenderer,
)
__all__ = [
"DebugRenderOptions",
"Fpdf2PdfRenderer",
"Fpdf2MultiPageRenderer",
]
================================================
FILE: src/ocrmypdf/fpdf_renderer/renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""fpdf2-based PDF renderer for OCR text layers.
This module provides PDF rendering using fpdf2 for creating searchable
OCR text layers.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from math import atan, cos, degrees, radians, sin, sqrt
from pathlib import Path
from fpdf import FPDF
from fpdf.enums import PDFResourceType, TextMode
from pikepdf import Matrix, Rectangle
from ocrmypdf.font import FontManager, MultiFontManager
from ocrmypdf.models.ocr_element import OcrClass, OcrElement
log = logging.getLogger(__name__)
def transform_point(matrix: Matrix, x: float, y: float) -> tuple[float, float]:
"""Transform a point (x, y) by a matrix.
Args:
matrix: pikepdf Matrix to apply
x: X coordinate
y: Y coordinate
Returns:
Tuple of (transformed_x, transformed_y)
"""
# Use a degenerate rectangle to transform a single point
rect = Rectangle(x, y, x, y)
transformed = matrix.transform(rect)
return (transformed.llx, transformed.lly)
def transform_box(
matrix: Matrix, left: float, top: float, right: float, bottom: float
) -> tuple[float, float, float, float]:
"""Transform a bounding box by a matrix.
Args:
matrix: pikepdf Matrix to apply
left: Left edge of box
top: Top edge of box
right: Right edge of box
bottom: Bottom edge of box
Returns:
Tuple of (llx, lly, width, height) of the transformed box
"""
rect = Rectangle(left, top, right, bottom)
transformed = matrix.transform(rect)
return (
transformed.llx,
transformed.lly,
transformed.width,
transformed.height,
)
@dataclass
class DebugRenderOptions:
"""Options for debug visualization during rendering.
When enabled, draws colored lines/shapes to visualize OCR structure.
"""
render_baseline: bool = False # Magenta lines along baselines
render_line_bbox: bool = False # Blue rectangles around lines
render_word_bbox: bool = False # Green rectangles around words
class CoordinateTransform:
"""Manages coordinate transformations for fpdf2 rendering.
Handles conversion from OCR pixel coordinates (top-left origin) to
PDF points. fpdf2 uses top-left origin like hOCR, so no Y-flip needed.
"""
def __init__(self, dpi: float, page_width_px: float, page_height_px: float):
"""Initialize coordinate transform."""
self.dpi = dpi
self.page_width_px = page_width_px
self.page_height_px = page_height_px
@property
def page_width_pt(self) -> float:
"""Page width in PDF points."""
return self.page_width_px * 72.0 / self.dpi
@property
def page_height_pt(self) -> float:
"""Page height in PDF points."""
return self.page_height_px * 72.0 / self.dpi
def px_to_pt(self, value: float) -> float:
"""Convert pixels to PDF points."""
return value * 72.0 / self.dpi
def bbox_to_pt(self, bbox) -> tuple[float, float, float, float]:
"""Convert BoundingBox from pixels to points."""
return (
self.px_to_pt(bbox.left),
self.px_to_pt(bbox.top),
self.px_to_pt(bbox.right),
self.px_to_pt(bbox.bottom),
)
class Fpdf2PdfRenderer:
"""Renders OcrElement trees to PDF using fpdf2.
This class provides the core rendering logic for converting OCR output
into PDF text layers using fpdf2's text drawing capabilities.
"""
def __init__(
self,
page: OcrElement,
dpi: float,
multi_font_manager: MultiFontManager,
invisible_text: bool = True,
image: Path | None = None,
debug_render_options: DebugRenderOptions | None = None,
):
"""Initialize renderer.
Args:
page: Root OcrElement (must be ocr_page)
dpi: Source image DPI
multi_font_manager: MultiFontManager instance
invisible_text: If True, render text as invisible (text mode 3)
image: Optional path to image to overlay on top of the text layer,
creating a sandwich PDF (text underneath, image on top)
debug_render_options: Options for debug visualization
Raises:
ValueError: If page is not an ocr_page or lacks a bounding box
"""
if page.ocr_class != OcrClass.PAGE:
raise ValueError("Root element must be ocr_page")
if page.bbox is None:
raise ValueError("Page must have bounding box")
self.page = page
self.dpi = dpi
self.multi_font_manager = multi_font_manager
self.invisible_text = invisible_text
self.image = image
self.debug_options = debug_render_options or DebugRenderOptions()
# Setup coordinate transform
self.coord_transform = CoordinateTransform(
dpi=dpi,
page_width_px=page.bbox.width,
page_height_px=page.bbox.height,
)
# Registered fonts: font_path -> fpdf_family_name
self._registered_fonts: dict[str, str] = {}
# Track whether we've already logged the info-level suppression message
self._logged_aspect_ratio_suppression = False
def render(self, output_path: Path) -> None:
"""Render page to PDF file.
Args:
output_path: Output PDF file path
"""
# Create PDF with custom page size
pdf = FPDF(
unit="pt",
format=(
self.coord_transform.page_width_pt,
self.coord_transform.page_height_pt,
),
)
pdf.set_auto_page_break(auto=False)
# Enable text shaping for complex scripts
pdf.set_text_shaping(True)
# Disable cell margin to ensure precise text positioning
# fpdf2's cell() adds c_margin padding by default, which shifts text
pdf.c_margin = 0
# Set text mode for invisible text
if self.invisible_text:
pdf.text_mode = TextMode.INVISIBLE
else:
pdf.text_mode = TextMode.FILL
# Render content to PDF
self.render_to_pdf(pdf)
# Write PDF
pdf.output(str(output_path))
def render_to_pdf(self, pdf: FPDF) -> None:
"""Render page content to an existing FPDF instance.
This method adds a page and renders all content. Used by both
single-page rendering and multi-page rendering.
Args:
pdf: FPDF instance to render into
"""
# Add page with correct dimensions
pdf.add_page(
format=(
self.coord_transform.page_width_pt,
self.coord_transform.page_height_pt,
)
)
# Render all paragraphs
for para in self.page.paragraphs:
self._render_paragraph(pdf, para)
# If no paragraphs, render lines directly
if not self.page.paragraphs:
for line in self.page.lines:
self._render_line(pdf, line)
# Place image on top of text layer (sandwich mode)
if self.image is not None:
pdf.image(
str(self.image),
x=0,
y=0,
w=self.coord_transform.page_width_pt,
h=self.coord_transform.page_height_pt,
)
def _register_font(self, pdf: FPDF, font_manager: FontManager) -> str:
"""Register font with fpdf2 if not already registered.
Args:
pdf: FPDF instance
font_manager: FontManager containing the font
Returns:
Font family name to use with pdf.set_font()
"""
font_path_str = str(font_manager.font_path)
if font_path_str not in self._registered_fonts:
# Use the font filename stem as the family name
family_name = font_manager.font_path.stem
pdf.add_font(family=family_name, fname=font_path_str)
self._registered_fonts[font_path_str] = family_name
return self._registered_fonts[font_path_str]
def _render_paragraph(self, pdf: FPDF, para: OcrElement) -> None:
"""Render a paragraph element.
Args:
pdf: FPDF instance
para: Paragraph OCR element
"""
for line in para.children:
if line.ocr_class in OcrClass.LINE_TYPES:
self._render_line(pdf, line)
def _render_line(self, pdf: FPDF, line: OcrElement) -> None:
"""Render a line element with baseline support.
Strategy (following pikepdf reference implementation):
1. Create a baseline_matrix that transforms from hOCR coordinates to
a coordinate system aligned with the text baseline
2. For each word, transform its hOCR bbox using baseline_matrix.inverse()
to get its position in the baseline coordinate system
3. Render words along the baseline with horizontal scaling
Args:
pdf: FPDF instance
line: Line OCR element
"""
if line.bbox is None:
return
# Validate line bbox
if line.bbox.height <= 0:
log.error(
"line box is invalid so we cannot render it: box=%s text=%s",
line.bbox,
line.text if hasattr(line, 'text') else '',
)
return
# Convert line bbox to PDF points
line_left_pt = self.coord_transform.px_to_pt(line.bbox.left)
line_top_pt = self.coord_transform.px_to_pt(line.bbox.top)
line_right_pt = self.coord_transform.px_to_pt(line.bbox.right)
line_bottom_pt = self.coord_transform.px_to_pt(line.bbox.bottom)
# Note: line_width_pt and line_height_pt not needed since we compute
# dimensions in the un-rotated coordinate system via matrix transform
# Debug rendering: draw line bbox (in page coordinates)
if self.debug_options.render_line_bbox:
self._render_debug_line_bbox(
pdf, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt
)
# Get textangle (rotation of the entire line)
textangle = line.textangle or 0.0
# Read baseline early so we can detect rotation from steep slopes.
# When Tesseract doesn't report textangle for rotated text, the
# rotation gets encoded as a very steep baseline slope instead.
slope = 0.0
intercept_pt = 0.0
has_meaningful_baseline = False
if line.baseline is not None:
slope = line.baseline.slope
intercept_pt = self.coord_transform.px_to_pt(line.baseline.intercept)
if abs(slope) < 0.005:
slope = 0.0
has_meaningful_baseline = True
# Detect text rotation from steep baseline slope.
# A slope magnitude > 1.0 corresponds to > 45° from horizontal,
# which indicates the line is rotated, not merely skewed.
if textangle == 0.0 and abs(slope) > 1.0:
textangle = degrees(atan(slope))
# The original baseline slope and intercept are not meaningful
# after extracting rotation; recalculate intercept from font
# metrics below.
slope = 0.0
has_meaningful_baseline = False
# Build line_size_aabb_matrix: transforms from page coords to un-rotated
# line coords. The hOCR bbox is the minimum axis-aligned bounding box
# enclosing the rotated text.
# Start at top-left corner of line bbox, then rotate by -textangle
line_size_aabb_matrix = (
Matrix()
.translated(line_left_pt, line_top_pt)
.rotated(-textangle) # textangle is counter-clockwise per hOCR spec
)
# Get the line dimensions in the un-rotated coordinate system
# Transform line bbox corners to get the un-rotated dimensions
inv_line_matrix = line_size_aabb_matrix.inverse()
# Transform bottom-right corner to get line dimensions in rotated space
_, _, line_size_width, line_size_height = transform_box(
inv_line_matrix, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt
)
# Get baseline intercept
if not has_meaningful_baseline:
# No baseline provided or baseline was used for rotation detection:
# calculate intercept from font metrics
default_font_manager = self.multi_font_manager.fonts['NotoSans-Regular']
ascent, descent, units_per_em = default_font_manager.get_font_metrics()
ascent_norm = ascent / units_per_em
descent_norm = descent / units_per_em
# Baseline intercept based on font metrics
intercept_pt = (
-abs(descent_norm)
* line_size_height
/ (ascent_norm + abs(descent_norm))
)
slope_angle_deg = degrees(atan(slope)) if slope != 0.0 else 0.0
# Build baseline_matrix: transforms from page coords to baseline coords
# 1. Start with line_size_aabb_matrix (translates to line corner, rotates)
# 2. Translate down to bottom of un-rotated line (line_size_height)
# 3. Apply baseline intercept offset
# 4. Rotate by baseline slope
baseline_matrix = (
line_size_aabb_matrix.translated(
0, line_size_height
) # Move to bottom of line
.translated(0, intercept_pt) # Apply baseline intercept
.rotated(slope_angle_deg) # Rotate by baseline slope
)
# Calculate font size: height from baseline to top of line
font_size = line_size_height + intercept_pt
if font_size < 1.0:
font_size = line_size_height * 0.8
# Total rotation for rendering (textangle + slope)
total_rotation_deg = -textangle + slope_angle_deg
# Debug rendering: draw baseline
if self.debug_options.render_baseline:
# Baseline starts at origin in baseline coords, extends line width
baseline_start = transform_point(baseline_matrix, 0, 0)
baseline_end = transform_point(baseline_matrix, line_size_width, 0)
pdf.set_draw_color(255, 0, 255) # Magenta
pdf.set_line_width(0.75)
pdf.line(
baseline_start[0], baseline_start[1], baseline_end[0], baseline_end[1]
)
# Extract line language for font selection
line_language = line.language
# Get inverse of baseline_matrix for transforming word bboxes
inv_baseline_matrix = baseline_matrix.inverse()
# Collect words to render
words: list[OcrElement | None] = [
w for w in line.children if w.ocr_class == OcrClass.WORD and w.text
]
# Suppress lines where the text aspect ratio is implausible.
# This catches cases where Tesseract failed to detect rotation
# entirely (slope=0, no textangle) and produced garbage text in a
# bounding box whose shape doesn't match the text content at all.
if not self._check_aspect_ratio_plausible(
pdf, words, font_size, slope_angle_deg,
line_size_width, line_size_height, line_language,
):
return
# Collect word rendering data: (text, x_baseline, font_family, word_tz)
word_render_data: list[tuple[str, float, str, float]] = []
for word in words:
if word is None or not word.text or word.bbox is None:
continue
word_left_pt = self.coord_transform.px_to_pt(word.bbox.left)
word_top_pt = self.coord_transform.px_to_pt(word.bbox.top)
word_right_pt = self.coord_transform.px_to_pt(word.bbox.right)
word_bottom_pt = self.coord_transform.px_to_pt(word.bbox.bottom)
word_width_pt = word_right_pt - word_left_pt
# Debug rendering: draw word bbox (in page coordinates)
if self.debug_options.render_word_bbox:
self._render_debug_word_bbox(
pdf, word_left_pt, word_top_pt, word_right_pt, word_bottom_pt
)
# Get x position in baseline coordinate system
box_llx, _, _, _ = transform_box(
inv_baseline_matrix,
word_left_pt,
word_top_pt,
word_right_pt,
word_bottom_pt,
)
# Select font and compute word-only Tz
font_manager = self.multi_font_manager.select_font_for_word(
word.text, line_language
)
font_family = self._register_font(pdf, font_manager)
pdf.set_font(font_family, size=font_size)
natural_width = pdf.get_string_width(word.text)
if natural_width > 0 and word_width_pt > 0:
word_tz = (word_width_pt / natural_width) * 100
else:
word_tz = 100.0
word_render_data.append((word.text, box_llx, font_family, word_tz))
if not word_render_data:
return
# Emit single BT block for the entire line using raw PDF operators.
# This avoids a poppler bug where Tz (horizontal scaling) is not
# carried across BT/ET boundaries, affecting all poppler-based tools
# and viewers (Evince, pdftotext, etc.). By keeping all words in a
# single BT block with relative Td positioning and per-word Tz, we
# ensure correct inter-word spacing.
self._emit_line_bt_block(
pdf,
word_render_data,
baseline_matrix,
font_size,
total_rotation_deg,
)
def _check_aspect_ratio_plausible(
self,
pdf: FPDF,
words: list[OcrElement | None],
font_size: float,
slope_angle_deg: float,
line_size_width: float,
line_size_height: float,
line_language: str | None,
) -> bool:
"""Check whether the line's aspect ratio is plausible for its text.
Compares the aspect ratio of the OCR bounding box to the aspect ratio
the text would have if rendered normally (accounting for baseline
slope). A large mismatch indicates Tesseract misread rotated text
without detecting the rotation.
Returns:
True if plausible (rendering should proceed), False to suppress.
"""
if line_size_width <= 0 or line_size_height <= 0 or font_size <= 0:
return True
# Fast path: most lines are wider than they are tall, which is
# the normal shape for horizontal text. Only tall-narrow boxes
# (height > width) need the expensive font measurement check.
if line_size_width >= line_size_height:
return True
line_text = ' '.join(
w.text for w in words if w is not None and w.text
)
if not line_text:
return True
# Measure the natural rendered width of the line text
font_manager = self.multi_font_manager.select_font_for_word(
line_text, line_language
)
font_family = self._register_font(pdf, font_manager)
pdf.set_font(font_family, size=round(font_size))
natural_width = pdf.get_string_width(line_text)
if natural_width <= 0:
return True
# Compute the AABB the text would occupy considering baseline slope
theta = radians(abs(slope_angle_deg))
expected_w = natural_width * cos(theta) + font_size * sin(theta)
expected_h = natural_width * sin(theta) + font_size * cos(theta)
if expected_h <= 0:
return True
actual_aspect = line_size_width / line_size_height
expected_aspect = expected_w / expected_h
ratio = actual_aspect / expected_aspect
if ratio >= 0.1:
return True
# Implausible aspect ratio — suppress this line
log.debug(
"Suppressing text with improbable aspect ratio: "
"actual=%.3f expected=%.3f ratio=%.4f text=%r",
actual_aspect,
expected_aspect,
ratio,
line_text[:80],
)
if not self._logged_aspect_ratio_suppression:
log.info(
"Suppressing OCR output text with improbable aspect ratio"
)
self._logged_aspect_ratio_suppression = True
return False
def _emit_line_bt_block(
self,
pdf: FPDF,
word_render_data: list[tuple[str, float, str, float]],
baseline_matrix: Matrix,
font_size: float,
total_rotation_deg: float,
) -> None:
"""Emit a single BT block for the entire line using raw PDF operators.
Writes all words in a single BT..ET block with relative Td positioning
and per-word Tz. Each non-last word gets a trailing space appended, with
Tz calculated so the rendered width of "word " spans from the current
word's start to the next word's start. This works around a poppler bug
where Tz is not carried across BT/ET boundaries, which affects all
poppler-based viewers and tools (Evince, pdftotext, etc.).
Args:
pdf: FPDF instance
word_render_data: List of (text, x_baseline, font_family, word_tz)
tuples, one per word on this line
baseline_matrix: Transform from baseline coords to page coords
font_size: Font size in points
total_rotation_deg: Total rotation angle (textangle + slope)
"""
page_height = self.coord_transform.page_height_pt
# Compute baseline direction in PDF coordinates for rotation
has_rotation = abs(total_rotation_deg) > 0.01
bx0, by0_fpdf = transform_point(baseline_matrix, 0, 0)
by0_pdf = page_height - by0_fpdf
ops: list[str] = []
if has_rotation:
# Compute direction vector along the baseline in PDF coordinates
bx1, by1_fpdf = transform_point(baseline_matrix, 100, 0)
by1_pdf = page_height - by1_fpdf
dx = bx1 - bx0
dy = by1_pdf - by0_pdf
length = sqrt(dx * dx + dy * dy)
if length > 0:
cos_a = dx / length
sin_a = dy / length
else:
cos_a = 1.0
sin_a = 0.0
# Save graphics state, apply rotation+translation via cm.
# The cm maps local coordinates (baseline-aligned, x along text)
# to PDF page coordinates.
ops.append('q')
ops.append(
f'{cos_a:.6f} {sin_a:.6f} {-sin_a:.6f} {cos_a:.6f} '
f'{bx0:.2f} {by0_pdf:.2f} cm'
)
# Begin text object
ops.append('BT')
# Text render mode: 3 = invisible, 0 = fill
tr = 3 if self.invisible_text else 0
ops.append(f'{tr} Tr')
# Initial text position
first_x_baseline = word_render_data[0][1]
if has_rotation:
# In the cm-transformed space, origin is at the baseline start
ops.append(f'{first_x_baseline:.2f} 0 Td')
else:
# Direct PDF coordinates
page_x, page_y_fpdf = transform_point(
baseline_matrix, first_x_baseline, 0
)
page_y_pdf = page_height - page_y_fpdf
ops.append(f'{page_x:.2f} {page_y_pdf:.2f} Td')
prev_font_family: str | None = None
prev_x_baseline = first_x_baseline
for i, (text, x_baseline, font_family, word_tz) in enumerate(
word_render_data
):
is_last = i == len(word_render_data) - 1
# Set font if changed
if font_family != prev_font_family:
pdf.set_font(font_family, size=font_size)
# Register font resource on this page
pdf._resource_catalog.add(
PDFResourceType.FONT, pdf.current_font.i, pdf.page
)
ops.append(
f'/F{pdf.current_font.i} {pdf.font_size_pt:.2f} Tf'
)
prev_font_family = font_family
# Relative positioning (for words after the first)
if i > 0:
if has_rotation:
# In rotated space, advance is purely along x-axis
dx_baseline = x_baseline - prev_x_baseline
ops.append(f'{dx_baseline:.2f} 0 Td')
else:
# Non-rotated: compute delta in PDF coordinates
px_prev, py_prev_f = transform_point(
baseline_matrix, prev_x_baseline, 0
)
px_curr, py_curr_f = transform_point(
baseline_matrix, x_baseline, 0
)
dx_pdf = px_curr - px_prev
# Flip y delta for PDF coordinates (y-up)
dy_pdf = -(py_curr_f - py_prev_f)
ops.append(f'{dx_pdf:.2f} {dy_pdf:.2f} Td')
# Determine text to render
if not is_last:
next_text, next_x_baseline, _, _ = word_render_data[i + 1]
advance = next_x_baseline - x_baseline
# Add trailing space for text extraction unless both are CJK
if (
advance > 0
and not (
self._is_cjk_only(text)
and self._is_cjk_only(next_text)
)
):
text_to_render = text + ' '
else:
text_to_render = text
else:
text_to_render = text
# Use word_tz (fits word into its hOCR bbox) — Td handles
# inter-word gaps, so Tz should not stretch to fill them.
render_tz = word_tz
ops.append(f'{render_tz:.2f} Tz')
ops.append(self._encode_shaped_text(pdf, text_to_render))
prev_x_baseline = x_baseline
# End text object
ops.append('ET')
if has_rotation:
ops.append('Q')
pdf._out('\n'.join(ops))
# Reset fpdf2's internal stretching tracking so subsequent API calls
# don't think Tz is still set from our raw operators
pdf.font_stretching = 100
def _encode_shaped_text(self, pdf: FPDF, text: str) -> str:
"""Encode text using HarfBuzz text shaping for complex script support.
Unlike font.encode_text() which maps unicode characters one-by-one to
glyph IDs, this uses HarfBuzz to handle BiDi reordering, Arabic joining
forms, Devanagari conjuncts, and other complex script shaping. Falls
back to encode_text() when text shaping is not enabled.
"""
font = pdf.current_font
if pdf.text_shaping and pdf.text_shaping.get("use_shaping_engine"):
shaped = font.shape_text(text, pdf.font_size_pt, pdf.text_shaping)
if shaped:
mapped = "".join(
chr(ti["mapped_char"])
for ti in shaped
if ti["mapped_char"] is not None
)
if mapped:
return f"({font.escape_text(mapped)}) Tj"
return font.encode_text(text)
def _is_cjk_only(self, text: str) -> bool:
"""Check if text contains only CJK characters.
CJK scripts don't use spaces between words, so we should not insert
spaces between adjacent CJK words.
Args:
text: Text to check
Returns:
True if text contains only CJK characters
"""
for char in text:
cp = ord(char)
# Check if character is in CJK ranges
if not (
0x4E00 <= cp <= 0x9FFF # CJK Unified Ideographs
or 0x3400 <= cp <= 0x4DBF # CJK Extension A
or 0x20000 <= cp <= 0x2A6DF # CJK Extension B
or 0x2A700 <= cp <= 0x2B73F # CJK Extension C
or 0x2B740 <= cp <= 0x2B81F # CJK Extension D
or 0x2B820 <= cp <= 0x2CEAF # CJK Extension E
or 0x2CEB0 <= cp <= 0x2EBEF # CJK Extension F
or 0x30000 <= cp <= 0x3134F # CJK Extension G
or 0x3040 <= cp <= 0x309F # Hiragana
or 0x30A0 <= cp <= 0x30FF # Katakana
or 0x31F0 <= cp <= 0x31FF # Katakana Phonetic Extensions
or 0xAC00 <= cp <= 0xD7AF # Hangul Syllables
or 0x1100 <= cp <= 0x11FF # Hangul Jamo
or 0x3130 <= cp <= 0x318F # Hangul Compatibility Jamo
or 0xA960 <= cp <= 0xA97F # Hangul Jamo Extended-A
or 0xD7B0 <= cp <= 0xD7FF # Hangul Jamo Extended-B
or 0x3000 <= cp <= 0x303F # CJK Symbols and Punctuation
or 0xFF00 <= cp <= 0xFFEF # Halfwidth and Fullwidth Forms
):
return False
return True
def _render_debug_line_bbox(
self,
pdf: FPDF,
left: float,
top: float,
right: float,
bottom: float,
) -> None:
"""Draw a blue box around the line bbox."""
pdf.set_draw_color(0, 0, 255) # Blue
pdf.set_line_width(0.5)
pdf.rect(left, top, right - left, bottom - top)
def _render_debug_baseline(
self,
pdf: FPDF,
x: float,
y: float,
width: float,
rotation_deg: float,
) -> None:
"""Draw a magenta line along the baseline."""
pdf.set_draw_color(255, 0, 255) # Magenta
pdf.set_line_width(0.75)
if abs(rotation_deg) > 0.1:
with pdf.rotation(rotation_deg, x=x, y=y):
pdf.line(x, y, x + width, y)
else:
pdf.line(x, y, x + width, y)
def _render_debug_word_bbox(
self,
pdf: FPDF,
left: float,
top: float,
right: float,
bottom: float,
) -> None:
"""Draw a green box around the word bbox."""
pdf.set_draw_color(0, 255, 0) # Green
pdf.set_line_width(0.3)
pdf.rect(left, top, right - left, bottom - top)
class Fpdf2MultiPageRenderer:
"""Renders multiple OcrElement pages into a single PDF.
This class handles multi-page documents by delegating to Fpdf2PdfRenderer
for each page while sharing a single FPDF instance and font registration.
"""
def __init__(
self,
pages_data: list[tuple[int, OcrElement, float]],
multi_font_manager: MultiFontManager,
invisible_text: bool = True,
debug_render_options: DebugRenderOptions | None = None,
):
"""Initialize multi-page renderer.
Args:
pages_data: List of (pageno, ocr_tree, dpi) tuples
multi_font_manager: Shared multi-font manager for all pages
invisible_text: Whether to render invisible text
debug_render_options: Options for debug visualization
"""
self.pages_data = pages_data
self.multi_font_manager = multi_font_manager
self.invisible_text = invisible_text
self.debug_options = debug_render_options or DebugRenderOptions()
def render(self, output_path: Path) -> None:
"""Render all pages to a single multi-page PDF.
Args:
output_path: Output PDF file path
"""
if not self.pages_data:
raise ValueError("No pages to render")
# Create PDF (page size will be set per-page)
pdf = FPDF(unit="pt")
pdf.set_auto_page_break(auto=False)
pdf.set_text_shaping(True)
# Disable cell margin to ensure precise text positioning
# fpdf2's cell() adds c_margin padding by default, which shifts text
pdf.c_margin = 0
# Set text mode for invisible text
if self.invisible_text:
pdf.text_mode = TextMode.INVISIBLE
else:
pdf.text_mode = TextMode.FILL
# Shared font registration across all pages
shared_registered_fonts: dict[str, str] = {}
# Render each page using Fpdf2PdfRenderer
for _pageno, page, dpi in self.pages_data:
if page.bbox is None:
continue
# Create a renderer for this page
page_renderer = Fpdf2PdfRenderer(
page=page,
dpi=dpi,
multi_font_manager=self.multi_font_manager,
invisible_text=self.invisible_text,
debug_render_options=self.debug_options,
)
# Share font registration to avoid re-registering fonts
page_renderer._registered_fonts = shared_registered_fonts
# Render page content to the shared PDF
page_renderer.render_to_pdf(pdf)
# Write PDF
pdf.output(str(output_path))
================================================
FILE: src/ocrmypdf/helpers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Support functions."""
from __future__ import annotations
import logging
import multiprocessing
import os
import shutil
import warnings
from collections.abc import Callable, Iterable, Sequence
from contextlib import suppress
from decimal import Decimal
from io import StringIO
from math import isclose, isfinite
from pathlib import Path
from statistics import harmonic_mean
from typing import (
Any,
Generic,
TypeVar,
)
import img2pdf
import pikepdf
log = logging.getLogger(__name__)
IMG2PDF_KWARGS = dict(engine=img2pdf.Engine.pikepdf, rotation=img2pdf.Rotation.ifvalid)
T = TypeVar('T', float, int, Decimal)
class Resolution(Generic[T]):
"""The number of pixels per inch in each 2D direction.
Resolution objects are considered "equal" for == purposes if they are
equal to a reasonable tolerance.
"""
x: T
y: T
__slots__ = ('x', 'y')
def __init__(self, x: T, y: T):
"""Construct a Resolution object."""
self.x = x
self.y = y
# rel_tol after converting from dpi to pixels per meter and saving
# as integer with rounding, as many file formats
CONVERSION_ERROR = 0.002
def round(self, ndigits: int) -> Resolution:
"""Round to ndigits after the decimal point."""
return Resolution(round(self.x, ndigits), round(self.y, ndigits))
def to_int(self) -> Resolution[int]:
"""Round to nearest integer."""
return Resolution(int(round(self.x)), int(round(self.y)))
@classmethod
def _isclose(cls, a, b):
return isclose(a, b, rel_tol=cls.CONVERSION_ERROR)
@property
def is_square(self) -> bool:
"""True if the resolution is square (x == y)."""
return self._isclose(self.x, self.y)
@property
def is_finite(self) -> bool:
"""True if both x and y are finite numbers."""
return isfinite(self.x) and isfinite(self.y)
def to_scalar(self) -> float:
"""Return the harmonic mean of x and y as a 1D approximation.
In most cases, Resolution is 2D, but typically it is "square" (x == y) and
can be approximated as a single number. When not square, the harmonic mean
is used to approximate the 2D resolution as a single number.
"""
return harmonic_mean([float(self.x), float(self.y)])
def _take_minmax(
self, vals: Iterable[Any], yvals: Iterable[Any] | None, cmp: Callable
) -> Resolution:
"""Return a new Resolution object with the maximum resolution of inputs."""
if yvals is not None:
return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))
cmp_x, cmp_y = self.x, self.y
for x, y in vals:
cmp_x = cmp(x, cmp_x)
cmp_y = cmp(y, cmp_y)
return Resolution(cmp_x, cmp_y)
def take_max(
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
) -> Resolution:
"""Return a new Resolution object with the maximum resolution of inputs."""
return self._take_minmax(vals, yvals, max)
def take_min(
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
) -> Resolution:
"""Return a new Resolution object with the minimum resolution of inputs."""
return self._take_minmax(vals, yvals, min)
def flip_axis(self) -> Resolution[T]:
"""Return a new Resolution object with x and y swapped."""
return Resolution(self.y, self.x)
def __getitem__(self, idx: int | slice) -> T:
"""Support [0] and [1] indexing."""
return (self.x, self.y)[idx]
def __str__(self):
"""Return a string representation of the resolution."""
return f"{self.x:f}×{self.y:f}"
def __repr__(self): # pragma: no cover
"""Return a repr() of the resolution."""
return f"Resolution({self.x!r}, {self.y!r})"
def __eq__(self, other):
"""Return True if the resolution is equal to another resolution."""
if isinstance(other, tuple) and len(other) == 2:
other = Resolution(*other)
if not isinstance(other, Resolution):
return NotImplemented
return self._isclose(self.x, other.x) and self._isclose(self.y, other.y)
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike) -> None:
"""Create a symbolic link at ``soft_link_name``, which references ``input_file``.
Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
used since symlinks may require administrator privileges. An existing link at the
destination is removed.
"""
input_file = os.fspath(input_file)
soft_link_name = os.fspath(soft_link_name)
# Guard against soft linking to oneself
if input_file == soft_link_name:
log.warning(
"No symbolic link created. You are using the original data directory "
"as the working directory."
)
return
# Soft link already exists: delete for relink?
if os.path.lexists(soft_link_name):
# do not delete or overwrite real (non-soft link) file
if not os.path.islink(soft_link_name):
raise FileExistsError(f"{soft_link_name} exists and is not a link")
os.unlink(soft_link_name)
if not os.path.exists(input_file):
raise FileNotFoundError(f"trying to create a broken symlink to {input_file}")
if os.name == 'nt':
# Don't actually use symlinks on Windows due to permission issues
shutil.copyfile(input_file, soft_link_name)
return
log.debug("os.symlink(%s, %s)", input_file, soft_link_name)
# Create symbolic link using absolute path
os.symlink(os.path.abspath(input_file), soft_link_name)
def samefile(file1: os.PathLike, file2: os.PathLike) -> bool:
"""Return True if two files are the same file.
Attempts to account for different relative paths to the same file.
"""
if os.name == 'nt':
return file1 == file2
else:
return os.path.samefile(file1, file2)
def is_iterable_notstr(thing: Any) -> bool:
"""Is this is an iterable type, other than a string?"""
return isinstance(thing, Iterable) and not isinstance(thing, str)
def monotonic(seq: Sequence) -> bool:
"""Does this sequence increase monotonically?"""
return all(b > a for a, b in zip(seq, seq[1:], strict=False))
def page_number(input_file: os.PathLike) -> int:
"""Get one-based page number implied by filename (000002.pdf -> 2)."""
return int(os.path.basename(os.fspath(input_file))[0:6])
def available_cpu_count() -> int:
"""Returns number of CPUs in the system."""
try:
return multiprocessing.cpu_count()
except NotImplementedError:
pass
warnings.warn(
"Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually."
)
return 1
def is_file_writable(test_file: os.PathLike) -> bool:
"""Intentionally racy test if target is writable.
We intend to write to the output file if and only if we succeed and
can replace it atomically. Before doing the OCR work, make sure
the location is writable.
"""
try:
p = Path(test_file)
if p.is_symlink():
p = p.resolve(strict=False)
# p.is_file() throws an exception in some cases
if p.exists() and (p.is_file() or p.samefile(os.devnull)):
return os.access(
os.fspath(p),
os.W_OK,
effective_ids=(os.access in os.supports_effective_ids),
)
try:
fp = p.open('wb')
except OSError:
return False
else:
fp.close()
with suppress(OSError):
p.unlink()
return True
except (OSError, RuntimeError) as e:
log.debug(e)
log.error(str(e))
return False
def check_pdf(input_file: Path) -> bool:
"""Check if a PDF complies with the PDF specification.
Checks for proper formatting and proper linearization. Uses pikepdf (which in
turn, uses QPDF) to perform the checks.
"""
try:
pdf = pikepdf.open(input_file)
except pikepdf.PdfError as e:
log.error(e)
return False
else:
with pdf:
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message=r'pikepdf.*JBIG2.*')
messages = pdf.check_pdf_syntax()
success = True
for msg in messages:
if 'error' in msg.lower():
log.error(msg)
success = False
elif (
"/DecodeParms: operation for dictionary attempted on object "
"of type null" in msg
):
pass # Ignore/spurious warning
else:
log.warning(msg)
success = False
sio = StringIO()
linearize_msgs = ''
try:
# If linearization is missing entirely, we do not complain. We do
# complain if linearization is present but incorrect.
pdf.check_linearization(sio)
except (RuntimeError, pikepdf.ForeignObjectError):
pass
else:
linearize_msgs = sio.getvalue()
if linearize_msgs:
log.warning(linearize_msgs)
return bool(success and not linearize_msgs)
def clamp(n: T, smallest: T, largest: T) -> T:
"""Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
return max(smallest, min(n, largest))
def remove_all_log_handlers(logger: logging.Logger) -> None:
"""Remove all log handlers, usually used in a child process.
The child process inherits the log handlers from the parent process when
a fork occurs. Typically we want to remove all log handlers in the child
process so that the child process can set up a single queue handler to
forward log messages to the parent process.
"""
for handler in logger.handlers[:]:
logger.removeHandler(handler)
handler.close() # To ensure handlers with opened resources are released
def pikepdf_enable_mmap() -> None:
"""Enable pikepdf memory mapping."""
try:
pikepdf._core.set_access_default_mmap(True)
log.debug(
"pikepdf mmap "
+ (
'enabled'
if pikepdf._core.get_access_default_mmap() # type: ignore[attr-defined]
else 'disabled'
)
)
except AttributeError:
log.debug("pikepdf mmap not available")
def running_in_docker() -> bool:
"""Returns True if we seem to be running in a Docker container."""
return Path('/.dockerenv').exists()
def running_in_snap() -> bool:
"""Returns True if we seem to be running in a Snap container."""
try:
cgroup_text = Path('/proc/self/cgroup').read_text()
return 'snap.ocrmypdf' in cgroup_text
except FileNotFoundError:
return False
================================================
FILE: src/ocrmypdf/hocrtransform/__init__.py
================================================
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Transform OCR output to text-only PDFs.
This package provides tools for:
1. Parsing OCR output (hOCR format) into generic OcrElement structures
2. Rendering OcrElement structures to searchable PDF text layers
The architecture separates parsing from rendering, allowing:
- Support for multiple OCR input formats (hOCR, ALTO, custom engines)
- Independent improvements to text rendering
- Reuse of the OcrElement data model for other purposes
Main components:
- OcrElement: Generic dataclass representing OCR output structure
- HocrParser: Parses hOCR files into OcrElement trees
- Fpdf2PdfRenderer: Renders OcrElement trees to PDF text layers (via fpdf2)
For PDF rendering, use the fpdf2_renderer module:
from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer, DebugRenderOptions
"""
from __future__ import annotations
from ocrmypdf.hocrtransform.hocr_parser import (
HocrParseError,
HocrParser,
)
from ocrmypdf.models.ocr_element import (
Baseline,
BoundingBox,
FontInfo,
OcrClass,
OcrElement,
)
__all__ = (
# hOCR parsing
'HocrParser',
'HocrParseError',
# OCR element data model
'OcrElement',
'OcrClass',
'BoundingBox',
'Baseline',
'FontInfo',
)
================================================
FILE: src/ocrmypdf/hocrtransform/__main__.py
================================================
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Simple CLI for testing HOCR to PDF conversion using fpdf2 renderer."""
from __future__ import annotations
import argparse
from pathlib import Path
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert hocr file to PDF')
parser.add_argument(
'-b',
'--boundingboxes',
action="store_true",
default=False,
help='Show bounding boxes borders (debug mode)',
)
parser.add_argument(
'-r',
'--resolution',
type=int,
default=300,
help='Resolution of the image that was OCRed',
)
parser.add_argument(
'-i',
'--image',
default=None,
help='Path to the image to overlay on top of the text layer',
)
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
args = parser.parse_args()
# Parse hOCR file
hocr_parser = HocrParser(args.hocrfile)
ocr_page = hocr_parser.parse()
# Use DPI from hOCR if available, otherwise use command-line resolution
dpi = ocr_page.dpi or args.resolution
# Setup debug render options if requested
debug_options = None
if args.boundingboxes:
debug_options = DebugRenderOptions(
render_line_bbox=True,
render_word_bbox=True,
render_baseline=True,
)
# Create multi-font manager with default font directory
font_dir = Path(__file__).parent.parent / "data"
multi_font_manager = MultiFontManager(font_dir)
# Render to PDF using fpdf2
image_path = Path(args.image) if args.image else None
renderer = Fpdf2PdfRenderer(
page=ocr_page,
dpi=dpi,
multi_font_manager=multi_font_manager,
invisible_text=bool(args.image),
image=image_path,
debug_render_options=debug_options,
)
renderer.render(Path(args.outputfile))
================================================
FILE: src/ocrmypdf/hocrtransform/hocr_parser.py
================================================
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Parser for hOCR format files.
This module provides functionality to parse hOCR files (HTML-based OCR format)
and convert them to the engine-agnostic OcrElement tree structure.
For details of the hOCR format, see:
http://kba.github.io/hocr-spec/1.2/
"""
from __future__ import annotations
import logging
import os
import re
import unicodedata
from pathlib import Path
from typing import Literal, cast
from xml.etree import ElementTree as ET
from ocrmypdf.models.ocr_element import (
Baseline,
BoundingBox,
FontInfo,
OcrClass,
OcrElement,
)
TextDirection = Literal["ltr", "rtl"]
log = logging.getLogger(__name__)
Element = ET.Element
class HocrParseError(Exception):
"""Error while parsing hOCR file."""
class HocrParser:
"""Parser for hOCR format files.
Converts hOCR XML/HTML files into OcrElement trees.
The hOCR format uses HTML with special class attributes (ocr_page, ocr_line,
ocrx_word, etc.) and a title attribute containing properties like bbox,
baseline, and confidence scores.
"""
# Regex patterns for parsing hOCR title attributes
_bbox_pattern = re.compile(
r'''
bbox \s+
(\d+) \s+ # left: uint
(\d+) \s+ # top: uint
(\d+) \s+ # right: uint
(\d+) # bottom: uint
''',
re.VERBOSE,
)
_baseline_pattern = re.compile(
r'''
baseline \s+
([\-\+]?\d*\.?\d*) \s+ # slope: +/- decimal float
([\-\+]?\d+) # intercept: +/- int
''',
re.VERBOSE,
)
_textangle_pattern = re.compile(
r'''
textangle \s+
([\-\+]?\d*\.?\d*) # angle: +/- decimal float
''',
re.VERBOSE,
)
_x_wconf_pattern = re.compile(
r'''
x_wconf \s+
(\d+) # confidence: uint (0-100)
''',
re.VERBOSE,
)
_x_fsize_pattern = re.compile(
r'''
x_fsize \s+
(\d*\.?\d+) # font size: float
''',
re.VERBOSE,
)
_x_font_pattern = re.compile(
r'''
x_font \s+
([^\s;]+) # font name: non-whitespace, non-semicolon string
''',
re.VERBOSE,
)
_ppageno_pattern = re.compile(
r'''
ppageno \s+
(\d+) # page number: uint
''',
re.VERBOSE,
)
_scan_res_pattern = re.compile(
r'''
scan_res \s+
(\d+) \s+ # x resolution
(\d+) # y resolution
''',
re.VERBOSE,
)
def __init__(self, hocr_file: str | Path):
"""Initialize the parser with an hOCR file.
Args:
hocr_file: Path to the hOCR file to parse
Raises:
HocrParseError: If the file cannot be parsed
"""
self._hocr_path = Path(hocr_file)
try:
self._tree = ET.parse(os.fspath(hocr_file))
except ET.ParseError as e:
raise HocrParseError(f"Failed to parse hOCR file: {e}") from e
# Detect XML namespace
root_tag = self._tree.getroot().tag
matches = re.match(r'({.*})html', root_tag)
self._xmlns = matches.group(1) if matches else ''
def parse(self) -> OcrElement:
"""Parse the hOCR file and return an OcrElement tree.
Returns:
The root OcrElement (ocr_page) containing the document structure
Raises:
HocrParseError: If no ocr_page element is found
"""
# Find the first ocr_page element
page_div = self._tree.find(self._xpath('div', 'ocr_page'))
if page_div is None:
raise HocrParseError("No ocr_page element found in hOCR file")
return self._parse_page(page_div)
def _xpath(self, html_tag: str, html_class: str | None = None) -> str:
"""Build an XPath expression for finding elements.
Args:
html_tag: HTML tag name (e.g., 'div', 'span', 'p')
html_class: Optional class attribute to match
Returns:
XPath expression string
"""
xpath = f".//{self._xmlns}{html_tag}"
if html_class:
xpath += f"[@class='{html_class}']"
return xpath
def _parse_page(self, page_elem: Element) -> OcrElement:
"""Parse an ocr_page element.
Args:
page_elem: The XML element with class="ocr_page"
Returns:
OcrElement representing the page
"""
title = page_elem.attrib.get('title', '')
bbox = self._parse_bbox(title)
if bbox is None:
raise HocrParseError("ocr_page missing bbox")
# Parse page-level properties
page_number = self._parse_ppageno(title)
dpi = self._parse_scan_res(title)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=bbox,
page_number=page_number,
dpi=dpi,
)
# Parse child paragraphs
for par_elem in page_elem.iterfind(self._xpath('p', 'ocr_par')):
paragraph = self._parse_paragraph(par_elem)
if paragraph is not None:
page.children.append(paragraph)
# If no paragraphs found, check for words directly under page
# (some Tesseract output structures)
if not page.children:
for word_elem in page_elem.iterfind(self._xpath('span', 'ocrx_word')):
word = self._parse_word(word_elem)
if word is not None:
page.children.append(word)
return page
def _parse_paragraph(self, par_elem: Element) -> OcrElement | None:
"""Parse an ocr_par element.
Args:
par_elem: The XML element with class="ocr_par"
Returns:
OcrElement representing the paragraph, or None if empty
"""
title = par_elem.attrib.get('title', '')
bbox = self._parse_bbox(title)
# Get direction and language from attributes
dir_attr = par_elem.attrib.get('dir')
direction: TextDirection | None = (
cast(TextDirection, dir_attr) if dir_attr in ('ltr', 'rtl') else None
)
language = par_elem.attrib.get('lang')
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=bbox,
direction=direction,
language=language,
)
# Parse child lines
line_classes = {
'ocr_line',
'ocr_header',
'ocr_footer',
'ocr_caption',
'ocr_textfloat',
}
for span_elem in par_elem.iterfind(self._xpath('span')):
elem_class = span_elem.attrib.get('class', '')
if elem_class in line_classes:
line = self._parse_line(span_elem, elem_class, direction, language)
if line is not None:
paragraph.children.append(line)
# Return None if paragraph is empty
if not paragraph.children:
return None
return paragraph
def _parse_line(
self,
line_elem: Element,
ocr_class: str,
parent_direction: TextDirection | None,
parent_language: str | None,
) -> OcrElement | None:
"""Parse a line element (ocr_line, ocr_header, etc.).
Args:
line_elem: The XML element representing the line
ocr_class: The hOCR class of the line
parent_direction: Text direction inherited from parent
parent_language: Language inherited from parent
Returns:
OcrElement representing the line, or None if empty
"""
title = line_elem.attrib.get('title', '')
bbox = self._parse_bbox(title)
if bbox is None:
return None
baseline = self._parse_baseline(title)
textangle = self._parse_textangle(title)
# Inherit direction and language from parent if not specified
dir_attr = line_elem.attrib.get('dir')
if dir_attr in ('ltr', 'rtl'):
direction: TextDirection | None = cast(TextDirection, dir_attr)
else:
direction = parent_direction
language = line_elem.attrib.get('lang') or parent_language
line = OcrElement(
ocr_class=ocr_class,
bbox=bbox,
baseline=baseline,
textangle=textangle,
direction=direction,
language=language,
)
# Parse child words
for word_elem in line_elem.iterfind(self._xpath('span', 'ocrx_word')):
word = self._parse_word(word_elem)
if word is not None:
line.children.append(word)
# Return None if line has no words
if not line.children:
return None
return line
def _parse_word(self, word_elem: Element) -> OcrElement | None:
"""Parse an ocrx_word element.
Args:
word_elem: The XML element with class="ocrx_word"
Returns:
OcrElement representing the word, or None if empty
"""
title = word_elem.attrib.get('title', '')
bbox = self._parse_bbox(title)
# Get the text content
text = self._get_element_text(word_elem)
text = self._normalize_text(text)
if not text:
return None
# Parse confidence (x_wconf is 0-100, convert to 0.0-1.0)
confidence = self._parse_x_wconf(title)
if confidence is not None:
confidence = confidence / 100.0
# Parse font info
font = self._parse_font_info(title)
return OcrElement(
ocr_class=OcrClass.WORD,
bbox=bbox,
text=text,
confidence=confidence,
font=font,
)
def _get_element_text(self, element: Element) -> str:
"""Get the full text content of an element including children.
Args:
element: XML element
Returns:
Combined text content
"""
text = element.text if element.text is not None else ''
for child in element:
text += self._get_element_text(child)
text += element.tail if element.tail is not None else ''
return text
@staticmethod
def _normalize_text(text: str) -> str:
"""Normalize text using NFKC normalization.
This splits ligatures and combines diacritics.
Args:
text: Raw text
Returns:
Normalized text, stripped of leading/trailing whitespace
"""
return unicodedata.normalize("NFKC", text).strip()
def _parse_bbox(self, title: str) -> BoundingBox | None:
"""Parse a bbox from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
BoundingBox or None if not found
"""
match = self._bbox_pattern.search(title)
if not match:
return None
try:
return BoundingBox(
left=float(match.group(1)),
top=float(match.group(2)),
right=float(match.group(3)),
bottom=float(match.group(4)),
)
except ValueError:
return None
def _parse_baseline(self, title: str) -> Baseline | None:
"""Parse baseline from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
Baseline or None if not found
"""
match = self._baseline_pattern.search(title)
if not match:
return None
try:
return Baseline(
slope=float(match.group(1)) if match.group(1) else 0.0,
intercept=float(match.group(2)),
)
except ValueError:
return None
def _parse_textangle(self, title: str) -> float | None:
"""Parse textangle from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
Angle in degrees or None if not found
"""
match = self._textangle_pattern.search(title)
if not match:
return None
try:
return float(match.group(1))
except ValueError:
return None
def _parse_x_wconf(self, title: str) -> float | None:
"""Parse word confidence from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
Confidence (0-100) or None if not found
"""
match = self._x_wconf_pattern.search(title)
if not match:
return None
try:
return float(match.group(1))
except ValueError:
return None
def _parse_ppageno(self, title: str) -> int | None:
"""Parse physical page number from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
Page number or None if not found
"""
match = self._ppageno_pattern.search(title)
if not match:
return None
try:
return int(match.group(1))
except ValueError:
return None
def _parse_scan_res(self, title: str) -> float | None:
"""Parse scan resolution (DPI) from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
DPI (using first value if x and y differ) or None if not found
"""
match = self._scan_res_pattern.search(title)
if not match:
return None
try:
# Use the first (x) resolution value
return float(match.group(1))
except ValueError:
return None
def _parse_font_info(self, title: str) -> FontInfo | None:
"""Parse font information from an hOCR title attribute.
Args:
title: The title attribute value
Returns:
FontInfo or None if no font info found
"""
font_match = self._x_font_pattern.search(title)
size_match = self._x_fsize_pattern.search(title)
if not font_match and not size_match:
return None
return FontInfo(
name=font_match.group(1) if font_match else None,
size=float(size_match.group(1)) if size_match else None,
)
================================================
FILE: src/ocrmypdf/imageops.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCR-related image manipulation."""
from __future__ import annotations
import logging
from math import floor, sqrt
from PIL import Image
log = logging.getLogger(__name__)
def bytes_per_pixel(mode: str) -> int:
"""Return the number of padded bytes per pixel for a given PIL image mode.
In RGB mode we assume 4 bytes per pixel, which is the case for most
consumers.
"""
if mode in ('1', 'L', 'P'):
return 1
if mode in ('LA', 'PA', 'La') or mode.startswith('I;16'):
return 2
return 4
def _calculate_downsample(
image_size: tuple[int, int],
bytes_per_pixel: int,
*,
max_size: tuple[int, int] | None = None,
max_pixels: int | None = None,
max_bytes: int | None = None,
) -> tuple[int, int]:
"""Calculate image size required to downsample an image to fit limits.
If no limit is exceeded, the input image's size is returned.
Args:
image_size: Dimensions of image.
bytes_per_pixel: Number of bytes per pixel.
max_size: The maximum width and height of the image.
max_pixels: The maximum number of pixels in the image. Some image consumers
limit the total number of pixels as some value other than width*height.
max_bytes: The maximum number of bytes in the image. RGB is counted as 4
bytes; all other modes are counted as 1 byte.
"""
size = image_size
if max_size is not None:
overage = max_size[0] / size[0], max_size[1] / size[1]
size_factor = min(overage)
if size_factor < 1.0:
log.debug("Resizing image to fit image dimensions limit")
size = floor(size[0] * size_factor), floor(size[1] * size_factor)
if size[0] == 0:
size = 1, min(size[1], max_size[1])
elif size[1] == 0:
size = min(size[0], max_size[0]), 1
if max_pixels is not None and size[0] * size[1] > max_pixels:
log.debug("Resizing image to fit image pixel limit")
pixels_factor = sqrt(max_pixels / (size[0] * size[1]))
size = floor(size[0] * pixels_factor), floor(size[1] * pixels_factor)
if max_bytes is not None:
bpp = bytes_per_pixel
# stride = bytes per line
stride = size[0] * bpp
height = size[1]
if stride * height > max_bytes:
log.debug("Resizing image to fit image byte size limit")
bytes_factor = sqrt(max_bytes / (stride * height))
scaled_stride = floor(stride * bytes_factor)
scaled_height = floor(height * bytes_factor)
if scaled_stride == 0:
scaled_stride = bpp
scaled_height = min(max_bytes // bpp, scaled_height)
if scaled_height == 0:
scaled_height = 1
scaled_stride = min(max_bytes // scaled_height, scaled_stride)
size = floor(scaled_stride / bpp), scaled_height
return size
def calculate_downsample(
image: Image.Image,
*,
max_size: tuple[int, int] | None = None,
max_pixels: int | None = None,
max_bytes: int | None = None,
) -> tuple[int, int]:
"""Calculate image size required to downsample an image to fit limits.
If no limit is exceeded, the input image's size is returned.
Args:
image: The image to downsample.
max_size: The maximum width and height of the image.
max_pixels: The maximum number of pixels in the image. Some image consumers
limit the total number of pixels as some value other than width*height.
max_bytes: The maximum number of bytes in the image. RGB is counted as 4
bytes; all other modes are counted as 1 byte.
"""
return _calculate_downsample(
image.size,
bytes_per_pixel(image.mode),
max_size=max_size,
max_pixels=max_pixels,
max_bytes=max_bytes,
)
def downsample_image(
image: Image.Image,
new_size: tuple[int, int],
*,
resample_mode: Image.Resampling = Image.Resampling.BICUBIC,
reducing_gap: int = 3,
) -> Image.Image:
"""Downsample an image to fit within the given limits.
The DPI is adjusted to match the new size, which is how we can ensure the
OCR is positioned correctly.
Args:
image: The image to downsample
new_size: The new size of the image.
resample_mode: The resampling mode to use when downsampling.
reducing_gap: The reducing gap to use when downsampling (for larger
reductions).
"""
if new_size == image.size:
return image
original_size = image.size
original_dpi = image.info['dpi']
image = image.resize(
new_size,
resample=resample_mode,
reducing_gap=reducing_gap,
)
image.info['dpi'] = (
round(original_dpi[0] * new_size[0] / original_size[0]),
round(original_dpi[1] * new_size[1] / original_size[1]),
)
log.debug(f"Rescaled image to {image.size} pixels and {image.info['dpi']} dpi")
return image
================================================
FILE: src/ocrmypdf/languages.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Language codes and names from ISO 639.
Derived from
https://www.loc.gov/standards/iso639-2/ascii_8bits.html
"""
from __future__ import annotations
from typing import NamedTuple
class ISOCodeData(NamedTuple):
"""Data for a single ISO 639 code."""
alt: str
alpha_2: str
english: str
french: str
ISO_639_3 = {
'aar': ISOCodeData('', 'aa', 'Afar', 'afar'),
'abk': ISOCodeData('', 'ab', 'Abkhazian', 'abkhaze'),
'ace': ISOCodeData('', '', 'Achinese', 'aceh'),
'ach': ISOCodeData('', '', 'Acoli', 'acoli'),
'ada': ISOCodeData('', '', 'Adangme', 'adangme'),
'ady': ISOCodeData('', '', 'Adyghe; Adygei', 'adyghé'),
'afa': ISOCodeData(
'',
'',
'Afro-Asiatic languages',
'afro-asiatiques, langues',
),
'afh': ISOCodeData('', '', 'Afrihili', 'afrihili'),
'afr': ISOCodeData('', 'af', 'Afrikaans', 'afrikaans'),
'ain': ISOCodeData('', '', 'Ainu', 'aïnou'),
'aka': ISOCodeData('', 'ak', 'Akan', 'akan'),
'akk': ISOCodeData('', '', 'Akkadian', 'akkadien'),
'alb': ISOCodeData('sqi', 'sq', 'Albanian', 'albanais'),
'ale': ISOCodeData('', '', 'Aleut', 'aléoute'),
'alg': ISOCodeData(
'',
'',
'Algonquian languages',
'algonquines, langues',
),
'alt': ISOCodeData('', '', 'Southern Altai', 'altai du Sud'),
'amh': ISOCodeData('', 'am', 'Amharic', 'amharique'),
'ang': ISOCodeData(
'',
'',
'English, Old (ca.450-1100)',
'anglo-saxon (ca.450-1100)',
),
'anp': ISOCodeData('', '', 'Angika', 'angika'),
'apa': ISOCodeData('', '', 'Apache languages', 'apaches, langues'),
'ara': ISOCodeData('', 'ar', 'Arabic', 'arabe'),
'arc': ISOCodeData(
'',
'',
'Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)',
"araméen d'empire (700-300 BCE)",
),
'arg': ISOCodeData('', 'an', 'Aragonese', 'aragonais'),
'arm': ISOCodeData('hye', 'hy', 'Armenian', 'arménien'),
'arn': ISOCodeData(
'',
'',
'Mapudungun; Mapuche',
'mapudungun; mapuche; mapuce',
),
'arp': ISOCodeData('', '', 'Arapaho', 'arapaho'),
'art': ISOCodeData(
'',
'',
'Artificial languages',
'artificielles, langues',
),
'arw': ISOCodeData('', '', 'Arawak', 'arawak'),
'asm': ISOCodeData('', 'as', 'Assamese', 'assamais'),
'ast': ISOCodeData(
'',
'',
'Asturian; Bable; Leonese; Asturleonese',
'asturien; bable; léonais; asturoléonais',
),
'ath': ISOCodeData(
'',
'',
'Athapascan languages',
'athapascanes, langues',
),
'aus': ISOCodeData(
'',
'',
'Australian languages',
'australiennes, langues',
),
'ava': ISOCodeData('', 'av', 'Avaric', 'avar'),
'ave': ISOCodeData('', 'ae', 'Avestan', 'avestique'),
'awa': ISOCodeData('', '', 'Awadhi', 'awadhi'),
'aym': ISOCodeData('', 'ay', 'Aymara', 'aymara'),
'aze': ISOCodeData('', 'az', 'Azerbaijani', 'azéri'),
'bad': ISOCodeData('', '', 'Banda languages', 'banda, langues'),
'bai': ISOCodeData('', '', 'Bamileke languages', 'bamiléké, langues'),
'bak': ISOCodeData('', 'ba', 'Bashkir', 'bachkir'),
'bal': ISOCodeData('', '', 'Baluchi', 'baloutchi'),
'bam': ISOCodeData('', 'bm', 'Bambara', 'bambara'),
'ban': ISOCodeData('', '', 'Balinese', 'balinais'),
'baq': ISOCodeData('eus', 'eu', 'Basque', 'basque'),
'bas': ISOCodeData('', '', 'Basa', 'basa'),
'bat': ISOCodeData('', '', 'Baltic languages', 'baltes, langues'),
'bej': ISOCodeData('', '', 'Beja; Bedawiyet', 'bedja'),
'bel': ISOCodeData('', 'be', 'Belarusian', 'biélorusse'),
'bem': ISOCodeData('', '', 'Bemba', 'bemba'),
'ben': ISOCodeData('', 'bn', 'Bengali', 'bengali'),
'ber': ISOCodeData('', '', 'Berber languages', 'berbères, langues'),
'bho': ISOCodeData('', '', 'Bhojpuri', 'bhojpuri'),
'bih': ISOCodeData('', 'bh', 'Bihari languages', 'langues biharis'),
'bik': ISOCodeData('', '', 'Bikol', 'bikol'),
'bin': ISOCodeData('', '', 'Bini; Edo', 'bini; edo'),
'bis': ISOCodeData('', 'bi', 'Bislama', 'bichlamar'),
'bla': ISOCodeData('', '', 'Siksika', 'blackfoot'),
'bnt': ISOCodeData('', '', 'Bantu languages', 'bantou, langues'),
'bos': ISOCodeData('', 'bs', 'Bosnian', 'bosniaque'),
'bra': ISOCodeData('', '', 'Braj', 'braj'),
'bre': ISOCodeData('', 'br', 'Breton', 'breton'),
'btk': ISOCodeData('', '', 'Batak languages', 'batak, langues'),
'bua': ISOCodeData('', '', 'Buriat', 'bouriate'),
'bug': ISOCodeData('', '', 'Buginese', 'bugi'),
'bul': ISOCodeData('', 'bg', 'Bulgarian', 'bulgare'),
'bur': ISOCodeData('mya', 'my', 'Burmese', 'birman'),
'byn': ISOCodeData('', '', 'Blin; Bilin', 'blin; bilen'),
'cad': ISOCodeData('', '', 'Caddo', 'caddo'),
'cai': ISOCodeData(
'',
'',
'Central American Indian languages',
"amérindiennes de L'Amérique centrale, langues",
),
'car': ISOCodeData('', '', 'Galibi Carib', 'karib; galibi; carib'),
'cat': ISOCodeData('', 'ca', 'Catalan; Valencian', 'catalan; valencien'),
'cau': ISOCodeData(
'',
'',
'Caucasian languages',
'caucasiennes, langues',
),
'ceb': ISOCodeData('', '', 'Cebuano', 'cebuano'),
'cel': ISOCodeData(
'',
'',
'Celtic languages',
'celtiques, langues; celtes, langues',
),
'cha': ISOCodeData('', 'ch', 'Chamorro', 'chamorro'),
'chb': ISOCodeData('', '', 'Chibcha', 'chibcha'),
'che': ISOCodeData('', 'ce', 'Chechen', 'tchétchène'),
'chg': ISOCodeData('', '', 'Chagatai', 'djaghataï'),
'chi': ISOCodeData('zho', 'zh', 'Chinese', 'chinois'),
'chk': ISOCodeData('', '', 'Chuukese', 'chuuk'),
'chm': ISOCodeData('', '', 'Mari', 'mari'),
'chn': ISOCodeData('', '', 'Chinook jargon', 'chinook, jargon'),
'cho': ISOCodeData('', '', 'Choctaw', 'choctaw'),
'chp': ISOCodeData('', '', 'Chipewyan; Dene Suline', 'chipewyan'),
'chr': ISOCodeData('', '', 'Cherokee', 'cherokee'),
'chu': ISOCodeData(
'',
'cu',
(
'Church Slavic; Old Slavonic; Church Slavonic;'
' Old Bulgarian; Old Church Slavonic'
),
"slavon d'église; vieux slave; slavon liturgique; vieux bulgare",
),
'chv': ISOCodeData('', 'cv', 'Chuvash', 'tchouvache'),
'chy': ISOCodeData('', '', 'Cheyenne', 'cheyenne'),
'cmc': ISOCodeData('', '', 'Chamic languages', 'chames, langues'),
'cnr': ISOCodeData('', '', 'Montenegrin', 'monténégrin'),
'cop': ISOCodeData('', '', 'Coptic', 'copte'),
'cor': ISOCodeData('', 'kw', 'Cornish', 'cornique'),
'cos': ISOCodeData('', 'co', 'Corsican', 'corse'),
'cpe': ISOCodeData(
'',
'',
'Creoles and pidgins, English based',
"créoles et pidgins basés sur l'anglais",
),
'cpf': ISOCodeData(
'',
'',
'Creoles and pidgins, French-based',
'créoles et pidgins basés sur le français',
),
'cpp': ISOCodeData(
'',
'',
'Creoles and pidgins, Portuguese-based',
'créoles et pidgins basés sur le portugais',
),
'cre': ISOCodeData('', 'cr', 'Cree', 'cree'),
'crh': ISOCodeData(
'',
'',
'Crimean Tatar; Crimean Turkish',
'tatar de Crimé',
),
'crp': ISOCodeData('', '', 'Creoles and pidgins', 'créoles et pidgins'),
'csb': ISOCodeData('', '', 'Kashubian', 'kachoube'),
'cus': ISOCodeData('', '', 'Cushitic languages', 'couchitiques, langues'),
'cze': ISOCodeData('ces', 'cs', 'Czech', 'tchèque'),
'dak': ISOCodeData('', '', 'Dakota', 'dakota'),
'dan': ISOCodeData('', 'da', 'Danish', 'danois'),
'dar': ISOCodeData('', '', 'Dargwa', 'dargwa'),
'day': ISOCodeData('', '', 'Land Dayak languages', 'dayak, langues'),
'del': ISOCodeData('', '', 'Delaware', 'delaware'),
'den': ISOCodeData('', '', 'Slave (Athapascan)', 'esclave (athapascan)'),
'dgr': ISOCodeData('', '', 'Dogrib', 'dogrib'),
'din': ISOCodeData('', '', 'Dinka', 'dinka'),
'div': ISOCodeData('', 'dv', 'Divehi; Dhivehi; Maldivian', 'maldivien'),
'doi': ISOCodeData('', '', 'Dogri', 'dogri'),
'dra': ISOCodeData(
'',
'',
'Dravidian languages',
'dravidiennes, langues',
),
'dsb': ISOCodeData('', '', 'Lower Sorbian', 'bas-sorabe'),
'dua': ISOCodeData('', '', 'Duala', 'douala'),
'dum': ISOCodeData(
'',
'',
'Dutch, Middle (ca.1050-1350)',
'néerlandais moyen (ca. 1050-1350)',
),
'dut': ISOCodeData('nld', 'nl', 'Dutch; Flemish', 'néerlandais; flamand'),
'dyu': ISOCodeData('', '', 'Dyula', 'dioula'),
'dzo': ISOCodeData('', 'dz', 'Dzongkha', 'dzongkha'),
'efi': ISOCodeData('', '', 'Efik', 'efik'),
'egy': ISOCodeData('', '', 'Egyptian (Ancient)', 'égyptien'),
'eka': ISOCodeData('', '', 'Ekajuk', 'ekajuk'),
'elx': ISOCodeData('', '', 'Elamite', 'élamite'),
'eng': ISOCodeData('', 'en', 'English', 'anglais'),
'enm': ISOCodeData(
'',
'',
'English, Middle (1100-1500)',
'anglais moyen (1100-1500)',
),
'epo': ISOCodeData('', 'eo', 'Esperanto', 'espéranto'),
'est': ISOCodeData('', 'et', 'Estonian', 'estonien'),
'ewe': ISOCodeData('', 'ee', 'Ewe', 'éwé'),
'ewo': ISOCodeData('', '', 'Ewondo', 'éwondo'),
'fan': ISOCodeData('', '', 'Fang', 'fang'),
'fao': ISOCodeData('', 'fo', 'Faroese', 'féroïen'),
'fat': ISOCodeData('', '', 'Fanti', 'fanti'),
'fij': ISOCodeData('', 'fj', 'Fijian', 'fidjien'),
'fil': ISOCodeData('', '', 'Filipino; Pilipino', 'filipino; pilipino'),
'fin': ISOCodeData('', 'fi', 'Finnish', 'finnois'),
'fiu': ISOCodeData(
'',
'',
'Finno-Ugrian languages',
'finno-ougriennes, langues',
),
'fon': ISOCodeData('', '', 'Fon', 'fon'),
'fre': ISOCodeData('fra', 'fr', 'French', 'français'),
'frm': ISOCodeData(
'',
'',
'French, Middle (ca.1400-1600)',
'français moyen (1400-1600)',
),
'fro': ISOCodeData(
'',
'',
'French, Old (842-ca.1400)',
'français ancien (842-ca.1400)',
),
'frr': ISOCodeData('', '', 'Northern Frisian', 'frison septentrional'),
'frs': ISOCodeData('', '', 'Eastern Frisian', 'frison oriental'),
'fry': ISOCodeData('', 'fy', 'Western Frisian', 'frison occidental'),
'ful': ISOCodeData('', 'ff', 'Fulah', 'peul'),
'fur': ISOCodeData('', '', 'Friulian', 'frioulan'),
'gaa': ISOCodeData('', '', 'Ga', 'ga'),
'gay': ISOCodeData('', '', 'Gayo', 'gayo'),
'gba': ISOCodeData('', '', 'Gbaya', 'gbaya'),
'gem': ISOCodeData('', '', 'Germanic languages', 'germaniques, langues'),
'geo': ISOCodeData('kat', 'ka', 'Georgian', 'géorgien'),
'ger': ISOCodeData('deu', 'de', 'German', 'allemand'),
'gez': ISOCodeData('', '', 'Geez', 'guèze'),
'gil': ISOCodeData('', '', 'Gilbertese', 'kiribati'),
'gla': ISOCodeData(
'',
'gd',
'Gaelic; Scottish Gaelic',
'gaélique; gaélique écossais',
),
'gle': ISOCodeData('', 'ga', 'Irish', 'irlandais'),
'glg': ISOCodeData('', 'gl', 'Galician', 'galicien'),
'glv': ISOCodeData('', 'gv', 'Manx', 'manx; mannois'),
'gmh': ISOCodeData(
'',
'',
'German, Middle High (ca.1050-1500)',
'allemand, moyen haut (ca. 1050-1500)',
),
'goh': ISOCodeData(
'',
'',
'German, Old High (ca.750-1050)',
'allemand, vieux haut (ca. 750-1050)',
),
'gon': ISOCodeData('', '', 'Gondi', 'gond'),
'gor': ISOCodeData('', '', 'Gorontalo', 'gorontalo'),
'got': ISOCodeData('', '', 'Gothic', 'gothique'),
'grb': ISOCodeData('', '', 'Grebo', 'grebo'),
'grc': ISOCodeData(
'',
'',
'Greek, Ancient (to 1453)',
"grec ancien (jusqu'à 1453)",
),
'gre': ISOCodeData(
'ell',
'el',
'Greek, Modern (1453-)',
'grec moderne (après 1453)',
),
'grn': ISOCodeData('', 'gn', 'Guarani', 'guarani'),
'gsw': ISOCodeData(
'',
'',
'Swiss German; Alemannic; Alsatian',
'suisse alémanique; alémanique; alsacien',
),
'guj': ISOCodeData('', 'gu', 'Gujarati', 'goudjrati'),
'gwi': ISOCodeData('', '', "Gwich'in", "gwich'in"),
'hai': ISOCodeData('', '', 'Haida', 'haida'),
'hat': ISOCodeData(
'',
'ht',
'Haitian; Haitian Creole',
'haïtien; créole haïtien',
),
'hau': ISOCodeData('', 'ha', 'Hausa', 'haoussa'),
'haw': ISOCodeData('', '', 'Hawaiian', 'hawaïen'),
'heb': ISOCodeData('', 'he', 'Hebrew', 'hébreu'),
'her': ISOCodeData('', 'hz', 'Herero', 'herero'),
'hil': ISOCodeData('', '', 'Hiligaynon', 'hiligaynon'),
'him': ISOCodeData(
'',
'',
'Himachali languages; Western Pahari languages',
'langues himachalis; langues paharis occidentales',
),
'hin': ISOCodeData('', 'hi', 'Hindi', 'hindi'),
'hit': ISOCodeData('', '', 'Hittite', 'hittite'),
'hmn': ISOCodeData('', '', 'Hmong; Mong', 'hmong'),
'hmo': ISOCodeData('', 'ho', 'Hiri Motu', 'hiri motu'),
'hrv': ISOCodeData('', 'hr', 'Croatian', 'croate'),
'hsb': ISOCodeData('', '', 'Upper Sorbian', 'haut-sorabe'),
'hun': ISOCodeData('', 'hu', 'Hungarian', 'hongrois'),
'hup': ISOCodeData('', '', 'Hupa', 'hupa'),
'iba': ISOCodeData('', '', 'Iban', 'iban'),
'ibo': ISOCodeData('', 'ig', 'Igbo', 'igbo'),
'ice': ISOCodeData('isl', 'is', 'Icelandic', 'islandais'),
'ido': ISOCodeData('', 'io', 'Ido', 'ido'),
'iii': ISOCodeData('', 'ii', 'Sichuan Yi; Nuosu', 'yi de Sichuan'),
'ijo': ISOCodeData('', '', 'Ijo languages', 'ijo, langues'),
'iku': ISOCodeData('', 'iu', 'Inuktitut', 'inuktitut'),
'ile': ISOCodeData('', 'ie', 'Interlingue; Occidental', 'interlingue'),
'ilo': ISOCodeData('', '', 'Iloko', 'ilocano'),
'ina': ISOCodeData(
'',
'ia',
'Interlingua (International Auxiliary Language Association)',
'interlingua (langue auxiliaire internationale)',
),
'inc': ISOCodeData('', '', 'Indic languages', 'indo-aryennes, langues'),
'ind': ISOCodeData('', 'id', 'Indonesian', 'indonésien'),
'ine': ISOCodeData(
'',
'',
'Indo-European languages',
'indo-européennes, langues',
),
'inh': ISOCodeData('', '', 'Ingush', 'ingouche'),
'ipk': ISOCodeData('', 'ik', 'Inupiaq', 'inupiaq'),
'ira': ISOCodeData('', '', 'Iranian languages', 'iraniennes, langues'),
'iro': ISOCodeData('', '', 'Iroquoian languages', 'iroquoises, langues'),
'ita': ISOCodeData('', 'it', 'Italian', 'italien'),
'jav': ISOCodeData('', 'jv', 'Javanese', 'javanais'),
'jbo': ISOCodeData('', '', 'Lojban', 'lojban'),
'jpn': ISOCodeData('', 'ja', 'Japanese', 'japonais'),
'jpr': ISOCodeData('', '', 'Judeo-Persian', 'judéo-persan'),
'jrb': ISOCodeData('', '', 'Judeo-Arabic', 'judéo-arabe'),
'kaa': ISOCodeData('', '', 'Kara-Kalpak', 'karakalpak'),
'kab': ISOCodeData('', '', 'Kabyle', 'kabyle'),
'kac': ISOCodeData('', '', 'Kachin; Jingpho', 'kachin; jingpho'),
'kal': ISOCodeData('', 'kl', 'Kalaallisut; Greenlandic', 'groenlandais'),
'kam': ISOCodeData('', '', 'Kamba', 'kamba'),
'kan': ISOCodeData('', 'kn', 'Kannada', 'kannada'),
'kar': ISOCodeData('', '', 'Karen languages', 'karen, langues'),
'kas': ISOCodeData('', 'ks', 'Kashmiri', 'kashmiri'),
'kau': ISOCodeData('', 'kr', 'Kanuri', 'kanouri'),
'kaw': ISOCodeData('', '', 'Kawi', 'kawi'),
'kaz': ISOCodeData('', 'kk', 'Kazakh', 'kazakh'),
'kbd': ISOCodeData('', '', 'Kabardian', 'kabardien'),
'kha': ISOCodeData('', '', 'Khasi', 'khasi'),
'khi': ISOCodeData('', '', 'Khoisan languages', 'khoïsan, langues'),
'khm': ISOCodeData('', 'km', 'Central Khmer', 'khmer central'),
'kho': ISOCodeData('', '', 'Khotanese; Sakan', 'khotanais; sakan'),
'kik': ISOCodeData('', 'ki', 'Kikuyu; Gikuyu', 'kikuyu'),
'kin': ISOCodeData('', 'rw', 'Kinyarwanda', 'rwanda'),
'kir': ISOCodeData('', 'ky', 'Kirghiz; Kyrgyz', 'kirghiz'),
'kmb': ISOCodeData('', '', 'Kimbundu', 'kimbundu'),
'kok': ISOCodeData('', '', 'Konkani', 'konkani'),
'kom': ISOCodeData('', 'kv', 'Komi', 'kom'),
'kon': ISOCodeData('', 'kg', 'Kongo', 'kongo'),
'kor': ISOCodeData('', 'ko', 'Korean', 'coréen'),
'kos': ISOCodeData('', '', 'Kosraean', 'kosrae'),
'kpe': ISOCodeData('', '', 'Kpelle', 'kpellé'),
'krc': ISOCodeData('', '', 'Karachay-Balkar', 'karatchai balkar'),
'krl': ISOCodeData('', '', 'Karelian', 'carélien'),
'kro': ISOCodeData('', '', 'Kru languages', 'krou, langues'),
'kru': ISOCodeData('', '', 'Kurukh', 'kurukh'),
'kua': ISOCodeData('', 'kj', 'Kuanyama; Kwanyama', 'kuanyama; kwanyama'),
'kum': ISOCodeData('', '', 'Kumyk', 'koumyk'),
'kur': ISOCodeData('', 'ku', 'Kurdish', 'kurde'),
'kut': ISOCodeData('', '', 'Kutenai', 'kutenai'),
'lad': ISOCodeData('', '', 'Ladino', 'judéo-espagnol'),
'lah': ISOCodeData('', '', 'Lahnda', 'lahnda'),
'lam': ISOCodeData('', '', 'Lamba', 'lamba'),
'lao': ISOCodeData('', 'lo', 'Lao', 'lao'),
'lat': ISOCodeData('', 'la', 'Latin', 'latin'),
'lav': ISOCodeData('', 'lv', 'Latvian', 'letton'),
'lez': ISOCodeData('', '', 'Lezghian', 'lezghien'),
'lim': ISOCodeData(
'',
'li',
'Limburgan; Limburger; Limburgish',
'limbourgeois',
),
'lin': ISOCodeData('', 'ln', 'Lingala', 'lingala'),
'lit': ISOCodeData('', 'lt', 'Lithuanian', 'lituanien'),
'lol': ISOCodeData('', '', 'Mongo', 'mongo'),
'loz': ISOCodeData('', '', 'Lozi', 'lozi'),
'ltz': ISOCodeData(
'',
'lb',
'Luxembourgish; Letzeburgesch',
'luxembourgeois',
),
'lua': ISOCodeData('', '', 'Luba-Lulua', 'luba-lulua'),
'lub': ISOCodeData('', 'lu', 'Luba-Katanga', 'luba-katanga'),
'lug': ISOCodeData('', 'lg', 'Ganda', 'ganda'),
'lui': ISOCodeData('', '', 'Luiseno', 'luiseno'),
'lun': ISOCodeData('', '', 'Lunda', 'lunda'),
'luo': ISOCodeData(
'',
'',
'Luo (Kenya and Tanzania)',
'luo (Kenya et Tanzanie)',
),
'lus': ISOCodeData('', '', 'Lushai', 'lushai'),
'mac': ISOCodeData('mkd', 'mk', 'Macedonian', 'macédonien'),
'mad': ISOCodeData('', '', 'Madurese', 'madourais'),
'mag': ISOCodeData('', '', 'Magahi', 'magahi'),
'mah': ISOCodeData('', 'mh', 'Marshallese', 'marshall'),
'mai': ISOCodeData('', '', 'Maithili', 'maithili'),
'mak': ISOCodeData('', '', 'Makasar', 'makassar'),
'mal': ISOCodeData('', 'ml', 'Malayalam', 'malayalam'),
'man': ISOCodeData('', '', 'Mandingo', 'mandingue'),
'mao': ISOCodeData('mri', 'mi', 'Maori', 'maori'),
'map': ISOCodeData(
'',
'',
'Austronesian languages',
'austronésiennes, langues',
),
'mar': ISOCodeData('', 'mr', 'Marathi', 'marathe'),
'mas': ISOCodeData('', '', 'Masai', 'massaï'),
'may': ISOCodeData('msa', 'ms', 'Malay', 'malais'),
'mdf': ISOCodeData('', '', 'Moksha', 'moksa'),
'mdr': ISOCodeData('', '', 'Mandar', 'mandar'),
'men': ISOCodeData('', '', 'Mende', 'mendé'),
'mga': ISOCodeData(
'',
'',
'Irish, Middle (900-1200)',
'irlandais moyen (900-1200)',
),
'mic': ISOCodeData('', '', "Mi'kmaq; Micmac", "mi'kmaq; micmac"),
'min': ISOCodeData('', '', 'Minangkabau', 'minangkabau'),
'mis': ISOCodeData('', '', 'Uncoded languages', 'langues non codées'),
'mkh': ISOCodeData('', '', 'Mon-Khmer languages', 'môn-khmer, langues'),
'mlg': ISOCodeData('', 'mg', 'Malagasy', 'malgache'),
'mlt': ISOCodeData('', 'mt', 'Maltese', 'maltais'),
'mnc': ISOCodeData('', '', 'Manchu', 'mandchou'),
'mni': ISOCodeData('', '', 'Manipuri', 'manipuri'),
'mno': ISOCodeData('', '', 'Manobo languages', 'manobo, langues'),
'moh': ISOCodeData('', '', 'Mohawk', 'mohawk'),
'mon': ISOCodeData('', 'mn', 'Mongolian', 'mongol'),
'mos': ISOCodeData('', '', 'Mossi', 'moré'),
'mul': ISOCodeData('', '', 'Multiple languages', 'multilingue'),
'mun': ISOCodeData('', '', 'Munda languages', 'mounda, langues'),
'mus': ISOCodeData('', '', 'Creek', 'muskogee'),
'mwl': ISOCodeData('', '', 'Mirandese', 'mirandais'),
'mwr': ISOCodeData('', '', 'Marwari', 'marvari'),
'myn': ISOCodeData('', '', 'Mayan languages', 'maya, langues'),
'myv': ISOCodeData('', '', 'Erzya', 'erza'),
'nah': ISOCodeData('', '', 'Nahuatl languages', 'nahuatl, langues'),
'nai': ISOCodeData(
'',
'',
'North American Indian languages',
'nord-amérindiennes, langues',
),
'nap': ISOCodeData('', '', 'Neapolitan', 'napolitain'),
'nau': ISOCodeData('', 'na', 'Nauru', 'nauruan'),
'nav': ISOCodeData('', 'nv', 'Navajo; Navaho', 'navaho'),
'nbl': ISOCodeData(
'',
'nr',
'Ndebele, South; South Ndebele',
'ndébélé du Sud',
),
'nde': ISOCodeData(
'',
'nd',
'Ndebele, North; North Ndebele',
'ndébélé du Nord',
),
'ndo': ISOCodeData('', 'ng', 'Ndonga', 'ndonga'),
'nds': ISOCodeData(
'',
'',
'Low German; Low Saxon; German, Low; Saxon, Low',
'bas allemand; bas saxon; allemand, bas; saxon, bas',
),
'nep': ISOCodeData('', 'ne', 'Nepali', 'népalais'),
'new': ISOCodeData('', '', 'Nepal Bhasa; Newari', 'nepal bhasa; newari'),
'nia': ISOCodeData('', '', 'Nias', 'nias'),
'nic': ISOCodeData(
'',
'',
'Niger-Kordofanian languages',
'nigéro-kordofaniennes, langues',
),
'niu': ISOCodeData('', '', 'Niuean', 'niué'),
'nno': ISOCodeData(
'',
'nn',
'Norwegian Nynorsk; Nynorsk, Norwegian',
'norvégien nynorsk; nynorsk, norvégien',
),
'nob': ISOCodeData(
'',
'nb',
'Bokmål, Norwegian; Norwegian Bokmål',
'norvégien bokmål',
),
'nog': ISOCodeData('', '', 'Nogai', 'nogaï; nogay'),
'non': ISOCodeData('', '', 'Norse, Old', 'norrois, vieux'),
'nor': ISOCodeData('', 'no', 'Norwegian', 'norvégien'),
'nqo': ISOCodeData('', '', "N'Ko", "n'ko"),
'nso': ISOCodeData(
'',
'',
'Pedi; Sepedi; Northern Sotho',
'pedi; sepedi; sotho du Nord',
),
'nub': ISOCodeData('', '', 'Nubian languages', 'nubiennes, langues'),
'nwc': ISOCodeData(
'',
'',
'Classical Newari; Old Newari; Classical Nepal Bhasa',
'newari classique',
),
'nya': ISOCodeData(
'',
'ny',
'Chichewa; Chewa; Nyanja',
'chichewa; chewa; nyanja',
),
'nym': ISOCodeData('', '', 'Nyamwezi', 'nyamwezi'),
'nyn': ISOCodeData('', '', 'Nyankole', 'nyankolé'),
'nyo': ISOCodeData('', '', 'Nyoro', 'nyoro'),
'nzi': ISOCodeData('', '', 'Nzima', 'nzema'),
'oci': ISOCodeData(
'',
'oc',
'Occitan (post 1500)',
'occitan (après 1500)',
),
'oji': ISOCodeData('', 'oj', 'Ojibwa', 'ojibwa'),
'ori': ISOCodeData('', 'or', 'Oriya', 'oriya'),
'orm': ISOCodeData('', 'om', 'Oromo', 'galla'),
'osa': ISOCodeData('', '', 'Osage', 'osage'),
'oss': ISOCodeData('', 'os', 'Ossetian; Ossetic', 'ossète'),
'ota': ISOCodeData(
'',
'',
'Turkish, Ottoman (1500-1928)',
'turc ottoman (1500-1928)',
),
'oto': ISOCodeData('', '', 'Otomian languages', 'otomi, langues'),
'paa': ISOCodeData('', '', 'Papuan languages', 'papoues, langues'),
'pag': ISOCodeData('', '', 'Pangasinan', 'pangasinan'),
'pal': ISOCodeData('', '', 'Pahlavi', 'pahlavi'),
'pam': ISOCodeData('', '', 'Pampanga; Kapampangan', 'pampangan'),
'pan': ISOCodeData('', 'pa', 'Panjabi; Punjabi', 'pendjabi'),
'pap': ISOCodeData('', '', 'Papiamento', 'papiamento'),
'pau': ISOCodeData('', '', 'Palauan', 'palau'),
'peo': ISOCodeData(
'',
'',
'Persian, Old (ca.600-400 B.C.)',
'perse, vieux (ca. 600-400 av. J.-C.)',
),
'per': ISOCodeData('fas', 'fa', 'Persian', 'persan'),
'phi': ISOCodeData(
'',
'',
'Philippine languages',
'philippines, langues',
),
'phn': ISOCodeData('', '', 'Phoenician', 'phénicien'),
'pli': ISOCodeData('', 'pi', 'Pali', 'pali'),
'pol': ISOCodeData('', 'pl', 'Polish', 'polonais'),
'pon': ISOCodeData('', '', 'Pohnpeian', 'pohnpei'),
'por': ISOCodeData('', 'pt', 'Portuguese', 'portugais'),
'pra': ISOCodeData('', '', 'Prakrit languages', 'prâkrit, langues'),
'pro': ISOCodeData(
'',
'',
'Provençal, Old (to 1500); Occitan, Old (to 1500)',
"provençal ancien (jusqu'à 1500); occitan ancien (jusqu'à 1500)",
),
'pus': ISOCodeData('', 'ps', 'Pushto; Pashto', 'pachto'),
'qaa': ISOCodeData(
'',
'',
'Reserved for local use',
"réservée à l'usage local",
),
'que': ISOCodeData('', 'qu', 'Quechua', 'quechua'),
'raj': ISOCodeData('', '', 'Rajasthani', 'rajasthani'),
'rap': ISOCodeData('', '', 'Rapanui', 'rapanui'),
'rar': ISOCodeData(
'',
'',
'Rarotongan; Cook Islands Maori',
'rarotonga; maori des îles Cook',
),
'roa': ISOCodeData('', '', 'Romance languages', 'romanes, langues'),
'roh': ISOCodeData('', 'rm', 'Romansh', 'romanche'),
'rom': ISOCodeData('', '', 'Romany', 'tsigane'),
'rum': ISOCodeData(
'ron',
'ro',
'Romanian; Moldavian; Moldovan',
'roumain; moldave',
),
'run': ISOCodeData('', 'rn', 'Rundi', 'rundi'),
'rup': ISOCodeData(
'',
'',
'Aromanian; Arumanian; Macedo-Romanian',
'aroumain; macédo-roumain',
),
'rus': ISOCodeData('', 'ru', 'Russian', 'russe'),
'sad': ISOCodeData('', '', 'Sandawe', 'sandawe'),
'sag': ISOCodeData('', 'sg', 'Sango', 'sango'),
'sah': ISOCodeData('', '', 'Yakut', 'iakoute'),
'sai': ISOCodeData(
'',
'',
'South American Indian languages',
'sud-amérindiennes, langues',
),
'sal': ISOCodeData('', '', 'Salishan languages', 'salishennes, langues'),
'sam': ISOCodeData('', '', 'Samaritan Aramaic', 'samaritain'),
'san': ISOCodeData('', 'sa', 'Sanskrit', 'sanskrit'),
'sas': ISOCodeData('', '', 'Sasak', 'sasak'),
'sat': ISOCodeData('', '', 'Santali', 'santal'),
'scn': ISOCodeData('', '', 'Sicilian', 'sicilien'),
'sco': ISOCodeData('', '', 'Scots', 'écossais'),
'sel': ISOCodeData('', '', 'Selkup', 'selkoupe'),
'sem': ISOCodeData('', '', 'Semitic languages', 'sémitiques, langues'),
'sga': ISOCodeData(
'',
'',
'Irish, Old (to 900)',
"irlandais ancien (jusqu'à 900)",
),
'sgn': ISOCodeData('', '', 'Sign Languages', 'langues des signes'),
'shn': ISOCodeData('', '', 'Shan', 'chan'),
'sid': ISOCodeData('', '', 'Sidamo', 'sidamo'),
'sin': ISOCodeData('', 'si', 'Sinhala; Sinhalese', 'singhalais'),
'sio': ISOCodeData('', '', 'Siouan languages', 'sioux, langues'),
'sit': ISOCodeData(
'',
'',
'Sino-Tibetan languages',
'sino-tibétaines, langues',
),
'sla': ISOCodeData('', '', 'Slavic languages', 'slaves, langues'),
'slo': ISOCodeData('slk', 'sk', 'Slovak', 'slovaque'),
'slv': ISOCodeData('', 'sl', 'Slovenian', 'slovène'),
'sma': ISOCodeData('', '', 'Southern Sami', 'sami du Sud'),
'sme': ISOCodeData('', 'se', 'Northern Sami', 'sami du Nord'),
'smi': ISOCodeData('', '', 'Sami languages', 'sames, langues'),
'smj': ISOCodeData('', '', 'Lule Sami', 'sami de Lule'),
'smn': ISOCodeData('', '', 'Inari Sami', "sami d'Inari"),
'smo': ISOCodeData('', 'sm', 'Samoan', 'samoan'),
'sms': ISOCodeData('', '', 'Skolt Sami', 'sami skolt'),
'sna': ISOCodeData('', 'sn', 'Shona', 'shona'),
'snd': ISOCodeData('', 'sd', 'Sindhi', 'sindhi'),
'snk': ISOCodeData('', '', 'Soninke', 'soninké'),
'sog': ISOCodeData('', '', 'Sogdian', 'sogdien'),
'som': ISOCodeData('', 'so', 'Somali', 'somali'),
'son': ISOCodeData('', '', 'Songhai languages', 'songhai, langues'),
'sot': ISOCodeData('', 'st', 'Sotho, Southern', 'sotho du Sud'),
'spa': ISOCodeData('', 'es', 'Spanish; Castilian', 'espagnol; castillan'),
'srd': ISOCodeData('', 'sc', 'Sardinian', 'sarde'),
'srn': ISOCodeData('', '', 'Sranan Tongo', 'sranan tongo'),
'srp': ISOCodeData('', 'sr', 'Serbian', 'serbe'),
'srr': ISOCodeData('', '', 'Serer', 'sérère'),
'ssa': ISOCodeData(
'',
'',
'Nilo-Saharan languages',
'nilo-sahariennes, langues',
),
'ssw': ISOCodeData('', 'ss', 'Swati', 'swati'),
'suk': ISOCodeData('', '', 'Sukuma', 'sukuma'),
'sun': ISOCodeData('', 'su', 'Sundanese', 'soundanais'),
'sus': ISOCodeData('', '', 'Susu', 'soussou'),
'sux': ISOCodeData('', '', 'Sumerian', 'sumérien'),
'swa': ISOCodeData('', 'sw', 'Swahili', 'swahili'),
'swe': ISOCodeData('', 'sv', 'Swedish', 'suédois'),
'syc': ISOCodeData('', '', 'Classical Syriac', 'syriaque classique'),
'syr': ISOCodeData('', '', 'Syriac', 'syriaque'),
'tah': ISOCodeData('', 'ty', 'Tahitian', 'tahitien'),
'tai': ISOCodeData('', '', 'Tai languages', 'tai, langues'),
'tam': ISOCodeData('', 'ta', 'Tamil', 'tamoul'),
'tat': ISOCodeData('', 'tt', 'Tatar', 'tatar'),
'tel': ISOCodeData('', 'te', 'Telugu', 'télougou'),
'tem': ISOCodeData('', '', 'Timne', 'temne'),
'ter': ISOCodeData('', '', 'Tereno', 'tereno'),
'tet': ISOCodeData('', '', 'Tetum', 'tetum'),
'tgk': ISOCodeData('', 'tg', 'Tajik', 'tadjik'),
'tgl': ISOCodeData('', 'tl', 'Tagalog', 'tagalog'),
'tha': ISOCodeData('', 'th', 'Thai', 'thaï'),
'tib': ISOCodeData('bod', 'bo', 'Tibetan', 'tibétain'),
'tig': ISOCodeData('', '', 'Tigre', 'tigré'),
'tir': ISOCodeData('', 'ti', 'Tigrinya', 'tigrigna'),
'tiv': ISOCodeData('', '', 'Tiv', 'tiv'),
'tkl': ISOCodeData('', '', 'Tokelau', 'tokelau'),
'tlh': ISOCodeData('', '', 'Klingon; tlhIngan-Hol', 'klingon'),
'tli': ISOCodeData('', '', 'Tlingit', 'tlingit'),
'tmh': ISOCodeData('', '', 'Tamashek', 'tamacheq'),
'tog': ISOCodeData('', '', 'Tonga (Nyasa)', 'tonga (Nyasa)'),
'ton': ISOCodeData(
'',
'to',
'Tonga (Tonga Islands)',
'tongan (Îles Tonga)',
),
'tpi': ISOCodeData('', '', 'Tok Pisin', 'tok pisin'),
'tsi': ISOCodeData('', '', 'Tsimshian', 'tsimshian'),
'tsn': ISOCodeData('', 'tn', 'Tswana', 'tswana'),
'tso': ISOCodeData('', 'ts', 'Tsonga', 'tsonga'),
'tuk': ISOCodeData('', 'tk', 'Turkmen', 'turkmène'),
'tum': ISOCodeData('', '', 'Tumbuka', 'tumbuka'),
'tup': ISOCodeData('', '', 'Tupi languages', 'tupi, langues'),
'tur': ISOCodeData('', 'tr', 'Turkish', 'turc'),
'tut': ISOCodeData('', '', 'Altaic languages', 'altaïques, langues'),
'tvl': ISOCodeData('', '', 'Tuvalu', 'tuvalu'),
'twi': ISOCodeData('', 'tw', 'Twi', 'twi'),
'tyv': ISOCodeData('', '', 'Tuvinian', 'touva'),
'udm': ISOCodeData('', '', 'Udmurt', 'oudmourte'),
'uga': ISOCodeData('', '', 'Ugaritic', 'ougaritique'),
'uig': ISOCodeData('', 'ug', 'Uighur; Uyghur', 'ouïgour'),
'ukr': ISOCodeData('', 'uk', 'Ukrainian', 'ukrainien'),
'umb': ISOCodeData('', '', 'Umbundu', 'umbundu'),
'und': ISOCodeData('', '', 'Undetermined', 'indéterminée'),
'urd': ISOCodeData('', 'ur', 'Urdu', 'ourdou'),
'uzb': ISOCodeData('', 'uz', 'Uzbek', 'ouszbek'),
'vai': ISOCodeData('', '', 'Vai', 'vaï'),
'ven': ISOCodeData('', 've', 'Venda', 'venda'),
'vie': ISOCodeData('', 'vi', 'Vietnamese', 'vietnamien'),
'vol': ISOCodeData('', 'vo', 'Volapük', 'volapük'),
'vot': ISOCodeData('', '', 'Votic', 'vote'),
'wak': ISOCodeData('', '', 'Wakashan languages', 'wakashanes, langues'),
'wal': ISOCodeData('', '', 'Wolaitta; Wolaytta', 'wolaitta; wolaytta'),
'war': ISOCodeData('', '', 'Waray', 'waray'),
'was': ISOCodeData('', '', 'Washo', 'washo'),
'wel': ISOCodeData('cym', 'cy', 'Welsh', 'gallois'),
'wen': ISOCodeData('', '', 'Sorbian languages', 'sorabes, langues'),
'wln': ISOCodeData('', 'wa', 'Walloon', 'wallon'),
'wol': ISOCodeData('', 'wo', 'Wolof', 'wolof'),
'xal': ISOCodeData('', '', 'Kalmyk; Oirat', 'kalmouk; oïrat'),
'xho': ISOCodeData('', 'xh', 'Xhosa', 'xhosa'),
'yao': ISOCodeData('', '', 'Yao', 'yao'),
'yap': ISOCodeData('', '', 'Yapese', 'yapois'),
'yid': ISOCodeData('', 'yi', 'Yiddish', 'yiddish'),
'yor': ISOCodeData('', 'yo', 'Yoruba', 'yoruba'),
'ypk': ISOCodeData('', '', 'Yupik languages', 'yupik, langues'),
'zap': ISOCodeData('', '', 'Zapotec', 'zapotèque'),
'zbl': ISOCodeData(
'',
'',
'Blissymbols; Blissymbolics; Bliss',
'symboles Bliss; Bliss',
),
'zen': ISOCodeData('', '', 'Zenaga', 'zenaga'),
'zgh': ISOCodeData(
'',
'',
'Standard Moroccan Tamazight',
'amazighe standard marocain',
),
'zha': ISOCodeData('', 'za', 'Zhuang; Chuang', 'zhuang; chuang'),
'znd': ISOCodeData('', '', 'Zande languages', 'zandé, langues'),
'zul': ISOCodeData('', 'zu', 'Zulu', 'zoulou'),
'zun': ISOCodeData('', '', 'Zuni', 'zuni'),
'zxx': ISOCodeData(
'',
'',
'No linguistic content; Not applicable',
'pas de contenu linguistique; non applicable',
),
'zza': ISOCodeData(
'',
'',
'Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki',
'zaza; dimili; dimli; kirdki; kirmanjki; zazaki',
),
}
def iso_639_2_from_3(iso3: str) -> str:
"""Convert ISO 639-3 code to ISO 639-2 code."""
if iso3 in ISO_639_3:
return ISO_639_3[iso3].alpha_2
else:
return ""
================================================
FILE: src/ocrmypdf/models/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF models for plugin options and cross-cutting concerns."""
from __future__ import annotations
================================================
FILE: src/ocrmypdf/models/ocr_element.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCR element dataclasses for representing OCR output structure.
This module provides a generic, engine-agnostic representation of OCR output.
The OcrElement dataclass can represent structural units from any OCR source
(hOCR, ALTO, custom engines, etc.) in a unified format suitable for rendering.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Literal
@dataclass
class BoundingBox:
"""An axis-aligned bounding box in pixel coordinates.
Coordinates use top-left origin (standard for images and hOCR).
Attributes:
left: Left edge x-coordinate
top: Top edge y-coordinate
right: Right edge x-coordinate
bottom: Bottom edge y-coordinate
"""
left: float
top: float
right: float
bottom: float
@property
def width(self) -> float:
"""Width of the bounding box."""
return self.right - self.left
@property
def height(self) -> float:
"""Height of the bounding box."""
return self.bottom - self.top
def __post_init__(self):
"""Validate bounding box coordinates."""
if self.right < self.left:
raise ValueError(
f"Invalid bounding box: right ({self.right}) < left ({self.left})"
)
if self.bottom < self.top:
raise ValueError(
f"Invalid bounding box: bottom ({self.bottom}) < top ({self.top})"
)
@dataclass
class Baseline:
"""Text baseline information.
The baseline is represented as a linear equation: y = slope * x + intercept.
This describes the line along which text characters sit, relative to the
bottom-left corner of the line's bounding box.
In hOCR, the baseline is specified relative to the bottom of the line's bbox,
with the intercept being the vertical offset from the bottom and the slope
representing rotation (positive = ascending left-to-right).
Attributes:
slope: Slope of the baseline (rise over run)
intercept: Y-intercept of the baseline (vertical offset from bbox bottom)
"""
slope: float = 0.0
intercept: float = 0.0
@dataclass
class FontInfo:
"""Font information for text rendering.
Attributes:
name: Font family name (e.g., "Times New Roman")
size: Font size in points
bold: Whether the font is bold
italic: Whether the font is italic
monospace: Whether the font is monospace
serif: Whether the font is serif (vs sans-serif)
smallcaps: Whether the font uses small caps
underline: Whether the text is underlined
"""
name: str | None = None
size: float | None = None
bold: bool = False
italic: bool = False
monospace: bool = False
serif: bool = False
smallcaps: bool = False
underline: bool = False
@dataclass
class OcrElement:
"""A generic OCR element representing any structural unit of OCR output.
OcrElements form a tree structure where pages contain paragraphs, paragraphs
contain lines, lines contain words, etc. The specific hierarchy depends on
the OCR engine, but this dataclass can represent any of these levels.
The ocr_class field uses hOCR naming conventions (ocr_page, ocr_par, ocr_line,
ocrx_word, etc.) as a common vocabulary, but elements from other sources can
map to these classes.
Common hOCR classes:
- ocr_page: The root element for a page
- ocr_carea: A content/column area
- ocr_par: A paragraph
- ocr_line: A line of text
- ocr_header: A header line
- ocr_footer: A footer line
- ocr_caption: A caption line
- ocr_textfloat: A floating text element
- ocrx_word: A single word
Attributes:
ocr_class: The element type (e.g., "ocr_page", "ocr_line", "ocrx_word")
bbox: Axis-aligned bounding box in source pixel coordinates (top-left origin)
poly: Polygon vertices for oriented/non-rectangular bounds
text: Text content (primarily for leaf nodes like words)
confidence: OCR confidence score (0.0-1.0)
children: Child elements (hierarchical structure)
direction: Text direction ("ltr" or "rtl")
language: Language code (e.g., "eng", "deu", "chi_sim")
baseline: Text baseline information (slope and intercept)
textangle: Text rotation angle in degrees (counter-clockwise from horizontal)
font: Font information (name, size, style)
dpi: Image resolution in dots per inch (typically for page-level)
page_number: Physical page number (0-indexed)
logical_page_number: Logical page number (as printed on the page)
"""
ocr_class: str
# Bounding boxes
bbox: BoundingBox | None = None
poly: list[tuple[float, float]] | None = None
# Text content
text: str = ""
# Confidence (0.0-1.0)
confidence: float | None = None
# Children (hierarchical structure)
children: list[OcrElement] = field(default_factory=list)
# Text direction and language
direction: Literal["ltr", "rtl"] | None = None
language: str | None = None
# Baseline (for lines)
baseline: Baseline | None = None
# Rotation angle in degrees (counter-clockwise)
textangle: float | None = None
# Font information
font: FontInfo | None = None
# Page-level properties
dpi: float | None = None
page_number: int | None = None
logical_page_number: int | None = None
def iter_by_class(self, *ocr_classes: str) -> list[OcrElement]:
"""Iterate over all descendants matching the given class(es).
Args:
*ocr_classes: One or more ocr_class values to match
Returns:
List of all matching descendant elements (depth-first order)
"""
result = []
if self.ocr_class in ocr_classes:
result.append(self)
for child in self.children:
result.extend(child.iter_by_class(*ocr_classes))
return result
def find_by_class(self, *ocr_classes: str) -> OcrElement | None:
"""Find the first descendant matching the given class(es).
Args:
*ocr_classes: One or more ocr_class values to match
Returns:
The first matching element, or None if not found
"""
if self.ocr_class in ocr_classes:
return self
for child in self.children:
result = child.find_by_class(*ocr_classes)
if result is not None:
return result
return None
def get_text_recursive(self) -> str:
"""Get the combined text of this element and all descendants.
Returns:
Combined text content, with words separated by spaces
"""
if self.text:
return self.text
texts = [child.get_text_recursive() for child in self.children]
return " ".join(t for t in texts if t)
@property
def words(self) -> list[OcrElement]:
"""Get all word elements (ocrx_word) in this element's subtree."""
return self.iter_by_class("ocrx_word")
@property
def lines(self) -> list[OcrElement]:
"""Get all line elements in this element's subtree."""
return self.iter_by_class(
"ocr_line", "ocr_header", "ocr_footer", "ocr_caption", "ocr_textfloat"
)
@property
def paragraphs(self) -> list[OcrElement]:
"""Get all paragraph elements (ocr_par) in this element's subtree."""
return self.iter_by_class("ocr_par")
# Type alias for text direction
TextDirection = Literal["ltr", "rtl"]
# hOCR class constants for convenience
class OcrClass:
"""Constants for common OCR element classes."""
# Page-level
PAGE = "ocr_page"
CAREA = "ocr_carea"
# Block-level
PARAGRAPH = "ocr_par"
# Line-level
LINE = "ocr_line"
HEADER = "ocr_header"
FOOTER = "ocr_footer"
CAPTION = "ocr_caption"
TEXTFLOAT = "ocr_textfloat"
# Word-level
WORD = "ocrx_word"
# Character-level
CHAR = "ocrx_cinfo"
# Line types (for convenience)
LINE_TYPES = frozenset({LINE, HEADER, FOOTER, CAPTION, TEXTFLOAT})
================================================
FILE: src/ocrmypdf/optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Post-processing image optimization of OCR PDFs."""
from __future__ import annotations
import logging
import sys
import tempfile
import threading
from collections.abc import Callable, Iterator, MutableSet, Sequence
from os import fspath
from pathlib import Path
from typing import Any, NamedTuple, NewType
from zlib import compress
import img2pdf
from packaging.version import Version
from pikepdf import (
Array,
Dictionary,
Name,
Object,
ObjectStreamMode,
Pdf,
PdfError,
PdfImage,
Stream,
UnsupportedImageTypeError,
)
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image
from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf._exec import ghostscript, jbig2enc, pngquant
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import OutputFileAccessError
from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink
log = logging.getLogger(__name__)
DEFAULT_JPEG_QUALITY = 75
DEFAULT_PNG_QUALITY = 70
FLATE_JPEG_THRESHOLD = 10000
Xref = NewType('Xref', int)
class XrefExt(NamedTuple):
"""A PDF xref and image extension pair."""
xref: Xref
ext: str
def img_name(root: Path, xref: Xref, ext: str) -> Path:
"""Return the name of an image file for a given xref and extension."""
return root / f'{xref:08d}{ext}'
def png_name(root: Path, xref: Xref) -> Path:
"""Return the name of a PNG file for a given xref."""
return img_name(root, xref, '.png')
def jpg_name(root: Path, xref: Xref) -> Path:
"""Return the name of a JPEG file for a given xref."""
return img_name(root, xref, '.jpg')
def extract_image_filter(
image: Stream, xref: Xref
) -> tuple[PdfImage, tuple[Name, Object]] | None:
"""Determine if an image is extractable."""
if image.Subtype != Name.Image:
return None
if not isinstance(image.Length, int) or image.Length < 100:
log.debug(f"xref {xref}: skipping image with small stream size")
return None
if (
not isinstance(image.Width, int)
or not isinstance(image.Height, int)
or image.Width < 8
or image.Height < 8
): # Issue 732
log.debug(f"xref {xref}: skipping image with unusually small dimensions")
return None
pim = PdfImage(image)
if len(pim.filter_decodeparms) > 1:
first_filtdp = pim.filter_decodeparms[0]
second_filtdp = pim.filter_decodeparms[1]
if (
len(pim.filter_decodeparms) == 2
and first_filtdp[0] == Name.FlateDecode
and first_filtdp[1] is not None
and first_filtdp[1].get(Name.Predictor, 1) == 1
and second_filtdp[0] == Name.DCTDecode
and not second_filtdp[1]
):
log.debug(
f"xref {xref}: found image compressed as /FlateDecode /DCTDecode, "
"marked for JPEG optimization"
)
filtdp = pim.filter_decodeparms[1]
else:
log.debug(f"xref {xref}: skipping image with multiple compression filters")
return None
else:
filtdp = pim.filter_decodeparms[0]
if pim.bits_per_component > 8:
log.debug(f"xref {xref}: skipping wide gamut image")
return None # Don't mess with wide gamut images
if filtdp[0] == Name.JPXDecode:
log.debug(f"xref {xref}: skipping JPEG2000 image")
return None # Don't do JPEG2000
if filtdp[0] == Name.CCITTFaxDecode and filtdp[1].get('/K', 0) >= 0:
log.debug(f"xref {xref}: skipping CCITT Group 3 image")
return None # pikepdf doesn't support Group 3 yet
if Name.Decode in image:
log.debug(f"xref {xref}: skipping image with Decode table")
return None # Don't mess with custom Decode tables
if image.get(Name.SMask, Dictionary()).get(Name.Matte, None) is not None:
# https://github.com/ocrmypdf/OCRmyPDF/issues/1536
# Do not attempt to optimize images that have a SMask with a Matte.
# That means alpha channel pre-blending is used, and we're not prepared
# to deal with the complexities of that.
log.debug(f"xref {xref}: skipping image whose SMask has Matte")
return None
return pim, filtdp
def extract_image_jbig2(
*, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
"""Extract an image, saving it as a JBIG2 file."""
del options # unused arg
result = extract_image_filter(image, xref)
if result is None:
return None
pim, filtdp = result
if (
pim.bits_per_component == 1
and filtdp[0] != Name.JBIG2Decode
and jbig2enc.available()
):
# Save any colorspace associated with the image, so that we
# will export a pure 1-bit PNG with no palette or ICC profile.
# Showing the palette or ICC to jbig2enc will cause it to perform
# colorspace transform to 1bpp, which will conflict the palette or
# ICC if it exists.
colorspace = pim.obj.get(Name.ColorSpace, None)
if colorspace is not None or pim.image_mask:
try:
# Set to DeviceGray temporarily; we already in 1 bpc.
pim.obj.ColorSpace = Name.DeviceGray
imgname = root / f'{xref:08d}'
with imgname.open('wb') as f:
ext = pim.extract_to(stream=f)
# Rename the file so it has .prejbig2.ext extension
# Making it unique avoids problems with Windows if the
# same image is extracted multiple times
imgname.rename(imgname.with_suffix(".prejbig2" + ext))
except NotImplementedError as e:
if '/Decode' in str(e):
log.debug(
f"xref {xref}: skipping image with unsupported Decode table"
)
return None
raise
except UnsupportedImageTypeError:
return None
finally:
# Restore image colorspace after temporarily setting it to DeviceGray
if colorspace is not None:
pim.obj.ColorSpace = colorspace
else:
del pim.obj.ColorSpace
return XrefExt(xref, ".prejbig2" + ext)
return None
def _should_optimize_jpeg(options, filtdp):
if options.optimize >= 2:
return True
# Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.
# To resolve this, re-optimize the JPEG anyway.
return options.optimize < 2 and ghostscript.version() >= Version('10.6.0')
def extract_image_generic(
*, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
"""Generic image extraction."""
result = extract_image_filter(image, xref)
if result is None:
return None
pim, filtdp = result
# Don't try to PNG-optimize 1bpp images, since JBIG2 does it better.
if pim.bits_per_component == 1:
return None
if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):
try:
imgname = root / f'{xref:08d}'
with imgname.open('wb') as f:
ext = pim.extract_to(stream=f)
imgname.rename(imgname.with_suffix(ext))
except (UnsupportedImageTypeError, HifiPrintImageNotTranscodableError):
return None
return XrefExt(xref, ext)
elif (
pim.indexed
and pim.colorspace in pim.SIMPLE_COLORSPACES
and options.optimize >= 3
):
# Try to improve on indexed images - these are far from low hanging
# fruit in most cases
pim.as_pil_image().save(png_name(root, xref))
return XrefExt(xref, '.png')
elif not pim.indexed and pim.colorspace in pim.SIMPLE_COLORSPACES:
# An optimization opportunity here, not currently taken, is directly
# generating a PNG from compressed data
try:
pim.as_pil_image().save(png_name(root, xref))
except NotImplementedError:
log.warning("PDF contains an atypical image that cannot be optimized.")
return None
return XrefExt(xref, '.png')
elif (
not pim.indexed
and pim.colorspace == Name.ICCBased
and pim.bits_per_component == 1
):
# We can losslessly optimize 1-bit images to CCITT or JBIG2 without
# paying any attention to the ICC profile
pim.as_pil_image().save(png_name(root, xref))
return XrefExt(xref, '.png')
return None
def _find_image_xrefs_container(
pdf: Pdf,
container: Object,
pageno: int,
include_xrefs: MutableSet[Xref],
exclude_xrefs: MutableSet[Xref],
pageno_for_xref: dict[Xref, int],
depth: int = 0,
):
"""Find all image XRefs or Form XObject and add to the include/exclude sets."""
if depth > 10:
log.warning("Recursion depth exceeded in _find_image_xrefs_page")
return
try:
xobjs = container.Resources.XObject
except AttributeError:
return
for _imname, image in dict(xobjs).items():
if image.objgen[1] != 0:
continue # Ignore images in an incremental PDF
xref = Xref(image.objgen[0])
if xref in include_xrefs or xref in exclude_xrefs:
continue # Already processed
if Name.Subtype in image and image.Subtype == Name.Form:
# Recurse into Form XObjects
log.debug(f"Recursing into Form XObject {_imname} in page {pageno}")
_find_image_xrefs_container(
pdf,
image,
pageno,
include_xrefs,
exclude_xrefs,
pageno_for_xref,
depth + 1,
)
continue
if Name.SMask in image:
# Ignore soft masks
smask_xref = Xref(image.SMask.objgen[0])
exclude_xrefs.add(smask_xref)
log.debug(f"xref {smask_xref}: skipping image because it is an SMask")
include_xrefs.add(xref)
log.debug(f"xref {xref}: treating as an optimization candidate")
if xref not in pageno_for_xref:
pageno_for_xref[xref] = pageno
def _find_image_xrefs(pdf: Pdf):
include_xrefs: MutableSet[Xref] = set()
exclude_xrefs: MutableSet[Xref] = set()
pageno_for_xref: dict[Xref, int] = {}
for pageno, page in enumerate(pdf.pages):
_find_image_xrefs_container(
pdf, page.obj, pageno, include_xrefs, exclude_xrefs, pageno_for_xref
)
working_xrefs = include_xrefs - exclude_xrefs
return working_xrefs, pageno_for_xref
def extract_images(
pdf: Pdf,
root: Path,
options,
extract_fn: Callable[..., XrefExt | None],
) -> Iterator[tuple[int, XrefExt]]:
"""Extract image using extract_fn.
Enumerate images on each page, lookup their xref/ID number in the PDF.
Exclude images that are soft masks (i.e. alpha transparency related).
Record the page number on which an image is first used, since images may be
used on multiple pages (or multiple times on the same page).
Current we do not check Form XObjects or other objects that may contain
images, and we don't evaluate alternate images or thumbnails.
extract_fn must decide if wants to extract the image in this context. If
it does a tuple should be returned: (xref, ext) where .ext is the file
extension. extract_fn must also extract the file it finds interesting.
"""
errors = 0
working_xrefs, pageno_for_xref = _find_image_xrefs(pdf)
for xref in working_xrefs:
image = pdf.get_object((xref, 0))
try:
result = extract_fn(
pdf=pdf, root=root, image=image, xref=xref, options=options
)
except Exception: # pylint: disable=broad-except
log.exception(
f"xref {xref}: While extracting this image, an error occurred"
)
errors += 1
else:
if result:
_, ext = result
yield pageno_for_xref[xref], XrefExt(xref, ext)
def extract_images_generic(
pdf: Pdf, root: Path, options
) -> tuple[list[Xref], list[Xref]]:
"""Extract any >=2bpp image we think we can improve."""
jpegs = []
pngs = []
for _, xref_ext in extract_images(pdf, root, options, extract_image_generic):
log.debug('%s', xref_ext)
if xref_ext.ext == '.png':
pngs.append(xref_ext.xref)
elif xref_ext.ext == '.jpg':
jpegs.append(xref_ext.xref)
log.debug(f"Optimizable images: JPEGs: {len(jpegs)} PNGs: {len(pngs)}")
return jpegs, pngs
def extract_images_jbig2(pdf: Pdf, root: Path, options) -> list[XrefExt]:
"""Extract any bitonal image that we think we can improve as JBIG2."""
jbig2_images = []
for _pageno, xref_ext in extract_images(pdf, root, options, extract_image_jbig2):
jbig2_images.append(xref_ext)
log.debug(f"Optimizable images: JBIG2: {len(jbig2_images)}")
return jbig2_images
def _produce_jbig2_images(
jbig2_images: list[XrefExt], root: Path, options, executor: Executor
) -> None:
"""Produce JBIG2 images using lossless single-image encoding."""
def jbig2_args():
for xref_ext in jbig2_images:
xref, ext = xref_ext
yield (
fspath(root),
img_name(root, xref, ext),
root / f'{xref:08d}.jbig2',
options.jbig2_threshold,
)
executor(
use_threads=True,
max_workers=options.jobs,
progress_kwargs=dict(
total=len(jbig2_images),
desc="JBIG2",
unit='image',
disable=not options.progress_bar,
),
task=jbig2enc.convert_single,
task_arguments=jbig2_args(),
)
def convert_to_jbig2(
pdf: Pdf,
jbig2_images: list[XrefExt],
root: Path,
options,
executor: Executor,
) -> None:
"""Convert images to JBIG2 and insert into PDF.
Each JBIG2 image is encoded independently using lossless compression.
No symbol dictionary (JBIG2Globals) is used.
"""
_produce_jbig2_images(jbig2_images, root, options, executor)
for xref_ext in jbig2_images:
xref, _ = xref_ext
jbig2_im_file = root / f'{xref:08d}.jbig2'
jbig2_im_data = jbig2_im_file.read_bytes()
im_obj = pdf.get_object(xref, 0)
im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=None)
def _optimize_jpeg(
xref: Xref, in_jpg: Path, opt_jpg: Path, jpg_quality: int
) -> tuple[Xref, Path | None]:
with Image.open(in_jpg) as im:
save_kwargs: dict[str, Any] = {'optimize': True}
if isinstance(jpg_quality, int) and 0 < jpg_quality <= 100:
save_kwargs['quality'] = jpg_quality
im.save(opt_jpg, **save_kwargs)
if opt_jpg.stat().st_size > in_jpg.stat().st_size:
log.debug(f"xref {xref}, jpeg, made larger - skip")
opt_jpg.unlink()
return xref, None
return xref, opt_jpg
def transcode_jpegs(
pdf: Pdf, jpegs: Sequence[Xref], root: Path, options, executor: Executor
) -> None:
"""Optimize JPEGs according to optimization settings."""
def jpeg_args() -> Iterator[tuple[Xref, Path, Path, int]]:
for xref in jpegs:
in_jpg = jpg_name(root, xref)
opt_jpg = in_jpg.with_suffix('.opt.jpg')
yield xref, in_jpg, opt_jpg, options.jpg_quality
def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar):
xref, opt_jpg = result
if opt_jpg:
compdata = opt_jpg.read_bytes() # JPEG can inserted into PDF as is
im_obj = pdf.get_object(xref, 0)
im_obj.write(compdata, filter=Name.DCTDecode)
pbar.update()
executor(
use_threads=True, # Processes are significantly slower at this task
max_workers=options.jobs,
progress_kwargs=dict(
desc="Recompressing JPEGs",
total=len(jpegs),
unit='image',
disable=not options.progress_bar,
),
task=_optimize_jpeg,
task_arguments=jpeg_args(),
task_finished=finish_jpeg,
)
def _already_flate_encoded(image: Stream) -> bool:
"""Check if the image already has FlateDecode in its filter chain."""
filt = image.get(Name.Filter)
if filt is None:
return False
if isinstance(filt, Array):
return Name.FlateDecode in list(filt)
return filt == Name.FlateDecode
def _find_deflatable_jpeg(
*, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
result = extract_image_filter(image, xref)
if result is None:
return None
_pim, filtdp = result
# Skip if already FlateDecode compressed - would double-compress
if _already_flate_encoded(image):
return None
if (
filtdp[0] == Name.DCTDecode
and not filtdp[1]
and (
(
# Don't flate very large images because it will slow down PDF viewers
1 <= options.optimize <= 2
and image.get(Name.Width, 0) < FLATE_JPEG_THRESHOLD
and image.get(Name.Height, 0) < FLATE_JPEG_THRESHOLD
)
or options.optimize == 3
)
):
return XrefExt(xref, '.memory')
return None
def _deflate_jpeg(
pdf: Pdf, lock: threading.Lock, xref: Xref, complevel: int
) -> tuple[Xref, bytes]:
with lock:
xobj = pdf.get_object(xref, 0)
try:
data = xobj.read_raw_bytes()
except PdfError:
return xref, b''
compdata = compress(data, complevel)
if len(compdata) >= len(data):
return xref, b''
return xref, compdata
def deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None:
"""Apply FlateDecode to JPEGs.
This is a lossless compression method that is supported by all PDF viewers,
and generally results in a smaller file size compared to straight DCTDecode
images.
"""
jpegs = []
for _pageno, xref_ext in extract_images(pdf, root, options, _find_deflatable_jpeg):
xref = xref_ext.xref
log.debug(f'xref {xref}: marking this JPEG as deflatable')
jpegs.append(xref)
complevel = 9 if options.optimize == 3 else 6
# Our calls to xobj.write() in finish() need coordination
lock = threading.Lock()
def deflate_args() -> Iterator:
for xref in jpegs:
yield pdf, lock, xref, complevel
def finish(result: tuple[Xref, bytes], pbar: ProgressBar):
xref, compdata = result
if len(compdata) > 0:
with lock:
xobj = pdf.get_object(xref, 0)
xobj.write(compdata, filter=[Name.FlateDecode, Name.DCTDecode])
pbar.update()
executor(
use_threads=True, # We're sharing the pdf directly, must use threads
max_workers=options.jobs,
progress_kwargs=dict(
desc="Deflating JPEGs",
total=len(jpegs),
unit='image',
disable=not options.progress_bar,
),
task=_deflate_jpeg,
task_arguments=deflate_args(),
task_finished=finish,
)
def _transcode_png(pdf: Pdf, filename: Path, xref: Xref) -> bool:
output = filename.with_suffix('.png.pdf')
with output.open('wb') as f:
img2pdf.convert(fspath(filename), outputstream=f, **IMG2PDF_KWARGS)
with Pdf.open(output) as pdf_image:
foreign_image = next(iter(pdf_image.pages[0].images.values()))
local_image = pdf.copy_foreign(foreign_image)
im_obj = pdf.get_object(xref, 0)
im_obj.write(
local_image.read_raw_bytes(),
filter=local_image.Filter,
decode_parms=local_image.DecodeParms,
)
# Don't copy keys from the new image...
del_keys = set(im_obj.keys()) - set(local_image.keys())
# ...except for the keep_fields, which are essential to displaying
# the image correctly and preserving its metadata. (/Decode arrays
# and /SMaskInData are implicitly discarded prior to this point.)
keep_fields = {
'/ID',
'/Intent',
'/Interpolate',
'/Mask',
'/Metadata',
'/OC',
'/OPI',
'/SMask',
'/StructParent',
}
del_keys -= keep_fields
for key in local_image.keys():
if key != Name.Length and str(key) not in keep_fields:
im_obj[key] = local_image[key]
for key in del_keys:
del im_obj[key]
return True
def transcode_pngs(
pdf: Pdf,
images: Sequence[Xref],
image_name_fn: Callable[[Path, Xref], Path],
root: Path,
options,
executor: Executor,
) -> None:
"""Apply lossy transcoding to PNGs."""
modified: MutableSet[Xref] = set()
if options.optimize >= 2:
png_quality = (
max(10, options.png_quality - 10),
min(100, options.png_quality + 10),
)
def pngquant_args():
for xref in images:
log.debug(image_name_fn(root, xref))
yield (
image_name_fn(root, xref),
png_name(root, xref),
png_quality[0],
png_quality[1],
)
modified.add(xref)
executor(
use_threads=True,
max_workers=options.jobs,
progress_kwargs=dict(
desc="PNGs",
total=len(images),
unit='image',
disable=not options.progress_bar,
),
task=pngquant.quantize,
task_arguments=pngquant_args(),
)
for xref in modified:
filename = png_name(root, xref)
_transcode_png(pdf, filename, xref)
DEFAULT_EXECUTOR = SerialExecutor()
def optimize(
input_file: Path,
output_file: Path,
context: PdfContext,
save_settings: dict[str, Any],
executor: Executor = DEFAULT_EXECUTOR,
) -> Path:
"""Optimize images in a PDF file."""
options = context.options
if options.optimize == 0:
safe_symlink(input_file, output_file)
return output_file
if not options.jpg_quality:
options.jpg_quality = DEFAULT_JPEG_QUALITY if options.optimize < 3 else 40
if not options.png_quality:
options.png_quality = DEFAULT_PNG_QUALITY if options.optimize < 3 else 30
with Pdf.open(input_file) as pdf:
root = output_file.parent / 'images'
root.mkdir(exist_ok=True)
jpegs, pngs = extract_images_generic(pdf, root, options)
transcode_jpegs(pdf, jpegs, root, options, executor)
deflate_jpegs(pdf, root, options, executor)
# if options.optimize >= 2:
# Try pngifying the jpegs
# transcode_pngs(pdf, jpegs, jpg_name, root, options)
transcode_pngs(pdf, pngs, png_name, root, options, executor)
jbig2_images = extract_images_jbig2(pdf, root, options)
convert_to_jbig2(pdf, jbig2_images, root, options, executor)
target_file = output_file.with_suffix('.opt.pdf')
pdf.remove_unreferenced_resources()
pdf.save(target_file, **save_settings)
input_size = input_file.stat().st_size
output_size = target_file.stat().st_size
if output_size == 0:
raise OutputFileAccessError(
f"Output file not created after optimizing. We probably ran "
f"out of disk space in the temporary folder: {tempfile.gettempdir()}."
)
savings = 1 - output_size / input_size
if savings < 0:
log.info(
"Image optimization did not improve the file - "
"optimizations will not be used"
)
# We still need to save the file
with Pdf.open(input_file) as pdf:
pdf.remove_unreferenced_resources()
pdf.save(output_file, **save_settings)
else:
safe_symlink(target_file, output_file)
return output_file
def main(infile, outfile, level, jobs=1):
"""Entry point for direct optimization of a file."""
from shutil import copy # pylint: disable=import-outside-toplevel
from tempfile import TemporaryDirectory # pylint: disable=import-outside-toplevel
from ocrmypdf._options import OcrOptions # pylint: disable=import-outside-toplevel
infile = Path(infile)
# Create OcrOptions with optimization-specific settings
options = OcrOptions(
input_file=infile,
output_file=outfile, # Required field
jobs=jobs,
optimize=int(level),
jpg_quality=0, # Use default
png_quality=0,
jbig2_threshold=0.85,
quiet=True,
progress_bar=False,
)
with TemporaryDirectory() as tmpdir:
context = PdfContext(options, Path(tmpdir), infile, None, None)
tmpout = Path(tmpdir) / 'out.pdf'
optimize(
infile,
tmpout,
context,
dict(
compress_streams=True,
preserve_pdfa=True,
object_stream_mode=ObjectStreamMode.generate,
),
)
copy(fspath(tmpout), fspath(outfile))
if __name__ == '__main__':
main(sys.argv[1], sys.argv[2], sys.argv[3])
================================================
FILE: src/ocrmypdf/pdfa.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Utilities for PDF/A production and confirmation with Ghostscript."""
from __future__ import annotations
import base64
import logging
from collections.abc import Iterator
from importlib.resources import files as package_files
from pathlib import Path
import pikepdf
from pikepdf import Array, Dictionary, Name, Pdf, Stream
log = logging.getLogger(__name__)
SRGB_ICC_PROFILE_NAME = 'sRGB.icc'
def _postscript_objdef(
alias: str,
dictionary: dict[str, str],
*,
stream_name: str | None = None,
stream_data: bytes | None = None,
) -> Iterator[str]:
assert (stream_name is None) == (stream_data is None)
objtype = '/stream' if stream_name else '/dict'
if stream_name:
assert stream_data is not None
a85_data = base64.a85encode(stream_data, adobe=True).decode('ascii')
yield f'{stream_name} ' + a85_data
yield 'def'
if alias != '{Catalog}': # Catalog needs no definition
yield f'[/_objdef {alias} /type {objtype} /OBJ pdfmark'
yield f'[{alias} <<'
for key, val in dictionary.items():
yield f' {key} {val}'
yield '>> /PUT pdfmark'
if stream_name:
yield f'[{alias} {stream_name[1:]} /PUT pdfmark'
def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[str]:
yield '%!'
yield from _postscript_objdef(
'{icc_PDFA}', # Not an f-string
{'/N': str(colors)},
stream_name='/ICCProfile',
stream_data=icc_data,
)
yield ''
yield from _postscript_objdef(
'{OutputIntent_PDFA}',
{
'/Type': '/OutputIntent',
'/S': '/GTS_PDFA1',
'/DestOutputProfile': '{icc_PDFA}',
'/OutputConditionIdentifier': f'({icc_name})', # Only f-string
},
)
yield ''
yield from _postscript_objdef(
'{Catalog}', {'/OutputIntents': '[ {OutputIntent_PDFA} ]'}
)
def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
"""Create a Postscript PDFMARK file for Ghostscript PDF/A conversion.
pdfmark is an extension to the Postscript language that describes some PDF
features like bookmarks and annotations. It was originally specified Adobe
Distiller, for Postscript to PDF conversion.
Ghostscript uses pdfmark for PDF to PDF/A conversion as well. To use Ghostscript
to create a PDF/A, we need to create a pdfmark file with the necessary metadata.
This function takes care of the many version-specific bugs and peculiarities in
Ghostscript's handling of pdfmark.
The only information we put in specifies that we want the file to be a
PDF/A, and we want to Ghostscript to convert objects to the sRGB colorspace
if it runs into any object that it decides must be converted.
Arguments:
target_filename: filename to save
icc: ICC identifier such as 'sRGB'
References:
Adobe PDFMARK Reference:
https://opensource.adobe.com/dc-acrobat-sdk-docs/library/pdfmark/
"""
if icc != 'sRGB':
raise NotImplementedError("Only supporting sRGB")
bytes_icc_profile = (
package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME
).read_bytes()
postscript = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
# We should have encoded everything to pure ASCII by this point, and
# to be safe, only allow ASCII in PostScript
Path(target_filename).write_text(postscript, encoding='ascii')
return target_filename
def file_claims_pdfa(filename: Path):
"""Determines if the file claims to be PDF/A compliant.
This only checks if the XMP metadata contains a PDF/A marker. It does not
do full PDF/A validation.
"""
with pikepdf.open(filename) as pdf:
pdfmeta = pdf.open_metadata()
if not pdfmeta.pdfa_status:
return {
'pass': False,
'output': 'pdf',
'conformance': 'No PDF/A metadata in XMP',
}
valid_part_conforms = {'1a', '1b', '2a', '2b', '2u', '3a', '3b', '3u'}
# Raw value in XMP metadata returned by pikepdf is uppercase, but ISO
# uses lower case for conformance levels.
pdfa_status_iso = pdfmeta.pdfa_status.lower()
conformance = f'PDF/A-{pdfa_status_iso}'
pdfa_dict: dict[str, str | bool] = {}
if pdfa_status_iso in valid_part_conforms:
pdfa_dict['pass'] = True
pdfa_dict['output'] = 'pdfa'
pdfa_dict['conformance'] = conformance
return pdfa_dict
def _load_srgb_icc_profile() -> bytes:
"""Load the sRGB ICC profile from package data."""
return (package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME).read_bytes()
def _pdfa_part_conformance(output_type: str) -> tuple[str, str]:
"""Extract PDF/A part and conformance from output_type.
Args:
output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'
Returns:
Tuple of (part, conformance) e.g., ('2', 'B')
"""
mapping = {
'pdfa': ('2', 'B'),
'pdfa-1': ('1', 'B'),
'pdfa-2': ('2', 'B'),
'pdfa-3': ('3', 'B'),
}
return mapping.get(output_type, ('2', 'B'))
def add_pdfa_metadata(pdf: Pdf, part: str, conformance: str) -> None:
"""Add PDF/A XMP metadata declaration to a PDF.
Args:
pdf: An open pikepdf.Pdf object
part: PDF/A part number ('1', '2', or '3')
conformance: Conformance level ('A', 'B', or 'U')
"""
with pdf.open_metadata() as meta:
meta['pdfaid:part'] = part
meta['pdfaid:conformance'] = conformance
def add_srgb_output_intent(pdf: Pdf) -> None:
"""Add sRGB ICC profile as OutputIntent to PDF catalog.
This creates the required PDF/A OutputIntent structure with:
- An ICC profile stream containing sRGB profile
- An OutputIntent dictionary pointing to that profile
- Updates the Catalog's OutputIntents array
Args:
pdf: An open pikepdf.Pdf object
"""
icc_data = _load_srgb_icc_profile()
# Create ICC profile stream
icc_stream = Stream(pdf, icc_data)
icc_stream[Name.N] = 3 # RGB has 3 components
# Create OutputIntent dictionary
output_intent = Dictionary({
'/Type': Name.OutputIntent,
'/S': Name('/GTS_PDFA1'),
'/OutputConditionIdentifier': 'sRGB',
'/DestOutputProfile': icc_stream,
})
# Add to catalog's OutputIntents array
if Name.OutputIntents not in pdf.Root:
pdf.Root[Name.OutputIntents] = Array([])
# Check if sRGB OutputIntent already exists
for intent in pdf.Root.OutputIntents: # type: ignore[attr-defined]
if str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB':
log.debug('sRGB OutputIntent already exists, skipping')
return
pdf.Root.OutputIntents.append(output_intent)
def speculative_pdfa_conversion(
input_file: Path,
output_file: Path,
output_type: str,
) -> Path:
"""Attempt to convert a PDF to PDF/A by adding required structures.
This function creates a copy of the input PDF and adds:
1. sRGB ICC profile as OutputIntent
2. XMP metadata declaring PDF/A conformance
This approach works for PDFs that are already mostly PDF/A compliant
but lack the formal declarations. It does NOT perform color conversion,
font embedding, or other transformations that Ghostscript does.
Args:
input_file: Path to input PDF
output_file: Path where output PDF should be written
output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'
Returns:
Path to the output file
Raises:
pikepdf.PdfError: If the PDF cannot be opened or modified
"""
part, conformance = _pdfa_part_conformance(output_type)
with Pdf.open(input_file) as pdf:
add_srgb_output_intent(pdf)
add_pdfa_metadata(pdf, part, conformance)
pdf.save(output_file)
log.debug('Speculative PDF/A conversion complete: %s', output_file)
return output_file
================================================
FILE: src/ocrmypdf/pdfinfo/__init__.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""For extracting information about PDFs prior to OCR."""
from __future__ import annotations
from ocrmypdf.pdfinfo._types import Colorspace, Encoding, FloatRect
from ocrmypdf.pdfinfo.info import PageInfo, PdfInfo
__all__ = ["Colorspace", "Encoding", "FloatRect", "PageInfo", "PdfInfo"]
================================================
FILE: src/ocrmypdf/pdfinfo/_contentstream.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF content stream interpretation."""
from __future__ import annotations
import re
from collections import defaultdict
from collections.abc import Mapping
from math import hypot, inf, isclose
from typing import NamedTuple
from warnings import warn
from pikepdf import Matrix, Object, PdfInlineImage, parse_content_stream
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._types import UNIT_SQUARE
class XobjectSettings(NamedTuple):
"""Info about an XObject found in a PDF."""
name: str
shorthand: tuple[float, float, float, float, float, float]
stack_depth: int
class InlineSettings(NamedTuple):
"""Info about an inline image found in a PDF."""
iimage: PdfInlineImage
shorthand: tuple[float, float, float, float, float, float]
stack_depth: int
class ContentsInfo(NamedTuple):
"""Info about various objects found in a PDF."""
xobject_settings: list[XobjectSettings]
inline_images: list[InlineSettings]
found_vector: bool
found_text: bool
name_index: Mapping[str, list[XobjectSettings]]
class TextboxInfo(NamedTuple):
"""Info about a text box found in a PDF."""
bbox: tuple[float, float, float, float]
is_visible: bool
is_corrupt: bool
class VectorMarker:
"""Sentinel indicating vector drawing operations were found on a page."""
class TextMarker:
"""Sentinel indicating text drawing operations were found on a page."""
def _is_unit_square(shorthand):
"""Check if the shorthand represents a unit square transformation."""
values = map(float, shorthand)
pairwise = zip(values, UNIT_SQUARE, strict=False)
return all(isclose(a, b, rel_tol=1e-3) for a, b in pairwise)
def _normalize_stack(graphobjs):
"""Convert runs of qQ's in the stack into single graphobjs."""
for operands, operator in graphobjs:
operator = str(operator)
if re.match(r'Q*q+$', operator): # Zero or more Q, one or more q
for char in operator: # Split into individual
yield ([], char) # Yield individual
else:
yield (operands, operator)
def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):
"""Interpret the PDF content stream.
The stack represents the state of the PDF graphics stack. We are only
interested in the current transformation matrix (CTM) so we only track
this object; a full implementation would need to track many other items.
The CTM is initialized to the mapping from user space to device space.
PDF units are 1/72". In a PDF viewer or printer this matrix is initialized
to the transformation to device space. For example if set to
(1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.
Images are always considered to be (0, 0) -> (1, 1). Before drawing an
image there should be a 'cm' that sets up an image coordinate system
where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
page.
PDF units suit our needs so we initialize ctm to the identity matrix.
According to the PDF specification, the maximum stack depth is 32. Other
viewers tolerate some amount beyond this. We issue a warning if the
stack depth exceeds the spec limit and set a hard limit beyond this to
bound our memory requirements. If the stack underflows behavior is
undefined in the spec, but we just pretend nothing happened and leave the
CTM unchanged.
"""
stack = []
ctm = Matrix(initial_shorthand)
xobject_settings: list[XobjectSettings] = []
inline_images: list[InlineSettings] = []
name_index = defaultdict(lambda: [])
found_vector = False
found_text = False
vector_ops = set('S s f F f* B B* b b*'.split())
text_showing_ops = set("""TJ Tj " '""".split())
image_ops = set('BI ID EI q Q Do cm'.split())
operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops)
for n, graphobj in enumerate(
_normalize_stack(parse_content_stream(contentstream, operator_whitelist))
):
operands, operator = graphobj
if operator == 'q':
stack.append(ctm)
if len(stack) > 32: # See docstring
if len(stack) > 128:
raise RuntimeError(
f"PDF graphics stack overflowed hard limit at operator {n}"
)
warn("PDF graphics stack overflowed spec limit")
elif operator == 'Q':
try:
ctm = stack.pop()
except IndexError:
# Keeping the ctm the same seems to be the only sensible thing
# to do. Just pretend nothing happened, keep calm and carry on.
warn("PDF graphics stack underflowed - PDF may be malformed")
elif operator == 'cm':
try:
ctm = Matrix(operands) @ ctm
except ValueError as e:
raise InputFileError(
"PDF content stream is corrupt - this PDF is malformed. "
"Use a PDF editor that is capable of visually inspecting the PDF."
) from e
elif operator == 'Do':
image_name = operands[0]
settings = XobjectSettings(
name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
)
xobject_settings.append(settings)
name_index[str(image_name)].append(settings)
elif operator == 'INLINE IMAGE': # BI/ID/EI are grouped into this
iimage = operands[0]
inline = InlineSettings(
iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
)
inline_images.append(inline)
elif operator in vector_ops:
found_vector = True
elif operator in text_showing_ops:
found_text = True
return ContentsInfo(
xobject_settings=xobject_settings,
inline_images=inline_images,
found_vector=found_vector,
found_text=found_text,
name_index=name_index,
)
def _get_dpi(ctm_shorthand, image_size) -> Resolution:
"""Given the transformation matrix and image size, find the image DPI.
PDFs do not include image resolution information within image data.
Instead, the PDF page content stream describes the location where the
image will be rasterized, and the effective resolution is the ratio of the
pixel size to raster target size.
Normally a scanned PDF has the paper size set appropriately but this is
not guaranteed. The most common case is a cropped image will change the
page size (/CropBox) without altering the page content stream. That means
it is not sufficient to assume that the image fills the page, even though
that is the most common case.
A PDF image may be scaled (always), cropped, translated, rotated in place
to an arbitrary angle (rarely) and skewed. Only equal area mappings can
be expressed, that is, it is not necessary to consider distortions where
the effective DPI varies with position.
To determine the image scale, transform an offset axis vector v0 (0, 0),
width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,
which gives the dimensions of the image in PDF units. From there we can
compare to actual image dimensions. PDF uses
row vector * matrix_transposed unlike the traditional
matrix * column vector.
The offset, width and height vectors can be combined in a matrix and
multiplied by the transform matrix. Then we want to calculated
magnitude(width_vector - offset_vector)
and
magnitude(height_vector - offset_vector)
When the above is worked out algebraically, the effect of translation
cancels out, and the vector magnitudes become functions of the nonzero
transformation matrix indices. The results of the derivation are used
in this code.
pdfimages -list does calculate the DPI in some way that is not completely
naive, but it does not get the DPI of rotated images right, so cannot be
used anymore to validate this. Photoshop works, or using Acrobat to
rotate the image back to normal.
It does not matter if the image is partially cropped, or even out of the
/MediaBox.
"""
a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name
# Calculate the width and height of the image in PDF units
image_drawn = hypot(a, b), hypot(c, d)
def calc(drawn, pixels, inches_per_pt=72.0):
# The scale of the image is pixels per unit of default user space (1/72")
scale = pixels / drawn if drawn != 0 else inf
dpi = scale * inches_per_pt
return dpi
dpi_w, dpi_h = (calc(image_drawn[n], image_size[n]) for n in range(2))
return Resolution(dpi_w, dpi_h)
================================================
FILE: src/ocrmypdf/pdfinfo/_image.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF image analysis."""
from __future__ import annotations
import logging
from collections.abc import Iterator
from decimal import Decimal
from pikepdf import (
Dictionary,
Matrix,
Name,
Object,
Pdf,
PdfImage,
PdfInlineImage,
Stream,
UnsupportedImageTypeError,
)
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._contentstream import (
ContentsInfo,
TextMarker,
VectorMarker,
_get_dpi,
_interpret_contents,
_is_unit_square,
)
from ocrmypdf.pdfinfo._types import (
FRIENDLY_COLORSPACE,
FRIENDLY_COMP,
FRIENDLY_ENCODING,
UNIT_SQUARE,
Colorspace,
Encoding,
)
logger = logging.getLogger()
class ImageInfo:
"""Information about an image found in a PDF.
This gathers information from pikepdf and pdfminer.six, and is pickle-able
so that it can be passed to a worker process, unlike objects from those
libraries.
"""
DPI_PREC = Decimal('1.000')
_comp: int | None
_name: str
def __init__(
self,
*,
name='',
pdfimage: Object | None = None,
inline: PdfInlineImage | None = None,
shorthand=None,
):
"""Initialize an ImageInfo."""
self._name = str(name)
self._shorthand = shorthand
pim: PdfInlineImage | PdfImage
if inline is not None:
self._origin = 'inline'
pim = inline
elif pdfimage is not None and isinstance(pdfimage, Stream):
self._origin = 'xobject'
pim = PdfImage(pdfimage)
else:
raise ValueError("Either pdfimage or inline must be set")
self._width = pim.width
self._height = pim.height
if (smask := pim.obj.get(Name.SMask, None)) is not None and isinstance(
smask, Stream | Dictionary
):
# SMask is pretty much an alpha channel, but in PDF it's possible
# for channel to have different dimensions than the image
# itself. Some PDF writers use this to create a grayscale stencil
# mask. For our purposes, the effective size is the size of the
# larger component (image or smask).
self._width = max(smask.get(Name.Width, 0), self._width)
self._height = max(smask.get(Name.Height, 0), self._height)
if (mask := pim.obj.get(Name.Mask, None)) is not None and isinstance(
mask, Stream | Dictionary
):
# If the image has a /Mask entry, it has an explicit mask.
# /Mask can be a Stream or an Array. If it's a Stream,
# use its /Width and /Height if they are larger than the main
# image's.
self._width = max(mask.get(Name.Width, 0), self._width)
self._height = max(mask.get(Name.Height, 0), self._height)
# If /ImageMask is true, then this image is a stencil mask
# (Images that draw with this stencil mask will have a reference to
# it in their /Mask, but we don't actually need that information)
if pim.image_mask:
self._type = 'stencil'
else:
self._type = 'image'
self._bpc = int(pim.bits_per_component)
if (
len(pim.filters) == 2
and pim.filters[0] == '/FlateDecode'
and pim.filters[1] == '/DCTDecode'
):
# Special case: FlateDecode followed by DCTDecode
self._enc = Encoding.flate_jpeg
else:
try:
self._enc = FRIENDLY_ENCODING.get(pim.filters[0])
except IndexError:
self._enc = None
try:
self._color = FRIENDLY_COLORSPACE.get(pim.colorspace or '')
except NotImplementedError:
self._color = None
if self._enc == Encoding.jpeg2000:
self._color = Colorspace.jpeg2000
self._comp = None
if self._color == Colorspace.icc and isinstance(pim, PdfImage):
self._comp = self._init_icc(pim)
else:
if isinstance(self._color, Colorspace):
self._comp = FRIENDLY_COMP.get(self._color)
# Bit of a hack... infer grayscale if component count is uncertain
# but encoding only supports monochrome.
if self._comp is None and self._enc in (Encoding.ccitt, Encoding.jbig2):
self._comp = FRIENDLY_COMP[Colorspace.gray]
def _init_icc(self, pim: PdfImage):
try:
icc = pim.icc
except UnsupportedImageTypeError as e:
logger.warning(
f"An image with a corrupt or unreadable ICC profile was found. "
f"Output PDF may not match the input PDF visually: {e}. {self}"
)
return None
# Check the ICC profile to determine actual colorspace
if icc is None or not hasattr(icc, 'profile'):
logger.warning(
f"An image with an ICC profile but no ICC profile data was found. "
f"The output PDF may not match the input PDF visually. {self}"
)
return None
try:
if icc.profile.xcolor_space == 'GRAY':
return 1
elif icc.profile.xcolor_space == 'CMYK':
return 4
else:
return 3
except AttributeError:
return None
@property
def name(self):
"""Name of the image as it appears in the PDF."""
return self._name
@property
def type_(self):
"""Type of image, either 'image' or 'stencil'."""
return self._type
@property
def width(self) -> int:
"""Width of the image in pixels."""
return self._width
@property
def height(self) -> int:
"""Height of the image in pixels."""
return self._height
@property
def bpc(self):
"""Bits per component."""
return self._bpc
@property
def color(self):
"""Colorspace of the image."""
return self._color if self._color is not None else '?'
@property
def comp(self):
"""Number of components/channels in the image."""
return self._comp if self._comp is not None else '?'
@property
def enc(self):
"""Encoding of the image."""
return self._enc if self._enc is not None else 'image'
@property
def renderable(self) -> bool:
"""Whether the image is renderable.
Some PDFs in the wild have invalid images that are not renderable,
due to unusual dimensions.
Stencil masks are not also not renderable, since they are not
drawn, but rather they control how rendering happens.
"""
return (
self.dpi.is_finite
and self.width >= 0
and self.height >= 0
and self.type_ != 'stencil'
)
@property
def dpi(self) -> Resolution:
"""Dots per inch of the image.
Calculated based on where and how the image is drawn in the PDF.
"""
return _get_dpi(self._shorthand, (self._width, self._height))
@property
def printed_area(self) -> float:
"""Physical area of the image in square inches."""
if not self.renderable:
return 0.0
return float((self.width / self.dpi.x) * (self.height / self.dpi.y))
def __repr__(self):
"""Return a string representation of the image."""
return (
f""
)
def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
"""Find inline images in the contentstream."""
for n, inline in enumerate(contentsinfo.inline_images):
yield ImageInfo(
name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage
)
def _image_xobjects(container) -> Iterator[tuple[Object, str]]:
"""Search for all XObject-based images in the container.
Usually the container is a page, but it could also be a Form XObject
that contains images. Filter out the Form XObjects which are dealt with
elsewhere.
Generate a sequence of tuples (image, xobj container), where container,
where xobj is the name of the object and image is the object itself,
since the object does not know its own name.
"""
if Name.Resources not in container:
return
resources = container[Name.Resources]
if Name.XObject not in resources:
return
for key, candidate in resources[Name.XObject].items():
if candidate is None or Name.Subtype not in candidate:
continue
if candidate[Name.Subtype] == Name.Image:
pdfimage = candidate
yield (pdfimage, key)
def _find_regular_images(
container: Object, contentsinfo: ContentsInfo
) -> Iterator[ImageInfo]:
"""Find images stored in the container's /Resources /XObject.
Usually the container is a page, but it could also be a Form XObject
that contains images.
Generates images with their DPI at time of drawing.
"""
for pdfimage, xobj in _image_xobjects(container):
if xobj not in contentsinfo.name_index:
continue
for draw in contentsinfo.name_index[xobj]:
if draw.stack_depth == 0 and _is_unit_square(draw.shorthand):
# At least one PDF in the wild (and test suite) draws an image
# when the graphics stack depth is 0, meaning that the image
# gets drawn into a square of 1x1 PDF units (or 1/72",
# or 0.35 mm). The equivalent DPI will be >100,000. Exclude
# these from our DPI calculation for the page.
continue
yield ImageInfo(name=draw.name, pdfimage=pdfimage, shorthand=draw.shorthand)
def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: ContentsInfo):
"""Find any images that are in Form XObjects in the container.
The container may be a page, or a parent Form XObject.
"""
if Name.Resources not in container:
return
resources = container[Name.Resources]
if Name.XObject not in resources:
return
xobjs = resources[Name.XObject].as_dict()
for xobj in xobjs:
candidate = xobjs[xobj]
if candidate is None or candidate.get(Name.Subtype) != Name.Form:
continue
form_xobject = candidate
for settings in contentsinfo.xobject_settings:
if settings.name != xobj:
continue
# Find images once for each time this Form XObject is drawn.
# This could be optimized to cache the multiple drawing events
# but in practice both Form XObjects and multiple drawing of the
# same object are both very rare.
ctm_shorthand = settings.shorthand
yield from _process_content_streams(
pdf=pdf, container=form_xobject, shorthand=ctm_shorthand
)
def _process_content_streams(
*, pdf: Pdf, container: Object, shorthand=None
) -> Iterator[VectorMarker | TextMarker | ImageInfo]:
"""Find all individual instances of images drawn in the container.
Usually the container is a page, but it may also be a Form XObject.
On a typical page images are stored inline or as regular images
in an XObject.
Form XObjects may include inline images, XObject images,
and recursively, other Form XObjects; and also vector graphic objects.
Every instance of an image being drawn somewhere is flattened and
treated as a unique image, since if the same image is drawn multiple times
on one page it may be drawn at differing resolutions, and our objective
is to find the resolution at which the page can be rastered without
downsampling.
"""
if container.get(Name.Type) == Name.Page and Name.Contents in container:
initial_shorthand = shorthand or UNIT_SQUARE
elif (
container.get(Name.Type) == Name.XObject
and container[Name.Subtype] == Name.Form
):
# Set the CTM to the state it was when the "Do" operator was
# encountered that is drawing this instance of the Form XObject
ctm = Matrix(shorthand) if shorthand else Matrix()
# A Form XObject may provide its own matrix to map form space into
# user space. Get this if one exists
form_shorthand = container.get(Name.Matrix, Matrix())
form_matrix = Matrix(form_shorthand)
# Concatenate form matrix with CTM to ensure CTM is correct for
# drawing this instance of the XObject
ctm = form_matrix @ ctm
initial_shorthand = ctm.shorthand
else:
return
contentsinfo = _interpret_contents(container, initial_shorthand)
if contentsinfo.found_vector:
yield VectorMarker()
if contentsinfo.found_text:
yield TextMarker()
yield from _find_inline_images(contentsinfo)
yield from _find_regular_images(container, contentsinfo)
yield from _find_form_xobject_images(pdf, container, contentsinfo)
================================================
FILE: src/ocrmypdf/pdfinfo/_types.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF type definitions and constants."""
from __future__ import annotations
from enum import Enum, auto
class Colorspace(Enum):
"""Description of common image colorspaces in a PDF."""
# pylint: disable=invalid-name
gray = auto()
rgb = auto()
cmyk = auto()
lab = auto()
icc = auto()
index = auto()
sep = auto()
devn = auto()
pattern = auto()
jpeg2000 = auto()
class Encoding(Enum):
"""Description of common image encodings in a PDF."""
# pylint: disable=invalid-name
ccitt = auto()
jpeg = auto()
jpeg2000 = auto()
jbig2 = auto()
asciihex = auto()
ascii85 = auto()
lzw = auto()
flate = auto()
runlength = auto()
flate_jpeg = auto()
FloatRect = tuple[float, float, float, float]
FRIENDLY_COLORSPACE: dict[str, Colorspace] = {
'/DeviceGray': Colorspace.gray,
'/CalGray': Colorspace.gray,
'/DeviceRGB': Colorspace.rgb,
'/CalRGB': Colorspace.rgb,
'/DeviceCMYK': Colorspace.cmyk,
'/Lab': Colorspace.lab,
'/ICCBased': Colorspace.icc,
'/Indexed': Colorspace.index,
'/Separation': Colorspace.sep,
'/DeviceN': Colorspace.devn,
'/Pattern': Colorspace.pattern,
'/G': Colorspace.gray, # Abbreviations permitted in inline images
'/RGB': Colorspace.rgb,
'/CMYK': Colorspace.cmyk,
'/I': Colorspace.index,
}
FRIENDLY_ENCODING: dict[str, Encoding] = {
'/CCITTFaxDecode': Encoding.ccitt,
'/DCTDecode': Encoding.jpeg,
'/JPXDecode': Encoding.jpeg2000,
'/JBIG2Decode': Encoding.jbig2,
'/CCF': Encoding.ccitt, # Abbreviations permitted in inline images
'/DCT': Encoding.jpeg,
'/AHx': Encoding.asciihex,
'/A85': Encoding.ascii85,
'/LZW': Encoding.lzw,
'/Fl': Encoding.flate,
'/RL': Encoding.runlength,
}
FRIENDLY_COMP: dict[Colorspace, int] = {
Colorspace.gray: 1,
Colorspace.rgb: 3,
Colorspace.cmyk: 4,
Colorspace.lab: 3,
Colorspace.index: 1,
}
UNIT_SQUARE = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
================================================
FILE: src/ocrmypdf/pdfinfo/_worker.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF page info worker process handling."""
from __future__ import annotations
import atexit
import logging
from collections.abc import Container, Sequence
from contextlib import contextmanager
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING
from pikepdf import Pdf
from ocrmypdf._concurrent import Executor
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import available_cpu_count, pikepdf_enable_mmap
if TYPE_CHECKING:
from ocrmypdf.pdfinfo.info import PageInfo
from ocrmypdf.pdfinfo.layout import PdfMinerState
logger = logging.getLogger()
worker_pdf = None # pylint: disable=invalid-name
def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
global worker_pdf # pylint: disable=global-statement,invalid-name
pikepdf_enable_mmap()
logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)
# If the pdf is not opened, open a copy for our worker process to use
if pdf is None:
worker_pdf = Pdf.open(infile)
def on_process_close():
worker_pdf.close()
# Close when this process exits
atexit.register(on_process_close)
@contextmanager
def _pdf_pageinfo_sync_pdf(thread_pdf: Pdf | None, infile: Path):
if thread_pdf is not None:
yield thread_pdf
elif worker_pdf is not None:
yield worker_pdf
else:
with Pdf.open(infile) as pdf:
yield pdf
def _pdf_pageinfo_sync(
pageno: int,
thread_pdf: Pdf | None,
infile: Path,
check_pages: Container[int],
detailed_analysis: bool,
miner_state: PdfMinerState | None,
) -> PageInfo:
# Import here to avoid circular import - info.py imports this module,
# but PageInfo is defined in info.py
from ocrmypdf.pdfinfo.info import PageInfo
with _pdf_pageinfo_sync_pdf(thread_pdf, infile) as pdf:
return PageInfo(
pdf, pageno, infile, check_pages, detailed_analysis, miner_state
)
def _pdf_pageinfo_concurrent(
pdf,
executor: Executor,
max_workers: int,
use_threads: bool,
infile,
progbar,
check_pages,
detailed_analysis: bool = False,
miner_state: PdfMinerState | None = None,
) -> Sequence[PageInfo | None]:
pages: list[PageInfo | None] = [None] * len(pdf.pages)
def update_pageinfo(page: PageInfo, pbar: ProgressBar):
if not page:
raise InputFileError("Could read a page in the PDF")
pages[page.pageno] = page
pbar.update()
if max_workers is None:
max_workers = available_cpu_count()
total = len(pdf.pages)
n_workers = min(1 + len(pages) // 4, max_workers)
if n_workers == 1:
# If we decided on only one worker, there is no point in using
# a separate process.
use_threads = True
if use_threads and n_workers > 1:
# If we are using threads, there is no point in using more than one
# worker thread - they will just fight over the GIL.
n_workers = 1
# If we use a thread, we can pass the already-open Pdf for them to use
# If we use processes, we pass a None which tells the init function to open its
# own
initial_pdf = pdf if use_threads else None
contexts = (
(n, initial_pdf, infile, check_pages, detailed_analysis, miner_state)
for n in range(total)
)
assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable"
logger.debug(
f"Gathering info with {n_workers} "
+ ('thread' if use_threads else 'process')
+ " workers"
)
executor(
use_threads=use_threads,
max_workers=n_workers,
progress_kwargs=dict(
total=total, desc="Scanning contents", unit='page', disable=not progbar
),
worker_initializer=partial(
_pdf_pageinfo_sync_init,
initial_pdf,
infile,
logging.getLogger('pdfminer').level,
),
task=_pdf_pageinfo_sync,
task_arguments=contexts,
task_finished=update_pageinfo,
)
return pages
================================================
FILE: src/ocrmypdf/pdfinfo/info.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Extract information about the content of a PDF."""
from __future__ import annotations
import logging
import statistics
from collections.abc import Callable, Container, Iterable, Iterator
from contextlib import nullcontext
from decimal import Decimal
from os import PathLike
from pathlib import Path
from typing import NamedTuple
from pdfminer.layout import LTPage, LTTextBox
from pikepdf import Name, Page, Pdf
from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf.exceptions import EncryptedPdfError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._contentstream import TextboxInfo, TextMarker, VectorMarker
from ocrmypdf.pdfinfo._image import ImageInfo, _process_content_streams
from ocrmypdf.pdfinfo._types import FloatRect
from ocrmypdf.pdfinfo._worker import _pdf_pageinfo_concurrent
from ocrmypdf.pdfinfo.layout import (
LTStateAwareChar,
PdfMinerState,
get_text_boxes,
)
logger = logging.getLogger()
def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:
"""Smarter text detection that ignores text in margins."""
pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name
margin_ratio = 0.125
interior_bbox = (
margin_ratio * pw, # left
(1 - margin_ratio) * ph, # top
(1 - margin_ratio) * pw, # right
margin_ratio * ph, # bottom (first quadrant: bottom < top)
)
def rects_intersect(a: FloatRect, b: FloatRect) -> bool:
"""Check if two 4-tuple rects intersect.
Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
Formula assumes all boxes are in first quadrant.
"""
return a[0] < b[2] and a[2] > b[0] and a[1] > b[3] and a[3] < b[1]
has_text = False
for bbox in text_blocks:
if rects_intersect(bbox, interior_bbox):
has_text = True
break
return has_text
def simplify_textboxes(
miner_page: LTPage, textbox_getter: Callable[[LTPage], Iterator[LTTextBox]]
) -> Iterator[TextboxInfo]:
"""Extract only limited content from text boxes.
We do this to save memory and ensure that our objects are pickleable.
"""
for box in textbox_getter(miner_page):
first_line = box._objs[0] # pylint: disable=protected-access
first_char = first_line._objs[0] # pylint: disable=protected-access
if not isinstance(first_char, LTStateAwareChar):
continue
visible = first_char.rendermode != 3
corrupt = first_char.get_text() == '\ufffd'
yield TextboxInfo(box.bbox, visible, corrupt)
class PageResolutionProfile(NamedTuple):
"""Information about the resolutions of a page."""
weighted_dpi: float
"""The weighted average DPI of the page, weighted by the area of each image."""
max_dpi: float
"""The maximum DPI of an image on the page."""
average_to_max_dpi_ratio: float
"""The average DPI of the page divided by the maximum DPI of the page.
This indicates the intensity of the resolution variation on the page.
If the average is 1.0 or close to 1.0, has all of its content at a uniform
resolution. If the average is much lower than 1.0, some content is at a
higher resolution than the rest of the page.
"""
area_ratio: float
"""The maximum-DPI area of the page divided by the total drawn area.
This indicates the prevalence of high-resolution content on the page.
"""
class PageInfo:
"""Information about type of contents on each page in a PDF."""
_has_text: bool | None
_has_vector: bool | None
_images: list[ImageInfo] = []
def __init__(
self,
pdf: Pdf,
pageno: int,
infile: PathLike,
check_pages: Container[int],
detailed_analysis: bool = False,
miner_state: PdfMinerState | None = None,
):
"""Initialize a PageInfo object."""
self._pageno = pageno
self._infile = infile
self._detailed_analysis = detailed_analysis
self._gather_pageinfo(
pdf, pageno, infile, check_pages, detailed_analysis, miner_state
)
def _gather_pageinfo(
self,
pdf: Pdf,
pageno: int,
infile: PathLike,
check_pages: Container[int],
detailed_analysis: bool,
miner_state: PdfMinerState | None,
):
page: Page = pdf.pages[pageno]
mediabox = [Decimal(d) for d in page.mediabox.as_list()]
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]
self._artbox = [float(d) for d in page.artbox.as_list()]
self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
self._cropbox = [float(d) for d in page.cropbox.as_list()]
self._mediabox = [float(d) for d in page.mediabox.as_list()]
self._trimbox = [float(d) for d in page.trimbox.as_list()]
check_this_page = pageno in check_pages
if check_this_page and detailed_analysis:
page_analysis = miner_state.get_page_analysis(pageno)
if page_analysis is not None:
self._textboxes = list(
simplify_textboxes(page_analysis, get_text_boxes)
)
else:
self._textboxes = []
bboxes = (box.bbox for box in self._textboxes)
self._has_text = _page_has_text(bboxes, width_pt, height_pt)
else:
self._textboxes = []
self._has_text = None # i.e. "no information"
userunit = page.get(Name.UserUnit, Decimal(1.0))
if not isinstance(userunit, Decimal):
userunit = Decimal(userunit)
self._userunit = userunit
self._width_inches = width_pt * userunit / Decimal(72.0)
self._height_inches = height_pt * userunit / Decimal(72.0)
self._rotate = int(getattr(page.obj, 'Rotate', 0))
userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)
if check_this_page:
self._has_vector = False
self._has_text = False
self._images = []
for info in _process_content_streams(
pdf=pdf, container=page, shorthand=userunit_shorthand
):
if isinstance(info, VectorMarker):
self._has_vector = True
elif isinstance(info, TextMarker):
self._has_text = True
elif isinstance(info, ImageInfo):
self._images.append(info)
else:
raise NotImplementedError()
else:
self._has_vector = None # i.e. "no information"
self._has_text = None
self._images = []
self._dpi = None
if self._images:
dpi = Resolution(0.0, 0.0).take_max(
image.dpi for image in self._images if image.renderable
)
self._dpi = dpi
self._width_pixels = int(round(dpi.x * float(self._width_inches)))
self._height_pixels = int(round(dpi.y * float(self._height_inches)))
@property
def pageno(self) -> int:
"""Return page number (0-based)."""
return self._pageno
@property
def has_text(self) -> bool:
"""Return True if page has text, False if not or unknown."""
return bool(self._has_text)
@property
def has_corrupt_text(self) -> bool:
"""Return True if page has corrupt text, False if not or unknown."""
if not self._detailed_analysis:
raise NotImplementedError('Did not do detailed analysis')
return any(tbox.is_corrupt for tbox in self._textboxes)
@property
def has_vector(self) -> bool:
"""Return True if page has vector graphics, False if not or unknown.
Vector graphics are sometimes used to draw fonts, so it may not be
obvious on visual inspection whether a page has text or not.
"""
return bool(self._has_vector)
@property
def width_inches(self) -> Decimal:
"""Return width of page in inches."""
return self._width_inches
@property
def height_inches(self) -> Decimal:
"""Return height of page in inches."""
return self._height_inches
@property
def width_pixels(self) -> int:
"""Return width of page in pixels."""
return int(round(float(self.width_inches) * self.dpi.x))
@property
def height_pixels(self) -> int:
"""Return height of page in pixels."""
return int(round(float(self.height_inches) * self.dpi.y))
@property
def rotation(self) -> int:
"""Return rotation of page in degrees.
Will only be a multiple of 90.
"""
return self._rotate
@rotation.setter
def rotation(self, value):
if value in (0, 90, 180, 270, 360, -90, -180, -270):
self._rotate = value
else:
raise ValueError("rotation must be a cardinal angle")
@property
def cropbox(self) -> FloatRect:
"""Return cropbox of page in PDF coordinates."""
return self._cropbox
@property
def mediabox(self) -> FloatRect:
"""Return mediabox of page in PDF coordinates."""
return self._mediabox
@property
def trimbox(self) -> FloatRect:
"""Return trimbox of page in PDF coordinates."""
return self._trimbox
@property
def artbox(self) -> FloatRect:
"""Return artbox of page in PDF coordinates."""
return self._artbox
@property
def bleedbox(self) -> FloatRect:
"""Return bleedbox of page in PDF coordinates."""
return self._bleedbox
@property
def images(self) -> list[ImageInfo]:
"""Return images."""
return self._images
def get_textareas(self, visible: bool | None = None, corrupt: bool | None = None):
"""Return textareas bounding boxes in PDF coordinates on the page."""
def predicate(
obj: TextboxInfo, want_visible: bool | None, want_corrupt: bool | None
) -> bool:
result = True
if want_visible is not None and obj.is_visible != want_visible:
result = False
if want_corrupt is not None and obj.is_corrupt != want_corrupt:
result = False
return result
if not self._textboxes:
if visible is not None and corrupt is not None:
raise NotImplementedError('Incomplete information on textboxes')
return self._textboxes
return (obj.bbox for obj in self._textboxes if predicate(obj, visible, corrupt))
@property
def dpi(self) -> Resolution:
"""Return DPI needed to render all images on the page."""
if self._dpi is None:
return Resolution(0.0, 0.0)
return self._dpi
@property
def userunit(self) -> Decimal:
"""Return user unit of page."""
return self._userunit
@property
def min_version(self) -> str:
"""Return minimum PDF version needed to render this page."""
if self.userunit is not None:
return '1.6'
else:
return '1.5'
def page_dpi_profile(self) -> PageResolutionProfile | None:
"""Return information about the DPIs of the page.
This is useful to detect pages with a small proportion of high-resolution
content that is forcing us to use a high DPI for the whole page. The ratio
is weighted by the area of each image. If images overlap, the overlapped
area counts.
Vector graphics and text are ignored.
Returns None if there is no meaningful DPI for the page.
"""
image_dpis = []
image_areas = []
for image in self._images:
if not image.renderable:
continue
image_dpis.append(image.dpi.to_scalar())
image_areas.append(image.printed_area)
total_drawn_area = sum(image_areas)
if total_drawn_area == 0:
return None
weights = [area / total_drawn_area for area in image_areas]
# Calculate harmonic mean of DPIs weighted by area
weighted_dpi = statistics.harmonic_mean(image_dpis, weights)
max_dpi = max(image_dpis)
dpi_average_max_ratio = weighted_dpi / max_dpi
arg_max_dpi = image_dpis.index(max_dpi)
max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area
return PageResolutionProfile(
weighted_dpi,
max_dpi,
dpi_average_max_ratio,
max_area_ratio,
)
def __repr__(self):
"""Return string representation."""
return (
f''
)
DEFAULT_EXECUTOR = SerialExecutor()
class PdfInfo:
"""Extract summary information about a PDF without retaining the PDF itself.
Crucially this lets us get the information in a pure Python format so that
it can be pickled and passed to a worker process.
"""
_has_acroform: bool = False
_has_signature: bool = False
_needs_rendering: bool = False
def __init__(
self,
infile: Path,
*,
detailed_analysis: bool = False,
progbar: bool = False,
max_workers: int | None = None,
use_threads: bool = True,
check_pages=None,
executor: Executor = DEFAULT_EXECUTOR,
):
"""Initialize."""
self._infile = infile
if check_pages is None:
check_pages = range(0, 1_000_000_000)
with Pdf.open(infile) as pdf:
if pdf.is_encrypted:
raise EncryptedPdfError() # Triggered by encryption with empty passwd
pscript5_mode = str(pdf.docinfo.get(Name.Creator, "")).startswith(
'PScript5'
)
self._miner_state = (
PdfMinerState(infile, pscript5_mode)
if detailed_analysis
else nullcontext()
)
with self._miner_state as miner_state:
self._pages = _pdf_pageinfo_concurrent(
pdf,
executor,
max_workers,
use_threads,
infile,
progbar,
check_pages=check_pages,
detailed_analysis=detailed_analysis,
miner_state=miner_state,
)
self._needs_rendering = pdf.Root.get(Name.NeedsRendering, False)
if Name.AcroForm in pdf.Root:
if (
len(pdf.Root.AcroForm.get(Name.Fields, [])) > 0
or Name.XFA in pdf.Root.AcroForm
):
self._has_acroform = True
self._has_signature = bool(pdf.Root.AcroForm.get(Name.SigFlags, 0) & 1)
self._is_tagged = bool(
pdf.Root.get(Name.MarkInfo, {}).get(Name.Marked, False)
)
@property
def pages(self) -> list[PageInfo | None]:
"""Return list of PageInfo objects, one per page in the PDF."""
return self._pages
@property
def min_version(self) -> str:
"""Return minimum PDF version needed to render this PDF."""
# The minimum PDF is the maximum version that any particular page needs
return max(page.min_version for page in self.pages if page)
@property
def has_userunit(self) -> bool:
"""Return True if any page has a user unit."""
return any(page.userunit != 1.0 for page in self.pages if page)
@property
def has_acroform(self) -> bool:
"""Return True if the document catalog has an AcroForm."""
return self._has_acroform
@property
def has_signature(self) -> bool:
"""Return True if the document annotations has a digital signature."""
return self._has_signature
@property
def is_tagged(self) -> bool:
"""Return True if the document catalog indicates this is a Tagged PDF."""
return self._is_tagged
@property
def filename(self) -> str | Path:
"""Return filename of PDF."""
if not isinstance(self._infile, str | Path):
raise NotImplementedError("can't get filename from stream")
return self._infile
@property
def needs_rendering(self) -> bool:
"""Return True if PDF contains XFA forms.
XFA forms are not supported by most standard PDF renderers, so we
need to detect and suppress them.
"""
return self._needs_rendering
def __getitem__(self, item) -> PageInfo:
"""Return PageInfo object for page number `item`."""
return self._pages[item]
def __len__(self):
"""Return number of pages in PDF."""
return len(self._pages)
def __repr__(self):
"""Return string representation."""
return f""
def main(): # pragma: no cover
"""Run as a script."""
import argparse # pylint: disable=import-outside-toplevel
from pprint import pprint # pylint: disable=import-outside-toplevel
parser = argparse.ArgumentParser()
parser.add_argument('infile')
args = parser.parse_args()
pdfinfo = PdfInfo(args.infile)
pprint(pdfinfo)
for page in pdfinfo.pages:
pprint(page)
for im in page.images:
pprint(im)
if __name__ == '__main__':
main()
================================================
FILE: src/ocrmypdf/pdfinfo/layout.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Detailed text position and layout analysis, building on pdfminer.six."""
from __future__ import annotations
import re
from collections.abc import Iterator, Mapping
from contextlib import contextmanager
from math import copysign
from os import PathLike
from pathlib import Path
from typing import Any
from unittest.mock import patch
import pdfminer
import pdfminer.encodingdb
import pdfminer.pdfdevice
import pdfminer.pdfinterp
import pdfminer.psparser
from deprecation import deprecated
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox
from pdfminer.pdfcolor import PDFColorSpace
from pdfminer.pdfdevice import PDFTextSeq
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from pdfminer.pdffont import FontWidthDict, PDFFont, PDFSimpleFont, PDFUnicodeNotDefined
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager, PDFTextState
from pdfminer.pdfpage import PDFPage
from pdfminer.utils import Matrix, bbox2str, matrix2str
from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
STRIP_NAME = re.compile(r'[0-9]+')
original_pdfsimplefont_init = PDFSimpleFont.__init__
def pdfsimplefont__init__(
self,
descriptor: Mapping[str, Any],
widths: FontWidthDict,
spec: Mapping[str, Any],
) -> None:
"""Monkeypatch pdfminer.six PDFSimpleFont.__init__.
If there is no ToUnicode and no Encoding, pdfminer.six assumes that Unicode
conversion is possible. This is incorrect, according to PDF Reference Manual
9.10.2. This patch fixes that.
"""
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
original_pdfsimplefont_init(self, descriptor, widths, spec)
if not self.unicode_map and 'Encoding' not in spec:
self.cid2unicode = {}
return
PDFSimpleFont.__init__ = pdfsimplefont__init__
# Patch pdfminer.six buffer size
# The parser doesn't properly handle keyword tokens are split across the end of the
# buffer, so increase the buffer size something far larger than will ever be seen.
pdfminer.psparser.PSBaseParser.BUFSIZ = 256 * 1024 * 1024
def pdftype3font__pscript5_get_height(self):
"""Monkeypatch for PScript5.dll PDFs.
The height of Type3 fonts is known to be incorrect in PScript5.dll
generated PDFs. This patch attempts to correct the height by
using the bbox height if it is available, otherwise using the
ascent and descent.
"""
h = self.bbox[3] - self.bbox[1]
if h == 0:
h = self.ascent - self.descent
return h * copysign(1.0, self.vscale)
def pdftype3font__pscript5_get_descent(self):
"""Monkeypatch for PScript5.dll PDFs.
The descent of Type3 fonts is known to be incorrect in PScript5.dll
generated PDFs. This patch attempts to correct the descent by
using the vscale.
"""
return self.descent * copysign(1.0, self.vscale)
def pdftype3font__pscript5_get_ascent(self):
"""Monkeypatch for PScript5.dll PDFs.
The ascent of Type3 fonts is known to be incorrect in PScript5.dll
generated PDFs. This patch attempts to correct the ascent by
using the vscale.
"""
return self.ascent * copysign(1.0, self.vscale)
def _is_undefined_char(s: str) -> bool:
"""Check if a string is an undefined character."""
return s.startswith('(cid:') and s.endswith(')')
class LTStateAwareChar(LTChar):
"""A subclass of LTChar that tracks text render mode at time of drawing."""
__slots__ = (
'rendermode',
'_text',
'matrix',
'fontname',
'adv',
'upright',
'size',
'width',
'height',
'bbox',
'x0',
'x1',
'y0',
'y1',
)
def __init__(
self,
matrix: Matrix,
font: PDFFont,
fontsize: float,
scaling: float,
rise: float,
text: str,
textwidth: float,
textdisp: float | tuple[float | None, float],
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
textstate: PDFTextState,
) -> None:
"""Initialize."""
super().__init__(
matrix,
font,
fontsize,
scaling,
rise,
text,
textwidth,
textdisp,
ncs,
graphicstate,
)
self.rendermode = textstate.render
def is_compatible(self, obj: object) -> bool:
"""Check if characters can be combined into a textline.
We consider characters compatible if:
- the Unicode mapping is known, and both have the same render mode
- the Unicode mapping is unknown but both are part of the same font
"""
# pylint: disable=protected-access
if not isinstance(obj, LTStateAwareChar):
return False
both_unicode_mapped = not _is_undefined_char(
self._text
) and not _is_undefined_char(obj._text)
if both_unicode_mapped:
return self.rendermode == obj.rendermode
return self.fontname == obj.fontname and self.rendermode == obj.rendermode
def get_text(self) -> str:
"""Get text from this character."""
if _is_undefined_char(self._text):
return '\ufffd' # standard 'Unknown symbol'
return self._text
def __repr__(self) -> str:
"""Return a string representation of this object."""
return (
f"<{self.__class__.__name__} "
f"{bbox2str(self.bbox)} "
f"matrix={matrix2str(self.matrix)} "
f"rendermode={self.rendermode!r} "
f"font={self.fontname!r} "
f"adv={self.adv} "
f"text={self.get_text()!r}>"
)
class TextPositionTracker(PDFLayoutAnalyzer):
"""A page layout analyzer that pays attention to text visibility."""
textstate: PDFTextState
def __init__(
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: LAParams | None = None,
):
"""Initialize the layout analyzer."""
super().__init__(rsrcmgr, pageno, laparams)
self.result: LTPage | None = None
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
"""Begin processing of a page."""
super().begin_page(page, ctm)
self.cur_item = LTPage(self.pageno, page.mediabox)
def end_page(self, page: PDFPage) -> None:
"""End processing of a page."""
assert not self._stack, str(len(self._stack))
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
if self.laparams is not None:
self.cur_item.analyze(self.laparams)
self.pageno += 1
self.receive_layout(self.cur_item)
def render_string(
self,
textstate: PDFTextState,
seq: PDFTextSeq,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
) -> None:
"""Respond to render string event by updating text state."""
self.textstate = textstate.copy()
super().render_string(self.textstate, seq, ncs, graphicstate)
def render_char(
self,
matrix: Matrix,
font: PDFFont,
fontsize: float,
scaling: float,
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
) -> float:
"""Respond to render char event by updating text state."""
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTStateAwareChar(
matrix,
font,
fontsize,
scaling,
rise,
text,
textwidth,
textdisp,
ncs,
graphicstate,
self.textstate,
)
self.cur_item.add(item)
return item.adv
def receive_layout(self, ltpage: LTPage) -> None:
"""Receive layout handler."""
self.result = ltpage
def get_result(self) -> LTPage | None:
"""Get the result of the analysis."""
return self.result
@contextmanager
def patch_pdfminer(pscript5_mode: bool):
"""Patch pdfminer.six to work around bugs in PDFs created by PScript5."""
if pscript5_mode:
with patch.multiple(
'pdfminer.pdffont.PDFType3Font',
spec=True,
get_ascent=pdftype3font__pscript5_get_ascent,
get_descent=pdftype3font__pscript5_get_descent,
get_height=pdftype3font__pscript5_get_height,
):
yield
else:
yield
@deprecated(deprecated_in='16.6.0', details='Use PdfMinerState instead.')
def get_page_analysis(
infile: PathLike, pageno: int, pscript5_mode: bool
) -> LTPage | None:
"""Get the page analysis for a given page."""
rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
disable_boxes_flow = None
dev = TextPositionTracker(
rman,
laparams=LAParams(
all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow
),
)
interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)
with patch_pdfminer(pscript5_mode):
try:
with Path(infile).open('rb') as f:
page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
page = next(page_iter, None)
if page is None:
raise InputFileError(
f"pdfminer could not process page {pageno} (counting from 0)."
)
interp.process_page(page)
except PDFTextExtractionNotAllowed as e:
raise EncryptedPdfError() from e
return dev.get_result()
class PdfMinerState:
"""Provide a context manager for using pdfminer.six.
This ensures that the file is closed. It also provides a cache of pages
from the PDF so that they can be reused if needed, to improve performance.
"""
def __init__(self, infile: Path, pscript5_mode: bool) -> None:
"""Initialize the context manager.
Args:
infile: The path to the PDF file to be analyzed.
pscript5_mode: Whether the PDF was generated by PScript5.dll.
"""
self.infile = infile
self.rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
self.disable_boxes_flow = None
self.page_iter = None
self.page_cache: list[PDFPage] = []
self.pscript5_mode = pscript5_mode
self.file = None
def __enter__(self):
"""Enter the context manager."""
self.file = Path(self.infile).open('rb')
self.page_iter = PDFPage.get_pages(self.file)
return self
def __exit__(self, exc_type, exc_value, traceback):
"""Exit the context manager."""
if self.file:
self.file.close()
return True
def get_page_analysis(self, pageno: int):
"""Get the page analysis for a given page."""
while len(self.page_cache) <= pageno:
try:
self.page_cache.append(next(self.page_iter))
except StopIteration:
raise InputFileError(
f"pdfminer did not find page {pageno} in the input file."
) from None
page = self.page_cache[pageno]
if not page:
raise InputFileError(
f"pdfminer could not process page {pageno} (counting from 0)."
)
dev = TextPositionTracker(
self.rman,
laparams=LAParams(
all_texts=True, detect_vertical=True, boxes_flow=self.disable_boxes_flow
),
)
interp = pdfminer.pdfinterp.PDFPageInterpreter(self.rman, dev)
with patch_pdfminer(self.pscript5_mode):
interp.process_page(page)
return dev.get_result()
def get_text_boxes(obj) -> Iterator[LTTextBox]:
"""Get the text boxes attached to the current node."""
for child in obj:
if isinstance(child, (LTTextBox)):
yield child
else:
try:
yield from get_text_boxes(child)
except TypeError:
continue
================================================
FILE: src/ocrmypdf/pluginspec.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF pluggy plugin specification."""
from __future__ import annotations
from abc import ABC, abstractmethod
from argparse import ArgumentParser
from collections.abc import Sequence, Set
from enum import StrEnum
from logging import Handler
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple
import pluggy
from pydantic import BaseModel
from ocrmypdf import Executor, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.helpers import Resolution
if TYPE_CHECKING:
from PIL import Image
# pylint: disable=ungrouped-imports
from ocrmypdf._jobcontext import PageContext
from ocrmypdf.hocrtransform import OcrElement
from ocrmypdf.pdfinfo import PdfInfo
# pylint: enable=ungrouped-imports
class GhostscriptRasterDevice(StrEnum):
"""Possible raster devices for Ghostscript."""
JPEGGRAY = 'jpeggray'
JPEGCOLOR = 'jpeg'
PNGMONO = 'pngmono'
PNGGRAY = 'pnggray'
PNG256 = 'png256'
PNG16M = 'png16m'
hookspec = pluggy.HookspecMarker('ocrmypdf')
# pylint: disable=unused-argument
# mypy: disable-error-code=empty-body
@hookspec(firstresult=True)
def get_logging_console() -> Handler: # type: ignore[return-value]
"""Returns a custom logging handler.
Generally this is necessary when both logging output and a progress bar are both
outputting to ``sys.stderr``.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec
def initialize(plugin_manager: pluggy.PluginManager) -> None:
"""Called when this plugin is first loaded into OCRmyPDF.
The primary intended use of this is for plugins to check compatibility with other
plugins and possibly block other blocks, a plugin that wishes to block ocrmypdf's
built-in optimize plugin could do:
.. code-block::
plugin_manager.set_blocked('ocrmypdf.builtin_plugins.optimize')
It would also be reasonable for an plugin implementation to check if it is unable
to proceed, for example, because a required dependency is missing. (If the plugin's
ability to proceed depends on options and arguments, use ``validate`` instead.)
Raises:
ocrmypdf.exceptions.ExitCodeException: If options are not acceptable
and the application should terminate gracefully with an informative
message and error code.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
"""
@hookspec
def add_options(parser: ArgumentParser) -> None:
"""Allows the plugin to add its own command line and API arguments.
OCRmyPDF converts command line arguments to API arguments, so adding
arguments here will cause new arguments to be processed for API calls
to ``ocrmypdf.ocr``, or when invoked on the command line.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
"""
@hookspec
def register_options() -> dict[str, type[BaseModel]]:
"""Return plugin's option models keyed by namespace.
This hook allows plugins to register their option models with the
plugin option registry. The returned dictionary should map namespace
strings to Pydantic model classes.
Returns:
Dictionary mapping namespace strings to BaseModel classes
Example:
@hookimpl
def register_options():
return {'tesseract': TesseractOptions}
Note:
This hook will be called from the main process during plugin
infrastructure setup, before child worker processes are forked.
"""
@hookspec
def check_options(options: OcrOptions) -> None:
"""Called to ask the plugin to check all of the options.
The plugin may check if options that it added are valid.
Warnings or other messages may be passed to the user by creating a logger
object using ``log = logging.getLogger(__name__)`` and logging to this.
The plugin may also modify the *options*. All objects that are in options
must be picklable so they can be marshalled to child worker processes.
Raises:
ocrmypdf.exceptions.ExitCodeException: If options are not acceptable
and the application should terminate gracefully with an informative
message and error code.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
"""
@hookspec(firstresult=True)
def get_executor(progressbar_class: type[ProgressBar]) -> Executor: # type: ignore[return-value]
"""Called to obtain an object that manages parallel execution.
This may be used to replace OCRmyPDF's default parallel execution system
with a third party alternative. For example, you could make OCRmyPDF run in a
distributed environment.
OCRmyPDF's executors are analogous to the standard Python executors in
``conconcurrent.futures``, but they do not work the same way. Executors may
be reused for different, unrelated batch operations, since all of the context
for a given job are passed to :meth:`Executor.__call__`.
Should be of type :class:`Executor` or otherwise conforming to the protocol
of that call.
Arguments:
progressbar_class: A progress bar class, which will be created when
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def get_progressbar_class() -> type[ProgressBar]: # type: ignore[return-value]
"""Called to obtain a class that can be used to monitor progress.
OCRmyPDF will call this function when it wants to display a progress bar.
The class returned by this function must be compatible with the
:class:`ProgressBar` protocol.
Example:
Here is how OCRmyPDF will use the progress bar:
.. code-block:: python
pbar_class = pm.hook.get_progressbar_class()
with pbar_class(**progress_kwargs) as pbar:
... # do some work
pbar.update(1)
"""
@hookspec
def validate(pdfinfo: PdfInfo, options: OcrOptions) -> None:
"""Called to give a plugin an opportunity to review *options* and *pdfinfo*.
*options* contains the "work order" to process a particular file. *pdfinfo*
contains information about the input file obtained after loading and
parsing. The plugin may modify the *options*. For example, you could decide
that a certain type of file should be treated with ``options.force_ocr = True``
based on information in its *pdfinfo*.
Raises:
ocrmypdf.exceptions.ExitCodeException: If options or pdfinfo are not acceptable
and the application should terminate gracefully with an informative
message and error code.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
"""
@hookspec(firstresult=True)
def rasterize_pdf_page(
input_file: Path,
output_file: Path,
raster_device: GhostscriptRasterDevice,
raster_dpi: Resolution,
pageno: int,
page_dpi: Resolution | None,
rotation: int | None,
filter_vector: bool,
stop_on_soft_error: bool,
options: OcrOptions | None,
use_cropbox: bool,
) -> Path: # type: ignore[return-value]
"""Rasterize one page of a PDF at resolution raster_dpi in canvas units.
The image is sized to match the integer pixels dimensions implied by
raster_dpi even if those numbers are noninteger. The image's DPI will
be overridden with the values in page_dpi.
Args:
input_file: The PDF to rasterize.
output_file: The desired name of the rasterized image.
raster_device: Type of image to produce at output_file.
raster_dpi: Resolution in dots per inch at which to rasterize page.
pageno: Page number to rasterize (beginning at page 1).
page_dpi: Resolution, overriding output image DPI.
rotation: Cardinal angle, clockwise, to rotate page.
filter_vector: If True, remove vector graphics objects.
stop_on_soft_error: If there is an "soft error" such that PDF page image
generation can proceed, but may visually differ from the original,
the implementer of this hook should raise a detailed exception. If
``False``, continue processing and report by logging it. If the hook
cannot proceed, it should always raise an exception, regardless of
this setting. One "soft error" would be a missing font that is
required to properly rasterize the PDF.
options: OCRmyPDF options. Plugins may use this to check settings like
``options.rasterizer`` to determine whether they should handle the
request or defer to another plugin. Introduced in version 17.0.
use_cropbox: If True, rasterize the page's CropBox instead of the
MediaBox. Default is False (use MediaBox) for consistency with
Ghostscript's default behavior.
Returns:
Path: output_file if successful
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image: # type: ignore[return-value]
"""Called to filter the image before it is sent to OCR.
This is the image that OCR sees, not what the user sees when they view the
PDF. In certain modes such as ``--redo-ocr``, portions of the image may be
masked out to hide them from OCR.
The main uses of this hook are expected to be hiding content from OCR,
conditioning images to OCR better with filters, and adjusting images to
match any constraints imposed by the OCR engine.
The input image may be color, grayscale, or monochrome, and the
output image may differ. For example, if you know that a custom OCR engine
does not care about the color of the text, you could convert the image to
it to grayscale or monochrome.
Generally speaking, the output image should be a faithful representation of
of the input image. You *may* change the pixel width and height of the
the input image, but you must not change the aspect ratio, and you must
calculate the DPI of the output image based on the new pixel width and
height or the OCR text layer will be misaligned with the visual position.
The built-in Tesseract OCR engine uses this hook itself to downsample
very large images to fit its constraints.
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def filter_page_image(page: PageContext, image_filename: Path) -> Path: # type: ignore[return-value]
"""Called to filter the whole page before it is inserted into the PDF.
A whole page image is only produced when preprocessing command line arguments
are issued or when ``--force-ocr`` is issued. If no whole page is image is
produced for a given page, this function will not be called. This is not
the image that will be shown to OCR.
If the function does not want to modify the image, it should return
``image_filename``. The hook may overwrite ``image_filename`` with a new file.
The output image should preserve the same physical unit dimensions, that is
``(width * dpi_x, height * dpi_y)``. That is, if the image is resized, the DPI
must be adjusted by the reciprocal. If this is not preserved, the PDF page
will be resized and the OCR layer misaligned. OCRmyPDF does nothing
to enforce these constraints; it is up to the plugin to do sensible things.
OCRmyPDF will create the PDF page based on the image format used (unless the
hook is overridden). If you convert the image to a JPEG, the output page will
be created as a JPEG, etc. If you change the colorspace, that change will be
kept. Note that the OCRmyPDF image optimization stage, if enabled, may
ultimately chose a different format.
If the return value is a file that does not exist, ``FileNotFoundError``
will occur. The return value should be a path to a file in the same folder
as ``image_filename``.
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -> Path: # type: ignore[return-value]
"""Called to convert a filtered whole page image into a PDF.
A whole page image is only produced when preprocessing command line arguments
are issued or when ``--force-ocr`` is issued. If no whole page is image is
produced for a given page, this function will not be called. This is not
the image that will be shown to OCR. The whole page image is filtered in
the hook above, ``filter_page_image``, then this function is called for
PDF conversion.
This function will only be called when OCRmyPDF runs in a mode such as
"force OCR" mode where rasterizing of all content is performed.
Clever things could be done at this stage such as segmenting the page image into
color regions or vector equivalents.
The provider of the hook implementation is responsible for ensuring that the
OCR text layer is aligned with the PDF produced here, or text misalignment
will result.
Currently this function must produce a single page PDF or the pipeline will
fail. If the intent is to remove the PDF, then create a single page empty
PDF.
Args:
page: Context for this page.
image_filename: Filename of the input image used to create output_pdf,
for "reference" if recreating the output_pdf entirely.
output_pdf: The previous created output_pdf.
Returns:
output_pdf
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook`.
"""
class OrientationConfidence(NamedTuple):
"""Expresses an OCR engine's confidence in page rotation.
Attributes:
angle: The clockwise angle (0, 90, 180, 270) that the page should be
rotated. 0 means no rotation.
confidence: How confident the OCR engine is that this the correct
rotation. 0 is not confident, 15 is very confident. Arbitrary units.
"""
angle: int
confidence: float
class OcrEngine(ABC):
"""A class representing an OCR engine with capabilities similar to Tesseract OCR.
This could be used to create a plugin for another OCR engine instead of
Tesseract OCR.
"""
@staticmethod
@abstractmethod
def version() -> str:
"""Returns the version of the OCR engine."""
@staticmethod
@abstractmethod
def creator_tag(options: OcrOptions) -> str:
"""Returns the creator tag to identify this software's role in creating the PDF.
This tag will be inserted in the XMP metadata and DocumentInfo dictionary
as appropriate. Ideally you should include the name of the OCR engine and its
version. The text should not contain line breaks. This is to help developers
like yourself identify the software that produced this file.
OCRmyPDF will always prepend its name to this value.
"""
@abstractmethod
def __str__(self) -> str:
"""Returns name of OCR engine and version.
This is used when OCRmyPDF wants to mention the name of the OCR engine
to the user, usually in an error message.
"""
@staticmethod
@abstractmethod
def languages(options: OcrOptions) -> Set[str]:
"""Returns the set of all languages that are supported by the engine.
Languages are typically given in 3-letter ISO 3166-1 codes, but actually
can be any value understood by the OCR engine.
"""
@staticmethod
@abstractmethod
def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:
"""Returns the orientation of the image."""
@staticmethod
def get_deskew(input_file: Path, options: OcrOptions) -> float:
"""Returns the deskew angle of the image, in degrees."""
return 0.0
@staticmethod
@abstractmethod
def generate_hocr(
input_file: Path, output_hocr: Path, output_text: Path, options: OcrOptions
) -> None:
"""Called to produce a hOCR file from a page image and sidecar text file.
A hOCR file is an HTML-like file that describes the position of text on a
page. OCRmyPDF can create a text only PDF from the hOCR file and graft it
onto the output PDF.
This function executes in a worker thread or worker process. OCRmyPDF
automatically parallelizes OCR over pages. The OCR engine should not
introduce more parallelism.
Args:
input_file: A page image on which to perform OCR.
output_hocr: The expected name of the output hOCR file.
output_text: The expected name of a text file containing the
recognized text.
options: The command line options.
"""
@staticmethod
@abstractmethod
def generate_pdf(
input_file: Path, output_pdf: Path, output_text: Path, options: OcrOptions
) -> None:
"""Called to produce a text only PDF from a page image.
A text only PDF should contain no visible material of any kind, as it
will be grafted onto the input PDF page. It must be sized to the
exact dimensions of the input image.
This function executes in a worker thread or worker process. OCRmyPDF
automatically parallelizes OCR over pages. The OCR engine should not
introduce more parallelism.
Args:
input_file: A page image on which to perform OCR.
output_pdf: The expected name of the output PDF.
output_text: The expected name of a text file containing the
recognized text.
options: The command line options.
"""
@staticmethod
def supports_generate_ocr() -> bool:
"""Return True if this engine supports the generate_ocr() API.
The pipeline uses this to determine whether to call generate_ocr()
or fall back to generate_hocr().
Returns:
False by default. Engines implementing generate_ocr() should
override this to return True.
"""
return False
@staticmethod
def generate_ocr(
input_file: Path,
options: OcrOptions,
page_number: int = 0,
) -> tuple[OcrElement, str]:
"""Generate OCR results as an OcrElement tree.
This is the modern API for OCR engines. Engines implementing this method
can return structured OCR results directly without intermediate file formats.
This function executes in a worker thread or worker process. OCRmyPDF
automatically parallelizes OCR over pages. The OCR engine should not
introduce more parallelism.
Args:
input_file: A page image on which to perform OCR.
options: The command line options.
page_number: Zero-indexed page number (for multi-page context).
Returns:
A tuple of (OcrElement tree for the page, plain text content).
The OcrElement should have ocr_class=OcrClass.PAGE as its root.
Note:
This method is optional. Engines that don't implement it should
leave the default implementation, and the pipeline will fall back to
generate_hocr() or generate_pdf().
"""
raise NotImplementedError("This OcrEngine does not implement generate_ocr()")
@hookspec(firstresult=True)
def get_ocr_engine(options: OcrOptions | None) -> OcrEngine: # type: ignore[return-value]
"""Returns an OcrEngine to use for processing this file.
The OcrEngine may be instantiated multiple times, by both the main process
and child process.
When multiple OCR engine plugins are installed, plugins should check
``options.ocr_engine`` and return ``None`` if they are not the selected
engine. The hook caller will then try the next plugin.
Args:
options: The current OcrOptions, used to determine which engine
to select. May be None for backward compatibility with external
plugins.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def generate_pdfa(
pdf_pages: list[Path],
pdfmark: Path,
output_file: Path,
context: PdfContext,
pdf_version: str,
pdfa_part: str,
progressbar_class: type[ProgressBar] | None,
stop_on_soft_error: bool,
) -> Path: # type: ignore[return-value]
"""Generate a PDF/A.
This API strongly assumes a PDF/A generator with Ghostscript's semantics.
OCRmyPDF will modify the metadata and possibly linearize the PDF/A after it
is generated.
Arguments:
pdf_pages: A list of one or more filenames, will be merged into output_file.
pdfmark: A PostScript file intended for Ghostscript with details on
how to perform the PDF/A conversion.
output_file: The name of the desired output file.
context: The current context.
pdf_version: The minimum PDF version that the output file should be.
At its own discretion, the PDF/A generator may raise the version,
but should not lower it.
pdfa_part: The desired PDF/A compliance level, such as ``'2b'``.
progressbar_class: The class of a progress bar, which must implement
the ProgressBar protocol. If None, no progress is reported.
stop_on_soft_error: If there is an "soft error" such that PDF/A generation
can proceed and produce a valid PDF/A, but output may be invalid or
may not visually resemble the original, the implementer of this hook
should raise a detailed exception. If ``False``, continue processing
and report by logging it. If the hook cannot proceed, it should always
raise an exception, regardless of this setting.
Returns:
Path: If successful, the hook should return ``output_file``.
Note:
This is a :ref:`firstresult hook`.
Note:
Before version 15.0.0, the ``context`` was not provided and ``compression``
was provided instead. Plugins should now read the context object to determine
if compression is requested.
"""
@hookspec(firstresult=True)
def optimize_pdf(
input_pdf: Path,
output_pdf: Path,
context: PdfContext,
executor: Executor,
linearize: bool,
) -> tuple[Path, Sequence[str]]: # type: ignore[return-value]
"""Optimize a PDF after image, OCR and metadata processing.
If the input_pdf is a PDF/A, the plugin should modify input_pdf in a way
that preserves the PDF/A status, or report to the user when this is not possible.
If the implementation fails to produce a smaller file than the input file, it
should return input_pdf instead.
A plugin that implements a new optimizer may need to suppress the built-in
optimizer by implementing an ``initialize`` hook.
Arguments:
input_pdf: The input PDF, which has OCR added.
output_pdf: The requested filename of the output PDF which should be created
by this optimization hook.
context: The current context.
executor: An initialized executor which may be used during optimization,
to distribute optimization tasks.
linearize: If True, OCRmyPDF requires ``optimize_pdf`` to return a linearized,
also known as fast web view PDF.
Returns:
Path: If optimization is successful, the hook should return ``output_file``.
If optimization does not produce a smaller file, the hook should return
``input_file``.
Sequence[str]: Any comments that the plugin wishes to report to the user,
especially reasons it was not able to further optimize the file. For
example, the plugin could report that a required third party was not
installed, so a specific optimization was not attempted.
Note:
This is a :ref:`firstresult hook`.
"""
@hookspec(firstresult=True)
def is_optimization_enabled(context: PdfContext) -> bool: # type: ignore[return-value]
"""For a given PdfContext, OCRmyPDF asks the plugin if optimization is enabled.
An optimization plugin might be installed and active but could be disabled by
user settings.
If this returns False, OCRmyPDF will take certain actions to finalize the PDF.
Returns:
True if the plugin's optimization is enabled.
Note:
This is a :ref:`firstresult hook`.
"""
================================================
FILE: src/ocrmypdf/py.typed
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
# ocrmypdf is typed
================================================
FILE: src/ocrmypdf/quality.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Utilities to measure OCR quality."""
from __future__ import annotations
import re
from collections.abc import Iterable
class OcrQualityDictionary:
"""Manages a dictionary for simple OCR quality checks."""
def __init__(self, *, wordlist: Iterable[str]):
"""Construct a dictionary from a list of words.
Words for which capitalization is important should be capitalized in the
dictionary. Words that contain spaces or other punctuation will never match.
"""
self.dictionary = set(wordlist)
def measure_words_matched(self, ocr_text: str) -> float:
"""Check how many unique words in the OCR text match a dictionary.
Words with mixed capitalized are only considered a match if the test word
matches that capitalization.
Returns:
number of words that match / number
"""
text = re.sub(r"[0-9_]+", ' ', ocr_text)
text = re.sub(r'\W+', ' ', text)
text_words_list = re.split(r'\s+', text)
text_words = {w for w in text_words_list if len(w) >= 3}
matches = 0
for w in text_words:
if w in self.dictionary or (
w != w.lower() and w.lower() in self.dictionary
):
matches += 1
hit_ratio = matches / len(text_words) if matches > 0 else 0.0
return hit_ratio
================================================
FILE: src/ocrmypdf/subprocess/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Wrappers to manage subprocess calls."""
from __future__ import annotations
import logging
import os
import re
import sys
from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from pathlib import Path
from subprocess import PIPE, STDOUT, CalledProcessError, CompletedProcess, Popen
from subprocess import run as subprocess_run
from packaging.version import Version
from ocrmypdf.exceptions import MissingDependencyError
# pylint: disable=logging-format-interpolation
log = logging.getLogger(__name__)
Args = Sequence[Path | str]
Environ = Mapping[str, str] | os._Environ # pylint: disable=protected-access
def run(
args: Args,
*,
env: Environ | None = None,
logs_errors_to_stdout: bool = False,
check: bool = False,
**kwargs,
) -> CompletedProcess:
"""Wrapper around :py:func:`subprocess.run`.
The main purpose of this wrapper is to log subprocess output in an orderly
fashion that identifies the responsible subprocess. An additional
task is that this function goes to greater lengths to find possible Windows
locations of our dependencies when they are not on the system PATH.
Arguments should be identical to ``subprocess.run``, except for following:
Args:
args: Positional arguments to pass to ``subprocess.run``.
env: A set of environment variables. If None, the OS environment is used.
logs_errors_to_stdout: If True, indicates that the process writes its error
messages to stdout rather than stderr, so stdout should be logged
if there is an error. If False, stderr is logged. Could be used with
stderr=STDOUT, stdout=PIPE for example.
check: If True, raise an exception if the process exits with a non-zero
status code. If False, the return value will indicate success or failure.
kwargs: Additional arguments to pass to ``subprocess.run``.
"""
args, env, process_log, _text = _fix_process_args(args, env, kwargs)
stderr = None
stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'
try:
proc = subprocess_run(args, env=env, check=check, **kwargs)
except CalledProcessError as e:
stderr = getattr(e, stderr_name, None)
raise
else:
stderr = getattr(proc, stderr_name, None)
finally:
if process_log.isEnabledFor(logging.DEBUG) and stderr:
with suppress(AttributeError, UnicodeDecodeError):
stderr = stderr.decode('utf-8', 'replace')
if logs_errors_to_stdout:
process_log.debug("stdout/stderr = %s", stderr)
else:
process_log.debug("stderr = %s", stderr)
return proc
def run_polling_stderr(
args: Args,
*,
callback: Callable[[str], None],
check: bool = False,
env: Environ | None = None,
**kwargs,
) -> CompletedProcess:
"""Run a process like ``ocrmypdf.subprocess.run``, and poll stderr.
Every line of produced by stderr will be forwarded to the callback function.
The intended use is monitoring progress of subprocesses that output their
own progress indicators. In addition, each line will be logged if debug
logging is enabled.
Requires stderr to be opened in text mode for ease of handling errors. In
addition the expected encoding= and errors= arguments should be set. Note
that if stdout is already set up, it need not be binary.
"""
args, env, process_log, text = _fix_process_args(args, env, kwargs)
assert text, "Must use text=True"
with Popen(args, env=env, **kwargs) as proc:
lines = []
while proc.poll() is None:
if proc.stderr is None:
continue
for msg in iter(proc.stderr.readline, ''):
if process_log.isEnabledFor(logging.DEBUG):
process_log.debug(msg.strip())
callback(msg)
lines.append(msg)
stderr = ''.join(lines)
if check and proc.returncode != 0:
raise CalledProcessError(proc.returncode, args, output=None, stderr=stderr)
return CompletedProcess(args, proc.returncode, None, stderr=stderr)
def _fix_process_args(
args: Args, env: Environ | None, kwargs
) -> tuple[Args, Environ, logging.Logger, bool]:
if not env:
env = os.environ
# Search in spoof path if necessary
program = str(args[0])
if sys.platform == 'win32':
# pylint: disable=import-outside-toplevel
from ocrmypdf.subprocess._windows import fix_windows_args
args = fix_windows_args(program, args, env)
log.debug("Running: %s", args)
process_log = log.getChild(os.path.basename(program))
text = bool(kwargs.get('text', False))
return args, env, process_log, text
def get_version(
program: str,
*,
version_arg: str = '--version',
regex=r'(\d+(\.\d+)*)',
env: Environ | None = None,
) -> str:
"""Get the version of the specified program.
Arguments:
program: The program to version check.
version_arg: The argument needed to ask for its version, e.g. ``--version``.
regex: A regular expression to parse the program's output and obtain the
version.
env: Custom ``os.environ`` in which to run program.
"""
args_prog = [program, version_arg]
try:
proc = run(
args_prog,
close_fds=True,
text=True,
stdout=PIPE,
stderr=STDOUT,
check=True,
env=env,
)
output: str = proc.stdout
except FileNotFoundError as e:
raise MissingDependencyError(
f"Could not find program '{program}' on the PATH"
) from e
except CalledProcessError as e:
if e.returncode != 0:
log.exception(e)
raise MissingDependencyError(
f"Ran program '{program}' but it exited with an error:\n{e.output}"
) from e
raise MissingDependencyError(
f"Could not find program '{program}' on the PATH"
) from e
match = re.match(regex, output.strip())
if not match:
raise MissingDependencyError(
f"The program '{program}' did not report its version. "
f"Message was:\n{output}"
)
version = match.group(1)
return version
MISSING_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH.
'''
MISSING_OPTIONAL_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is required when you use the
{required_for} arguments. You could try omitting these arguments, or install
the package.
'''
MISSING_RECOMMEND_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is recommended when using the {required_for} arguments,
but not required, so we will proceed. For best results, install the program.
'''
OLD_VERSION = '''
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
to have {found_version}. Please update this program.
'''
OLD_VERSION_REQUIRED_FOR = '''
OCRmyPDF requires '{program}' {need_version} or higher when run with the
{required_for} arguments. {program} {found_version} is installed.
If you omit these arguments, OCRmyPDF may be able to
proceed. For best results, update the program.
'''
OSX_INSTALL_ADVICE = '''
If you have homebrew installed, try these command to install the missing
package:
brew install {package}
'''
LINUX_INSTALL_ADVICE = '''
On systems with the aptitude package manager (Debian, Ubuntu), try these
commands:
sudo apt update
sudo apt install {package}
On RPM-based systems (Red Hat, Fedora), try this command:
sudo dnf install {package}
'''
WINDOWS_INSTALL_ADVICE = '''
If not already installed, install the Chocolatey package manager. Then use
a command prompt to install the missing package:
choco install {package}
'''
def _get_platform() -> str:
if sys.platform.startswith('freebsd'):
return 'freebsd'
elif sys.platform.startswith('linux'):
return 'linux'
elif sys.platform.startswith('win'):
return 'windows'
return sys.platform
def _error_trailer(program: str, package: str | Mapping[str, str], **kwargs) -> None:
del kwargs
if isinstance(package, Mapping):
package = package.get(_get_platform(), program)
if _get_platform() == 'darwin':
log.info(OSX_INSTALL_ADVICE.format(**locals()))
elif _get_platform() == 'linux':
log.info(LINUX_INSTALL_ADVICE.format(**locals()))
elif _get_platform() == 'windows':
log.info(WINDOWS_INSTALL_ADVICE.format(**locals()))
def _error_missing_program(
program: str, package: str, required_for: str | None, recommended: bool
) -> None:
# pylint: disable=unused-argument
if recommended:
log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals()))
elif required_for:
log.error(MISSING_OPTIONAL_PROGRAM.format(**locals()))
else:
log.error(MISSING_PROGRAM.format(**locals()))
_error_trailer(**locals())
def _error_old_version(
program: str,
package: str,
need_version: str,
found_version: str,
required_for: str | None,
) -> None:
# pylint: disable=unused-argument
if required_for:
log.error(OLD_VERSION_REQUIRED_FOR.format(**locals()))
else:
log.error(OLD_VERSION.format(**locals()))
_error_trailer(**locals())
def check_external_program(
*,
program: str,
package: str,
version_checker: Callable[[], Version],
need_version: str | Version,
required_for: str | None = None,
recommended: bool = False,
version_parser: type[Version] = Version,
) -> None:
"""Check for required version of external program and raise exception if not.
Args:
program: The name of the program to test.
package: The name of a software package that typically supplies this program.
Usually the same as program.
version_checker: A callable without arguments that retrieves the installed
version of program.
need_version: The minimum required version.
required_for: The name of an argument of feature that requires this program.
recommended: If this external program is recommended, instead of raising
an exception, log a warning and allow execution to continue.
version_parser: A class that should be used to parse and compare version
numbers. Used when version numbers do not follow standard conventions.
"""
if not isinstance(need_version, Version):
need_version = version_parser(need_version)
try:
found_version = version_checker()
except (CalledProcessError, FileNotFoundError) as e:
_error_missing_program(program, package, required_for, recommended)
if not recommended:
raise MissingDependencyError(program) from e
return
except MissingDependencyError:
_error_missing_program(program, package, required_for, recommended)
if not recommended:
raise
return
if found_version and found_version < need_version:
_error_old_version(
program, package, str(need_version), str(found_version), required_for
)
if not recommended:
raise MissingDependencyError(program)
log.debug('Found %s %s', program, found_version)
================================================
FILE: src/ocrmypdf/subprocess/_windows.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Find Tesseract and Ghostscript binaries on Windows using the registry."""
from __future__ import annotations
import logging
import os
import re
import shutil
import sys
from collections.abc import Callable, Iterable, Iterator
from itertools import chain
from pathlib import Path
from typing import Any, TypeAlias, TypeVar
from packaging.version import InvalidVersion, Version
if sys.platform == 'win32':
# mypy understands 'if sys.platform' better than try/except ModuleNotFoundError
import winreg # pylint: disable=import-error
HKEYType: TypeAlias = winreg.HKEYType
else:
from unittest.mock import Mock
winreg = Mock(
spec=['HKEYType', 'EnumKey', 'EnumValue', 'HKEY_LOCAL_MACHINE', 'OpenKey']
)
# mypy does not understand winreg.HKeyType where winreg is a Mock (fair enough!)
HKEYType: TypeAlias = Any # type: ignore
log = logging.getLogger(__name__)
T = TypeVar('T')
Tkey = TypeVar('Tkey')
def ghostscript_version_key(s: str) -> tuple[int, int, int]:
"""Compare Ghostscript version numbers."""
try:
release = [int(elem) for elem in s.split('.', maxsplit=3)]
while len(release) < 3:
release.append(0)
return (release[0], release[1], release[2])
except ValueError:
return (0, 0, 0)
def registry_enum(key: HKEYType, enum_fn: Callable[[HKEYType, int], T]) -> Iterator[T]:
limit = 999
n = 0
while n < limit:
try:
yield enum_fn(key, n)
n += 1
except OSError:
break
if n == limit:
raise ValueError(f"Too many registry keys under {key}")
def registry_subkeys(key: HKEYType) -> Iterator[str]:
return registry_enum(key, winreg.EnumKey)
def registry_values(key: HKEYType) -> Iterator[tuple[str, Any, int]]:
return registry_enum(key, winreg.EnumValue)
def registry_path_ghostscript(env=None) -> Iterator[Path]:
del env # unused (but needed for protocol)
try:
with winreg.OpenKey(
winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Artifex\GPL Ghostscript"
) as k:
latest_gs = max(
registry_subkeys(k), key=ghostscript_version_key, default=(0, 0, 0)
)
with winreg.OpenKey(
winreg.HKEY_LOCAL_MACHINE, rf"SOFTWARE\Artifex\GPL Ghostscript\{latest_gs}"
) as k:
for _, gs_path, _ in registry_values(k):
yield Path(gs_path) / 'bin'
except OSError as e:
log.warning(e)
def registry_path_tesseract(env=None) -> Iterator[Path]:
del env # unused (but needed for protocol)
try:
with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR") as k:
for subkey, val, _valtype in registry_values(k):
if subkey == 'InstallDir':
tesseract_path = Path(val)
yield tesseract_path
except OSError as e:
log.warning(e)
def _gs_version_in_path_key(path: Path) -> tuple[str, Version | None]:
"""Key function for comparing Ghostscript and Tesseract paths.
Ghostscript installs on Windows:
%PROGRAMFILES%/gs/gs9.56.1/bin -> ('gs', Version('9.56.1'))
%PROGRAMFILES%/gs/9.24/bin -> ('gs', Version('9.24'))
Tesseract looks like:
%PROGRAMFILES%/Tesseract-OCR -> ('Tesseract-OCR', None)
Thus ensuring the resulting tuple will order the alternatives correctly,
e.g. gs10.0 > gs9.99.
"""
match = re.search(r'gs[/\\]?([0-9.]+)[/\\]bin', str(path))
if match:
try:
version_str = match.group(1)
version = Version(version_str)
return 'gs', version
except InvalidVersion:
pass
return path.name, None
def program_files_paths(env=None) -> Iterator[Path]:
if not env:
env = os.environ
program_files = env.get('PROGRAMFILES', '')
def path_walker() -> Iterator[Path]:
for path in Path(program_files).iterdir():
if not path.is_dir():
continue
if path.name.lower() == 'tesseract-ocr':
yield path
elif path.name.lower() == 'gs':
yield from (p for p in path.glob('**/bin') if p.is_dir())
return iter(
sorted(
(p for p in path_walker()),
key=_gs_version_in_path_key,
reverse=True,
)
)
def paths_from_env(env=None) -> Iterator[Path]:
return (Path(p) for p in os.get_exec_path(env) if p)
def shim_path(new_paths: Callable[[Any], Iterator[Path]], env=None) -> str:
if not env:
env = os.environ
return os.pathsep.join(str(p) for p in new_paths(env) if p)
SHIMS = [
paths_from_env,
registry_path_ghostscript,
registry_path_tesseract,
program_files_paths,
]
def fix_windows_args(program: str, args, env):
"""Adjust our desired program and command line arguments for use on Windows."""
# If we are running a .py on Windows, ensure we call it with this Python
# (to support test suite shims)
if program.lower().endswith('.py'):
args = [sys.executable] + args
# If the program we want is not on the PATH, check elsewhere
for shim in SHIMS:
shimmed_path = shim_path(shim, env)
new_args0 = shutil.which(args[0], path=shimmed_path)
if new_args0:
args[0] = new_args0
break
return args
def unique_everseen(iterable: Iterable[T], key: Callable[[T], Tkey]) -> Iterator[T]:
"""List unique elements, preserving order."""
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen: set[Tkey] = set()
seen_add = seen.add
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def _casefold_path(path: Path) -> str:
return str.casefold(str(path))
def shim_env_path(env=None):
if env is None:
env = os.environ
shim_paths = chain.from_iterable(shim(env) for shim in SHIMS)
return os.pathsep.join(
str(p) for p in unique_everseen(shim_paths, key=_casefold_path)
)
================================================
FILE: tests/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Tests."""
from __future__ import annotations
================================================
FILE: tests/cache/manifest.jsonl
================================================
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000003_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000003_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000004_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000004_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000005_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000005_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000006_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000006_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=2", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "--oem", "1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=2", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "--oem", "1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
================================================
FILE: tests/conftest.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import platform
import sys
from pathlib import Path
from subprocess import CompletedProcess, run
import pytest
from ocrmypdf import api, pdfinfo
from ocrmypdf._exec import unpaper
from ocrmypdf.api import setup_plugin_infrastructure
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode
class Gs106WarningFilter(logging.Filter):
"""Filter out expected Ghostscript 10.6.x warning from test logs."""
def filter(self, record: logging.LogRecord) -> bool:
# Allow all records except the expected Ghostscript 10.6.x warning
return (
"Ghostscript 10.6.x contains JPEG encoding errors"
not in record.getMessage()
)
@pytest.fixture(autouse=True)
def suppress_gs106_warning():
"""Suppress the expected Ghostscript 10.6.x JPEG encoding warning in tests."""
# Add filter to root logger to suppress expected warnings
root_logger = logging.getLogger()
warning_filter = Gs106WarningFilter()
root_logger.addFilter(warning_filter)
yield
root_logger.removeFilter(warning_filter)
def is_linux():
return platform.system() == 'Linux'
def is_macos():
return platform.system() == 'Darwin'
def have_unpaper():
try:
unpaper.version()
except Exception: # pylint: disable=broad-except
return False
return True
TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT
@pytest.fixture(scope="session")
def resources() -> Path:
return Path(TESTS_ROOT) / 'resources'
@pytest.fixture
def ocrmypdf_exec() -> list[str]:
return [sys.executable, '-m', 'ocrmypdf']
@pytest.fixture(scope="function")
def outdir(tmp_path) -> Path:
return tmp_path
@pytest.fixture(scope="function")
def outpdf(tmp_path) -> Path:
return tmp_path / 'out.pdf'
@pytest.fixture(scope="function")
def outtxt(tmp_path) -> Path:
return tmp_path / 'out.txt'
@pytest.fixture(scope="function")
def no_outpdf(tmp_path) -> Path:
"""Document fact that a test is not expected to produce output.
This just documents the fact that a test is not expected to produce
output. Unfortunately an assertion failure inside a test fixture produces
an error rather than a test failure, so no testing is done. It's up to
the test to confirm that no output file was created.
"""
return tmp_path / 'no_output.pdf'
@pytest.fixture(scope="session")
def multipage(resources):
return resources / 'multipage.pdf'
def check_ocrmypdf(input_file: Path, output_file: Path, *args) -> Path:
"""Run ocrmypdf and confirm that a valid plausible PDF was created."""
api_args = [str(input_file), str(output_file)] + [
str(arg) for arg in args if arg is not None
]
options, plugin_manager = get_options_and_plugins(args=api_args)
api.check_options(options, plugin_manager)
result = api.run_pipeline(options, plugin_manager=plugin_manager)
assert result == 0
assert output_file.exists(), "Output file not created"
assert output_file.stat().st_size > 100, "PDF too small or empty"
return output_file
def run_ocrmypdf_api(input_file: Path, output_file: Path, *args) -> ExitCode:
"""Run ocrmypdf via its API in-process, but return CLI-style ExitCode.
This simulates calling the command line interface in a subprocess and allows us
to check that the command line interface is working correctly, but since it is
in-process it is easier to trace with a debugger or coverage tool.
Any exception raised will be trapped and converted to an exit code.
The return code must be checked by the caller to determine if the test passed.
"""
api_args = [str(input_file), str(output_file)] + [
str(arg) for arg in args if arg is not None
]
options, plugin_manager = get_options_and_plugins(args=api_args)
api.check_options(options, plugin_manager)
return api.run_pipeline_cli(options, plugin_manager=plugin_manager)
def run_ocrmypdf(
input_file: Path, output_file: Path, *args, text: bool = True
) -> CompletedProcess:
"""Run ocrmypdf in a subprocess and let test deal with results.
If an exception is thrown this fact will be returned as part of the result
text and return code rather than exception objects.
"""
p_args = (
[sys.executable, '-m', 'ocrmypdf']
+ [str(arg) for arg in args if arg is not None]
+ [str(input_file), str(output_file)]
)
p = run(
p_args,
capture_output=True,
text=text,
check=False,
)
# print(p.stderr)
return p
def first_page_dimensions(pdf: Path):
info = pdfinfo.PdfInfo(pdf)
page0 = info[0]
return (page0.width_inches, page0.height_inches)
def pytest_addoption(parser):
parser.addoption(
"--runslow",
action="store_true",
default=False,
help=(
"run slow tests only useful for development (unlikely to be "
"useful for downstream packagers)"
),
)
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
def get_test_plugin_manager(plugins=None):
"""Get a properly initialized plugin manager for testing."""
return setup_plugin_infrastructure(plugins=plugins or [])
================================================
FILE: tests/plugins/gs_feature_elision.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr
ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1
not permitted in PDF/A-2, overprint mode not set"""
def run_append_stderr(*args, **kwargs):
proc = run_polling_stderr(*args, **kwargs)
proc.stderr += '\n' + ELISION_WARNING + '\n'
return proc
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = run_append_stderr
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,
stop_on_soft_error=True,
)
mock.assert_called_once()
return output_file
================================================
FILE: tests/plugins/gs_pdfa_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr
def run_rig_args(args, **kwargs):
# Remove the two arguments that tell ghostscript to create a PDF/A
# Does not remove the Postscript definition file - not necessary
# to cause PDF/A creation failure
new_args = [
arg for arg in args if not arg.startswith('-dPDFA') and not arg.endswith('.ps')
]
proc = run_polling_stderr(new_args, **kwargs)
return proc
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = run_rig_args
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,
stop_on_soft_error=True,
)
mock.assert_called()
return output_file
================================================
FILE: tests/plugins/gs_raster_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from pathlib import Path
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
def raise_gs_fail(*args, **kwargs):
raise CalledProcessError(
1, 'gs', output=b"", stderr=b"TEST ERROR: gs_raster_failure.py"
)
@hookimpl
def rasterize_pdf_page(
input_file,
output_file,
raster_device,
raster_dpi,
pageno,
page_dpi,
rotation,
filter_vector,
stop_on_soft_error,
options,
use_cropbox,
) -> Path:
with patch('ocrmypdf._exec.ghostscript.run') as mock:
mock.side_effect = raise_gs_fail
ghostscript.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
raster_device=raster_device,
raster_dpi=raster_dpi,
pageno=pageno,
page_dpi=page_dpi,
rotation=rotation,
filter_vector=filter_vector,
stop_on_soft_error=stop_on_soft_error,
options=options,
use_cropbox=use_cropbox,
)
mock.assert_called()
return output_file
================================================
FILE: tests/plugins/gs_raster_soft_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from pathlib import Path
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run
def fail_if_stoponerror(args, **kwargs):
if '-dPDFSTOPONERROR' in args:
raise CalledProcessError(1, 'gs', output=b"", stderr=b"PDF STOP ON ERROR")
return run(args, **kwargs)
@hookimpl
def rasterize_pdf_page(
input_file,
output_file,
raster_device,
raster_dpi,
pageno,
page_dpi,
rotation,
filter_vector,
stop_on_soft_error,
options,
use_cropbox,
) -> Path:
with patch('ocrmypdf._exec.ghostscript.run') as mock:
mock.side_effect = fail_if_stoponerror
ghostscript.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
raster_device=raster_device,
raster_dpi=raster_dpi,
pageno=pageno,
page_dpi=page_dpi,
rotation=rotation,
filter_vector=filter_vector,
stop_on_soft_error=stop_on_soft_error,
options=options,
use_cropbox=use_cropbox,
)
mock.assert_called()
return output_file
================================================
FILE: tests/plugins/gs_render_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
def raise_gs_fail(*args, **kwargs):
raise CalledProcessError(
1, 'gs', output=b"", stderr=b"TEST ERROR: gs_render_failure.py"
)
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = raise_gs_fail
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,
stop_on_soft_error=True,
)
mock.assert_called()
return output_file
================================================
FILE: tests/plugins/gs_render_soft_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr
def fail_if_stoponerror(args, **kwargs):
if '-dPDFSTOPONERROR' in args:
raise CalledProcessError(1, 'gs', output=b"", stderr=b"PDF STOP ON ERROR")
return run_polling_stderr(args, **kwargs)
@hookimpl
def generate_pdfa(
pdf_pages,
pdfmark,
output_file,
context,
pdf_version,
pdfa_part,
stop_on_soft_error,
):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = fail_if_stoponerror
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,
stop_on_soft_error=stop_on_soft_error,
)
mock.assert_called()
return output_file
================================================
FILE: tests/plugins/tesseract_badutf8.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract bad utf8.
In some cases, some versions of Tesseract can output binary gibberish or data
that is not UTF-8 compatible, so we are forced to check that we can convert it
and present it to the user.
"""
from __future__ import annotations
from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
def bad_utf8(*args, **kwargs):
raise CalledProcessError(
1,
'tesseract',
output=b'\x96\xb3\x8c\xf8\x82\xc8UTF-8\x0a', # "Invalid UTF-8" in Shift JIS
stderr=b"",
)
@contextmanager
def patch_tesseract_run():
with patch('ocrmypdf._exec.tesseract.run') as mock:
mock.side_effect = bad_utf8
yield
mock.assert_called()
class BadUtf8OcrEngine(TesseractOcrEngine):
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def get_ocr_engine():
return BadUtf8OcrEngine()
================================================
FILE: tests/plugins/tesseract_big_image_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
def raise_size_exception(*args, **kwargs):
raise CalledProcessError(
1,
'tesseract',
output=b"Image too large: (33830, 14959)\nError during processing.",
stderr=b"",
)
@contextmanager
def patch_tesseract_run():
with patch('ocrmypdf._exec.tesseract.run') as mock:
mock.side_effect = raise_size_exception
yield
mock.assert_called()
class BigImageErrorOcrEngine(TesseractOcrEngine):
@staticmethod
def get_orientation(input_file, options):
with patch_tesseract_run():
return TesseractOcrEngine.get_orientation(input_file, options)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def get_ocr_engine():
return BigImageErrorOcrEngine()
================================================
FILE: tests/plugins/tesseract_cache.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Cache output of tesseract to speed up test suite.
The cache is keyed by by the input test file The input arguments are slugged
into a hideous filename that more or less represents them literally. Joined
together, this becomes the name of the cache folder. A few name files like
stdout, stderr, hocr, pdf, describe the output to reproduce.
Changes to tests/resources/ or image processing algorithms don't trigger a
cache miss. By design, an input image that varies according to platform
differences (e.g. JPEG decoders are allowed to produce differing outputs,
and in practice they do) will still be a cache hit. By design, an
invocation of tesseract with the same parameters from a different test case
will be a hit. It's fragile.
The tests/cache/manifest.jsonl is a JSON lines file that contains
information about the system that produced the results used when cache was
generated. This mainly a log to answer questions about how the files
were produced.
Certain operations are not cached and routed to Tesseract OCR directly.
Assumes Tesseract 4+.
"""
from __future__ import annotations
import argparse
import json
import logging
import platform
import re
import shutil
import threading
from functools import partial
from pathlib import Path
from subprocess import PIPE, CalledProcessError, CompletedProcess
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
from ocrmypdf.subprocess import run
log = logging.getLogger(__name__)
TESTS_ROOT = Path(__file__).resolve().parent.parent
CACHE_ROOT = TESTS_ROOT / 'cache'
parser = argparse.ArgumentParser(
prog='tesseract-cache', description='cache output of tesseract'
)
parser.add_argument('-l', '--language', action='append')
parser.add_argument('imagename')
parser.add_argument('outputbase')
parser.add_argument('configfiles', nargs='*')
parser.add_argument('--user-words', type=str)
parser.add_argument('--user-patterns', type=str)
parser.add_argument('-c', action='append')
parser.add_argument('--psm', type=int)
parser.add_argument('--oem', type=int)
def get_cache_folder(source_pdf, run_args, parsed_args):
def slugs():
yield '' # so we don't start with a '-' which makes rm difficult
for arg in run_args[1:]:
if arg == parsed_args.imagename:
yield Path(parsed_args.imagename).name
elif arg == parsed_args.outputbase:
yield Path(parsed_args.outputbase).name
elif arg == '-c' or arg.startswith('textonly'):
pass
else:
yield arg
argv_slug = '__'.join(slugs())
argv_slug = argv_slug.replace('/', '___')
return Path(CACHE_ROOT) / Path(source_pdf).stem / argv_slug
def cached_run(options, run_args, **run_kwargs):
run_args = [str(arg) for arg in run_args] # flatten PosixPaths
args = parser.parse_args(run_args[1:])
if args.imagename in ('stdin', '-'):
return run(run_args, **run_kwargs)
source_file = options.input_file
cache_folder = get_cache_folder(source_file, run_args, args)
cache_folder.mkdir(parents=True, exist_ok=True)
log.debug(f"Using Tesseract cache {cache_folder}")
# Determine what configfiles we need
configfiles = args.configfiles if args.configfiles else ['txt']
# Check if cache has all required files
def cache_complete():
if not (cache_folder / 'stderr.bin').exists():
return False
if not (cache_folder / 'stdout.bin').exists():
return False
if args.outputbase != 'stdout':
for configfile in configfiles:
if not (cache_folder / f'{configfile}.bin').exists():
return False
return True
if cache_complete():
log.debug("Cache HIT")
# Replicate stdout/err
if args.outputbase != 'stdout':
for configfile in configfiles:
# cp cache -> output
tessfile = args.outputbase + '.' + configfile
shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)
return CompletedProcess(
args=run_args,
returncode=0,
stdout=(cache_folder / 'stdout.bin').read_bytes(),
stderr=(cache_folder / 'stderr.bin').read_bytes(),
)
log.debug("Cache MISS")
cache_kwargs = {
k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')
}
# Don't pass timeout=0 to the actual run call - it would timeout immediately
# A timeout of 0 means "use default/no timeout" in the caching context
if cache_kwargs.get('timeout') == 0.0:
cache_kwargs['timeout'] = None
if 'check' not in cache_kwargs:
cache_kwargs['check'] = True
try:
p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)
except CalledProcessError as e:
log.exception(e)
raise # Pass exception onward
# Update cache
(cache_folder / 'stdout.bin').write_bytes(p.stdout)
(cache_folder / 'stderr.bin').write_bytes(p.stderr)
if args.outputbase != 'stdout':
for configfile in configfiles:
if configfile not in ('fpdf2', 'hocr', 'pdf', 'txt'):
continue
# cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}
tessfile = args.outputbase + '.' + configfile
shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')
def clean_sys_argv():
for arg in run_args[1:]:
yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1', arg)
manifest = {
'tesseract_version': TesseractOcrEngine.version().replace('\n', ' '),
'system': platform.system(),
'python': platform.python_version(),
'argv_slug': cache_folder.name,
'sourcefile': str(Path(source_file).relative_to(TESTS_ROOT)),
'args': list(clean_sys_argv()),
}
with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
json.dump(manifest, f)
f.write('\n')
f.flush()
return p
class CacheOcrEngine(TesseractOcrEngine):
# Concurrent threads (with --use-threads) might try to use different parts
# of the OcrEngine, so we need a lock to protect the state of patched
# module whenever it's patched. Should refactor ocrmypdf._exec.tesseract so that
# it does not to be patched at all for testing.
lock = threading.Lock()
@staticmethod
def get_orientation(input_file, options):
with (
CacheOcrEngine.lock,
patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
):
return TesseractOcrEngine.get_orientation(input_file, options)
@staticmethod
def get_deskew(input_file, options) -> float:
with (
CacheOcrEngine.lock,
patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
):
return TesseractOcrEngine.get_deskew(input_file, options)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with (
CacheOcrEngine.lock,
patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
):
TesseractOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with (
CacheOcrEngine.lock,
patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
):
TesseractOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def get_ocr_engine():
return CacheOcrEngine()
================================================
FILE: tests/plugins/tesseract_crash.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations
import signal
from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
def raise_crash(*args, **kwargs):
raise CalledProcessError(
128 + signal.SIGABRT,
'tesseract',
output=b"",
stderr=b"libc++abi.dylib: terminating with uncaught exception of type "
+ b"std::bad_alloc: std::bad_alloc",
)
@contextmanager
def patch_tesseract_run():
with patch('ocrmypdf._exec.tesseract.run') as mock:
mock.side_effect = raise_crash
yield
mock.assert_called()
class CrashOcrEngine(TesseractOcrEngine):
@staticmethod
def get_orientation(input_file, options):
with patch_tesseract_run():
return TesseractOcrEngine.get_orientation(input_file, options)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def get_ocr_engine():
return CrashOcrEngine()
================================================
FILE: tests/plugins/tesseract_debug_rotate.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op/fixed rotate plugin.
To quickly run tests where getting OCR output is not necessary and we want to test
the rotation pipeline.
In generate_hocr mode, create a .hocr file that specifies no text found.
In 'pdf' mode, convert the image to PDF using another program.
In orientation check mode, report 0, 90, 180, 270... based on page number.
"""
from __future__ import annotations
import pikepdf
from PIL import Image
from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl
from ocrmypdf.helpers import page_number
HOCR_TEMPLATE = '''
'''
class FixedRotateNoopOcrEngine(OcrEngine):
@staticmethod
def version():
return '4.1.1'
@staticmethod
def creator_tag(options):
tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
return f"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}"
def __str__(self):
return f"NO-OP {FixedRotateNoopOcrEngine.version()}"
@staticmethod
def languages(options):
return {'eng'}
@staticmethod
def get_orientation(input_file, options):
page = page_number(input_file)
angle = ((page - 1) * 90) % 360
return OrientationConfidence(angle=angle, confidence=99.9)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with (
Image.open(input_file) as im,
open(output_hocr, 'w', encoding='utf-8') as f,
):
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output_text, 'w') as f:
f.write('')
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with Image.open(input_file) as im:
dpi = im.info['dpi']
pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
ptsize = pagesize[0] * 72, pagesize[1] * 72
pdf = pikepdf.new()
pdf.add_blank_page(page_size=ptsize)
pdf.save(output_pdf, static_id=True)
output_text.write_text('')
@hookimpl
def get_ocr_engine():
return FixedRotateNoopOcrEngine()
================================================
FILE: tests/plugins/tesseract_noop.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op plugin.
To quickly run tests where getting OCR output is not necessary.
In generate_hocr mode, create a .hocr file that specifies no text found.
In 'pdf' mode, convert the image to PDF using another program.
In orientation check mode, report the orientation is upright.
"""
from __future__ import annotations
import pikepdf
from PIL import Image
from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl
HOCR_TEMPLATE = '''
'''
class NoopOcrEngine(OcrEngine):
@staticmethod
def version():
return '4.1.1'
@staticmethod
def creator_tag(options):
tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
return f"NO-OP {tag} {NoopOcrEngine.version()}"
def __str__(self):
return f"NO-OP {NoopOcrEngine.version()}"
@staticmethod
def languages(options):
return {'eng'}
@staticmethod
def get_orientation(input_file, options):
return OrientationConfidence(angle=0, confidence=0.0)
@staticmethod
def get_deskew(input_file, options):
return 0.0
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with (
Image.open(input_file) as im,
open(output_hocr, 'w', encoding='utf-8') as f,
):
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output_text, 'w') as f:
f.write('')
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with Image.open(input_file) as im:
dpi = im.info['dpi']
pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
ptsize = pagesize[0] * 72, pagesize[1] * 72
pdf = pikepdf.new()
pdf.add_blank_page(page_size=ptsize)
pdf.save(output_pdf, static_id=True)
output_text.write_text('')
@hookimpl
def get_ocr_engine():
return NoopOcrEngine()
================================================
FILE: tests/plugins/tesseract_simulate_oom_killer.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op plugin that simulates the OOM killer on page 4.
OCRmyPDF can use a lot of memory, even that it might trigger the
OOM killer on Linux or similar features on other platforms. We want to
ensure we fail with an error rather than deadlock in such cases.
Page 4 was chosen because of this number's association with bad luck
in many East Asian cultures.
"""
# type: ignore
from __future__ import annotations
import os
import signal
from pathlib import Path
from ocrmypdf import hookimpl
# type: ignore
# Ugly hack that let us use the NoopOcrEngine without setting up packaging for our
# tests.
# This hack also requires us to set type: ignore
parent_file = Path(__file__).with_name('tesseract_noop.py')
parent = compile(parent_file.read_text(), parent_file, mode='exec')
exec(parent)
NoopOcrEngine = locals()['NoopOcrEngine']
class Page4Engine(NoopOcrEngine): # type: ignore
def __str__(self):
return f"NO-OP Page 4 {NoopOcrEngine.version()}"
@staticmethod
def generate_hocr(input_file: Path, output_hocr, output_text, options):
if input_file.stem.startswith('000004'):
# Suicide
os.kill(os.getpid(), signal.SIGKILL)
else:
return NoopOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
if input_file.stem.startswith('000004'):
# Suicide
os.kill(os.getpid(), signal.SIGKILL)
else:
return NoopOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def check_options(options):
if options.use_threads:
raise ValueError("I'm not compatible with use_threads")
@hookimpl
def get_ocr_engine():
return Page4Engine()
================================================
FILE: tests/resources/README.rst
================================================
.. SPDX-FileCopyrightText: 2022 James R. Barlow
.. SPDX-License-Identifier: CC-BY-SA-4.0
These test files are used in OCRmyPDF's test suite. They do not necessarily produce OCR results
at all and are not necessarily meant as examples of OCR output. Some are even invalid PDFs that might
crash certain PDF viewers.
Some of these images were obtained from the public domain. Others are copyrighted and may have
licenses associated. Refer to ``.reuse/dep5`` file in OCRmyPDF's Git repository for information on
the copyright holder(s) and license(s) applicable to these resources.
.. list-table::
:widths: 15 35 50
:header-rows: 1
* - File
- Source
- Purpose
* - c02-22.pdf
- `Project Gutenberg`_, Adventures of Huckleberry Finn, page 22
- difficult OCR image (obscure fonts and illustrations)
* - graph.pdf
- `Wikimedia:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`_
- image with slanted text
* - lichtenstein.pdf
- `Wikimedia: JPEG2000 Lichtenstein`_
- JPEG2000 image
* - linn.png, linn.pdf, linn.txt
- `Wikimedia: LinnSequencer`_
- image with two columns
* - typewriter.png, 2400dpi.pdf
- `Wikimedia: Triumph typewrtier text Linzensoep`_
- simple text
* - baiona.png
- `Wikimedia: Baionako udalerri mugakideak`_
- multilingual text and images
* - aspect.pdf
- synthetic
- test image with 200 x 100 DPI resolution
* - blank.pdf
- synthetic
- blank PDF generated by Adobe Illustrator CC 17, containing a lot of application-specific metadata/bloat
* - cmyk.pdf
- synthetic
- a CMYK image created in Photoshop
* - crom.png
- synthetic
- test for non-dictionary words
* - enormous.pdf
- synthetic
- very large PDF page
* - epson.pdf
- synthetic
- a linearized PDF containing some unusual indirect objects, created by an Epson printer; printout of a Wikipedia article (CC-BY-SA)
* - formxobject.pdf
- synthetic
- hand-crafted PDF containing an image inside a Form XObject
* - francais.pdf
- synthetic
- a page containing French accents (diacritics)
* - hugemono.pdf
- synthetic
- large monochrome 35000x35000 image in JBIG2 encoding
* - invalid.pdf
- synthetic
- a PDF file header followed by EOF marker
* - kcs.pdf
- synthetic
- PDF file generated by Kodak Capture Desktop Software 1.2; has invalid table of contents
* - livecycle.pdf
- synthetic
- a minimal PDF that claims to use dynamic XFA forms
* - masks.pdf
- synthetic
- file containing explicit masks and a stencil mask drawn without a proper transformation matrix; printout of a German Wikipedia article (CC-BY-SA)
* - missing_docinfo.pdf
- synthetic
- PDF file with no /DocumentInfo section
* - overlay.pdf
- synthetic
- PDF file generated by PDFPen pro that triggered content stream parse errors
* - negzero.pdf
- synthetic
- copy of formxobject.pdf with token that qpdf doesn't like
* - no_contents.pdf
- synthetic
- synthetic PDF with a blank page that has no /Contents entry
* - truetype_font_nomapping.pdf
- synthetic
- example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
* - trivial.pdf
- synthetic
- smallest possible valid PDF-1.3 with all required fields
* - type3_font_nomapping.pdf
- synthetic
- example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
* - vector.pdf
- synthetic
- a PDF with vector art and text rendered as curves with no fonts
Assemblies
==========
These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.
- baiona_gray.png (from baiona.png, grayscale version)
- baiona_colormapped.png (from baiona.png, palette version)
- baiona_alpha.png (from baiona.png, RGB+A version)
- cardinal.pdf (four cardinal directions, baked-in rotated copies of linn.png)
- ccitt.pdf (linn.png, converted to CCITT encoding)
- graph_ocred.pdf (from graph.pdf)
- jbig2.pdf (from linn.png)
- multipage.pdf (from several other files)
- palette.pdf (from baiona_colormapped.png)
- poster.pdf (from linn.png)
- rotated_skew.pdf (a /Rotate'd and skewed document from linn.png)
- skew.pdf (from linn.png, skew simulated by adjusting the transformation matrix)
- toc.pdf (from formxobject.pdf, trivial.pdf)
.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg
.. _`Project Gutenberg`: https://www.gutenberg.org/files/76/76-h/76-h.htm#c2
.. _`Wikimedia: Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`: https://en.wikipedia.org/wiki/File:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png
.. _`Wikimedia: JPEG2000 Lichtenstein`: https://en.wikipedia.org/wiki/JPEG_2000#/media/File:Jpeg2000_2-level_wavelet_transform-lichtenstein.png
.. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux
.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png
================================================
FILE: tests/resources/arabic.hocr
================================================
مرحبا
بالعالم
هذا
نص
عربي
سلام
فارسی
================================================
FILE: tests/resources/cjk.hocr
================================================
你好
世界
繁體
中文
こんにちは
世界
안녕하세요
세계
================================================
FILE: tests/resources/devanagari.hocr
================================================
नमस्ते
दुनिया
यह
हिंदी
पाठ
है
संस्कृत
भाषा
================================================
FILE: tests/resources/hello_world_scripts.hocr
================================================
Multilingual Hello World Script Test
Hello!
¡Hola!
Bonjour!
Grüß Gott!
Привет!
Γειά σου!
你好!
こんにちは!
안녕하세요!
Merhaba!
नमस्ते!
!مرحبا
שלום
Olá!
Ciao!
Cześć!
您好!
Здравствуй!
Χαίρετε!
!أهلاً
================================================
FILE: tests/resources/latin.hocr
================================================
The
quick
brown
fox
jumps
Café
résumé
naïve
Größe
Zürich
Ärger
================================================
FILE: tests/resources/linn.txt
================================================
The LinnSequencer
32 Track MIDI Sequence Recorder
The LinnSequencer is a state—of—the-art composition and performance tool for the professional musician. It is
extremely powerful, yet amazingly simple to learn and use. It’s many remarkable features include:
0 Operation is similar to multi-track tape recorder with PLAY, STOP, RECORD, FAST
FORWARD, REWIND, and LOCATE controls.
0 Each of the 100 sequences contains 32 simultaneous, polyphonic tracks. Each track may
be assigned to one of 16 MIDI channels. Simultaneously plays up to 16 polyphonic
synthesizers !
0 Ultra-fast 3 1/2 ” disk drive stores complex songs in seconds and holds over 110,000 notes
per disk!
0 One or all tracks may be TRANSPOSED at the touch of a key.
0 Exclusive real—time ERASE function makes editing FAST.
0 Exclusive REPEAT function automatically repeats any held notes at a pre-selected
rhythmic value.
0 TIMING CORRECTION works during playback and operates without ‘chopping’ notes.
0 Optional SMPTE time code synchronization.
0 Optional remote control.
Recording a Sequence
To record a sequence, simply press RECORD and PLAY,
then play your MIDI keyboard in time to the Sequencer’s
click track. When the sequence loops back around to bar 1,
you’ll hear what you played—only all timing errors will be
corrected! (Timing correction may be adjusted 0r defeated).
Any additional notes played will be added into the track
—existing notes are not erased while recording!
FAST FORWARD, REWIND, and LOCATE controls
may be used at any time to quickly access any location in
your sequence for spot-recording. To overdub a new part,
select a different track and start recording—while you
record, the first‘track will play in perfect sync (unless you
MUTE it, or SOLO another track). In this way, up to 32
tracks may be overdubbed! All MIDI effects are recorded
including pitch bend, modulation, velocity, aftertouch,
sustain pedal, and program changes!
Editing
To erase a wrong note, simply hold ERASE and press
the note to be erased just before it plays in the sequence-—
when played back, it will be gone. Notes may also be
added, erased, or changed using the SINGLE STEP func-
tion. To overdub notes at specific points within a sequence,
Additional Features
simply use LOCATE, FAST FORWARD, or REWIND to
find the desired bar number, then start recording.
The INSERT/ COPY function allows you to move bars
from one location to another—in the same sequence or a
different one. For example, you might insert a copy of the
first verse between the second chorus and the bridge.
DELETE BARS operates the same way to remove
unwanted sections.
Creating a Song
One way to create a song is to record each track all the
way through (up to 999 bars). Another way is to record
each basic section (verse, chorus, etc.) in individual
sequences, then use the CREATE SONG function to “chain”
them together. CREATE SONG will then automatically
copy all the parts into a new sequence. If desired, you can
even set the last few bars to repeat infinitely, for a fadeout.
Composition Without Compromise
The technology you use should never be so complex that
it interferes with the creative process. That’s precisely why
the LinnSequencer is designed to let you compose, record
and edit while devoting your undivided attention to your
music. See your Linn dealer today for a demonstration!
0 Simple, easy to learn operation—the 32 character LCD display clearly guides you through all operations. If needed, the
HELP button displays additional explanations.
0 Non-destructive recording—existing notes are not erased while recording.
0 Two FOOTSWIT CH INPUTS may be assigned to remotely control many of the commonly used functions, including
ERASE, REPEAT, PLAY/ STOP, or LOCATE.
0 Two TRIGGER OUTPUTS may be programmed to output pulses at any selected note value.
0 Will sync to standard LinnDrum or Linn 9000 sync tone.
0 Utilizes ultra high—speed, 8 MHZ 80186 16 bit computer internally for FAST operation.
0 TEMPO may be specified in BEATS-PER—MINUTE or FRAMES-PER—BEAT at 24, 25, or 30 frames per second,
(even drop frame!)
0 TEMPO may be entered numerically, adjustable in tenths of a Beat-Per-Minute increments, or by tapping quarter notes
on the TAP TEMPO button.
0 TEMPO CHANGES may be programmed into a sequence, with smooth transitions if desired.
0 Any TIME SIGNATURE may be used, and may be changed within a song.
EDI]
Linn Electronics, Inc.
18720 Oxnard Street, Tarzana, CA 91356
(818) 708-8131 TELEX #298949 LINN UR
================================================
FILE: tests/resources/multilingual.hocr
================================================
English
Text
Here
مرحبا
بك
================================================
FILE: tests/test_acroform.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import pikepdf
import pytest
import ocrmypdf
from .conftest import check_ocrmypdf
# pylint: disable=redefined-outer-name
@pytest.fixture
def acroform(resources):
return resources / 'acroform.pdf'
def test_acroform_and_redo(acroform, no_outpdf):
with pytest.raises(
ocrmypdf.exceptions.InputFileError,
match=r'.*--redo-ocr.*is not currently possible.*',
):
check_ocrmypdf(acroform, no_outpdf, '--redo-ocr')
def test_acroform_message(acroform, caplog, outpdf):
caplog.set_level(logging.INFO)
check_ocrmypdf(acroform, outpdf, '--plugin', 'tests/plugins/tesseract_noop.py')
assert 'fillable form' in caplog.text
assert '--force-ocr' in caplog.text
@pytest.fixture
def digitally_signed(acroform, outdir):
out = outdir / 'acroform_signed.pdf'
with pikepdf.open(acroform) as pdf:
pdf.Root.AcroForm.SigFlags = 3
pdf.save(out)
yield out
def test_digital_signature(digitally_signed, no_outpdf):
with pytest.raises(ocrmypdf.exceptions.DigitalSignatureError):
check_ocrmypdf(digitally_signed, no_outpdf)
def test_digital_signature_invalidate(digitally_signed, no_outpdf):
check_ocrmypdf(
digitally_signed, no_outpdf, '--force-ocr', '--invalidate-digital-signatures'
)
================================================
FILE: tests/test_annots.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from pikepdf import Array, Dictionary, Name, NameTree, Pdf
from ocrmypdf._annots import remove_broken_goto_annotations
def test_remove_broken_goto_annotations(resources):
with Pdf.open(resources / 'link.pdf') as pdf:
assert not remove_broken_goto_annotations(pdf), "File should not be modified"
# Construct Dests nametree
nt = NameTree.new(pdf)
names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())
names[Name.Dests] = nt.obj
# Create a broken named destination
nt['Invalid'] = pdf.make_indirect(Dictionary())
# Create a valid named destination
nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])
pdf.pages[0].Annots[0].A.D = 'Missing'
pdf.pages[1].Annots[0].A.D = 'Valid'
assert remove_broken_goto_annotations(pdf), "File should be modified"
assert Name.D not in pdf.pages[0].Annots[0].A
assert Name.D in pdf.pages[1].Annots[0].A
================================================
FILE: tests/test_api.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pickle
from io import BytesIO
from pathlib import Path
import pytest
from pdfminer.high_level import extract_text
import ocrmypdf
import ocrmypdf._pipelines
import ocrmypdf.api
def test_language_list():
with pytest.raises(
(ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)
):
ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])
def test_language_parameter_mapped_to_languages():
"""Test that the API 'language' parameter is mapped to OcrOptions 'languages'.
Regression test for GitHub issue #1640: the Python API ignored the language
parameter, always defaulting to 'eng'.
"""
from ocrmypdf._options import OcrOptions
from ocrmypdf.api import create_options, setup_plugin_infrastructure
from ocrmypdf.cli import get_parser
setup_plugin_infrastructure()
parser = get_parser()
options = create_options(
input_file='test.pdf',
output_file='output.pdf',
parser=parser,
language=['tam'],
)
assert options.languages == ['tam']
# Test with a list of multiple languages
options = create_options(
input_file='test.pdf',
output_file='output.pdf',
parser=parser,
language=['fra', 'deu'],
)
assert options.languages == ['fra', 'deu']
# Test with a bare string (single language)
options = create_options(
input_file='test.pdf',
output_file='output.pdf',
parser=parser,
language='tam',
)
assert options.languages == ['tam']
# Test '+'-separated string is split like CLI --language
options = create_options(
input_file='test.pdf',
output_file='output.pdf',
parser=parser,
language='eng+spa',
)
assert options.languages == ['eng', 'spa']
# Test '+'-separated entry within a list is also split
options = create_options(
input_file='test.pdf',
output_file='output.pdf',
parser=parser,
language=['eng+spa'],
)
assert options.languages == ['eng', 'spa']
def test_stream_api(resources: Path):
in_ = (resources / 'graph.pdf').open('rb')
out = BytesIO()
ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)
out.seek(0)
assert b'%PDF' in out.read(1024)
def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):
s = BytesIO()
ocrmypdf.ocr(
resources / 'ccitt.pdf',
outpdf,
plugins=['tests/plugins/tesseract_cache.py'],
sidecar=s,
)
s.seek(0)
assert b'the' in s.getvalue()
def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
ocrmypdf.api._pdf_to_hocr(
resources / 'multipage.pdf',
outdir,
language='eng',
skip_text=True,
plugins=['tests/plugins/tesseract_cache.py'],
)
assert (outdir / '000001_ocr_hocr.hocr').exists()
assert (outdir / '000006_ocr_hocr.hocr').exists()
assert not (outdir / '000004_ocr_hocr.hocr').exists()
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)
assert outpdf.exists()
def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
ocrmypdf.api._pdf_to_hocr(
resources / 'ccitt.pdf',
outdir,
language='eng',
skip_text=True,
plugins=['tests/plugins/tesseract_cache.py'],
)
assert (outdir / '000001_ocr_hocr.hocr').exists()
hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')
mangled = hocr.replace('the', 'hocr')
(outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)
text = extract_text(outpdf)
assert 'hocr' in text and 'the' not in text
def test_hocr_result_json():
result = ocrmypdf._pipelines._common.HOCRResult(
pageno=1,
pdf_page_from_image=Path('a'),
hocr=Path('b'),
textpdf=Path('c'),
orientation_correction=180,
)
assert (
result.to_json()
== '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '
'"textpdf": {"Path": "c"}, "orientation_correction": 180, "ocr_tree": null}'
)
assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result
def test_hocr_result_pickle():
result = ocrmypdf._pipelines._common.HOCRResult(
pageno=1,
pdf_page_from_image=Path('a'),
hocr=Path('b'),
textpdf=Path('c'),
orientation_correction=180,
)
assert result == pickle.loads(pickle.dumps(result))
def test_nested_plugin_option_access():
"""Test that plugin options can be accessed via nested namespaces."""
from ocrmypdf._options import OcrOptions
from ocrmypdf.api import setup_plugin_infrastructure
# Set up plugin infrastructure to register plugin models
setup_plugin_infrastructure()
# Create options with tesseract settings
options = OcrOptions(
input_file='test.pdf',
output_file='output.pdf',
tesseract_timeout=120.0,
tesseract_oem=1,
optimize=2,
)
# Test flat access still works
assert options.tesseract_timeout == 120.0
assert options.tesseract_oem == 1
assert options.optimize == 2
# Test nested access for tesseract
tesseract = options.tesseract
assert tesseract is not None
assert tesseract.timeout == 120.0
assert tesseract.oem == 1
# Test nested access for ghostscript
ghostscript = options.ghostscript
assert ghostscript is not None
assert ghostscript.color_conversion_strategy == "LeaveColorUnchanged"
# Test that cached instances are returned
assert options.tesseract is tesseract
def test_default_tesseract_timeout():
"""Test that OcrOptions without explicit tesseract_timeout uses plugin default.
Regression test for GitHub issue #1636: when using the Python API without
specifying tesseract_timeout, the default was 0.0 which caused Tesseract
to immediately time out and produce no OCR output.
"""
from ocrmypdf._options import OcrOptions
from ocrmypdf.api import setup_plugin_infrastructure
setup_plugin_infrastructure()
# Default OcrOptions should leave tesseract_timeout as None
options = OcrOptions(
input_file='test.pdf',
output_file='output.pdf',
)
assert options.tesseract_timeout is None
# The plugin default (180s) should be used when tesseract_timeout is None
assert options.tesseract.timeout == 180.0
================================================
FILE: tests/test_check_pdf.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from ocrmypdf.helpers import check_pdf
def test_pdf_error(resources):
assert check_pdf(resources / 'blank.pdf')
assert not check_pdf(__file__)
================================================
FILE: tests/test_completion.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
from subprocess import run
import pytest
from ocrmypdf.helpers import running_in_docker
pytestmark = pytest.mark.skipif(
running_in_docker(),
reason="docker can't complete",
)
def test_fish():
try:
proc = run(
['fish', '-n', 'misc/completion/ocrmypdf.fish'],
check=True,
encoding='utf-8',
capture_output=True,
)
assert proc.stderr == '', proc.stderr
except FileNotFoundError:
pytest.xfail('fish is not installed')
@pytest.mark.skipif(
os.name == 'nt', reason="Windows CI workers have bash but are best left alone"
)
def test_bash():
try:
proc = run(
['bash', '-n', 'misc/completion/ocrmypdf.bash'],
check=True,
encoding='utf-8',
capture_output=True,
)
assert proc.stderr == '', proc.stderr
except FileNotFoundError:
pytest.xfail('bash is not installed')
================================================
FILE: tests/test_concurrency.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
import platform
import pytest
from ocrmypdf import ExitCode
from .conftest import run_ocrmypdf_api
@pytest.mark.skipif(os.name == 'nt', reason="Windows doesn't have SIGKILL")
@pytest.mark.skipif(
platform.python_version_tuple() >= ('3', '12'), reason="can deadlock due to fork"
)
def test_simulate_oom_killer(multipage, no_outpdf):
exitcode = run_ocrmypdf_api(
multipage,
no_outpdf,
'--force-ocr',
'--no-use-threads',
'--plugin',
'tests/plugins/tesseract_simulate_oom_killer.py',
)
assert exitcode == ExitCode.child_process_error
================================================
FILE: tests/test_fpdf_renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Tests for fpdf2-based PDF renderer."""
from __future__ import annotations
from pathlib import Path
import pytest
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import (
DebugRenderOptions,
Fpdf2MultiPageRenderer,
Fpdf2PdfRenderer,
)
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
from ocrmypdf.models.ocr_element import OcrClass
@pytest.fixture
def font_dir():
"""Return path to font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create MultiFontManager instance for testing."""
return MultiFontManager(font_dir)
@pytest.fixture
def resources():
"""Return path to test resources directory."""
return Path(__file__).parent / "resources"
class TestFpdf2RendererImports:
"""Test that all fpdf2 renderer modules can be imported."""
def test_imports(self):
"""Test that all fpdf_renderer modules can be imported."""
from ocrmypdf.fpdf_renderer import (
DebugRenderOptions,
Fpdf2MultiPageRenderer,
Fpdf2PdfRenderer,
)
assert DebugRenderOptions is not None
assert Fpdf2PdfRenderer is not None
assert Fpdf2MultiPageRenderer is not None
class TestDebugRenderOptions:
"""Test DebugRenderOptions dataclass."""
def test_defaults(self):
"""Test default values."""
opts = DebugRenderOptions()
assert opts.render_baseline is False
assert opts.render_line_bbox is False
assert opts.render_word_bbox is False
def test_custom_values(self):
"""Test custom values."""
opts = DebugRenderOptions(
render_baseline=True,
render_line_bbox=True,
render_word_bbox=True,
)
assert opts.render_baseline is True
assert opts.render_line_bbox is True
assert opts.render_word_bbox is True
class TestFpdf2PdfRenderer:
"""Test Fpdf2PdfRenderer."""
def test_requires_page_element(self, multi_font_manager):
"""Test that renderer requires ocr_page element."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a non-page element
word = OcrElement(
ocr_class=OcrClass.WORD,
text="test",
bbox=BoundingBox(left=0, top=0, right=100, bottom=20),
)
with pytest.raises(ValueError, match="Root element must be ocr_page"):
Fpdf2PdfRenderer(
page=word,
dpi=300,
multi_font_manager=multi_font_manager,
)
def test_requires_bbox(self, multi_font_manager):
"""Test that renderer requires page with bounding box."""
from ocrmypdf.models.ocr_element import OcrElement
page = OcrElement(ocr_class=OcrClass.PAGE)
with pytest.raises(ValueError, match="Page must have bounding box"):
Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
)
def test_render_simple_page(self, multi_font_manager, tmp_path):
"""Test rendering a simple page with one word."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a simple page with one word
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Hello",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72, # 1:1 mapping to PDF points
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_simple.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_invisible_text(self, multi_font_manager, tmp_path):
"""Test rendering invisible text (OCR layer)."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Invisible",
bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72,
multi_font_manager=multi_font_manager,
invisible_text=True, # This is the default
)
output_path = tmp_path / "test_invisible.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestFpdf2MultiPageRenderer:
"""Test Fpdf2MultiPageRenderer."""
def test_requires_pages(self, multi_font_manager):
"""Test that renderer requires at least one page."""
with pytest.raises(ValueError, match="No pages to render"):
renderer = Fpdf2MultiPageRenderer(
pages_data=[],
multi_font_manager=multi_font_manager,
)
renderer.render(Path("/tmp/test.pdf"))
def test_render_multiple_pages(self, multi_font_manager, tmp_path):
"""Test rendering multiple pages."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
pages_data = []
for i in range(3):
word = OcrElement(
ocr_class=OcrClass.WORD,
text=f"Page{i+1}",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
pages_data.append((i + 1, page, 72))
renderer = Fpdf2MultiPageRenderer(
pages_data=pages_data,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_multipage.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestFpdf2RendererWithHocr:
"""Test fpdf2 renderer with actual hOCR files."""
def test_render_latin_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering Latin text from hOCR."""
hocr_path = resources / "latin.hocr"
if not hocr_path.exists():
pytest.skip("latin.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Ensure we got a page
assert page.ocr_class == OcrClass.PAGE
assert page.bbox is not None
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "latin_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_cjk_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering CJK text from hOCR."""
hocr_path = resources / "cjk.hocr"
if not hocr_path.exists():
pytest.skip("cjk.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "cjk_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_arabic_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering Arabic text from hOCR."""
hocr_path = resources / "arabic.hocr"
if not hocr_path.exists():
pytest.skip("arabic.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "arabic_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_hello_world_scripts_hocr(
self, resources, multi_font_manager, tmp_path
):
"""Test rendering comprehensive multilingual 'Hello!' hOCR file.
This tests all major scripts including:
- Latin (English, Spanish, French, German, Italian, Polish, Portuguese, Turkish)
- Cyrillic (Russian)
- Greek
- CJK (Chinese Simplified, Chinese Traditional, Japanese, Korean)
- Devanagari (Hindi)
- Arabic (RTL)
- Hebrew (RTL)
Also includes rotated baselines to exercise skew handling.
"""
hocr_path = resources / "hello_world_scripts.hocr"
if not hocr_path.exists():
pytest.skip("hello_world_scripts.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Verify we parsed the page correctly
assert page.ocr_class == OcrClass.PAGE
assert page.bbox is not None
# Should have 2550x3300 at 300 DPI
assert page.bbox.right == 2550
assert page.bbox.bottom == 3300
# Test with visible text for visual inspection
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "hello_world_scripts_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_hello_world_scripts_multipage(
self, resources, multi_font_manager, tmp_path
):
"""Test rendering hello_world_scripts.hocr using MultiPageRenderer.
Uses Fpdf2MultiPageRenderer to render the multilingual test file,
demonstrating font handling across all major writing systems.
"""
hocr_path = resources / "hello_world_scripts.hocr"
if not hocr_path.exists():
pytest.skip("hello_world_scripts.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Build pages_data list as expected by MultiPageRenderer
pages_data = [(1, page, 300)] # (page_number, page_element, dpi)
renderer = Fpdf2MultiPageRenderer(
pages_data=pages_data,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "hello_world_scripts_multipage.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestWordSegmentation:
"""Test that rendered PDFs have proper word segmentation for pdfminer.six."""
def test_word_segmentation_with_pdfminer(self, multi_font_manager, tmp_path):
"""Test that pdfminer.six can extract words with proper spacing.
This test verifies that explicit space characters are inserted between
words so that pdfminer.six (and similar PDF readers) can properly
segment words during text extraction.
"""
from pdfminer.high_level import extract_text
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a page with multiple words on one line
word1 = OcrElement(
ocr_class=OcrClass.WORD,
text="Hello",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
word2 = OcrElement(
ocr_class=OcrClass.WORD,
text="World",
bbox=BoundingBox(left=220, top=100, right=320, bottom=130),
)
word3 = OcrElement(
ocr_class=OcrClass.WORD,
text="Test",
bbox=BoundingBox(left=340, top=100, right=420, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=420, bottom=130),
children=[word1, word2, word3],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72, # 1:1 mapping to PDF points
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_word_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# Verify words are separated by spaces
assert "Hello" in extracted_text
assert "World" in extracted_text
assert "Test" in extracted_text
# The text should NOT be run together like "HelloWorldTest"
assert "HelloWorld" not in extracted_text
assert "WorldTest" not in extracted_text
# Verify proper word segmentation - words should be separated
# (allowing for whitespace variations)
words_found = extracted_text.split()
assert "Hello" in words_found
assert "World" in words_found
assert "Test" in words_found
def test_cjk_no_spurious_spaces(self, multi_font_manager, tmp_path):
"""Test that CJK text does not get spurious spaces inserted.
CJK scripts don't use spaces between characters/words, so we should
not insert spaces between adjacent CJK words.
"""
from pdfminer.high_level import extract_text
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a page with CJK words (Chinese characters)
# 你好 = "Hello" in Chinese
# 世界 = "World" in Chinese
word1 = OcrElement(
ocr_class=OcrClass.WORD,
text="你好",
bbox=BoundingBox(left=100, top=100, right=160, bottom=130),
)
word2 = OcrElement(
ocr_class=OcrClass.WORD,
text="世界",
bbox=BoundingBox(left=170, top=100, right=230, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=230, bottom=130),
children=[word1, word2],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_cjk_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# CJK text should be present
assert "你好" in extracted_text
assert "世界" in extracted_text
# There should NOT be spaces between CJK characters
# (but pdfminer may add some whitespace, so we check the raw chars)
extracted_chars = extracted_text.replace(" ", "").replace("\n", "")
assert "你好世界" in extracted_chars or (
"你好" in extracted_chars and "世界" in extracted_chars
)
def test_latin_hocr_word_segmentation(
self, resources, multi_font_manager, tmp_path
):
"""Test word segmentation with real Latin hOCR file."""
from pdfminer.high_level import extract_text
hocr_path = resources / "latin.hocr"
if not hocr_path.exists():
pytest.skip("latin.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "latin_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# The Latin text should have proper word segmentation
# Words should be separable
words = extracted_text.split()
assert len(words) > 0
# Check that common English words are properly segmented
# (not stuck together)
text_no_newlines = extracted_text.replace("\n", " ")
# There should be spaces in the extracted text
assert " " in text_no_newlines
================================================
FILE: tests/test_ghostscript.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import secrets
import subprocess
import sys
from decimal import Decimal
from unittest.mock import patch
import pikepdf
import pytest
from packaging.version import Version
from PIL import Image, UnidentifiedImageError
from ocrmypdf._exec import ghostscript
from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
from ocrmypdf.builtin_plugins.ghostscript import _repair_gs106_jpeg_corruption
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from .conftest import check_ocrmypdf, run_ocrmypdf_api
# pylint: disable=redefined-outer-name
@pytest.fixture
def francais(resources):
path = resources / 'francais.pdf'
return path, pikepdf.open(path)
def test_rasterize_size(francais, outdir):
path, pdf = francais
page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
target_size = Decimal('50.0'), Decimal('30.0')
forced_dpi = Resolution(42.0, 4242.0)
rasterize_pdf(
path,
outdir / 'out.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(
target_size[0] / page_size[0], target_size[1] / page_size[1]
),
page_dpi=forced_dpi,
)
with Image.open(outdir / 'out.png') as im:
assert im.size == target_size
assert im.info['dpi'] == forced_dpi
def test_rasterize_rotated(francais, outdir, caplog):
path, pdf = francais
page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
target_size = Decimal('50.0'), Decimal('30.0')
forced_dpi = Resolution(42.0, 4242.0)
caplog.set_level(logging.DEBUG)
rasterize_pdf(
path,
outdir / 'out.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(
target_size[0] / page_size[0], target_size[1] / page_size[1]
),
page_dpi=forced_dpi,
rotation=90,
)
with Image.open(outdir / 'out.png') as im:
assert im.size == (target_size[1], target_size[0])
assert im.info['dpi'] == forced_dpi.flip_axis()
def test_rasterize_low_dpi(francais, outdir):
"""Test that very low DPI values (below 10) produce correctly sized output.
Ghostscript may fail with DPI values below 10. The workaround renders at
a minimum of 10 DPI and resizes the output to match the expected dimensions.
"""
path, pdf = francais
page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)
# Request a very small output (DPI below 10 on both axes)
target_size = (5, 3)
forced_dpi = Resolution(72.0, 72.0)
rasterize_pdf(
path,
outdir / 'out_low_dpi.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(
target_size[0] / page_size[0], target_size[1] / page_size[1]
),
page_dpi=forced_dpi,
)
with Image.open(outdir / 'out_low_dpi.png') as im:
assert im.size == target_size
assert im.info['dpi'] == forced_dpi
def test_rasterize_low_dpi_one_axis(francais, outdir):
"""Test low DPI on only one axis produces correctly sized output."""
path, pdf = francais
page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)
# Request low DPI on X axis only (below 10), normal on Y axis
target_size = (5, 50)
forced_dpi = Resolution(72.0, 72.0)
rasterize_pdf(
path,
outdir / 'out_low_dpi_x.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(
target_size[0] / page_size[0], target_size[1] / page_size[1]
),
page_dpi=forced_dpi,
)
with Image.open(outdir / 'out_low_dpi_x.png') as im:
assert im.size == target_size
assert im.info['dpi'] == forced_dpi
def test_gs_render_failure(resources, outpdf, caplog):
exitcode = run_ocrmypdf_api(
resources / 'blank.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_render_failure.py',
)
assert 'TEST ERROR: gs_render_failure.py' in caplog.text
assert exitcode == ExitCode.child_process_error
def test_gs_raster_failure(resources, outpdf, caplog):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_raster_failure.py',
)
assert 'TEST ERROR: gs_raster_failure.py' in caplog.text
assert exitcode == ExitCode.child_process_error
def test_ghostscript_pdfa_failure(resources, outpdf, caplog):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_pdfa_failure.py',
)
assert (
exitcode == ExitCode.pdfa_conversion_failed
), "Unexpected return when PDF/A fails"
def test_ghostscript_feature_elision(resources, outpdf):
check_ocrmypdf(
resources / 'francais.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_feature_elision.py',
)
def test_ghostscript_mandatory_color_conversion(resources, outpdf):
with pytest.raises(ColorConversionNeededError):
check_ocrmypdf(
resources / 'jbig2_baddevicen.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_rasterize_pdf_errors(resources, no_outpdf, caplog):
with patch('ocrmypdf._exec.ghostscript.run') as mock:
# ghostscript can produce empty files with return code 0
mock.return_value = subprocess.CompletedProcess(
['fakegs'], returncode=0, stdout=b'', stderr=b'error this is an error'
)
with pytest.raises(UnidentifiedImageError):
rasterize_pdf(
resources / 'francais.pdf',
no_outpdf,
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(100, 100),
)
assert "this is an error" in caplog.text
assert "invalid page image file" in caplog.text
class TestDuplicateFilter:
@pytest.fixture(scope='function')
def duplicate_filter_logger(self):
# token_urlsafe: ensure the logger has a unique name so tests are isolated
logger = logging.getLogger(__name__ + secrets.token_urlsafe(8))
logger.setLevel(logging.DEBUG)
logger.addFilter(DuplicateFilter(logger))
return logger
@pytest.mark.xfail(
(3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
reason="https://github.com/python/cpython/pull/135858",
)
def test_filter_duplicate_messages(self, duplicate_filter_logger, caplog):
log = duplicate_filter_logger
log.error("test error message")
log.error("test error message")
log.error("test error message")
log.error("another error message")
log.error("another error message")
log.error("yet another error message")
assert len(caplog.records) == 5
assert caplog.records[0].msg == "test error message"
assert caplog.records[1].msg == "(suppressed 2 repeated lines)"
assert caplog.records[2].msg == "another error message"
assert caplog.records[3].msg == "(suppressed 1 repeated lines)"
assert caplog.records[4].msg == "yet another error message"
def test_filter_does_not_affect_unique_messages(
self, duplicate_filter_logger, caplog
):
log = duplicate_filter_logger
log.error("test error message")
log.error("another error message")
log.error("yet another error message")
assert len(caplog.records) == 3
assert caplog.records[0].msg == "test error message"
assert caplog.records[1].msg == "another error message"
assert caplog.records[2].msg == "yet another error message"
@pytest.mark.xfail(
(3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
reason="https://github.com/python/cpython/pull/135858",
)
def test_filter_alt_messages(self, duplicate_filter_logger, caplog):
log = duplicate_filter_logger
log.error("test error message")
log.error("another error message")
log.error("test error message")
log.error("another error message")
log.error("test error message")
log.error("test error message")
log.error("another error message")
log.error("yet another error message")
assert len(caplog.records) == 4
assert caplog.records[0].msg == "test error message"
assert caplog.records[1].msg == "another error message"
assert caplog.records[2].msg == "(suppressed 5 repeated lines)"
assert caplog.records[3].msg == "yet another error message"
@pytest.fixture
def pdf_with_invalid_image(outdir):
# issue 1451
Name = pikepdf.Name
pdf = pikepdf.new()
pdf.add_blank_page()
pdf.pages[0].Contents = pdf.make_stream(b'612 0 0 612 0 0 cm /Image Do')
# Create an invalid image object that has both ColorSpace and ImageMask set
pdf.pages[0].Resources = pikepdf.Dictionary(
XObject=pdf.make_indirect(
pikepdf.Dictionary(
Image=pdf.make_stream(
b"\xf0\x0f" * 8,
ColorSpace=Name.DeviceGray,
BitsPerComponent=1,
Width=8,
Height=8,
ImageMask=True,
Subtype=Name.Image,
Type=Name.XObject,
)
)
)
)
pdf.save(outdir / 'invalid_image.pdf')
pdf.save('invalid_image.pdf')
return outdir / 'invalid_image.pdf'
@pytest.mark.xfail(
ghostscript.version() < Version('10.04.0'),
reason="Older Ghostscript behavior is different",
)
def test_recoverable_image_error(pdf_with_invalid_image, outdir, caplog):
# When stop_on_error is False, we expect Ghostscript to print an error
# but continue
rasterize_pdf(
outdir / 'invalid_image.pdf',
outdir / 'out.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(10, 10),
stop_on_error=False,
)
assert 'Image has both ImageMask and ColorSpace' in caplog.text
@pytest.mark.xfail(
ghostscript.version() < Version('10.04.0'),
reason="Older Ghostscript behavior is different",
)
def test_recoverable_image_error_with_stop(pdf_with_invalid_image, outdir, caplog):
# When stop_on_error is True, Ghostscript will print an error and exit
# but still produce a viable image. We intercept this case and raise
# InputFileError because it will contain an image of the whole page minus
# the image we are rendering.
with pytest.raises(
InputFileError, match="Try using --continue-on-soft-render-error"
):
rasterize_pdf(
outdir / 'invalid_image.pdf',
outdir / 'out.png',
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(100, 100),
stop_on_error=True,
)
# out2.png will not be created; if it were it would be blank.
class TestGs106JpegCorruptionRepair:
"""Test the Ghostscript 10.6 JPEG corruption repair function."""
@pytest.fixture
def create_damaged_pdf(self, resources, outdir):
"""Create a damaged PDF by truncating JPEG data by 2 bytes."""
def _create_damaged(source_pdf_name='francais.pdf', truncate_bytes=2):
source_path = resources / source_pdf_name
damaged_path = outdir / 'damaged.pdf'
with pikepdf.open(source_path) as pdf:
# Find and truncate DCTDecode images
Name = pikepdf.Name
damaged_count = 0
for page in pdf.pages:
if Name.Resources not in page:
continue
resources_dict = page[Name.Resources]
if Name.XObject not in resources_dict:
continue
for key in resources_dict[Name.XObject].keys():
obj = resources_dict[Name.XObject][key]
if obj.get(Name.Subtype) != Name.Image:
continue
if obj.get(Name.Filter) != Name.DCTDecode:
continue
# Truncate the JPEG data
original_bytes = obj.read_raw_bytes()
truncated_bytes = original_bytes[:-truncate_bytes]
obj.write(truncated_bytes, filter=Name.DCTDecode)
damaged_count += 1
pdf.save(damaged_path)
return source_path, damaged_path, damaged_count
return _create_damaged
def test_repair_truncated_jpeg(self, create_damaged_pdf, caplog):
"""Test that truncated JPEG images are repaired."""
caplog.set_level(logging.DEBUG)
source_path, damaged_path, damaged_count = create_damaged_pdf()
assert damaged_count > 0, "Test PDF should have DCTDecode images"
# Get original image bytes for comparison
with pikepdf.open(source_path) as pdf:
Name = pikepdf.Name
original_bytes_list = []
for page in pdf.pages:
if Name.Resources not in page:
continue
resources_dict = page[Name.Resources]
if Name.XObject not in resources_dict:
continue
for key in resources_dict[Name.XObject].keys():
obj = resources_dict[Name.XObject][key]
if obj.get(Name.Subtype) != Name.Image:
continue
if obj.get(Name.Filter) != Name.DCTDecode:
continue
original_bytes_list.append(obj.read_raw_bytes())
# Run the repair function
repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
assert repaired is True, "Repair should have been performed"
# Verify the repaired PDF has correct image bytes
with pikepdf.open(damaged_path) as pdf:
Name = pikepdf.Name
repaired_bytes_list = []
for page in pdf.pages:
if Name.Resources not in page:
continue
resources_dict = page[Name.Resources]
if Name.XObject not in resources_dict:
continue
for key in resources_dict[Name.XObject].keys():
obj = resources_dict[Name.XObject][key]
if obj.get(Name.Subtype) != Name.Image:
continue
if obj.get(Name.Filter) != Name.DCTDecode:
continue
repaired_bytes_list.append(obj.read_raw_bytes())
assert len(repaired_bytes_list) == len(original_bytes_list)
for orig, repaired_bytes in zip(original_bytes_list, repaired_bytes_list, strict=False):
assert orig == repaired_bytes, "Repaired bytes should match original"
# Check that error/warning was logged
assert "JPEG corruption detected" in caplog.text
def test_no_repair_when_not_truncated(self, resources, outdir, caplog):
"""Test that no repair is done when images are not truncated."""
caplog.set_level(logging.DEBUG)
source_path = resources / 'francais.pdf'
# Copy source to output (no damage)
output_path = outdir / 'undamaged.pdf'
with pikepdf.open(source_path) as pdf:
pdf.save(output_path)
# Run the repair function - should not repair anything
repaired = _repair_gs106_jpeg_corruption(source_path, output_path)
assert repaired is False, "No repair should have been performed"
assert "JPEG corruption detected" not in caplog.text
def test_no_repair_when_truncation_too_large(self, create_damaged_pdf, caplog):
"""Test that images truncated by more than 15 bytes are not repaired."""
caplog.set_level(logging.DEBUG)
source_path, damaged_path, _ = create_damaged_pdf(truncate_bytes=20)
repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
assert repaired is False, "Should not repair truncation > 15 bytes"
assert "JPEG corruption detected" not in caplog.text
================================================
FILE: tests/test_graft.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from unittest.mock import patch
import pikepdf
import ocrmypdf
def test_no_glyphless_graft(resources, outdir):
with (
pikepdf.open(resources / 'francais.pdf') as pdf,
pikepdf.open(resources / 'aspect.pdf') as pdf_aspect,
pikepdf.open(resources / 'cmyk.pdf') as pdf_cmyk,
):
pdf.pages.extend(pdf_aspect.pages)
pdf.pages.extend(pdf_cmyk.pages)
pdf.save(outdir / 'test.pdf')
with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):
ocrmypdf.ocr(
outdir / 'test.pdf',
outdir / 'out.pdf',
deskew=True,
tesseract_timeout=0,
force_ocr=True,
)
# This test needs asserts
def test_links(resources, outpdf):
ocrmypdf.ocr(
resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'
)
with pikepdf.open(outpdf) as pdf:
p1 = pdf.pages[0]
p2 = pdf.pages[1]
assert p1.Annots[0].A.D[0].objgen == p2.objgen
assert p2.Annots[0].A.D[0].objgen == p1.objgen
def test_redo_ocr_with_offset_mediabox(resources, outdir):
"""Test that --redo-ocr handles non-zero mediabox origins correctly.
Regression test for issue #1630 where PDFs with mediabox origins like
[0, 100, width, height+100] (common in cropped/JSTOR-style PDFs)
would have OCR text shifted vertically because the text layer CTM
did not account for the page origin offset.
"""
# Create a PDF with a non-zero mediabox origin
input_pdf = outdir / 'offset_mediabox_input.pdf'
y_offset = 100
with pikepdf.open(resources / 'graph_ocred.pdf') as pdf:
page = pdf.pages[0]
original_mb = list(page.MediaBox)
# Shift mediabox Y origin to simulate cropped/JSTOR-style PDFs
page.MediaBox = [
original_mb[0],
original_mb[1] + y_offset,
original_mb[2],
original_mb[3] + y_offset,
]
pdf.save(input_pdf)
# Run --redo-ocr (this is where the bug occurred)
output_pdf = outdir / 'offset_redo_ocr.pdf'
ocrmypdf.ocr(input_pdf, output_pdf, redo_ocr=True)
# Verify the output
with pikepdf.open(output_pdf) as pdf:
page = pdf.pages[0]
mediabox = list(page.MediaBox)
# MediaBox origin should be preserved
assert (
float(mediabox[1]) == y_offset
), f"MediaBox Y origin should be preserved at {y_offset}, got {mediabox[1]}"
# The content stream should include a CTM with the Y origin translation.
# Without the fix, the CTM was omitted for rotation==0, causing a shift.
content = page.Contents.read_bytes()
assert b'cm' in content, (
"Content stream should include a CTM to translate by the page origin"
)
def test_strip_invisble_text():
pdf = pikepdf.Pdf.new()
print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))
page = pdf.add_blank_page()
visible_text = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('visible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
invisible_text = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
invisible_text_setting_tr = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
stream = [
pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
*invisible_text,
pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),
*visible_text,
*invisible_text_setting_tr,
*invisible_text,
]
content_stream = pikepdf.unparse_content_stream(stream)
page.Contents = pikepdf.Stream(pdf, content_stream)
def count(string, page):
return len(
[
True
for operands, operator in pikepdf.parse_content_stream(page)
if operator == pikepdf.Operator('Tj')
and operands[0] == pikepdf.String(string)
]
)
nr_visible_pre = count('visible', page)
ocrmypdf._graft.strip_invisible_text(pdf, page)
nr_visible_post = count('visible', page)
assert (
nr_visible_pre == nr_visible_post
), 'Number of visible text elements did not change'
assert count('invisible', page) == 0, 'No invisible elems left'
================================================
FILE: tests/test_helpers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import multiprocessing
import os
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from packaging.version import Version
from ocrmypdf import helpers
from ocrmypdf.helpers import running_in_docker
needs_symlink = pytest.mark.skipif(os.name == 'nt', reason='needs posix symlink')
windows_only = pytest.mark.skipif(os.name != 'nt', reason="Windows test")
class TestSafeSymlink:
def test_safe_symlink_link_self(self, tmp_path, caplog):
helpers.safe_symlink(tmp_path / 'self', tmp_path / 'self')
assert caplog.record_tuples[0][1] == logging.WARNING
def test_safe_symlink_overwrite(self, tmp_path):
(tmp_path / 'regular_file').touch()
with pytest.raises(FileExistsError):
helpers.safe_symlink(tmp_path / 'input', tmp_path / 'regular_file')
@needs_symlink
def test_safe_symlink_relink(self, tmp_path):
(tmp_path / 'regular_file_a').touch()
(tmp_path / 'regular_file_b').write_bytes(b'ABC')
(tmp_path / 'link').symlink_to(tmp_path / 'regular_file_a')
helpers.safe_symlink(tmp_path / 'regular_file_b', tmp_path / 'link')
assert (tmp_path / 'link').samefile(tmp_path / 'regular_file_b') or (
tmp_path / 'link'
).read_bytes() == b'ABC'
def test_no_cpu_count(monkeypatch):
invoked = False
def cpu_count_raises():
nonlocal invoked
invoked = True
raise NotImplementedError()
monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises)
with pytest.warns(expected_warning=UserWarning):
assert helpers.available_cpu_count() == 1
assert invoked, "Patched function called during test"
skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker")
class TestFileIsWritable:
@pytest.fixture
def non_existent(self, tmp_path):
return tmp_path / 'nofile'
@pytest.fixture
def basic_file(self, tmp_path):
basic = tmp_path / 'basic'
basic.touch()
return basic
def test_plain(self, non_existent):
assert helpers.is_file_writable(non_existent)
@needs_symlink
def test_symlink_loop(self, tmp_path):
loop = tmp_path / 'loop'
loop.symlink_to(loop)
assert not helpers.is_file_writable(loop)
@skipif_docker
def test_chmod(self, basic_file):
assert helpers.is_file_writable(basic_file)
basic_file.chmod(0o400)
assert not helpers.is_file_writable(basic_file)
basic_file.chmod(0o000)
assert not helpers.is_file_writable(basic_file)
def test_permission_error(self, basic_file):
pathmock = MagicMock(spec_set=basic_file)
pathmock.is_symlink.return_value = False
pathmock.exists.return_value = True
pathmock.is_file.side_effect = PermissionError
assert not helpers.is_file_writable(pathmock)
@windows_only
def test_gs_install_locations():
# pylint: disable=import-outside-toplevel
from ocrmypdf.subprocess._windows import _gs_version_in_path_key
assert _gs_version_in_path_key(Path("C:\\Program Files\\gs\\gs9.52\\bin")) == (
'gs',
Version('9.52'),
)
@windows_only
def test_shim_paths(tmp_path):
# pylint: disable=import-outside-toplevel
from ocrmypdf.subprocess._windows import shim_env_path
progfiles = tmp_path / 'Program Files'
progfiles.mkdir()
(progfiles / 'tesseract-ocr').mkdir()
(progfiles / 'gs' / '9.51' / 'bin').mkdir(parents=True)
(progfiles / 'gs' / 'gs9.52.3' / 'bin').mkdir(parents=True)
syspath = tmp_path / 'bin'
env = {'PROGRAMFILES': str(progfiles), 'PATH': str(syspath)}
result_str = shim_env_path(env=env)
results = result_str.split(os.pathsep)
assert results[0] == str(syspath), results
assert results[-3].endswith('tesseract-ocr'), results
assert results[-2].endswith(os.path.join('gs9.52.3', 'bin')), results
assert results[-1].endswith(os.path.join('gs', '9.51', 'bin')), results
def test_resolution():
Resolution = helpers.Resolution
dpi_100 = Resolution(100, 100)
dpi_200 = Resolution(200, 200)
assert dpi_100.is_square
assert not Resolution(100, 200).is_square
assert dpi_100 == Resolution(100, 100)
assert str(dpi_100) != str(dpi_200)
assert dpi_100.take_max([200, 300], [400]) == Resolution(300, 400)
================================================
FILE: tests/test_hocr_parser.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for HocrParser class."""
from __future__ import annotations
from pathlib import Path
from textwrap import dedent
import pytest
from ocrmypdf.hocrtransform import (
HocrParseError,
HocrParser,
OcrClass,
)
@pytest.fixture
def simple_hocr(tmp_path) -> Path:
"""Create a simple valid hOCR file."""
content = dedent("""\
Test
Hello
World
""")
hocr_file = tmp_path / "simple.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
@pytest.fixture
def multiline_hocr(tmp_path) -> Path:
"""Create an hOCR file with multiple lines and paragraphs."""
content = dedent("""\
Line
one
Line
two
German
text
""")
hocr_file = tmp_path / "multiline.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
@pytest.fixture
def rtl_hocr(tmp_path) -> Path:
"""Create an hOCR file with RTL text."""
content = dedent("""\
مرحبا
""")
hocr_file = tmp_path / "rtl.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
@pytest.fixture
def rotated_hocr(tmp_path) -> Path:
"""Create an hOCR file with rotated text (textangle)."""
content = dedent("""\
Rotated
""")
hocr_file = tmp_path / "rotated.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
@pytest.fixture
def header_hocr(tmp_path) -> Path:
"""Create an hOCR file with different line types."""
content = dedent("""\
Chapter
One
Body
text
Figure
1
""")
hocr_file = tmp_path / "header.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
@pytest.fixture
def font_info_hocr(tmp_path) -> Path:
"""Create an hOCR file with font information."""
content = dedent("""\
Styled
""")
hocr_file = tmp_path / "font_info.hocr"
hocr_file.write_text(content, encoding='utf-8')
return hocr_file
class TestHocrParserBasic:
"""Basic HocrParser functionality tests."""
def test_parse_simple_hocr(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
assert page.ocr_class == OcrClass.PAGE
assert page.bbox is not None
assert page.bbox.width == 1000
assert page.bbox.height == 500
def test_parse_page_number(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
assert page.page_number == 0
def test_parse_paragraphs(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
assert len(page.paragraphs) == 1
paragraph = page.paragraphs[0]
assert paragraph.ocr_class == OcrClass.PARAGRAPH
assert paragraph.language == "eng"
assert paragraph.direction == "ltr"
def test_parse_lines(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
lines = page.lines
assert len(lines) == 1
line = lines[0]
assert line.ocr_class == OcrClass.LINE
assert line.bbox is not None
assert line.baseline is not None
assert line.baseline.slope == pytest.approx(0.01)
assert line.baseline.intercept == -5
def test_parse_words(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
words = page.words
assert len(words) == 2
assert words[0].text == "Hello"
assert words[1].text == "World"
def test_parse_word_confidence(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
words = page.words
assert words[0].confidence == pytest.approx(0.95)
assert words[1].confidence == pytest.approx(0.90)
def test_parse_word_bbox(self, simple_hocr):
parser = HocrParser(simple_hocr)
page = parser.parse()
word = page.words[0]
assert word.bbox is not None
assert word.bbox.left == 100
assert word.bbox.top == 100
assert word.bbox.right == 200
assert word.bbox.bottom == 150
class TestHocrParserMultiline:
"""Test parsing of multi-line/multi-paragraph hOCR."""
def test_multiple_lines(self, multiline_hocr):
parser = HocrParser(multiline_hocr)
page = parser.parse()
assert len(page.paragraphs) == 2
assert len(page.lines) == 3 # 2 in first par, 1 in second
def test_multiple_paragraphs_languages(self, multiline_hocr):
parser = HocrParser(multiline_hocr)
page = parser.parse()
paragraphs = page.paragraphs
assert paragraphs[0].language == "eng"
assert paragraphs[1].language == "deu"
def test_word_count(self, multiline_hocr):
parser = HocrParser(multiline_hocr)
page = parser.parse()
assert len(page.words) == 6 # 2 + 2 + 2
class TestHocrParserRTL:
"""Test parsing of RTL text."""
def test_rtl_direction(self, rtl_hocr):
parser = HocrParser(rtl_hocr)
page = parser.parse()
paragraph = page.paragraphs[0]
assert paragraph.direction == "rtl"
assert paragraph.language == "ara"
def test_rtl_line_inherits_direction(self, rtl_hocr):
parser = HocrParser(rtl_hocr)
page = parser.parse()
line = page.lines[0]
assert line.direction == "rtl"
class TestHocrParserRotation:
"""Test parsing of rotated text."""
def test_textangle(self, rotated_hocr):
parser = HocrParser(rotated_hocr)
page = parser.parse()
line = page.lines[0]
assert line.textangle == pytest.approx(5.5)
class TestHocrParserLineTypes:
"""Test parsing of different line types."""
def test_header_line(self, header_hocr):
parser = HocrParser(header_hocr)
page = parser.parse()
lines = page.lines
assert len(lines) == 3
# Check line types
line_classes = [line.ocr_class for line in lines]
assert OcrClass.HEADER in line_classes
assert OcrClass.LINE in line_classes
assert OcrClass.CAPTION in line_classes
def test_all_line_types_have_words(self, header_hocr):
parser = HocrParser(header_hocr)
page = parser.parse()
for line in page.lines:
assert len(line.children) > 0
class TestHocrParserFontInfo:
"""Test parsing of font information."""
def test_font_name_and_size(self, font_info_hocr):
parser = HocrParser(font_info_hocr)
page = parser.parse()
word = page.words[0]
assert word.font is not None
assert word.font.name == "Arial"
assert word.font.size == pytest.approx(12.5)
class TestHocrParserErrors:
"""Test error handling in HocrParser."""
def test_missing_file(self, tmp_path):
with pytest.raises(FileNotFoundError):
HocrParser(tmp_path / "nonexistent.hocr")
def test_invalid_xml(self, tmp_path):
hocr_file = tmp_path / "invalid.hocr"
hocr_file.write_text("not closed", encoding='utf-8')
with pytest.raises(HocrParseError):
HocrParser(hocr_file)
def test_missing_ocr_page(self, tmp_path):
hocr_file = tmp_path / "no_page.hocr"
hocr_file.write_text(
"No ocr_page
", encoding='utf-8'
)
parser = HocrParser(hocr_file)
with pytest.raises(HocrParseError, match="No ocr_page"):
parser.parse()
def test_missing_page_bbox(self, tmp_path):
hocr_file = tmp_path / "no_bbox.hocr"
hocr_file.write_text(
"No bbox",
encoding='utf-8',
)
parser = HocrParser(hocr_file)
with pytest.raises(HocrParseError, match="bbox"):
parser.parse()
class TestHocrParserEdgeCases:
"""Test edge cases in HocrParser."""
def test_empty_word_text(self, tmp_path):
"""Words with empty text should be skipped."""
content = dedent("""\
Valid
""")
hocr_file = tmp_path / "empty_word.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
# Only the non-empty word should be parsed
assert len(page.words) == 1
assert page.words[0].text == "Valid"
def test_whitespace_only_word(self, tmp_path):
"""Words with only whitespace should be skipped."""
content = dedent("""\
Valid
""")
hocr_file = tmp_path / "whitespace_word.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
assert len(page.words) == 1
assert page.words[0].text == "Valid"
def test_line_without_bbox(self, tmp_path):
"""Lines without bbox should be skipped."""
content = dedent("""\
Word
Valid
""")
hocr_file = tmp_path / "no_line_bbox.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
# Only line with bbox should be parsed
assert len(page.lines) == 1
assert page.words[0].text == "Valid"
def test_unicode_normalization(self, tmp_path):
"""Text should be NFKC normalized."""
# Use a string with combining characters
content = dedent("""\
fi
""")
hocr_file = tmp_path / "unicode.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
# fi ligature should be normalized to "fi"
assert page.words[0].text == "fi"
def test_words_directly_under_page(self, tmp_path):
"""Test fallback for words directly under page (no paragraph structure)."""
content = dedent("""\
Direct
Word
""")
hocr_file = tmp_path / "direct_words.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
# Words should be parsed as direct children
assert len(page.children) == 2
assert page.children[0].text == "Direct"
assert page.children[1].text == "Word"
def test_no_namespace(self, tmp_path):
"""Test parsing hOCR without XHTML namespace."""
content = dedent("""\
NoNS
""")
hocr_file = tmp_path / "no_namespace.hocr"
hocr_file.write_text(content, encoding='utf-8')
parser = HocrParser(hocr_file)
page = parser.parse()
assert len(page.words) == 1
assert page.words[0].text == "NoNS"
================================================
FILE: tests/test_hocrtransform.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import re
from io import StringIO
from pathlib import Path
import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image
from ocrmypdf._exec.tesseract import generate_hocr
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer
from ocrmypdf.helpers import check_pdf
from ocrmypdf.hocrtransform import HocrParser
from .conftest import check_ocrmypdf
def text_from_pdf(filename):
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
# pylint: disable=redefined-outer-name
@pytest.fixture
def font_dir():
"""Get the font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create a MultiFontManager for tests."""
return MultiFontManager(font_dir)
@pytest.fixture
def blank_hocr(tmp_path):
im = Image.new('1', (8, 8), 0)
im.save(tmp_path / 'blank.tif', format='TIFF')
generate_hocr(
input_file=tmp_path / 'blank.tif',
output_hocr=tmp_path / 'blank.hocr',
output_text=tmp_path / 'blank.txt',
languages=['eng'],
engine_mode=1,
tessconfig=[],
pagesegmode=3,
thresholding=0,
user_words=None,
user_patterns=None,
timeout=None,
)
return tmp_path / 'blank.hocr'
def test_mono_image(blank_hocr, outdir, multi_font_manager):
im = Image.new('1', (8, 8), 0)
for n in range(8):
im.putpixel((n, n), 1)
im.save(outdir / 'mono.tif', format='TIFF')
# Parse hOCR file
parser = HocrParser(str(blank_hocr))
ocr_page = parser.parse()
# Use DPI from hOCR or default
dpi = ocr_page.dpi or 8
# Render to PDF using fpdf2
renderer = Fpdf2PdfRenderer(
page=ocr_page,
dpi=dpi,
multi_font_manager=multi_font_manager,
invisible_text=True,
)
renderer.render(outdir / 'mono.pdf')
check_pdf(outdir / 'mono.pdf')
@pytest.mark.slow
def test_fpdf2_matches_sandwich(resources, outdir):
"""Test that fpdf2 renderer produces similar output to sandwich renderer."""
# Note: hocr renderer now redirects to fpdf2
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'fpdf2.pdf', '--pdf-renderer=fpdf2'
)
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
)
# Slight differences in spacing and word order can appear, so at least ensure
# that we get all of the same words...
def clean(s):
s = re.sub(r'\s+', ' ', s)
words = s.split(' ')
return set(words)
fpdf2_words = clean(text_from_pdf(outdir / 'fpdf2.pdf'))
tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))
similarity = len(fpdf2_words & tess_words) / len(fpdf2_words | tess_words)
assert similarity > 0.99
================================================
FILE: tests/test_image_input.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from unittest.mock import patch
import img2pdf
import pikepdf
import pytest
from PIL import Image
import ocrmypdf
from .conftest import check_ocrmypdf, run_ocrmypdf_api
# pylint: disable=redefined-outer-name
@pytest.fixture
def baiona(resources):
return Image.open(resources / 'baiona_gray.png')
def test_image_to_pdf(resources, outpdf):
check_ocrmypdf(
resources / 'crom.png',
outpdf,
'--image-dpi',
'200',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_no_dpi_info(caplog, baiona, outdir, no_outpdf):
im = baiona
assert 'dpi' not in im.info
input_image = outdir / 'baiona_no_dpi.png'
im.save(input_image)
rc = run_ocrmypdf_api(input_image, no_outpdf)
assert rc == ocrmypdf.ExitCode.input_file
assert "--image-dpi" in caplog.text
def test_dpi_not_credible(caplog, baiona, outdir, no_outpdf):
im = baiona
assert 'dpi' not in im.info
input_image = outdir / 'baiona_no_dpi.png'
im.save(input_image, dpi=(30, 30))
rc = run_ocrmypdf_api(input_image, no_outpdf)
assert rc == ocrmypdf.ExitCode.input_file
assert "not credible" in caplog.text
def test_cmyk_no_icc(caplog, resources, no_outpdf):
rc = run_ocrmypdf_api(resources / 'baiona_cmyk.jpg', no_outpdf)
assert rc == ocrmypdf.ExitCode.input_file
assert "no ICC profile" in caplog.text
def test_img2pdf_fails(resources, no_outpdf):
with patch(
'ocrmypdf._pipeline.img2pdf.convert', side_effect=img2pdf.ImageOpenError()
) as mock:
rc = run_ocrmypdf_api(
resources / 'baiona_gray.png', no_outpdf, '--image-dpi', '200'
)
assert rc == ocrmypdf.ExitCode.input_file
mock.assert_called()
@pytest.mark.xfail(reason="remove background disabled")
def test_jpeg_in_jpeg_out(resources, outpdf):
check_ocrmypdf(
resources / 'baiona_color.jpg',
outpdf,
'--image-dpi',
'100',
'--output-type',
'pdf', # specifically check pdf because Ghostscript may convert to JPEG
'--remove-background',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with pikepdf.open(outpdf) as pdf:
assert next(iter(pdf.pages[0].images.values())).Filter == pikepdf.Name.DCTDecode
================================================
FILE: tests/test_imageops.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import hypothesis.strategies as st
from hypothesis import given
from PIL import Image
from ocrmypdf.imageops import (
_calculate_downsample,
bytes_per_pixel,
calculate_downsample,
downsample_image,
)
def test_bytes_per_pixel():
assert bytes_per_pixel('RGB') == 4
assert bytes_per_pixel('RGBA') == 4
assert bytes_per_pixel('LA') == 2
assert bytes_per_pixel('L') == 1
def test_calculate_downsample():
im = Image.new('RGB', (100, 100))
assert calculate_downsample(im, max_size=(50, 50)) == (50, 50)
assert calculate_downsample(im, max_pixels=2500) == (50, 50)
assert calculate_downsample(im, max_bytes=10000) == (50, 50)
assert calculate_downsample(im, max_bytes=100000) == (100, 100)
@given(
st.one_of(st.just("RGB"), st.just('L')),
st.integers(min_value=1, max_value=100000),
st.integers(min_value=1, max_value=100000),
st.integers(min_value=64, max_value=100000),
st.integers(min_value=64, max_value=100000),
st.integers(min_value=64 * 64, max_value=1000000),
)
def test_calculate_downsample_hypothesis(mode, im_w, im_h, max_x, max_y, max_bytes):
result = _calculate_downsample(
(im_w, im_h),
bytes_per_pixel(mode),
max_size=(max_x, max_y),
max_bytes=max_bytes,
)
assert result[0] <= max_x
assert result[1] <= max_y
assert result[0] * result[1] * bytes_per_pixel(mode) <= max_bytes
def test_downsample_image():
im = Image.new('RGB', (100, 100))
im.info['dpi'] = (300, 300)
ds = downsample_image(im, (50, 50))
assert ds.size == (50, 50)
assert ds.info['dpi'] == (150, 150)
================================================
FILE: tests/test_json_serialization.py
================================================
"""Test JSON serialization of OcrOptions for multiprocessing compatibility."""
from __future__ import annotations
import multiprocessing
from io import BytesIO
from pathlib import Path, PurePath
import pytest
from ocrmypdf._options import OcrOptions
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions
@pytest.fixture(autouse=True)
def register_plugin_models():
"""Register plugin models for tests."""
OcrOptions.register_plugin_models({'tesseract': TesseractOptions})
yield
# Clean up after test (optional, but good practice)
def worker_function(options_json: str) -> str:
"""Worker function that deserializes OcrOptions from JSON and returns a result."""
# Register plugin models in worker process
from ocrmypdf._options import OcrOptions
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions
OcrOptions.register_plugin_models({'tesseract': TesseractOptions})
# Reconstruct OcrOptions from JSON in worker process
options = OcrOptions.model_validate_json_safe(options_json)
# Verify we can access various option types
# Count only user-added extra_attrs (exclude plugin cache keys starting with '_')
user_attrs_count = len(
[k for k in options.extra_attrs.keys() if not k.startswith('_')]
)
result = {
'input_file': str(options.input_file),
'output_file': str(options.output_file),
'languages': options.languages,
'optimize': options.optimize,
'tesseract_timeout': options.tesseract.timeout,
'fast_web_view': options.fast_web_view,
'extra_attrs_count': user_attrs_count,
}
# Return as JSON string
import json
return json.dumps(result)
def test_json_serialization_multiprocessing():
"""Test that OcrOptions can be JSON serialized and used in multiprocessing."""
# Create OcrOptions with various field types
options = OcrOptions(
input_file=Path('/test/input.pdf'),
output_file=Path('/test/output.pdf'),
languages=['eng', 'deu'],
optimize=2,
tesseract_timeout=120.0,
fast_web_view=2.5,
deskew=True,
clean=False,
)
# Add some extra attributes
options.extra_attrs['custom_field'] = 'test_value'
options.extra_attrs['numeric_field'] = 42
# Serialize to JSON
options_json = options.model_dump_json_safe()
# Test that we can deserialize in the main process
reconstructed = OcrOptions.model_validate_json_safe(options_json)
assert reconstructed.input_file == options.input_file
assert reconstructed.output_file == options.output_file
assert reconstructed.languages == options.languages
assert reconstructed.optimize == options.optimize
assert reconstructed.tesseract_timeout == options.tesseract.timeout
assert reconstructed.fast_web_view == options.fast_web_view
assert reconstructed.deskew == options.deskew
assert reconstructed.clean == options.clean
# Compare user-added extra_attrs (excluding plugin cache keys)
user_attrs = {k: v for k, v in options.extra_attrs.items() if not k.startswith('_')}
reconstructed_attrs = {
k: v for k, v in reconstructed.extra_attrs.items() if not k.startswith('_')
}
assert reconstructed_attrs == user_attrs
# Test multiprocessing with JSON serialization
with multiprocessing.Pool(processes=2) as pool:
# Send the JSON string to worker processes
results = pool.map(worker_function, [options_json, options_json])
# Verify results from worker processes
import json
for result_json in results:
result = json.loads(result_json)
assert PurePath(result['input_file']) == PurePath('/test/input.pdf')
assert PurePath(result['output_file']) == PurePath('/test/output.pdf')
assert result['languages'] == ['eng', 'deu']
assert result['optimize'] == 2
assert result['tesseract_timeout'] == 120.0
assert result['fast_web_view'] == 2.5
assert result['extra_attrs_count'] == 2 # custom_field and numeric_field
def test_json_serialization_with_streams():
"""Test JSON serialization with stream objects."""
input_stream = BytesIO(b'fake pdf data')
output_stream = BytesIO()
options = OcrOptions(
input_file=input_stream,
output_file=output_stream,
languages=['eng'],
optimize=1,
)
# Serialize to JSON (streams should be converted to placeholders)
options_json = options.model_dump_json_safe()
# Deserialize (streams will be placeholder strings)
reconstructed = OcrOptions.model_validate_json_safe(options_json)
# Streams should be converted to placeholder strings
assert reconstructed.input_file == 'stream'
assert reconstructed.output_file == 'stream'
assert reconstructed.languages == ['eng']
assert reconstructed.optimize == 1
def test_json_serialization_with_none_values():
"""Test JSON serialization handles None values correctly."""
options = OcrOptions(
input_file=Path('/test/input.pdf'),
output_file=Path('/test/output.pdf'),
languages=['eng'],
# Many fields will be None by default
)
# Serialize to JSON
options_json = options.model_dump_json_safe()
# Deserialize
reconstructed = OcrOptions.model_validate_json_safe(options_json)
# Verify None values are preserved (check actual defaults from model)
assert reconstructed.tesseract_timeout is None # Default value
assert reconstructed.fast_web_view == 1.0 # Default value, not None
assert (
reconstructed.color_conversion_strategy == "LeaveColorUnchanged"
) # Default value
assert reconstructed.pdfa_image_compression is None # This one is actually None
# Verify non-None values are preserved
assert reconstructed.input_file == options.input_file
assert reconstructed.output_file == options.output_file
assert reconstructed.languages == options.languages
================================================
FILE: tests/test_logging.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
from ocrmypdf._pipelines._common import configure_debug_logging
def test_debug_logging(tmp_path):
# Just exercise the debug logger but don't validate it
# See https://github.com/pytest-dev/pytest/issues/5502 for pytest logging quirks
prefix = 'test_debug_logging'
log = logging.getLogger(prefix)
_handler, remover = configure_debug_logging(tmp_path / 'test.log', prefix)
log.info("test message")
remover()
================================================
FILE: tests/test_main.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
import shutil
import sys
from math import isclose
from pathlib import Path
from subprocess import run
from unittest.mock import patch
import pikepdf
import pytest
from PIL import Image
import ocrmypdf
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from ocrmypdf.helpers import running_in_docker
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.pdfinfo import Colorspace, Encoding, PdfInfo
from ocrmypdf.subprocess import get_version
from .conftest import (
check_ocrmypdf,
first_page_dimensions,
have_unpaper,
is_macos,
run_ocrmypdf,
run_ocrmypdf_api,
)
# pylint: disable=redefined-outer-name
RENDERERS = ['fpdf2', 'sandwich']
def test_quick(resources, outpdf):
check_ocrmypdf(
resources / 'ccitt.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_cache.py'
)
@pytest.mark.parametrize('renderer', RENDERERS)
def test_oversample(renderer, resources, outpdf):
oversampled_pdf = check_ocrmypdf(
resources / 'skew.pdf',
outpdf,
'--oversample',
'350',
'-f',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(oversampled_pdf)
print(pdfinfo[0].dpi.x)
assert abs(pdfinfo[0].dpi.x - 350) < 1
def test_repeat_ocr(resources, no_outpdf):
result = run_ocrmypdf_api(resources / 'graph_ocred.pdf', no_outpdf)
assert result == ExitCode.already_done_ocr
def test_force_ocr(resources, outpdf):
out = check_ocrmypdf(
resources / 'graph_ocred.pdf',
outpdf,
'-f',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(out)
assert pdfinfo[0].has_text
def test_skip_ocr(resources, outpdf):
out = check_ocrmypdf(
resources / 'graph_ocred.pdf',
outpdf,
'-s',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(out)
assert pdfinfo[0].has_text
def test_redo_ocr(resources, outpdf):
in_ = resources / 'graph_ocred.pdf'
before = PdfInfo(in_, detailed_analysis=True)
out = outpdf
out = check_ocrmypdf(in_, out, '--redo-ocr')
after = PdfInfo(out, detailed_analysis=True)
assert before[0].has_text and after[0].has_text
assert (
before[0].get_textareas() != after[0].get_textareas()
), "Expected text to be different after re-OCR"
def test_argsfile(resources, outdir):
path_argsfile = outdir / 'test_argsfile.txt'
with open(str(path_argsfile), 'w') as argsfile:
print(
'--title',
'ArgsFile Test',
'--author',
'Test Cases',
'--plugin',
'tests/plugins/tesseract_noop.py',
sep='\n',
end='\n',
file=argsfile,
)
check_ocrmypdf(
resources / 'graph.pdf', path_argsfile, '@' + str(outdir / 'test_argsfile.txt')
)
@pytest.mark.parametrize('renderer', RENDERERS)
def test_ocr_timeout(renderer, resources, outpdf):
out = check_ocrmypdf(
resources / 'skew.pdf',
outpdf,
'--tesseract-timeout',
'0',
'--pdf-renderer',
renderer,
)
pdfinfo = PdfInfo(out)
assert not pdfinfo[0].has_text
def test_skip_big(resources, outpdf):
out = check_ocrmypdf(
resources / 'jbig2.pdf',
outpdf,
'--skip-big',
'1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(out)
assert not pdfinfo[0].has_text
@pytest.mark.parametrize('renderer', RENDERERS)
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_maximum_options(renderer, output_type, multipage, outpdf):
check_ocrmypdf(
multipage,
outpdf,
'-d',
'-ci' if have_unpaper() else None,
'-f',
'-k',
'--oversample',
'300',
'--skip-big',
'10',
'--title',
'Too Many Weird Files',
'--author',
'py.test',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
@pytest.mark.skipif(
tesseract.version() >= tesseract.TesseractVersion('5'),
reason="tess 5 tries harder to find its files",
)
def test_tesseract_missing_tessdata(monkeypatch, resources, no_outpdf, tmpdir):
monkeypatch.setenv("TESSDATA_PREFIX", os.fspath(tmpdir))
with pytest.raises(MissingDependencyError):
run_ocrmypdf_api(resources / 'graph.pdf', no_outpdf, '-v', '1', '--skip-text')
def test_invalid_input_pdf(resources, no_outpdf):
result = run_ocrmypdf_api(resources / 'invalid.pdf', no_outpdf)
assert result == ExitCode.input_file
def test_blank_input_pdf(resources, outpdf):
result = run_ocrmypdf_api(resources / 'blank.pdf', outpdf)
assert result == ExitCode.ok
def test_force_ocr_on_pdf_with_no_images(resources, no_outpdf):
# As a correctness test, make sure that --force-ocr on a PDF with no
# content still triggers tesseract. If tesseract crashes, then it was
# called.
exitcode = run_ocrmypdf_api(
resources / 'blank.pdf',
no_outpdf,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_crash.py',
)
assert exitcode == ExitCode.child_process_error
assert not no_outpdf.exists()
@pytest.mark.skipif(
is_macos(),
reason="takes too long to install language packs in macOS homebrew",
)
def test_german(resources, outdir):
# Produce a sidecar too - implicit test that system locale is set up
# properly. It is fine that we are testing -l deu on a French file because
# we are exercising the functionality not going for accuracy.
sidecar = outdir / 'francais.txt'
try:
check_ocrmypdf(
resources / 'francais.pdf',
outdir / 'francais.pdf',
'-l',
'deu', # more commonly installed
'--sidecar',
sidecar,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
except MissingDependencyError:
if 'deu' not in tesseract.get_languages():
pytest.xfail(reason="tesseract-deu language pack not installed")
raise
def test_klingon(resources, outpdf):
with pytest.raises(MissingDependencyError):
run_ocrmypdf_api(resources / 'francais.pdf', outpdf, '-l', 'klz')
def test_missing_docinfo(resources, outpdf):
result = run_ocrmypdf_api(
resources / 'missing_docinfo.pdf',
outpdf,
'-l',
'eng',
'--skip-text',
'--plugin',
Path('tests/plugins/tesseract_noop.py'),
)
assert result == ExitCode.ok
def test_uppercase_extension(resources, outdir):
shutil.copy(str(resources / "skew.pdf"), str(outdir / "UPPERCASE.PDF"))
check_ocrmypdf(
outdir / "UPPERCASE.PDF",
outdir / "UPPERCASE_OUT.PDF",
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_input_file_not_found(caplog, no_outpdf):
input_file = "does not exist.pdf"
result = run_ocrmypdf_api(input_file, no_outpdf)
assert result == ExitCode.input_file
assert input_file in caplog.text
@pytest.mark.skipif(os.name == 'nt' or running_in_docker(), reason="chmod")
def test_input_file_not_readable(caplog, resources, outdir, no_outpdf):
input_file = outdir / 'trivial.pdf'
shutil.copy(resources / 'trivial.pdf', input_file)
input_file.chmod(0o000)
result = run_ocrmypdf_api(input_file, no_outpdf)
assert result == ExitCode.input_file
assert str(input_file) in caplog.text
def test_input_file_not_a_pdf(caplog, no_outpdf):
input_file = __file__ # Try to OCR this file
result = run_ocrmypdf_api(input_file, no_outpdf)
assert result == ExitCode.input_file
if os.name != 'nt': # name will be mangled with \\'s on nt
assert input_file in caplog.text
@pytest.mark.parametrize('renderer', RENDERERS)
def test_pagesegmode(renderer, resources, outpdf):
check_ocrmypdf(
resources / 'skew.pdf',
outpdf,
'--tesseract-pagesegmode',
'7',
'-v',
'1',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
def test_tesseract_oem(resources, outpdf):
check_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--tesseract-oem',
'1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
@pytest.mark.parametrize('value', ['auto', 'otsu', 'adaptive-otsu', 'sauvola'])
def test_tesseract_thresholding(value, resources, outpdf):
check_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--tesseract-thresholding',
value,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
@pytest.mark.parametrize('value', ['abcxyz'])
def test_tesseract_thresholding_invalid(value, resources, no_outpdf):
with pytest.raises(SystemExit, match='2'):
run_ocrmypdf_api(
resources / 'trivial.pdf',
no_outpdf,
'--tesseract-thresholding',
value,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
@pytest.mark.parametrize('renderer', RENDERERS)
def test_tesseract_crash(renderer, resources, no_outpdf, caplog):
exitcode = run_ocrmypdf_api(
resources / 'ccitt.pdf',
no_outpdf,
'-v',
'1',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_crash.py',
)
assert exitcode == ExitCode.child_process_error
assert not no_outpdf.exists()
assert "SubprocessOutputError" in caplog.text
def test_tesseract_crash_autorotate(resources, no_outpdf, caplog):
exitcode = run_ocrmypdf_api(
resources / 'ccitt.pdf',
no_outpdf,
'-r',
'--plugin',
'tests/plugins/tesseract_crash.py',
)
assert exitcode == ExitCode.child_process_error
assert not no_outpdf.exists()
assert "uncaught exception" in caplog.text
@pytest.mark.parametrize('renderer', RENDERERS)
@pytest.mark.slow
def test_tesseract_image_too_big(renderer, resources, outpdf):
check_ocrmypdf(
resources / 'hugemono.pdf',
outpdf,
'-r',
'--pdf-renderer',
renderer,
'--max-image-mpixels',
'0',
'--plugin',
'tests/plugins/tesseract_big_image_error.py',
)
@pytest.mark.parametrize('encryption_level', [2, 3, 4, 6])
def test_encrypted(resources, outpdf, encryption_level, caplog):
if os.name == 'darwin' and sys.version_info >= (3, 12) and encryption_level <= 4:
# Error is: RuntimeError: unable to load openssl legacy provider
# pikepdf obtains encryption from qpdf, which gets it from openssl among other
# providers.
# Error message itself comes from here:
# https://github.com/qpdf/qpdf/blob/da3eae39c8e5261196bbc1b460e5b556c6836dbf/libqpdf/QPDFCrypto_openssl.cc#L56
# Somehow pikepdf + Python 3.12 + macOS does not have this problem, despite
# using Homebrew's qpdf. Possibly the difference is that pikepdf's Python 3.12
# comes from cibuildwheel, and our macOS Python 3.12 comes from GitHub Actions
# setup-python. It may be necessary to build a custom qpdf for macOS.
# In any case, OCRmyPDF doesn't support loading encrypted files at all, it
# just complains about encryption, and it's using pikepdf to generate encrypted
# files for testing.
pytest.skip("GitHub Python 3.12 on macOS does not have openssl legacy support")
encryption = pikepdf.models.encryption.Encryption(
owner='ocrmypdf',
user='ocrmypdf',
R=encryption_level,
aes=(encryption_level >= 4),
metadata=(encryption_level == 6),
)
with pikepdf.open(resources / 'jbig2.pdf') as pdf:
pdf.save(outpdf, encryption=encryption)
exitcode = run_ocrmypdf_api(
outpdf,
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.encrypted_pdf
assert 'encryption must be removed' in caplog.text
def test_jbig2_passthrough(resources, outpdf):
out = check_ocrmypdf(
resources / 'jbig2.pdf',
outpdf,
'--output-type',
'pdf',
'--pdf-renderer',
'fpdf2',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
out_pageinfo = PdfInfo(out)
assert out_pageinfo[0].images[0].enc == Encoding.jbig2
def test_masks(resources, outpdf):
assert (
ocrmypdf.ocr(
resources / 'masks.pdf', outpdf, plugins=['tests/plugins/tesseract_noop.py']
)
== ExitCode.ok
)
def test_linearized_pdf_and_indirect_object(resources, outpdf):
check_ocrmypdf(
resources / 'epson.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_noop.py'
)
def test_very_high_dpi(resources, outpdf):
"""Checks for a Decimal quantize error with high DPI, etc."""
check_ocrmypdf(
resources / '2400dpi.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(outpdf)
image = pdfinfo[0].images[0]
assert isclose(image.dpi.x, image.dpi.y)
assert isclose(image.dpi.x, 2400)
def test_overlay(resources, outpdf):
check_ocrmypdf(
resources / 'overlay.pdf',
outpdf,
'--skip-text',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.fixture
def protected_file(outdir):
protected_file = outdir / 'protected.pdf'
protected_file.touch()
protected_file.chmod(0o400) # Read-only
yield protected_file
@pytest.mark.skipif(
os.name == 'nt' or os.geteuid() == 0, reason="root can write to anything"
)
def test_destination_not_writable(resources, protected_file):
exitcode = run_ocrmypdf_api(
resources / 'jbig2.pdf',
protected_file,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.file_access_error
@pytest.fixture
def valid_tess_config(outdir):
cfg_file = outdir / 'test.cfg'
with cfg_file.open('w') as f:
f.write(
'''\
load_system_dawg 0
language_model_penalty_non_dict_word 0
language_model_penalty_non_freq_dict_word 0
'''
)
yield cfg_file
def test_tesseract_config_valid(resources, valid_tess_config, outpdf):
check_ocrmypdf(
resources / '3small.pdf',
outpdf,
'--tesseract-config',
valid_tess_config,
'--pages',
'1',
)
@pytest.fixture
def invalid_tess_config(outdir):
cfg_file = outdir / 'test.cfg'
with cfg_file.open('w') as f:
f.write(
'''\
THIS FILE IS INVALID
'''
)
yield cfg_file
@pytest.mark.slow # This test sometimes times out in CI
@pytest.mark.parametrize('renderer', RENDERERS)
def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outpdf):
p = run_ocrmypdf(
resources / 'ccitt.pdf',
outpdf,
'--pdf-renderer',
renderer,
'--tesseract-config',
invalid_tess_config,
)
assert (
"parameter not found" in p.stderr.lower()
or "error occurred while parsing" in p.stderr.lower()
), "No error message"
assert p.returncode == ExitCode.invalid_config
def test_user_words_ocr(resources, outdir):
# Does not actually test if --user-words causes output to differ
word_list = outdir / 'wordlist.txt'
sidecar_after = outdir / 'sidecar.txt'
with word_list.open('w') as f:
f.write('cromulent\n') # a perfectly cromulent word
check_ocrmypdf(
resources / 'crom.png',
outdir / 'out.pdf',
'--image-dpi',
150,
'--sidecar',
sidecar_after,
'--user-words',
word_list,
)
def test_form_xobject(resources, outpdf):
check_ocrmypdf(
resources / 'formxobject.pdf',
outpdf,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.mark.parametrize('renderer', RENDERERS)
def test_pagesize_consistency(renderer, resources, outpdf):
infile = resources / '3small.pdf'
before_dims = first_page_dimensions(infile)
check_ocrmypdf(
infile,
outpdf,
'--pdf-renderer',
renderer,
'--clean' if have_unpaper() else None,
'--deskew',
# '--remove-background',
'--clean-final' if have_unpaper() else None,
'-k',
'--pages',
'1',
)
after_dims = first_page_dimensions(outpdf)
assert isclose(before_dims[0], after_dims[0], rel_tol=1e-4)
assert isclose(before_dims[1], after_dims[1], rel_tol=1e-4)
def test_skip_big_with_no_images(resources, outpdf):
check_ocrmypdf(
resources / 'blank.pdf',
outpdf,
'--skip-big',
'5',
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_no_contents(resources, outpdf):
check_ocrmypdf(
resources / 'no_contents.pdf',
outpdf,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.mark.parametrize(
'image', ['baiona.png', 'baiona_gray.png', 'baiona_alpha.png', 'baiona_color.jpg']
)
def test_compression_preserved(ocrmypdf_exec, resources, image, outpdf):
input_file = str(resources / image)
output_file = str(outpdf)
im = Image.open(input_file)
# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'--optimize',
'0',
'--image-dpi',
'150',
'--output-type',
'pdf',
'--plugin',
'tests/plugins/tesseract_noop.py',
'-',
output_file,
]
p = run(
p_args,
capture_output=True,
stdin=input_stream,
text=True,
check=False,
)
if im.mode in ('RGBA', 'LA'):
# If alpha image is input, expect an error
assert p.returncode != ExitCode.ok and 'alpha' in p.stderr
return
assert p.returncode == ExitCode.ok, p.stderr
pdfinfo = PdfInfo(output_file)
pdfimage = pdfinfo[0].images[0]
if input_file.endswith('.png'):
assert pdfimage.enc != Encoding.jpeg, "Lossless compression changed to lossy!"
elif input_file.endswith('.jpg'):
assert pdfimage.enc == Encoding.jpeg, "Lossy compression changed to lossless!"
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage.color == Colorspace.rgb, "Colorspace changed"
elif im.mode.startswith('L'):
assert pdfimage.color == Colorspace.gray, "Colorspace changed"
im.close()
@pytest.mark.parametrize(
'image,compression',
[
('baiona.png', 'jpeg'),
('baiona_gray.png', 'lossless'),
('baiona_color.jpg', 'lossless'),
],
)
def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpdf):
input_file = str(resources / image)
output_file = str(outpdf)
im = Image.open(input_file)
# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'--image-dpi',
'150',
'--output-type',
'pdfa',
'--optimize',
'0',
'--pdfa-image-compression',
compression,
'--plugin',
'tests/plugins/tesseract_noop.py',
'-',
output_file,
]
p = run(
p_args,
capture_output=True,
stdin=input_stream,
text=True,
check=False,
)
assert p.returncode == ExitCode.ok, p.stderr
pdfinfo = PdfInfo(output_file)
pdfimage = pdfinfo[0].images[0]
if compression == "jpeg":
assert pdfimage.enc == Encoding.jpeg
else:
if image.endswith('jpg'):
# Ghostscript JPEG passthrough - no issue
assert pdfimage.enc == Encoding.jpeg
else:
assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage.color == Colorspace.rgb, "Colorspace changed"
elif im.mode.startswith('L'):
assert pdfimage.color == Colorspace.gray, "Colorspace changed"
im.close()
def test_sidecar_pagecount(resources, outpdf):
sidecar = outpdf.with_suffix('.txt')
check_ocrmypdf(
resources / '3small.pdf',
outpdf,
'--skip-text',
'--sidecar',
sidecar,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfinfo = PdfInfo(resources / '3small.pdf')
num_pages = len(pdfinfo)
with open(sidecar, encoding='utf-8') as f:
ocr_text = f.read()
# There should a formfeed between each pair of pages, so the count of
# formfeeds is the page count less one
assert (
ocr_text.count('\f') == num_pages - 1
), "Sidecar page count does not match PDF page count"
def test_sidecar_nonempty(resources, outpdf):
sidecar = outpdf.with_suffix('.txt')
check_ocrmypdf(
resources / 'ccitt.pdf',
outpdf,
'--sidecar',
sidecar,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
with open(sidecar, encoding='utf-8') as f:
ocr_text = f.read()
assert 'the' in ocr_text
@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
def test_pdfa_n(pdfa_level, resources, outpdf):
check_ocrmypdf(
resources / 'ccitt.pdf',
outpdf,
'--output-type',
'pdfa-' + pdfa_level,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}b'
def test_decompression_bomb_error(resources, outpdf, caplog):
run_ocrmypdf_api(resources / 'hugemono.pdf', outpdf)
assert 'decompression bomb' in caplog.text
assert 'max-image-mpixels' in caplog.text
@pytest.mark.slow
def test_decompression_bomb_succeeds(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'hugemono.pdf', outpdf, '--max-image-mpixels', '2000'
)
assert exitcode == 0
def test_text_curves(resources, outpdf):
with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):
check_ocrmypdf(
resources / 'vector.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
info = PdfInfo(outpdf)
assert len(info.pages[0].images) == 0, "added images to the vector PDF"
def test_text_curves_force(resources, outpdf):
with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):
check_ocrmypdf(
resources / 'vector.pdf',
outpdf,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
info = PdfInfo(outpdf)
assert len(info.pages[0].images) != 0, "force did not rasterize"
def test_output_is_dir(resources, outdir, caplog):
exitcode = run_ocrmypdf_api(
resources / 'trivial.pdf',
outdir,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.file_access_error
assert 'is not a writable file' in caplog.text
@pytest.mark.skipif(os.name == 'nt', reason="symlink needs admin permissions")
def test_output_is_symlink(resources, outdir):
sym = Path(outdir / 'this_is_a_symlink')
sym.symlink_to(outdir / 'out.pdf')
exitcode = run_ocrmypdf_api(
resources / 'trivial.pdf',
sym,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.ok
assert (outdir / 'out.pdf').stat().st_size > 0, 'target file not created'
def test_livecycle(resources, no_outpdf, caplog):
exitcode = run_ocrmypdf_api(resources / 'livecycle.pdf', no_outpdf)
assert exitcode == ExitCode.input_file, caplog.text
def test_version_check():
with pytest.raises(MissingDependencyError):
get_version('NOT_FOUND_UNLIKELY_ON_PATH')
with pytest.raises(MissingDependencyError):
get_version('sh', version_arg='-c')
with pytest.raises(MissingDependencyError):
get_version('echo')
@pytest.mark.parametrize(
'threshold, optimize, output_type, expected',
[
[1.0, 0, 'pdfa', False],
[1.0, 0, 'pdf', False],
[0.0, 0, 'pdfa', True],
[0.0, 0, 'pdf', True],
[1.0, 1, 'pdfa', False],
[1.0, 1, 'pdf', False],
[0.0, 1, 'pdfa', True],
[0.0, 1, 'pdf', True],
],
)
def test_fast_web_view(resources, outpdf, threshold, optimize, output_type, expected):
check_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--fast-web-view',
threshold,
'--optimize',
optimize,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with pikepdf.open(outpdf) as pdf:
assert pdf.is_linearized == expected
def test_image_dpi_not_image(caplog, resources, outpdf):
check_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--image-dpi',
'100',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert '--image-dpi is being ignored' in caplog.text
def test_outputtype_none_bad_setup(resources, outpdf):
p = run_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--output-type=none',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert p.returncode == ExitCode.bad_args
assert 'Set the output file to' in p.stderr
def test_outputtype_none(resources, outtxt):
exitcode = run_ocrmypdf_api(
resources / 'trivial.pdf',
'-',
'--output-type=none',
'--sidecar',
outtxt,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.ok
assert outtxt.exists()
@pytest.fixture
def graph_bad_icc(resources, outdir):
synth_input_file = outdir / 'graph-bad-icc.pdf'
with pikepdf.open(resources / 'graph.pdf') as pdf:
icc = pdf.make_stream(
b'invalid icc profile', N=3, Alternate=pikepdf.Name.DeviceRGB
)
pdf.pages[0].Resources.XObject['/Im0'].ColorSpace = pikepdf.Array(
[pikepdf.Name.ICCBased, icc]
)
pdf.save(synth_input_file)
yield synth_input_file
def test_corrupt_icc(graph_bad_icc, outpdf, caplog):
result = run_ocrmypdf_api(graph_bad_icc, outpdf)
assert result == ExitCode.ok
assert any(
'corrupt or unreadable ICC profile' in rec.message for rec in caplog.records
)
================================================
FILE: tests/test_metadata.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import datetime as dt
import warnings
from shutil import copyfile
import pikepdf
import pytest
from pikepdf.models.metadata import decode_pdf_date
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._metadata import metadata_fixup
from ocrmypdf._pipeline import convert_to_pdfa
from ocrmypdf.api import setup_plugin_infrastructure
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.pdfa import file_claims_pdfa, generate_pdfa_ps
from ocrmypdf.pdfinfo import PdfInfo
from .conftest import check_ocrmypdf, run_ocrmypdf, run_ocrmypdf_api
@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_preserve_docinfo(output_type, resources, outpdf):
output = check_ocrmypdf(
resources / 'graph.pdf',
outpdf,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with (
pikepdf.open(resources / 'graph.pdf') as pdf_before,
pikepdf.open(output) as pdf_after,
):
for key in ('/Title', '/Author'):
assert pdf_before.docinfo[key] == pdf_after.docinfo[key]
pdfa_info = file_claims_pdfa(str(output))
assert pdfa_info['output'] == output_type
@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_override_metadata(output_type, resources, outpdf, caplog):
input_file = resources / 'c02-22.pdf'
german = 'Du siehst den Wald vor lauter Bäumen nicht.'
chinese = '孔子'
exitcode = run_ocrmypdf_api(
input_file,
outpdf,
'--title',
german,
'--author',
chinese,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.ok, caplog.text
with pikepdf.open(input_file) as before, pikepdf.open(outpdf) as after:
assert after.docinfo.Title == german, after.docinfo
assert after.docinfo.Author == chinese, after.docinfo
assert after.docinfo.get('/Keywords', '') == ''
before_date = decode_pdf_date(str(before.docinfo.CreationDate))
after_date = decode_pdf_date(str(after.docinfo.CreationDate))
assert before_date == after_date
pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['output'] == output_type
@pytest.mark.parametrize('output_type', ['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'])
@pytest.mark.parametrize('field', ['title', 'author', 'subject', 'keywords'])
def test_unset_metadata(output_type, field, resources, outpdf, caplog):
input_file = resources / 'meta.pdf'
# magic strings contained in the input pdf metadata
meta = {
'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',
}
exitcode = run_ocrmypdf_api(
input_file,
outpdf,
f'--{field}',
'',
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode == ExitCode.ok, caplog.text
# We mainly want to ensure that when '' is passed, the corresponding
# metadata is unset in the output pdf. Since metedata is not compressed,
# the best way to gaurentee the metadata of interest didn't carry
# forward is to just check to ensure the corresponding magic string
# isn't contained anywhere in the output pdf. We'll also check to ensure
# it's in the input pdf and that any values not unset are still in the
# output pdf.
with open(input_file, 'rb') as before, open(outpdf, 'rb') as after:
before_data = before.read()
after_data = after.read()
for k, v in meta.items():
assert v in before_data
if k == field:
assert v not in after_data
else:
assert v in after_data
def test_high_unicode(resources, no_outpdf):
# Ghostscript doesn't support high Unicode, so neither do we, to be
# safe
input_file = resources / 'c02-22.pdf'
high_unicode = 'U+1030C is: 𐌌'
p = run_ocrmypdf(
input_file,
no_outpdf,
'--subject',
high_unicode,
'--output-type',
'pdfa',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert p.returncode == ExitCode.bad_args, p.stderr
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(output_type, ocr_option, resources, outpdf):
fitz = pytest.importorskip('fitz')
input_file = resources / 'toc.pdf'
before_toc = fitz.Document(str(input_file)).get_toc()
check_ocrmypdf(
input_file,
outpdf,
ocr_option,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
after_toc = fitz.Document(str(outpdf)).get_toc()
print(before_toc)
print(after_toc)
assert before_toc == after_toc
def seconds_between_dates(date1, date2):
return (date2 - date1).total_seconds()
@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_creation_date_preserved(output_type, resources, infile, outpdf):
input_file = resources / infile
check_ocrmypdf(
input_file,
outpdf,
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with pikepdf.open(input_file) as pdf_before, pikepdf.open(outpdf) as pdf_after:
before = pdf_before.trailer.get('/Info', {})
after = pdf_after.trailer.get('/Info', {})
if not before:
assert after.get('/CreationDate', '') != ''
else:
# We expect that the creation date stayed the same
date_before = decode_pdf_date(str(before['/CreationDate']))
date_after = decode_pdf_date(str(after['/CreationDate']))
assert seconds_between_dates(date_before, date_after) < 1000
# We expect that the modified date is quite recent
date_after = decode_pdf_date(str(after['/ModDate']))
assert seconds_between_dates(date_after, dt.datetime.now(dt.UTC)) < 1000
@pytest.fixture
def libxmp_file_to_dict():
try:
with warnings.catch_warnings():
# libxmp imports distutils.Version, which is deprecated
warnings.filterwarnings(
"ignore",
category=DeprecationWarning,
message=r".*distutils Version classes are deprecated.*",
)
from libxmp.utils import (
file_to_dict, # pylint: disable=import-outside-toplevel
)
except Exception: # pylint: disable=broad-except
pytest.skip("libxmp not available or libexempi3 not installed")
return file_to_dict
@pytest.mark.parametrize(
'test_file,output_type',
[
('graph.pdf', 'pdf'), # PDF with full metadata
('graph.pdf', 'pdfa'), # PDF/A with full metadata
('overlay.pdf', 'pdfa'), # /Title()
('3small.pdf', 'pdfa'),
],
)
def test_xml_metadata_preserved(
libxmp_file_to_dict, test_file, output_type, resources, outpdf
):
input_file = resources / test_file
before = libxmp_file_to_dict(str(input_file))
check_ocrmypdf(
input_file,
outpdf,
'--output-type',
output_type,
'--skip-text',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
after = libxmp_file_to_dict(str(outpdf))
equal_properties = [
'dc:contributor',
'dc:coverage',
'dc:creator',
'dc:description',
'dc:format',
'dc:identifier',
'dc:language',
'dc:publisher',
'dc:relation',
'dc:rights',
'dc:source',
'dc:subject',
'dc:title',
'dc:type',
'pdf:keywords',
]
acquired_properties = ['dc:format']
# Cleanup messy data structure
# Top level is key-value mapping of namespaces to keys under namespace,
# so we put everything in the same namespace
def unify_namespaces(xmpdict):
for entries in xmpdict.values():
yield from entries
# Now we have a list of (key, value, {infodict}). We don't care about
# infodict. Just flatten to keys and values
def keyval_from_tuple(list_of_tuples):
for k, v, *_ in list_of_tuples:
yield k, v
before = dict(keyval_from_tuple(unify_namespaces(before)))
after = dict(keyval_from_tuple(unify_namespaces(after)))
for prop in equal_properties:
if prop in before:
assert prop in after, f'{prop} dropped from xmp'
assert before[prop] == after[prop]
# libxmp presents multivalued entries (e.g. dc:title) as:
# 'dc:title': '' <- there's a title
# 'dc:title[1]: 'The Title' <- the actual title
# 'dc:title[1]/?xml:lang': 'x-default' <- language info
propidx = f'{prop}[1]'
if propidx in before:
assert (
after.get(propidx) == before[propidx]
or after.get(prop) == before[propidx]
)
if prop in after and prop not in before:
assert prop in acquired_properties, (
f"acquired unexpected property {prop} with value "
f"{after.get(propidx) or after.get(prop)}"
)
def test_kodak_toc(resources, outpdf):
check_ocrmypdf(
resources / 'kcs.pdf',
outpdf,
'--output-type',
'pdf',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with pikepdf.open(outpdf) as p:
if pikepdf.Name.First in p.Root.Outlines:
assert isinstance(p.Root.Outlines.First, pikepdf.Dictionary)
def test_metadata_fixup_warning(resources, outdir, caplog):
options, _pm = get_options_and_plugins(
['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
)
copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')
# Use the new setup function instead of get_plugin_manager directly
plugin_manager = setup_plugin_infrastructure([])
context = PdfContext(options, outdir, outdir / 'graph.pdf', None, plugin_manager)
metadata_fixup(
working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}
)
for record in caplog.records:
assert record.levelname != 'WARNING', "Unexpected warning"
# Now add some metadata that will not be copyable
with pikepdf.open(outdir / 'graph.pdf') as graph:
with graph.open_metadata() as meta:
meta['prism2:publicationName'] = 'OCRmyPDF Test'
graph.save(outdir / 'graph_mod.pdf')
context = PdfContext(
options, outdir, outdir / 'graph_mod.pdf', None, plugin_manager
)
metadata_fixup(
working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}
)
assert any(record.levelname == 'WARNING' for record in caplog.records)
XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
def test_prevent_gs_invalid_xml(resources, outdir):
generate_pdfa_ps(outdir / 'pdfa.ps')
# Inject a string with a trailing nul character into the DocumentInfo
# dictionary of this PDF, as often occurs in practice.
with pikepdf.open(resources / 'trivial.pdf') as pdf:
pdf.Root.DocumentInfo = pikepdf.Dictionary(
Title=b'String with trailing nul\x00'
)
pdf.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
options, _ = get_options_and_plugins(
args=[
'-j',
'1',
'--output-type',
'pdfa-2',
'a.pdf',
'b.pdf',
]
)
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
# Use the new setup function
plugin_manager = setup_plugin_infrastructure([])
context = PdfContext(
options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, plugin_manager
)
convert_to_pdfa(
str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
)
contents = (outdir / 'pdfa.pdf').read_bytes()
# Since the XML may be invalid, we scan instead of actually feeding it
# to a parser.
xmp_start = contents.find(XMP_MAGIC)
xmp_end = contents.rfind(b' bool:
"""Check if CJK font is available (from system)."""
return 'NotoSansCJK-Regular' in manager.fonts
def has_arabic_font(manager: MultiFontManager) -> bool:
"""Check if Arabic font is available (from system)."""
return 'NotoSansArabic-Regular' in manager.fonts
def has_devanagari_font(manager: MultiFontManager) -> bool:
"""Check if Devanagari font is available (from system)."""
return 'NotoSansDevanagari-Regular' in manager.fonts
# Marker for tests that require CJK fonts
requires_cjk = pytest.mark.skipif(
"not has_cjk_font(MultiFontManager())",
reason="CJK font not available (not installed on system)"
)
# --- MultiFontManager Initialization Tests ---
def test_init_loads_builtin_fonts(multi_font_manager):
"""Test that initialization loads all expected builtin fonts."""
# Only NotoSans-Regular and Occulta are bundled
assert 'NotoSans-Regular' in multi_font_manager.fonts
assert 'Occulta' in multi_font_manager.fonts
# At least 2 builtin fonts should be loaded
assert len(multi_font_manager.fonts) >= 2
# Arabic, Devanagari, CJK are optional (system fonts)
def test_missing_font_directory():
"""Test that missing font directory raises error for fallback font."""
with pytest.raises(FileNotFoundError):
MultiFontManager(Path("/nonexistent/path"))
# --- Arabic Script Language Tests ---
# These tests require Arabic fonts to be installed on the system
def test_select_font_for_arabic_language(multi_font_manager):
"""Test font selection with Arabic language hint."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
font_manager = multi_font_manager.select_font_for_word("مرحبا", "ara")
assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']
def test_select_font_for_persian_language(multi_font_manager):
"""Test font selection with Persian language hint."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
font_manager = multi_font_manager.select_font_for_word("سلام", "per")
assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']
def test_select_font_for_urdu_language(multi_font_manager):
"""Test font selection with Urdu language hint."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
font_manager = multi_font_manager.select_font_for_word("ہیلو", "urd")
assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']
def test_farsi_language_code(multi_font_manager):
"""Test that 'fas' (Farsi alternative code) maps to Arabic font."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
font_manager = multi_font_manager.select_font_for_word("سلام", "fas")
assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']
# --- Devanagari Script Language Tests ---
# These tests require Devanagari fonts to be installed on the system
def test_select_font_for_hindi_language(multi_font_manager):
"""Test font selection with Hindi language hint."""
if not has_devanagari_font(multi_font_manager):
pytest.skip("Devanagari font not available")
font_manager = multi_font_manager.select_font_for_word("नमस्ते", "hin")
assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']
def test_select_font_for_sanskrit_language(multi_font_manager):
"""Test font selection with Sanskrit language hint."""
if not has_devanagari_font(multi_font_manager):
pytest.skip("Devanagari font not available")
font_manager = multi_font_manager.select_font_for_word("संस्कृतम्", "san")
assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']
def test_select_font_for_marathi_language(multi_font_manager):
"""Test font selection with Marathi language hint."""
if not has_devanagari_font(multi_font_manager):
pytest.skip("Devanagari font not available")
font_manager = multi_font_manager.select_font_for_word("मराठी", "mar")
assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']
def test_select_font_for_nepali_language(multi_font_manager):
"""Test font selection with Nepali language hint."""
if not has_devanagari_font(multi_font_manager):
pytest.skip("Devanagari font not available")
font_manager = multi_font_manager.select_font_for_word("नेपाली", "nep")
assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']
# --- CJK Language Tests ---
# These tests require CJK fonts to be installed on the system
def test_select_font_for_chinese_language(multi_font_manager):
"""Test font selection with Chinese language hint (ISO 639-3)."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("你好", "zho")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_select_font_for_chinese_generic(multi_font_manager):
"""Test font selection with generic Chinese language code."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("中文", "chi")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_select_font_for_chinese_simplified(multi_font_manager):
"""Test font selection with Tesseract's chi_sim language code."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("简体字", "chi_sim")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_select_font_for_chinese_traditional(multi_font_manager):
"""Test font selection with Tesseract's chi_tra language code."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("漢字", "chi_tra")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_select_font_for_japanese_language(multi_font_manager):
"""Test font selection with Japanese language hint."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("こんにちは", "jpn")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_select_font_for_korean_language(multi_font_manager):
"""Test font selection with Korean language hint."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("안녕하세요", "kor")
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
# --- Latin/English Tests ---
def test_select_font_for_english_text(multi_font_manager):
"""Test font selection for English text."""
font_manager = multi_font_manager.select_font_for_word("Hello World", "eng")
assert font_manager == multi_font_manager.fonts['NotoSans-Regular']
def test_select_font_without_language_hint(multi_font_manager):
"""Test font selection without language hint falls back to glyph checking."""
font_manager = multi_font_manager.select_font_for_word("Hello", None)
assert font_manager == multi_font_manager.fonts['NotoSans-Regular']
# --- Fallback Behavior Tests ---
def test_select_font_arabic_text_without_language_hint(multi_font_manager):
"""Test that Arabic text is handled via fallback without language hint."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
font_manager = multi_font_manager.select_font_for_word("مرحبا", None)
# Should get NotoSansArabic-Regular via fallback chain glyph checking
assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']
def test_devanagari_text_without_language_hint(multi_font_manager):
"""Test that Devanagari text is handled via fallback without language hint."""
# NotoSans-Regular includes Devanagari glyphs, so it's selected first in fallback
font_manager = multi_font_manager.select_font_for_word("नमस्ते", None)
# Could be NotoSans-Regular or NotoSansDevanagari-Regular depending on availability
assert font_manager is not None
def test_cjk_text_without_language_hint(multi_font_manager):
"""Test that CJK text is handled via fallback without language hint."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
font_manager = multi_font_manager.select_font_for_word("你好", None)
assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']
def test_fallback_to_occulta_font(multi_font_manager):
"""Test that unsupported characters fall back to Occulta.ttf."""
# Use a character unlikely to be in any standard font
font_manager = multi_font_manager.select_font_for_word("test", "xyz")
# Should return some valid font
assert font_manager in multi_font_manager.fonts.values()
def test_fallback_fonts_constant(multi_font_manager):
"""Test that FALLBACK_FONTS contains expected fonts."""
# Check that core fonts are in fallback list
assert 'NotoSans-Regular' in MultiFontManager.FALLBACK_FONTS
assert 'NotoSansArabic-Regular' in MultiFontManager.FALLBACK_FONTS
assert 'NotoSansDevanagari-Regular' in MultiFontManager.FALLBACK_FONTS
assert 'NotoSansCJK-Regular' in MultiFontManager.FALLBACK_FONTS
# Only NotoSans-Regular is bundled; other scripts are system fonts
assert 'NotoSans-Regular' in multi_font_manager.fonts
# --- Glyph Coverage Tests ---
def test_has_all_glyphs_for_english(multi_font_manager):
"""Test glyph coverage checking for English text."""
assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "Hello World")
assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "café")
def test_has_all_glyphs_for_arabic(multi_font_manager):
"""Test glyph coverage checking for Arabic text."""
if not has_arabic_font(multi_font_manager):
pytest.skip("Arabic font not available")
assert multi_font_manager.has_all_glyphs('NotoSansArabic-Regular', "مرحبا")
def test_has_all_glyphs_for_devanagari(multi_font_manager):
"""Test glyph coverage checking for Devanagari text."""
if not has_devanagari_font(multi_font_manager):
pytest.skip("Devanagari font not available")
assert multi_font_manager.has_all_glyphs('NotoSansDevanagari-Regular', "नमस्ते")
def test_has_all_glyphs_for_cjk(multi_font_manager):
"""Test glyph coverage checking for CJK text."""
if not has_cjk_font(multi_font_manager):
pytest.skip("CJK font not available")
assert multi_font_manager.has_all_glyphs('NotoSansCJK-Regular', "你好")
def test_empty_text_has_all_glyphs(multi_font_manager):
"""Test that empty text returns True for glyph coverage."""
assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "")
def test_has_all_glyphs_missing_font(multi_font_manager):
"""Test that has_all_glyphs returns False for non-existent font."""
assert not multi_font_manager.has_all_glyphs('NonExistentFont', "test")
# --- Caching Tests ---
def test_font_selection_caching(multi_font_manager):
"""Test that font selection results are cached."""
font1 = multi_font_manager.select_font_for_word("Hello", "eng")
cache_key = ("Hello", "eng")
assert cache_key in multi_font_manager._selection_cache
font2 = multi_font_manager.select_font_for_word("Hello", "eng")
assert font1 == font2
# --- Language Font Map Tests ---
def test_language_font_map_coverage():
"""Test that LANGUAGE_FONT_MAP has valid structure."""
# Only NotoSans-Regular is bundled now
# This test just verifies the structure is valid
for font_name in MultiFontManager.LANGUAGE_FONT_MAP.values():
# All font names should be valid strings
assert isinstance(font_name, str)
assert font_name.startswith('NotoSans')
# --- get_all_fonts Tests ---
def test_get_all_fonts(multi_font_manager):
"""Test get_all_fonts returns all loaded fonts."""
all_fonts = multi_font_manager.get_all_fonts()
assert isinstance(all_fonts, dict)
# At least 2 builtin fonts should be loaded (NotoSans-Regular and Occulta)
assert len(all_fonts) >= 2
assert 'NotoSans-Regular' in all_fonts
assert 'Occulta' in all_fonts
# Arabic, Devanagari, CJK are optional (system fonts)
# --- FontProvider Tests ---
class MockFontProvider:
"""Mock FontProvider for testing missing fonts."""
def __init__(
self, available_fonts: dict[str, FontManager], fallback: FontManager
):
"""Initialize mock font provider with given fonts."""
self._fonts = available_fonts
self._fallback = fallback
def get_font(self, font_name: str) -> FontManager | None:
return self._fonts.get(font_name)
def get_available_fonts(self) -> list[str]:
return list(self._fonts.keys())
def get_fallback_font(self) -> FontManager:
return self._fallback
def test_custom_font_provider(font_dir):
"""Test that custom FontProvider can be injected."""
fonts = {
'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),
'Occulta': FontManager(font_dir / 'Occulta.ttf'),
}
provider = MockFontProvider(fonts, fonts['Occulta'])
manager = MultiFontManager(font_provider=provider)
# Should only have the fonts we provided
assert len(manager.fonts) == 2
assert 'NotoSans-Regular' in manager.fonts
assert 'Occulta' in manager.fonts
def test_missing_font_uses_fallback(font_dir):
"""Test that missing fonts gracefully fall back."""
fonts = {
'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),
'Occulta': FontManager(font_dir / 'Occulta.ttf'),
}
provider = MockFontProvider(fonts, fonts['Occulta'])
manager = MultiFontManager(font_provider=provider)
# Arabic text should fall back to Occulta since NotoSansArabic is missing
font = manager.select_font_for_word("مرحبا", "ara")
assert font == fonts['Occulta']
def test_builtin_font_provider_loads_expected_fonts(font_dir):
"""Test BuiltinFontProvider loads all expected builtin fonts."""
provider = BuiltinFontProvider(font_dir)
available = provider.get_available_fonts()
assert 'NotoSans-Regular' in available
assert 'Occulta' in available
# Only Latin (NotoSans) and glyphless fallback (Occulta) are bundled.
# All other scripts (Arabic, Devanagari, CJK, etc.) are discovered
# from system fonts by SystemFontProvider to reduce package size.
assert len(available) == 2
def test_builtin_font_provider_get_font(font_dir):
"""Test BuiltinFontProvider.get_font returns correct fonts."""
provider = BuiltinFontProvider(font_dir)
font = provider.get_font('NotoSans-Regular')
assert font is not None
assert isinstance(font, FontManager)
missing = provider.get_font('NonExistent')
assert missing is None
def test_builtin_font_provider_get_fallback(font_dir):
"""Test BuiltinFontProvider.get_fallback_font returns Occulta font."""
provider = BuiltinFontProvider(font_dir)
fallback = provider.get_fallback_font()
assert fallback is not None
assert fallback == provider.get_font('Occulta')
def test_builtin_font_provider_missing_font_logs_warning(tmp_path, font_dir, caplog):
"""Test that missing expected fonts log a warning."""
# Create minimal font directory with only Occulta.ttf
(tmp_path / 'Occulta.ttf').write_bytes((font_dir / 'Occulta.ttf').read_bytes())
with caplog.at_level(logging.WARNING):
provider = BuiltinFontProvider(tmp_path)
# Should have logged warnings for missing fonts
assert 'NotoSans-Regular' in caplog.text
assert 'not found' in caplog.text
# But Occulta should be loaded
assert provider.get_fallback_font() is not None
def test_builtin_font_provider_missing_occulta_raises(tmp_path):
"""Test that missing Occulta.ttf raises FileNotFoundError."""
with pytest.raises(FileNotFoundError, match="Required fallback font"):
BuiltinFontProvider(tmp_path)
================================================
FILE: tests/test_multilingual_direct.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Direct tests for multilingual text rendering with fpdf2 renderer.
This tests the fpdf2 renderer with various language groups:
- Latin (English, French, German with diacritics)
- Arabic (Arabic, Persian - RTL scripts)
- CJK (Chinese Simplified/Traditional, Japanese, Korean)
- Devanagari (Hindi, Sanskrit)
"""
from __future__ import annotations
import shutil
import subprocess
from pathlib import Path
import pytest
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
RESOURCES = Path(__file__).parent / "resources"
@pytest.fixture
def pdftotext():
"""Return a function to extract text from PDF using pdftotext.
Skips the test if pdftotext is not available.
"""
pdftotext_path = shutil.which('pdftotext')
if pdftotext_path is None:
pytest.skip("pdftotext not available")
def extract_text(pdf_path: Path) -> str:
return subprocess.check_output(
['pdftotext', '-enc', 'UTF-8', str(pdf_path), '-'],
text=True,
encoding='utf-8',
)
return extract_text
@pytest.fixture
def font_dir():
"""Return path to font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create MultiFontManager instance for testing."""
return MultiFontManager(font_dir)
@pytest.fixture
def multi_font_manager_arabic(font_dir):
"""Create MultiFontManager instance for testing, with Arabic."""
mfm = MultiFontManager(font_dir)
if not mfm.has_font("NotoSansArabic-Regular"):
pytest.skip("NotoSansArabic font not available")
return mfm
# =============================================================================
# Latin Script Tests
# =============================================================================
class TestLatinScript:
"""Tests for Latin script (English, French, German, etc.)."""
@pytest.fixture
def latin_hocr(self):
"""Return path to Latin HOCR test file."""
return RESOURCES / "latin.hocr"
def test_render_latin_basic(
self, latin_hocr, multi_font_manager, tmp_path, pdftotext
):
"""Test rendering Latin script with various diacritics."""
parser = HocrParser(latin_hocr)
page = parser.parse()
assert page is not None
paras = list(page.paragraphs)
assert len(paras) == 3 # English, French, German
# Check languages
assert paras[0].language == 'eng'
assert paras[1].language == 'fra'
assert paras[2].language == 'deu'
# Render to PDF
output_pdf = tmp_path / "latin_output.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
# Extract text and verify
text = pdftotext(output_pdf)
# English words
assert 'quick' in text or 'brown' in text or 'fox' in text
# French with diacritics
assert 'Café' in text or 'résumé' in text or 'naïve' in text
# German with umlauts and eszett
assert 'Größe' in text or 'Zürich' in text or 'Ärger' in text
def test_latin_font_selection(self, latin_hocr, multi_font_manager):
"""Test that NotoSans is selected for Latin text."""
parser = HocrParser(latin_hocr)
page = parser.parse()
for line in page.lines:
for word in line.children:
if word.text:
font = multi_font_manager.select_font_for_word(
word.text, line.language
)
assert font is not None
# Latin text should use NotoSans-Regular
assert multi_font_manager.has_all_glyphs(
'NotoSans-Regular', word.text
)
# =============================================================================
# Arabic Script Tests
# =============================================================================
class TestArabicScript:
"""Tests for Arabic script (Arabic, Persian, etc.)."""
@pytest.fixture
def arabic_hocr(self):
"""Return path to Arabic HOCR test file."""
return RESOURCES / "arabic.hocr"
def test_render_arabic_basic(
self, arabic_hocr, multi_font_manager_arabic, tmp_path, pdftotext
):
"""Test rendering Arabic script text."""
parser = HocrParser(arabic_hocr)
page = parser.parse()
assert page is not None
paras = list(page.paragraphs)
assert len(paras) == 3 # Arabic paragraphs and Persian
# Render to PDF
output_pdf = tmp_path / "arabic_output.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager_arabic,
invisible_text=False,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
# Extract text and verify Arabic content
text = pdftotext(output_pdf)
# Arabic words: مرحبا بالعالم (Hello world)
assert 'مرحبا' in text or 'بالعالم' in text
# هذا نص عربي (This is Arabic text)
assert 'عربي' in text or 'نص' in text
def test_arabic_font_selection(self, arabic_hocr, multi_font_manager_arabic):
"""Test that NotoSansArabic is selected for Arabic text."""
parser = HocrParser(arabic_hocr)
page = parser.parse()
for line in page.lines:
for word in line.children:
if word.text and line.language in ('ara', 'per'):
font = multi_font_manager_arabic.select_font_for_word(
word.text, line.language
)
assert font is not None
# Arabic text should use NotoSansArabic
assert multi_font_manager_arabic.has_all_glyphs(
'NotoSansArabic-Regular', word.text
), f"NotoSansArabic cannot render '{word.text}'"
def test_arabic_rtl_handling(self, arabic_hocr):
"""Test that RTL direction is correctly parsed from hOCR."""
parser = HocrParser(arabic_hocr)
page = parser.parse()
for para in page.paragraphs:
if para.language in ('ara', 'per'):
# Arabic paragraphs should have RTL direction
assert (
para.direction == 'rtl'
), "Arabic paragraph should have RTL direction"
# =============================================================================
# CJK Script Tests
# =============================================================================
def _latin_font_works(multi_font_manager) -> bool:
"""Check if Latin font is available."""
return multi_font_manager.has_font('NotoSans-Regular')
def _arabic_font_works(multi_font_manager) -> bool:
"""Check if Arabic font is available."""
return multi_font_manager.has_font('NotoSansArabic-Regular')
def _devanagari_font_works(multi_font_manager) -> bool:
"""Check if Devanagari font is available."""
return multi_font_manager.has_font('NotoSansDevanagari-Regular')
def _cjk_font_works(multi_font_manager) -> bool:
"""Check if CJK font is working (not corrupted)."""
return multi_font_manager.has_font('NotoSansCJK-Regular')
class TestCJKScript:
"""Tests for CJK scripts (Chinese, Japanese, Korean)."""
@pytest.fixture
def cjk_hocr(self):
"""Return path to CJK HOCR test file."""
return RESOURCES / "cjk.hocr"
def test_render_cjk_basic(self, cjk_hocr, multi_font_manager, tmp_path, pdftotext):
"""Test rendering CJK script text."""
if not _cjk_font_works(multi_font_manager):
pytest.skip("CJK font not available or corrupted")
parser = HocrParser(cjk_hocr)
page = parser.parse()
assert page is not None
paras = list(page.paragraphs)
assert len(paras) == 4 # Chinese Simplified, Traditional, Japanese, Korean
# Check languages
languages = [p.language for p in paras]
assert 'chi_sim' in languages
assert 'chi_tra' in languages
assert 'jpn' in languages
assert 'kor' in languages
# Render to PDF
output_pdf = tmp_path / "cjk_output.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
# Extract text and verify CJK content
text = pdftotext(output_pdf)
# Chinese: 你好 世界 (Hello world)
assert '你好' in text or '世界' in text
# Japanese: こんにちは (Hello)
assert 'こんにちは' in text or '世界' in text
# Korean: 안녕하세요 (Hello)
assert '안녕하세요' in text or '세계' in text
def test_cjk_font_selection(self, cjk_hocr, multi_font_manager):
"""Test that NotoSansCJK is selected for CJK text."""
if not _cjk_font_works(multi_font_manager):
pytest.skip("CJK font not available or corrupted")
parser = HocrParser(cjk_hocr)
page = parser.parse()
cjk_languages = {'chi_sim', 'chi_tra', 'jpn', 'kor', 'zho', 'chi'}
for line in page.lines:
for word in line.children:
if word.text and line.language in cjk_languages:
font = multi_font_manager.select_font_for_word(
word.text, line.language
)
assert font is not None
# CJK text should use NotoSansCJK
assert multi_font_manager.has_all_glyphs(
'NotoSansCJK-Regular', word.text
), f"NotoSansCJK cannot render '{word.text}'"
# =============================================================================
# Devanagari Script Tests
# =============================================================================
class TestDevanagariScript:
"""Tests for Devanagari script (Hindi, Sanskrit, etc.)."""
@pytest.fixture
def devanagari_hocr(self):
"""Return path to Devanagari HOCR test file."""
return RESOURCES / "devanagari.hocr"
def test_render_devanagari_basic(
self, devanagari_hocr, multi_font_manager, tmp_path, pdftotext
):
"""Test rendering Devanagari script text."""
parser = HocrParser(devanagari_hocr)
page = parser.parse()
assert page is not None
paras = list(page.paragraphs)
assert len(paras) == 3 # Hindi paragraphs and Sanskrit
# Render to PDF
output_pdf = tmp_path / "devanagari_output.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
# Extract text and verify Devanagari content
text = pdftotext(output_pdf)
# Hindi: नमस्ते दुनिया (Hello world)
assert 'नमस्ते' in text or 'दुनिया' in text
# यह हिंदी पाठ है (This is Hindi text)
assert 'हिंदी' in text or 'पाठ' in text
def test_devanagari_font_selection(self, devanagari_hocr, multi_font_manager):
"""Test that NotoSansDevanagari is selected for Devanagari text."""
if not multi_font_manager.has_font('NotoSansDevanagari-Regular'):
pytest.skip("Devanagari font not available")
parser = HocrParser(devanagari_hocr)
page = parser.parse()
devanagari_languages = {'hin', 'san', 'mar', 'nep'}
for line in page.lines:
for word in line.children:
if word.text and line.language in devanagari_languages:
font = multi_font_manager.select_font_for_word(
word.text, line.language
)
assert font is not None
# Devanagari text should use NotoSansDevanagari
assert multi_font_manager.has_all_glyphs(
'NotoSansDevanagari-Regular', word.text
), f"NotoSansDevanagari cannot render '{word.text}'"
# =============================================================================
# Mixed Language / Multilingual Tests
# =============================================================================
class TestMultilingual:
"""Tests for mixed-language documents."""
@pytest.fixture
def multilingual_hocr(self):
"""Return path to multilingual HOCR test file."""
return RESOURCES / "multilingual.hocr"
def test_render_multilingual_hocr_basic(
self, multilingual_hocr, multi_font_manager_arabic, tmp_path, pdftotext
):
"""Test rendering multilingual HOCR file with English and Arabic text."""
parser = HocrParser(multilingual_hocr)
page = parser.parse()
assert page is not None
assert len(list(page.paragraphs)) == 2 # English and Arabic paragraphs
# Check languages
paras = list(page.paragraphs)
assert paras[0].language == 'eng'
assert paras[1].language == 'ara'
# Render to PDF
output_pdf = tmp_path / "multilingual_output.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager_arabic,
invisible_text=False,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
# Extract text from PDF
text = pdftotext(output_pdf)
# Verify both English and Arabic text are present
assert 'English' in text or 'Text' in text or 'Here' in text
# Arabic text: مرحبا بك
assert 'مرحبا' in text or 'بك' in text
def test_render_multilingual_with_debug_options(
self, multilingual_hocr, multi_font_manager, tmp_path
):
"""Test rendering with debug visualization enabled."""
parser = HocrParser(multilingual_hocr)
page = parser.parse()
# Render with debug options
output_pdf = tmp_path / "multilingual_debug.pdf"
debug_options = DebugRenderOptions(
render_baseline=True,
render_line_bbox=True,
render_word_bbox=True,
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
debug_render_options=debug_options,
)
renderer.render(output_pdf)
assert output_pdf.exists()
assert output_pdf.stat().st_size > 0
def test_multilingual_invisible_text(
self, multilingual_hocr, multi_font_manager, tmp_path, pdftotext
):
"""Test rendering with invisible text (default OCR mode)."""
parser = HocrParser(multilingual_hocr)
page = parser.parse()
# Render with invisible text (standard for OCR layer)
output_pdf = tmp_path / "multilingual_invisible.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300.0,
multi_font_manager=multi_font_manager,
invisible_text=True,
)
renderer.render(output_pdf)
assert output_pdf.exists()
# Text should still be extractable even though invisible
text = pdftotext(output_pdf)
assert len(text.strip()) > 0
def test_multilingual_font_selection(
self, multilingual_hocr, multi_font_manager_arabic
):
"""Test that correct fonts are selected for each language."""
parser = HocrParser(multilingual_hocr)
page = parser.parse()
# Get all words
words = []
for line in page.lines:
for word in line.children:
if word.text:
words.append((word.text, line.language))
# Verify we have both English and Arabic words
eng_words = [w for w, lang in words if lang == 'eng']
ara_words = [w for w, lang in words if lang == 'ara']
assert len(eng_words) > 0, "Should have English words"
assert len(ara_words) > 0, "Should have Arabic words"
# Test font selection
for text, lang in words:
font_mgr = multi_font_manager_arabic.select_font_for_word(text, lang)
assert font_mgr is not None, f"No font selected for '{text}' ({lang})"
if lang == 'ara':
assert multi_font_manager_arabic.has_all_glyphs(
'NotoSansArabic-Regular', text
), f"NotoSansArabic cannot render '{text}'"
# =============================================================================
# Baseline and Structure Tests
# =============================================================================
class TestBaselineHandling:
"""Tests for baseline and hOCR structure handling."""
@pytest.fixture
def multilingual_hocr(self):
"""Return path to multilingual HOCR test file."""
return RESOURCES / "multilingual.hocr"
def test_multilingual_baseline_handling(self, multilingual_hocr):
"""Test that baseline information is correctly parsed from hOCR."""
parser = HocrParser(multilingual_hocr)
page = parser.parse()
for line in page.lines:
if line.baseline:
# Baseline should be reasonable
assert (
-1.0 <= line.baseline.slope <= 1.0
), "Baseline slope should be reasonable"
# =============================================================================
# Font Coverage Tests
# =============================================================================
class TestFontCoverage:
"""Tests verifying font coverage for various scripts."""
def test_noto_sans_latin_coverage(self, multi_font_manager):
"""Test NotoSans covers common Latin characters and diacritics."""
if not _latin_font_works(multi_font_manager):
pytest.skip("NotoSans font not available")
latin_samples = [
"Hello World",
"Café résumé naïve",
"Größe Zürich Ärger",
"ÀÁÂÃÄÅÆÇÈÉÊË",
"àáâãäåæçèéêë",
]
for sample in latin_samples:
assert multi_font_manager.has_all_glyphs(
'NotoSans-Regular', sample
), f"NotoSans should cover: {sample}"
def test_noto_sans_arabic_coverage(self, multi_font_manager_arabic):
"""Test NotoSansArabic covers Arabic characters."""
arabic_samples = [
"مرحبا", # Hello
"بالعالم", # World
"العربية", # Arabic
]
for sample in arabic_samples:
assert multi_font_manager_arabic.has_all_glyphs(
'NotoSansArabic-Regular', sample
), f"NotoSansArabic should cover: {sample}"
def test_noto_sans_devanagari_coverage(self, multi_font_manager):
"""Test NotoSansDevanagari covers Devanagari characters."""
if not _devanagari_font_works(multi_font_manager):
pytest.skip("NotoSansDevanagari font not available")
devanagari_samples = [
"नमस्ते", # Hello
"हिंदी", # Hindi
"संस्कृत", # Sanskrit
]
for sample in devanagari_samples:
assert multi_font_manager.has_all_glyphs(
'NotoSansDevanagari-Regular', sample
), f"NotoSansDevanagari should cover: {sample}"
def test_noto_sans_cjk_coverage(self, multi_font_manager):
"""Test NotoSansCJK covers CJK characters."""
if not _cjk_font_works(multi_font_manager):
pytest.skip("CJK font not available or corrupted")
cjk_samples = [
"你好", # Chinese: Hello
"世界", # Chinese: World
"こんにちは", # Japanese: Hello
"안녕하세요", # Korean: Hello
]
for sample in cjk_samples:
assert multi_font_manager.has_all_glyphs(
'NotoSansCJK-Regular', sample
), f"NotoSansCJK should cover: {sample}"
if __name__ == "__main__":
# Allow running this test directly for quick iteration
import sys
sys.exit(pytest.main([__file__, "-v", "-s"]))
================================================
FILE: tests/test_null_ocr_engine.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for NullOcrEngine (--ocr-engine none).
Tests verify that the Null OCR engine exists and functions correctly
for scenarios where users want PDF processing without OCR.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock
import pytest
class TestNullOcrEngineExists:
"""Test that NullOcrEngine plugin exists and is loadable."""
def test_null_ocr_module_importable(self):
"""null_ocr module should be importable."""
from ocrmypdf.builtin_plugins import null_ocr
assert null_ocr is not None
def test_null_ocr_engine_class_exists(self):
"""NullOcrEngine class should exist."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine is not None
class TestNullOcrEngineInterface:
"""Test NullOcrEngine implements OcrEngine interface."""
def test_version_returns_none(self):
"""NullOcrEngine.version() should return 'none'."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine.version() == "none"
def test_creator_tag(self):
"""NullOcrEngine.creator_tag() should indicate no OCR."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
tag = NullOcrEngine.creator_tag(MagicMock())
tag_lower = tag.lower()
assert "no ocr" in tag_lower or "null" in tag_lower or "none" in tag_lower
def test_languages_returns_empty_set(self):
"""NullOcrEngine.languages() should return empty set."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
langs = NullOcrEngine.languages(MagicMock())
assert langs == set()
def test_supports_generate_ocr_returns_true(self):
"""NullOcrEngine should support generate_ocr()."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine.supports_generate_ocr() is True
def test_get_orientation_returns_zero(self):
"""NullOcrEngine.get_orientation() should return angle=0."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.get_orientation(Path("test.png"), MagicMock())
assert result.angle == 0
def test_get_deskew_returns_zero(self):
"""NullOcrEngine.get_deskew() should return 0.0."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.get_deskew(Path("test.png"), MagicMock())
assert result == 0.0
class TestNullOcrEngineGenerateOcr:
"""Test NullOcrEngine.generate_ocr() output."""
@pytest.fixture
def sample_image(self, tmp_path):
"""Create a simple test image."""
from PIL import Image
img_path = tmp_path / "test.png"
img = Image.new('RGB', (100, 100), color='white')
img.save(img_path, dpi=(300, 300))
return img_path
def test_generate_ocr_returns_tuple(self, sample_image):
"""generate_ocr() should return (OcrElement, str) tuple."""
from ocrmypdf import OcrElement
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], OcrElement)
assert isinstance(result[1], str)
def test_generate_ocr_returns_empty_text(self, sample_image):
"""generate_ocr() should return empty text string."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
_, text = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert text == ""
def test_generate_ocr_returns_page_element(self, sample_image):
"""generate_ocr() should return OcrElement with ocr_class PAGE."""
from ocrmypdf import OcrClass
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert ocr_tree.ocr_class == OcrClass.PAGE
def test_generate_ocr_page_has_correct_dimensions(self, sample_image):
"""generate_ocr() page element should have image dimensions."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
# Image is 100x100
assert ocr_tree.bbox.right == 100
assert ocr_tree.bbox.bottom == 100
class TestOcrEngineOption:
"""Test --ocr-engine CLI option."""
def test_ocr_engine_option_accepted(self):
"""CLI should accept --ocr-engine option."""
from ocrmypdf.cli import get_parser
parser = get_parser()
# Should not raise
args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
assert args.ocr_engine == 'none'
def test_ocr_engine_choices_include_none(self):
"""--ocr-engine should include 'none' as a choice."""
from ocrmypdf.cli import get_parser
parser = get_parser()
# Find the --ocr-engine action
for action in parser._actions:
if '--ocr-engine' in action.option_strings:
assert 'none' in action.choices
break
else:
pytest.fail("--ocr-engine option not found")
def test_ocr_engine_choices_include_auto(self):
"""--ocr-engine should include 'auto' as default."""
from ocrmypdf.cli import get_parser
parser = get_parser()
for action in parser._actions:
if '--ocr-engine' in action.option_strings:
assert 'auto' in action.choices
assert action.default == 'auto'
break
================================================
FILE: tests/test_ocr_element.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for OcrElement dataclass and related classes."""
from __future__ import annotations
import pytest
from ocrmypdf.hocrtransform import (
Baseline,
BoundingBox,
FontInfo,
OcrClass,
OcrElement,
)
class TestBoundingBox:
"""Tests for BoundingBox dataclass."""
def test_basic_creation(self):
bbox = BoundingBox(left=10, top=20, right=100, bottom=50)
assert bbox.left == 10
assert bbox.top == 20
assert bbox.right == 100
assert bbox.bottom == 50
def test_width_height(self):
bbox = BoundingBox(left=10, top=20, right=110, bottom=70)
assert bbox.width == 100
assert bbox.height == 50
def test_zero_size_box(self):
bbox = BoundingBox(left=10, top=20, right=10, bottom=20)
assert bbox.width == 0
assert bbox.height == 0
def test_invalid_left_right(self):
with pytest.raises(ValueError, match="right.*left"):
BoundingBox(left=100, top=20, right=10, bottom=50)
def test_invalid_top_bottom(self):
with pytest.raises(ValueError, match="bottom.*top"):
BoundingBox(left=10, top=50, right=100, bottom=20)
class TestBaseline:
"""Tests for Baseline dataclass."""
def test_defaults(self):
baseline = Baseline()
assert baseline.slope == 0.0
assert baseline.intercept == 0.0
def test_with_values(self):
baseline = Baseline(slope=0.01, intercept=-5)
assert baseline.slope == 0.01
assert baseline.intercept == -5
class TestFontInfo:
"""Tests for FontInfo dataclass."""
def test_defaults(self):
font = FontInfo()
assert font.name is None
assert font.size is None
assert font.bold is False
assert font.italic is False
def test_with_values(self):
font = FontInfo(name="Arial", size=12.0, bold=True)
assert font.name == "Arial"
assert font.size == 12.0
assert font.bold is True
assert font.italic is False
class TestOcrElement:
"""Tests for OcrElement dataclass."""
def test_minimal_element(self):
elem = OcrElement(ocr_class=OcrClass.WORD, text="hello")
assert elem.ocr_class == "ocrx_word"
assert elem.text == "hello"
assert elem.bbox is None
assert elem.children == []
def test_element_with_bbox(self):
bbox = BoundingBox(left=0, top=0, right=100, bottom=50)
elem = OcrElement(ocr_class=OcrClass.LINE, bbox=bbox)
assert elem.bbox == bbox
assert elem.bbox.width == 100
def test_element_hierarchy(self):
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
paragraph = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[paragraph])
assert len(page.children) == 1
assert len(page.children[0].children) == 1
assert len(page.children[0].children[0].children) == 2
def test_iter_by_class_single(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="test")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
words = page.iter_by_class(OcrClass.WORD)
assert len(words) == 1
assert words[0].text == "test"
def test_iter_by_class_multiple(self):
words = [
OcrElement(ocr_class=OcrClass.WORD, text="one"),
OcrElement(ocr_class=OcrClass.WORD, text="two"),
OcrElement(ocr_class=OcrClass.WORD, text="three"),
]
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.iter_by_class(OcrClass.WORD)
assert len(result) == 3
assert [w.text for w in result] == ["one", "two", "three"]
def test_iter_by_class_multiple_types(self):
line = OcrElement(ocr_class=OcrClass.LINE)
header = OcrElement(ocr_class=OcrClass.HEADER)
caption = OcrElement(ocr_class=OcrClass.CAPTION)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line, header, caption])
result = page.iter_by_class(OcrClass.LINE, OcrClass.HEADER)
assert len(result) == 2
def test_find_by_class(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="found")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.find_by_class(OcrClass.WORD)
assert result is not None
assert result.text == "found"
def test_find_by_class_not_found(self):
line = OcrElement(ocr_class=OcrClass.LINE)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.find_by_class(OcrClass.WORD)
assert result is None
def test_get_text_recursive_leaf(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="hello")
assert word.get_text_recursive() == "hello"
def test_get_text_recursive_nested(self):
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
assert line.get_text_recursive() == "Hello World"
def test_words_property(self):
words = [
OcrElement(ocr_class=OcrClass.WORD, text="a"),
OcrElement(ocr_class=OcrClass.WORD, text="b"),
]
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
assert len(page.words) == 2
assert page.words[0].text == "a"
def test_lines_property(self):
line1 = OcrElement(ocr_class=OcrClass.LINE)
line2 = OcrElement(ocr_class=OcrClass.HEADER) # Also a line type
par = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line1, line2])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par])
assert len(page.lines) == 2
def test_paragraphs_property(self):
par1 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
par2 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par1, par2])
assert len(page.paragraphs) == 2
def test_direction_ltr(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="ltr")
assert elem.direction == "ltr"
def test_direction_rtl(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="rtl")
assert elem.direction == "rtl"
def test_language(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, language="eng")
assert elem.language == "eng"
def test_baseline(self):
baseline = Baseline(slope=0.01, intercept=-3)
elem = OcrElement(ocr_class=OcrClass.LINE, baseline=baseline)
assert elem.baseline.slope == 0.01
assert elem.baseline.intercept == -3
def test_textangle(self):
elem = OcrElement(ocr_class=OcrClass.LINE, textangle=5.0)
assert elem.textangle == 5.0
def test_confidence(self):
elem = OcrElement(ocr_class=OcrClass.WORD, confidence=0.95)
assert elem.confidence == 0.95
def test_page_properties(self):
elem = OcrElement(
ocr_class=OcrClass.PAGE,
dpi=300.0,
page_number=0,
logical_page_number=1,
)
assert elem.dpi == 300.0
assert elem.page_number == 0
assert elem.logical_page_number == 1
class TestOcrClass:
"""Tests for OcrClass constants."""
def test_class_values(self):
assert OcrClass.PAGE == "ocr_page"
assert OcrClass.PARAGRAPH == "ocr_par"
assert OcrClass.LINE == "ocr_line"
assert OcrClass.WORD == "ocrx_word"
assert OcrClass.HEADER == "ocr_header"
assert OcrClass.CAPTION == "ocr_caption"
def test_line_types_frozenset(self):
assert OcrClass.LINE in OcrClass.LINE_TYPES
assert OcrClass.HEADER in OcrClass.LINE_TYPES
assert OcrClass.CAPTION in OcrClass.LINE_TYPES
assert OcrClass.WORD not in OcrClass.LINE_TYPES
================================================
FILE: tests/test_ocr_engine_interface.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for OcrEngine interface extensions.
These tests verify that the OcrEngine ABC has the new generate_ocr() method
and that OcrElement classes are exported from the public API.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from ocrmypdf.pluginspec import OcrEngine
class TestOcrEngineInterface:
"""Test that OcrEngine ABC has required methods."""
def test_generate_ocr_method_exists(self):
"""OcrEngine must have generate_ocr() method signature."""
assert hasattr(OcrEngine, 'generate_ocr')
def test_supports_generate_ocr_method_exists(self):
"""OcrEngine must have supports_generate_ocr() method."""
assert hasattr(OcrEngine, 'supports_generate_ocr')
def test_supports_generate_ocr_default_false(self):
"""Default supports_generate_ocr() should return False."""
from ocrmypdf.pluginspec import OrientationConfidence
# Create a minimal concrete implementation
class MinimalEngine(OcrEngine):
@staticmethod
def version():
return "1.0"
@staticmethod
def creator_tag(options):
return "test"
def __str__(self):
return "test"
@staticmethod
def languages(options):
return set()
@staticmethod
def get_orientation(input_file, options):
return OrientationConfidence(0, 0.0)
@staticmethod
def get_deskew(input_file, options):
return 0.0
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
pass
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
pass
engine = MinimalEngine()
assert engine.supports_generate_ocr() is False
def test_generate_ocr_raises_not_implemented_by_default(self):
"""Default generate_ocr() should raise NotImplementedError."""
from ocrmypdf.pluginspec import OrientationConfidence
class MinimalEngine(OcrEngine):
@staticmethod
def version():
return "1.0"
@staticmethod
def creator_tag(options):
return "test"
def __str__(self):
return "test"
@staticmethod
def languages(options):
return set()
@staticmethod
def get_orientation(input_file, options):
return OrientationConfidence(0, 0.0)
@staticmethod
def get_deskew(input_file, options):
return 0.0
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
pass
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
pass
engine = MinimalEngine()
with pytest.raises(NotImplementedError):
engine.generate_ocr(Path("test.png"), MagicMock(), 0)
class TestOcrElementExport:
"""Test that OcrElement is exported from public API."""
def test_ocrelement_importable_from_ocrmypdf(self):
"""OcrElement should be importable from ocrmypdf package."""
from ocrmypdf import OcrElement
assert OcrElement is not None
def test_ocrclass_importable_from_ocrmypdf(self):
"""OcrClass should be importable from ocrmypdf package."""
from ocrmypdf import OcrClass
assert OcrClass is not None
def test_boundingbox_importable_from_ocrmypdf(self):
"""BoundingBox should be importable from ocrmypdf package."""
from ocrmypdf import BoundingBox
assert BoundingBox is not None
================================================
FILE: tests/test_ocr_engine_selection.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for OCR engine selection mechanism.
Tests verify that the --ocr-engine option works correctly and that
engine-specific options are available.
"""
from __future__ import annotations
import pytest
class TestOcrEngineCliOption:
"""Test --ocr-engine CLI option."""
def test_ocr_engine_option_exists(self):
"""CLI should have --ocr-engine option."""
from ocrmypdf.cli import get_parser
parser = get_parser()
option_strings = []
for action in parser._actions:
option_strings.extend(action.option_strings)
assert '--ocr-engine' in option_strings
def test_ocr_engine_accepts_tesseract(self):
"""--ocr-engine should accept 'tesseract'."""
from ocrmypdf.cli import get_parser
parser = get_parser()
args = parser.parse_args(['--ocr-engine', 'tesseract', 'in.pdf', 'out.pdf'])
assert args.ocr_engine == 'tesseract'
def test_ocr_engine_accepts_auto(self):
"""--ocr-engine should accept 'auto'."""
from ocrmypdf.cli import get_parser
parser = get_parser()
args = parser.parse_args(['--ocr-engine', 'auto', 'in.pdf', 'out.pdf'])
assert args.ocr_engine == 'auto'
def test_ocr_engine_accepts_none(self):
"""--ocr-engine should accept 'none'."""
from ocrmypdf.cli import get_parser
parser = get_parser()
args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
assert args.ocr_engine == 'none'
def test_ocr_engine_default_is_auto(self):
"""--ocr-engine should default to 'auto'."""
from ocrmypdf.cli import get_parser
parser = get_parser()
args = parser.parse_args(['in.pdf', 'out.pdf'])
assert args.ocr_engine == 'auto'
def test_ocr_engine_rejects_invalid(self):
"""--ocr-engine should reject invalid values."""
from ocrmypdf.cli import get_parser
parser = get_parser()
with pytest.raises(SystemExit):
parser.parse_args(['--ocr-engine', 'invalid_engine', 'in.pdf', 'out.pdf'])
class TestOcrEngineOptionsModel:
"""Test OcrOptions has ocr_engine field."""
def test_ocr_options_has_ocr_engine_field(self):
"""OcrOptions should have ocr_engine field."""
from ocrmypdf._options import OcrOptions
# Check field exists in model
assert 'ocr_engine' in OcrOptions.model_fields
class TestOcrEnginePluginSelection:
"""Test that get_ocr_engine() hook selects correct engine based on options."""
def test_tesseract_selected_when_auto(self):
"""TesseractOcrEngine should be returned when ocr_engine='auto'."""
from unittest.mock import MagicMock
from ocrmypdf.builtin_plugins import tesseract_ocr
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
options = MagicMock()
options.ocr_engine = 'auto'
engine = tesseract_ocr.get_ocr_engine(options=options)
assert isinstance(engine, TesseractOcrEngine)
def test_tesseract_selected_when_tesseract(self):
"""TesseractOcrEngine should be returned when ocr_engine='tesseract'."""
from unittest.mock import MagicMock
from ocrmypdf.builtin_plugins import tesseract_ocr
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
options = MagicMock()
options.ocr_engine = 'tesseract'
engine = tesseract_ocr.get_ocr_engine(options=options)
assert isinstance(engine, TesseractOcrEngine)
def test_null_selected_when_none(self):
"""NullOcrEngine should be returned when ocr_engine='none'."""
from unittest.mock import MagicMock
from ocrmypdf.builtin_plugins import null_ocr
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
options = MagicMock()
options.ocr_engine = 'none'
engine = null_ocr.get_ocr_engine(options=options)
assert isinstance(engine, NullOcrEngine)
def test_null_returns_none_when_auto(self):
"""null_ocr.get_ocr_engine() should return None when ocr_engine='auto'."""
from unittest.mock import MagicMock
from ocrmypdf.builtin_plugins import null_ocr
options = MagicMock()
options.ocr_engine = 'auto'
engine = null_ocr.get_ocr_engine(options=options)
assert engine is None
================================================
FILE: tests/test_optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from io import BytesIO
from os import fspath
from pathlib import Path
from unittest.mock import patch
import img2pdf
import pikepdf
import pytest
from pikepdf import Array, Dictionary, Name
from PIL import Image, ImageDraw
from ocrmypdf import optimize as opt
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._exec.ghostscript import rasterize_pdf
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.optimize import PdfImage, extract_image_filter
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from tests.conftest import check_ocrmypdf
needs_pngquant = pytest.mark.skipif(
not pngquant.available(), reason="pngquant not installed"
)
needs_jbig2enc = pytest.mark.skipif(
not jbig2enc.available(), reason="jbig2enc not installed"
)
# pylint:disable=redefined-outer-name
@pytest.fixture(scope="session")
def palette(resources):
return resources / 'palette.pdf'
@needs_pngquant
@pytest.mark.parametrize('pdf', ['multipage', 'palette'])
def test_basic(multipage, palette, pdf, outpdf):
infile = multipage if pdf == 'multipage' else palette
opt.main(infile, outpdf, level=3)
assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size
@needs_pngquant
def test_mono_not_inverted(resources, outdir):
infile = resources / '2400dpi.pdf'
opt.main(infile, outdir / 'out.pdf', level=3)
rasterize_pdf(
outdir / 'out.pdf',
outdir / 'im.png',
raster_device=GhostscriptRasterDevice.PNGGRAY,
raster_dpi=Resolution(10, 10),
)
with Image.open(fspath(outdir / 'im.png')) as im:
assert im.getpixel((0, 0)) > 240, "Expected white background"
@needs_pngquant
def test_jpg_png_params(resources, outpdf):
check_ocrmypdf(
resources / 'crom.png',
outpdf,
'--image-dpi',
'200',
'--optimize',
'3',
'--jpg-quality',
'50',
'--png-quality',
'20',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@needs_jbig2enc
def test_jbig2_lossless(resources, outpdf):
"""Test that JBIG2 lossless encoding works without JBIG2Globals."""
args = [
resources / 'ccitt.pdf',
outpdf,
'--image-dpi',
'200',
'--optimize',
'3',
'--jpg-quality',
'50',
'--png-quality',
'20',
'--plugin',
'tests/plugins/tesseract_noop.py',
'--jbig2-threshold',
'0.7',
]
check_ocrmypdf(*args)
with pikepdf.open(outpdf) as pdf:
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'
# Lossless JBIG2 has no JBIG2Globals (no shared symbol dictionary)
assert len(pim.decode_parms) == 0
@needs_pngquant
@needs_jbig2enc
def test_flate_to_jbig2(resources, outdir):
# This test requires an image that pngquant is capable of converting to
# to 1bpp - so use an existing 1bpp image, convert up, confirm it can
# convert down
with Image.open(fspath(resources / 'typewriter.png')) as im:
assert im.mode in ('1', 'P')
im = im.convert('L')
im.save(fspath(outdir / 'type8.png'))
check_ocrmypdf(
outdir / 'type8.png',
outdir / 'out.pdf',
'--image-dpi',
'100',
'--png-quality',
'50',
'--optimize',
'3',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
with pikepdf.open(outdir / 'out.pdf') as pdf:
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'
@needs_pngquant
def test_multiple_pngs(resources, outdir):
with Path.open(outdir / 'in.pdf', 'wb') as inpdf:
img2pdf.convert(
fspath(resources / 'baiona_colormapped.png'),
fspath(resources / 'baiona_gray.png'),
outputstream=inpdf,
**IMG2PDF_KWARGS,
)
def mockquant(input_file, output_file, *_args):
with Image.open(input_file) as im:
draw = ImageDraw.Draw(im)
draw.rectangle((0, 0, im.width, im.height), fill=128)
im.save(output_file)
with patch('ocrmypdf.optimize.pngquant.quantize') as mock:
mock.side_effect = mockquant
check_ocrmypdf(
outdir / 'in.pdf',
outdir / 'out.pdf',
'--optimize',
'3',
'--jobs',
'1',
'--use-threads',
'--output-type',
'pdf',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
mock.assert_called()
with (
pikepdf.open(outdir / 'in.pdf') as inpdf,
pikepdf.open(outdir / 'out.pdf') as outpdf,
):
for n in range(len(inpdf.pages)):
inim = next(iter(inpdf.pages[n].images.values()))
outim = next(iter(outpdf.pages[n].images.values()))
assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n
def test_optimize_off(resources, outpdf):
check_ocrmypdf(
resources / 'trivial.pdf',
outpdf,
'--optimize=0',
'--output-type',
'pdf',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_group3(resources):
with pikepdf.open(resources / 'ccitt.pdf') as pdf:
im = pdf.pages[0].Resources.XObject['/Im1']
assert (
opt.extract_image_filter(im, im.objgen[0]) is not None
), "Group 4 should be allowed"
im.DecodeParms['/K'] = 0
assert (
opt.extract_image_filter(im, im.objgen[0]) is None
), "Group 3 should be disallowed"
def test_find_formx(resources):
with pikepdf.open(resources / 'formxobject.pdf') as pdf:
working, pagenos = opt._find_image_xrefs(pdf)
assert len(working) == 1
xref = next(iter(working))
assert pagenos[xref] == 0
def test_extract_image_filter_with_pdf_image():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 10
image.Height = 10
image.Filter = [Name.FlateDecode, Name.DCTDecode]
pdf_image = PdfImage(image)
image.BitsPerComponent = 8
assert extract_image_filter(image, None) == (
pdf_image,
pdf_image.filter_decodeparms[1],
)
def test_extract_image_filter_with_non_image():
image = Dictionary()
image.Subtype = Name.Form
assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_small_stream_size():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 50
assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_small_dimensions():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 5
image.Height = 5
assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_multiple_compression_filters():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 10
image.Height = 10
image.BitsPerComponent = 8
image.Filter = [Name.ASCII85Decode, Name.FlateDecode, Name.DCTDecode]
assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_wide_gamut_image():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 10
image.Height = 10
image.BitsPerComponent = 16
image.Filter = Name.FlateDecode
assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_jpeg2000_image():
im = Image.new('RGB', (10, 10))
bio = BytesIO()
im.save(bio, format='JPEG2000')
pdf = pikepdf.new()
stream = pdf.make_stream(
data=bio.getvalue(),
Subtype=Name.Image,
Length=200,
Width=10,
Height=10,
BitsPerComponent=8,
Filter=Name.JPXDecode,
)
assert extract_image_filter(stream, None) is None
def test_extract_image_filter_with_ccitt_group_3_image():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 10
image.Height = 10
image.BitsPerComponent = 1
image.Filter = Name.CCITTFaxDecode
image.DecodeParms = Array([Dictionary(K=1)])
assert extract_image_filter(image, None) is None
# Triggers pikepdf bug
# def test_extract_image_filter_with_decode_table():
# image = Dictionary()
# image.Subtype = Name.Image
# image.Length = 200
# image.Width = 10
# image.Height = 10
# image.Filter = Name.FlateDecode
# image.BitsPerComponent = 8
# image.ColorSpace = Name.DeviceGray
# image.Decode = [42, 0]
# assert extract_image_filter(image, None) is None
def test_extract_image_filter_with_rgb_smask_matte():
image = Dictionary()
image.Subtype = Name.Image
image.Length = 200
image.Width = 10
image.Height = 10
image.Filter = Name.FlateDecode
image.BitsPerComponent = 8
image.ColorSpace = Name.DeviceRGB
image.SMask = Dictionary(
Type=Name.Image,
Subtype=Name.Image,
Length=200,
Width=10,
Height=10,
Filter=Name.FlateDecode,
BitsPerComponent=8,
ColorSpace=Name.DeviceGray,
Matte=Array([1, 2, 3]),
)
assert extract_image_filter(image, None) is None
================================================
FILE: tests/test_page_boxes.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pikepdf
import pytest
from ocrmypdf._exec import verapdf
from .conftest import check_ocrmypdf
page_rect = [0, 0, 612, 792]
inset_rect = [200, 200, 612, 792]
wh_rect = [0, 0, 412, 592]
neg_rect = [-100, -100, 512, 692]
# When speculative PDF/A succeeds (verapdf available), MediaBox is preserved.
# Ghostscript would normalize MediaBox to start at origin, but speculative
# conversion bypasses Ghostscript.
_pdfa_inset_expected = inset_rect if verapdf.available() else wh_rect
mediabox_testdata = [
('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'fpdf2',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
(
'fpdf2',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
('fpdf2', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
('fpdf2', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
)
def test_media_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.MediaBox = crop_to
pdf.save(outdir / 'cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
assert [float(x) for x in page.mediabox] == crop_expected
cropbox_testdata = [
('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'fpdf2',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
(
'fpdf2',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
)
def test_crop_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.CropBox = crop_to
pdf.save(outdir / 'cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
'--optimize',
'0',
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
assert [float(x) for x in page.cropbox] == crop_expected
================================================
FILE: tests/test_page_numbers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pytest
import ocrmypdf
from ocrmypdf._options import _pages_from_ranges
from ocrmypdf.exceptions import BadArgsError
from ocrmypdf.pdfinfo import PdfInfo
@pytest.mark.parametrize(
'pages, result',
[
['1', {0}],
['1,2', {0, 1}],
['1-3', {0, 1, 2}],
['2,5,6', {1, 4, 5}],
['11-15, 18, ', {10, 11, 12, 13, 14, 17}],
[',,3', {2}],
['3, 3, 3, 3,', {2}],
['3, 2, 1, 42', {0, 1, 2, 41}],
['-1', BadArgsError],
['1,3,-11', BadArgsError],
['1-,', BadArgsError],
['start-end', BadArgsError],
['1-0', BadArgsError],
['99-98', BadArgsError],
['0-0', BadArgsError],
['1-0,3-4', BadArgsError],
[',', BadArgsError],
['', BadArgsError],
],
)
def test_pages(pages, result):
if isinstance(result, type):
with pytest.raises(result):
_pages_from_ranges(pages)
else:
assert _pages_from_ranges(pages) == result
def test_nonmonotonic_warning(caplog):
pages = _pages_from_ranges('1, 3, 2')
assert pages == {0, 1, 2}
assert 'out of order' in caplog.text
def test_limited_pages(multipage, outpdf):
ocrmypdf.ocr(
multipage,
outpdf,
pages='5-6',
optimize=0,
output_type='pdf',
plugins=['tests/plugins/tesseract_cache.py'],
)
pi = PdfInfo(outpdf)
assert not pi.pages[0].has_text
assert pi.pages[4].has_text
assert pi.pages[5].has_text
================================================
FILE: tests/test_pdf_renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for Fpdf2PdfRenderer class."""
from __future__ import annotations
from io import StringIO
from pathlib import Path
import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.helpers import check_pdf
from ocrmypdf.hocrtransform import (
Baseline,
BoundingBox,
OcrClass,
OcrElement,
)
def text_from_pdf(filename: Path) -> str:
"""Extract text from a PDF file using pdfminer."""
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
@pytest.fixture
def font_dir():
"""Get the font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create a MultiFontManager for tests."""
return MultiFontManager(font_dir)
def create_simple_page(
width: float = 1000,
height: float = 500,
words: list[tuple[str, tuple[float, float, float, float]]] | None = None,
) -> OcrElement:
"""Create a simple OcrElement page for testing.
Args:
width: Page width in pixels
height: Page height in pixels
words: List of (text, (left, top, right, bottom)) tuples
Returns:
OcrElement representing the page
"""
if words is None:
words = [("Hello", (100, 100, 200, 150)), ("World", (250, 100, 350, 150))]
word_elements = [
OcrElement(
ocr_class=OcrClass.WORD,
text=text,
bbox=BoundingBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]),
)
for text, bbox in words
]
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.0, intercept=0),
children=word_elements,
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
direction="ltr",
language="eng",
children=[line],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=width, bottom=height),
children=[paragraph],
)
return page
class TestFpdf2PdfRendererBasic:
"""Basic Fpdf2PdfRenderer functionality tests."""
def test_render_simple_page(self, tmp_path, multi_font_manager):
"""Test rendering a simple page with two words."""
page = create_simple_page()
output_pdf = tmp_path / "simple.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
assert output_pdf.exists()
check_pdf(str(output_pdf))
def test_rendered_text_extractable(self, tmp_path, multi_font_manager):
"""Test that rendered text can be extracted from the PDF."""
page = create_simple_page()
output_pdf = tmp_path / "extractable.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
extracted_text = text_from_pdf(output_pdf)
assert "Hello" in extracted_text
assert "World" in extracted_text
def test_invisible_text_mode(self, tmp_path, multi_font_manager):
"""Test that invisible_text=True creates a valid PDF."""
page = create_simple_page()
output_pdf = tmp_path / "invisible.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72.0,
multi_font_manager=multi_font_manager,
invisible_text=True,
)
renderer.render(output_pdf)
# Text should still be extractable even when invisible
extracted_text = text_from_pdf(output_pdf)
assert "Hello" in extracted_text
def test_visible_text_mode(self, tmp_path, multi_font_manager):
"""Test that invisible_text=False creates a valid PDF with visible text."""
page = create_simple_page()
output_pdf = tmp_path / "visible.pdf"
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
renderer.render(output_pdf)
# Text should be extractable
extracted_text = text_from_pdf(output_pdf)
assert "Hello" in extracted_text
class TestFpdf2PdfRendererPageSize:
"""Test page size calculations."""
def test_page_dimensions(self, tmp_path, multi_font_manager):
"""Test that page dimensions are calculated correctly."""
# 1000x500 pixels at 72 dpi = 1000x500 points
page = create_simple_page(width=1000, height=500)
output_pdf = tmp_path / "dimensions.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
assert renderer.coord_transform.page_width_pt == pytest.approx(1000.0)
assert renderer.coord_transform.page_height_pt == pytest.approx(500.0)
renderer.render(output_pdf)
def test_high_dpi_page(self, tmp_path, multi_font_manager):
"""Test page dimensions at higher DPI."""
# 720x360 pixels at 144 dpi = 360x180 points
page = create_simple_page(width=720, height=360)
output_pdf = tmp_path / "high_dpi.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=144.0, multi_font_manager=multi_font_manager
)
assert renderer.coord_transform.page_width_pt == pytest.approx(360.0)
assert renderer.coord_transform.page_height_pt == pytest.approx(180.0)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
class TestFpdf2PdfRendererMultiLine:
"""Test rendering of multi-line content."""
def test_multiple_lines(self, tmp_path, multi_font_manager):
"""Test rendering multiple lines of text."""
line1_words = [
OcrElement(
ocr_class=OcrClass.WORD,
text="Line",
bbox=BoundingBox(left=100, top=100, right=180, bottom=150),
),
OcrElement(
ocr_class=OcrClass.WORD,
text="one",
bbox=BoundingBox(left=190, top=100, right=250, bottom=150),
),
]
line1 = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.0, intercept=0),
children=line1_words,
)
line2_words = [
OcrElement(
ocr_class=OcrClass.WORD,
text="Line",
bbox=BoundingBox(left=100, top=200, right=180, bottom=250),
),
OcrElement(
ocr_class=OcrClass.WORD,
text="two",
bbox=BoundingBox(left=190, top=200, right=250, bottom=250),
),
]
line2 = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=200, right=900, bottom=250),
baseline=Baseline(slope=0.0, intercept=0),
children=line2_words,
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=250),
direction="ltr",
language="eng",
children=[line1, line2],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "multiline.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
extracted_text = text_from_pdf(output_pdf)
assert "Line" in extracted_text
assert "one" in extracted_text
assert "two" in extracted_text
class TestFpdf2PdfRendererTextDirection:
"""Test rendering of different text directions."""
def test_ltr_text(self, tmp_path, multi_font_manager):
"""Test rendering LTR text."""
page = create_simple_page()
output_pdf = tmp_path / "ltr.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
def test_rtl_text(self, tmp_path, multi_font_manager):
"""Test rendering RTL text."""
word = OcrElement(
ocr_class=OcrClass.WORD,
text="مرحبا",
bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.0, intercept=0),
direction="rtl",
children=[word],
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
direction="rtl",
language="ara",
children=[line],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "rtl.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
class TestFpdf2PdfRendererBaseline:
"""Test baseline handling in rendering."""
def test_sloped_baseline(self, tmp_path, multi_font_manager):
"""Test rendering with a sloped baseline."""
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Sloped",
bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.02, intercept=-5),
children=[word],
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
direction="ltr",
language="eng",
children=[line],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "sloped.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
extracted_text = text_from_pdf(output_pdf)
assert "Sloped" in extracted_text
class TestFpdf2PdfRendererTextangle:
"""Test textangle (rotation) handling in rendering."""
def test_rotated_text(self, tmp_path, multi_font_manager):
"""Test rendering rotated text."""
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Rotated",
bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.0, intercept=0),
textangle=5.0,
children=[word],
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
direction="ltr",
language="eng",
children=[line],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "rotated.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
extracted_text = text_from_pdf(output_pdf)
assert "Rotated" in extracted_text
class TestFpdf2PdfRendererWordBreaks:
"""Test word rendering."""
def test_word_breaks_english(self, tmp_path, multi_font_manager):
"""Test that words are rendered for English text."""
page = create_simple_page()
output_pdf = tmp_path / "english.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
extracted_text = text_from_pdf(output_pdf)
# Words should be present
assert "Hello" in extracted_text
assert "World" in extracted_text
def test_cjk_text(self, tmp_path, multi_font_manager):
"""Test rendering CJK text."""
words = [
OcrElement(
ocr_class=OcrClass.WORD,
text="你好",
bbox=BoundingBox(left=100, top=100, right=150, bottom=150),
),
OcrElement(
ocr_class=OcrClass.WORD,
text="世界",
bbox=BoundingBox(left=160, top=100, right=210, bottom=150),
),
]
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
baseline=Baseline(slope=0.0, intercept=0),
children=words,
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
direction="ltr",
language="chi_sim", # Simplified Chinese
children=[line],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "chinese.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
class TestFpdf2PdfRendererDebugOptions:
"""Test debug rendering options."""
def test_debug_render_options_default(self, multi_font_manager):
"""Test that debug options are disabled by default."""
page = create_simple_page()
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
assert renderer.debug_options.render_baseline is False
assert renderer.debug_options.render_word_bbox is False
assert renderer.debug_options.render_line_bbox is False
def test_debug_render_options_enabled(self, tmp_path, multi_font_manager):
"""Test rendering with debug options enabled."""
page = create_simple_page()
output_pdf = tmp_path / "debug.pdf"
debug_opts = DebugRenderOptions(
render_baseline=True,
render_word_bbox=True,
render_line_bbox=True,
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72.0,
multi_font_manager=multi_font_manager,
invisible_text=False,
debug_render_options=debug_opts,
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
# Text should still be extractable
extracted_text = text_from_pdf(output_pdf)
assert "Hello" in extracted_text
class TestFpdf2PdfRendererErrors:
"""Test error handling in Fpdf2PdfRenderer."""
def test_invalid_ocr_class(self, multi_font_manager):
"""Test that non-page elements are rejected."""
line = OcrElement(
ocr_class=OcrClass.LINE, bbox=BoundingBox(left=0, top=0, right=100, bottom=50)
)
with pytest.raises(ValueError, match="ocr_page"):
Fpdf2PdfRenderer(page=line, dpi=72.0, multi_font_manager=multi_font_manager)
def test_page_without_bbox(self, multi_font_manager):
"""Test that pages without bbox are rejected."""
page = OcrElement(ocr_class=OcrClass.PAGE)
with pytest.raises(ValueError, match="bounding box"):
Fpdf2PdfRenderer(page=page, dpi=72.0, multi_font_manager=multi_font_manager)
class TestFpdf2PdfRendererLineTypes:
"""Test rendering of different line types."""
def test_header_line(self, tmp_path, multi_font_manager):
"""Test rendering header lines."""
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Header",
bbox=BoundingBox(left=100, top=50, right=200, bottom=100),
)
header = OcrElement(
ocr_class=OcrClass.HEADER,
bbox=BoundingBox(left=100, top=50, right=900, bottom=100),
baseline=Baseline(slope=0.0, intercept=0),
children=[word],
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=50, right=900, bottom=100),
direction="ltr",
language="eng",
children=[header],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "header.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
extracted_text = text_from_pdf(output_pdf)
assert "Header" in extracted_text
def test_caption_line(self, tmp_path, multi_font_manager):
"""Test rendering caption lines."""
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Caption",
bbox=BoundingBox(left=100, top=300, right=200, bottom=350),
)
caption = OcrElement(
ocr_class=OcrClass.CAPTION,
bbox=BoundingBox(left=100, top=300, right=900, bottom=350),
baseline=Baseline(slope=0.0, intercept=0),
children=[word],
)
paragraph = OcrElement(
ocr_class=OcrClass.PARAGRAPH,
bbox=BoundingBox(left=100, top=300, right=900, bottom=350),
direction="ltr",
language="eng",
children=[caption],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
children=[paragraph],
)
output_pdf = tmp_path / "caption.pdf"
renderer = Fpdf2PdfRenderer(
page=page, dpi=72.0, multi_font_manager=multi_font_manager
)
renderer.render(output_pdf)
check_pdf(str(output_pdf))
extracted_text = text_from_pdf(output_pdf)
assert "Caption" in extracted_text
================================================
FILE: tests/test_pdfa.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
import pikepdf
import pytest
from ocrmypdf.exceptions import MissingDependencyError
from .conftest import check_ocrmypdf
@pytest.mark.parametrize('optimize', (0, 3))
@pytest.mark.parametrize('pdfa_level', (1, 2, 3))
def test_pdfa(resources, outpdf, optimize, pdfa_level):
try:
check_ocrmypdf(
resources / 'francais.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
f'--output-type=pdfa-{pdfa_level}',
f'--optimize={optimize}',
)
except MissingDependencyError as e:
if 'pngquant' in str(e) and optimize in (2, 3) and os.name == 'nt':
pytest.xfail("pngquant currently not available on Windows")
if pdfa_level in (2, 3):
# PDF/A-2 allows ObjStm
assert b'/ObjStm' in outpdf.read_bytes()
elif pdfa_level == 1:
# PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
# we don't use it
assert b'/ObjStm' not in outpdf.read_bytes()
with pikepdf.open(outpdf) as pdf, pdf.open_metadata() as m:
assert m.pdfa_status == f'{pdfa_level}B'
================================================
FILE: tests/test_pdfinfo.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pickle
import warnings
from io import BytesIO
from math import isclose
import img2pdf
import pikepdf
import pytest
from PIL import Image
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas
from ocrmypdf import pdfinfo
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.pdfinfo import Colorspace, Encoding
from ocrmypdf.pdfinfo._contentstream import _interpret_contents
from ocrmypdf.pdfinfo.layout import PDFPage
warnings.filterwarnings(
"ignore", category=DeprecationWarning, module="reportlab.lib.rl_safe_eval"
)
# pylint: disable=protected-access
@pytest.fixture
def single_page_text(outdir):
filename = outdir / 'text.pdf'
pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch))
text = pdf.beginText()
text.setFont('Helvetica', 12)
text.setTextOrigin(1 * inch, 3 * inch)
text.textLine(
"Methink'st thou art a general offence and every man should beat thee."
)
pdf.drawText(text)
pdf.showPage()
pdf.save()
return filename
def test_single_page_text(single_page_text):
info = pdfinfo.PdfInfo(single_page_text)
assert len(info) == 1
page = info[0]
assert page.has_text
assert len(page.images) == 0
@pytest.fixture(scope='session')
def eight_by_eight():
im = Image.new('1', (8, 8), 0)
for n in range(8):
im.putpixel((n, n), 1)
return im
@pytest.fixture
def eight_by_eight_regular_image(eight_by_eight, outpdf):
im = eight_by_eight
bio = BytesIO()
im.save(bio, format='PNG')
bio.seek(0)
imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)
with outpdf.open('wb') as f:
img2pdf.convert(
bio,
producer="img2pdf",
layout_fun=layout_fun,
outputstream=f,
**IMG2PDF_KWARGS,
)
return outpdf
def test_single_page_image(eight_by_eight_regular_image):
info = pdfinfo.PdfInfo(eight_by_eight_regular_image)
assert len(info) == 1
page = info[0]
assert not page.has_text
assert len(page.images) == 1
pdfimage = page.images[0]
assert pdfimage.width == 8
assert pdfimage.color == Colorspace.gray
# DPI in a 1"x1" is the image width
assert isclose(pdfimage.dpi.x, 8)
assert isclose(pdfimage.dpi.y, 8)
@pytest.fixture
def eight_by_eight_inline_image(eight_by_eight, outpdf):
pdf = Canvas(str(outpdf), pagesize=(8 * 72, 6 * 72))
# Draw image in a 72x72 pt or 1"x1" area
pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72)
pdf.showPage()
pdf.save()
return outpdf
def test_single_page_inline_image(eight_by_eight_inline_image):
info = pdfinfo.PdfInfo(eight_by_eight_inline_image)
print(info)
pdfimage = info[0].images[0]
assert isclose(pdfimage.dpi.x, 8)
assert pdfimage.color == Colorspace.gray
assert pdfimage.width == 8
def test_jpeg(resources):
filename = resources / 'c02-22.pdf'
pdf = pdfinfo.PdfInfo(filename)
pdfimage = pdf[0].images[0]
assert pdfimage.enc == Encoding.jpeg
assert isclose(pdfimage.dpi.x, 150)
@pytest.fixture
def flate_jpeg_pdf(outpdf):
"""Create a PDF with a FlateDecode+DCTDecode (flate+jpeg) encoded image.
This simulates what OCRmyPDF's optimizer does when it deflates JPEGs.
"""
from zlib import compress
# Create an RGB image and save as JPEG
im = Image.new('RGB', (64, 64), color=(128, 64, 192))
bio = BytesIO()
im.save(bio, format='JPEG')
jpeg_data = bio.getvalue()
# Compress the JPEG data with flate
flate_jpeg_data = compress(jpeg_data)
# Create a PDF with the flate+jpeg image
with pikepdf.Pdf.new() as pdf:
pdf.add_blank_page(page_size=(72, 72))
image_dict = pikepdf.Stream(
pdf,
flate_jpeg_data,
BitsPerComponent=8,
ColorSpace=pikepdf.Name.DeviceRGB,
Filter=[pikepdf.Name.FlateDecode, pikepdf.Name.DCTDecode],
Height=64,
Subtype=pikepdf.Name.Image,
Type=pikepdf.Name.XObject,
Width=64,
)
objname = pdf.pages[0].add_resource(
image_dict, pikepdf.Name.XObject, pikepdf.Name.Im0
)
pdf.pages[0].Contents = pikepdf.Stream(
pdf, b"q 72 0 0 72 0 0 cm %s Do Q" % bytes(objname)
)
pdf.save(outpdf)
return outpdf
def test_flate_jpeg(flate_jpeg_pdf):
"""Test that pdfinfo correctly identifies FlateDecode+DCTDecode as flate_jpeg."""
pdf = pdfinfo.PdfInfo(flate_jpeg_pdf)
pdfimage = pdf[0].images[0]
assert pdfimage.enc == Encoding.flate_jpeg
def test_form_xobject(resources):
filename = resources / 'formxobject.pdf'
pdf = pdfinfo.PdfInfo(filename)
pdfimage = pdf[0].images[0]
assert pdfimage.width == 50
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdf = pdfinfo.PdfInfo(filename)
assert len(pdf[0].images) == 0
assert not pdf[0].has_text
def test_oversized_page(resources):
pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
image = pdf[0].images[0]
assert image.width * image.dpi.x > 200, "this is supposed to be oversized"
def test_pickle(resources):
# For multiprocessing we must be able to pickle our information - if
# this fails then we are probably storing some unpickleabe pikepdf or
# other external data around
filename = resources / 'graph_ocred.pdf'
pdf = pdfinfo.PdfInfo(filename)
pickle.dumps(pdf)
def test_vector(resources):
filename = resources / 'vector.pdf'
pdf = pdfinfo.PdfInfo(filename)
assert pdf[0].has_vector
assert not pdf[0].has_text
def test_ocr_detection(resources):
filename = resources / 'graph_ocred.pdf'
pdf = pdfinfo.PdfInfo(filename)
assert not pdf[0].has_vector
assert pdf[0].has_text
@pytest.mark.parametrize(
'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
)
def test_corrupt_font_detection(resources, testfile):
filename = resources / testfile
pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True)
assert pdf[0].has_corrupt_text
def test_stack_abuse():
p = pikepdf.Pdf.new()
stream = pikepdf.Stream(p, b'q ' * 35)
with pytest.warns(UserWarning, match="overflowed"):
_interpret_contents(stream)
stream = pikepdf.Stream(p, b'q Q Q Q Q')
with pytest.warns(UserWarning, match="underflowed"):
_interpret_contents(stream)
stream = pikepdf.Stream(p, b'q ' * 135)
with pytest.warns(UserWarning), pytest.raises(RuntimeError):
_interpret_contents(stream)
def test_pages_issue700(monkeypatch, resources):
def get_no_pages(*args, **kwargs):
return iter([])
monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages)
with pytest.raises(InputFileError, match="pdfminer"):
pi = pdfinfo.PdfInfo(
resources / 'cardinal.pdf',
detailed_analysis=True,
progbar=False,
max_workers=1,
)
pi._miner_state.get_page_analysis(0)
@pytest.fixture
def image_scale0(resources, outpdf):
with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
xobj = cmyk.pages[0].as_form_xobject()
p = pikepdf.Pdf.new()
p.add_blank_page(page_size=(72, 72))
objname = p.pages[0].add_resource(
p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0
)
print(objname)
p.pages[0].Contents = pikepdf.Stream(
p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)
)
p.save(outpdf)
return outpdf
def test_image_scale0(image_scale0):
pi = pdfinfo.PdfInfo(
image_scale0, detailed_analysis=True, progbar=False, max_workers=1
)
assert not pi.pages[0]._images[0].dpi.is_finite
assert pi.pages[0].dpi == Resolution(0, 0)
================================================
FILE: tests/test_pipeline.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import warnings
from unittest.mock import Mock
import pytest
from PIL import Image
from reportlab.lib.units import inch
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen.canvas import Canvas
from ocrmypdf import _pipeline, pdfinfo
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import Encoding
warnings.filterwarnings(
"ignore", category=DeprecationWarning, module="reportlab.lib.rl_safe_eval"
)
@pytest.fixture(scope='session')
def rgb_image():
im = Image.new('RGB', (8, 8))
im.putpixel((4, 4), (255, 0, 0))
im.putpixel((5, 5), (0, 255, 0))
im.putpixel((6, 6), (0, 0, 255))
return ImageReader(im)
DUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0)
VECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI, _pipeline.VECTOR_PAGE_DPI)
@pytest.mark.parametrize(
'image, text, vector, result',
[
(False, False, False, VECTOR_RESOLUTION),
(False, True, False, VECTOR_RESOLUTION),
(True, False, False, DUMMY_OVERSAMPLE_RESOLUTION),
(True, True, False, VECTOR_RESOLUTION),
(False, False, True, VECTOR_RESOLUTION),
(False, True, True, VECTOR_RESOLUTION),
(True, False, True, VECTOR_RESOLUTION),
(True, True, True, VECTOR_RESOLUTION),
],
)
def test_dpi_needed(image, text, vector, result, rgb_image, outdir):
c = Canvas(str(outdir / 'dpi.pdf'), pagesize=(5 * inch, 5 * inch))
if image:
c.drawImage(rgb_image, 1 * inch, 1 * inch, width=1 * inch, height=1 * inch)
if text:
c.drawString(1 * inch, 4 * inch, "Actual text")
if vector:
c.ellipse(3 * inch, 3 * inch, 4 * inch, 4 * inch)
c.showPage()
c.save()
pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf')
pageinfo = pi[0]
ctx = Mock()
ctx.options.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0]
ctx.pageinfo = pageinfo
assert _pipeline.get_canvas_square_dpi(ctx) == result
assert _pipeline.get_page_square_dpi(ctx) == result
@pytest.mark.parametrize(
# Name for nicer -v output
'name,input,output',
(
(
'empty_input',
# Input:
(),
# Output:
(),
),
(
'no_values',
# Input:
('', '', '', '', ''),
# Output:
(((1, 5), None),),
),
(
'no_empty_values',
# Input:
('v', 'w', 'x', 'y', 'z'),
# Output:
(
((1, 1), 'v'),
((2, 2), 'w'),
((3, 3), 'x'),
((4, 4), 'y'),
((5, 5), 'z'),
),
),
(
'skip_head',
# Input:
('', '', 'x', 'y', 'z'),
# Output:
(
((1, 2), None),
((3, 3), 'x'),
((4, 4), 'y'),
((5, 5), 'z'),
),
),
(
'skip_tail',
# Input:
('x', 'y', 'z', '', ''),
# Output:
(
((1, 1), 'x'),
((2, 2), 'y'),
((3, 3), 'z'),
((4, 5), None),
),
),
(
'range_in_middle',
# Input:
('x', '', '', '', 'y'),
# Output:
(
((1, 1), 'x'),
((2, 4), None),
((5, 5), 'y'),
),
),
(
'range_in_middle_2',
# Input:
('x', '', '', 'y', '', '', '', 'z'),
# Output:
(
((1, 1), 'x'),
((2, 3), None),
((4, 4), 'y'),
((5, 7), None),
((8, 8), 'z'),
),
),
),
)
def test_enumerate_compress_ranges(name, input, output):
assert output == tuple(_pipeline.enumerate_compress_ranges(input))
@pytest.mark.parametrize(
'encodings, expected',
[
# Empty images list returns False
([], False),
# Single JPEG returns True
([Encoding.jpeg], True),
# Single flate_jpeg returns True
([Encoding.flate_jpeg], True),
# Mix of jpeg and flate_jpeg returns True
([Encoding.jpeg, Encoding.flate_jpeg], True),
# Non-JPEG encoding returns False
([Encoding.flate], False),
# Mix with non-JPEG returns False
([Encoding.jpeg, Encoding.flate], False),
([Encoding.flate_jpeg, Encoding.flate], False),
],
)
def test_should_visible_page_image_use_jpg(encodings, expected):
"""Test that should_visible_page_image_use_jpg correctly handles flate_jpeg."""
pageinfo = Mock()
pageinfo.images = [Mock(enc=enc) for enc in encodings]
assert _pipeline.should_visible_page_image_use_jpg(pageinfo) == expected
================================================
FILE: tests/test_pipeline_generate_ocr.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for pipeline support of generate_ocr().
These tests verify that the pipeline supports the new generate_ocr() API
alongside the existing hOCR path.
"""
from __future__ import annotations
import dataclasses
from pathlib import Path
from unittest.mock import MagicMock, patch
from ocrmypdf import BoundingBox, OcrElement
class TestOcrEngineDirect:
"""Test the ocr_engine_direct() pipeline function."""
def test_ocr_engine_direct_function_exists(self):
"""ocr_engine_direct function should exist in _pipeline module."""
from ocrmypdf import _pipeline
assert hasattr(_pipeline, 'ocr_engine_direct')
def test_ocr_engine_direct_returns_tuple(self, tmp_path):
"""ocr_engine_direct should return (OcrElement, Path) tuple."""
from ocrmypdf._pipeline import ocr_engine_direct
# Mock page context with an engine that supports generate_ocr
mock_context = MagicMock()
mock_engine = MagicMock()
mock_engine.supports_generate_ocr.return_value = True
mock_engine.generate_ocr.return_value = (
OcrElement(ocr_class='ocr_page', bbox=BoundingBox(0, 0, 100, 100)),
"test text",
)
mock_context.plugin_manager.get_ocr_engine.return_value = mock_engine
mock_context.get_path.return_value = tmp_path / Path("test.txt")
mock_context.pageno = 0
with patch('builtins.open', MagicMock()):
result = ocr_engine_direct(Path("test.png"), mock_context)
assert isinstance(result, tuple)
assert len(result) == 2
class TestPageResultExtension:
"""Test PageResult NamedTuple extension."""
def test_page_result_has_ocr_tree_field(self):
"""PageResult should have ocr_tree field."""
from ocrmypdf._pipelines._common import PageResult
# PageResult is a NamedTuple, use _fields
assert 'ocr_tree' in PageResult._fields
def test_page_result_ocr_tree_default_none(self):
"""PageResult.ocr_tree should default to None."""
from ocrmypdf._pipelines._common import PageResult
result = PageResult(pageno=0)
assert result.ocr_tree is None
class TestFpdf2DirectPage:
"""Test Fpdf2DirectPage dataclass for direct OcrElement input."""
def test_fpdf2_direct_page_exists(self):
"""Fpdf2DirectPage dataclass should exist."""
from ocrmypdf._graft import Fpdf2DirectPage
assert Fpdf2DirectPage is not None
def test_fpdf2_direct_page_has_ocr_tree(self):
"""Fpdf2DirectPage should have ocr_tree field."""
from ocrmypdf._graft import Fpdf2DirectPage
fields = {f.name for f in dataclasses.fields(Fpdf2DirectPage)}
assert 'ocr_tree' in fields
class TestHOCRResultExtension:
"""Test HOCRResult dataclass extension."""
def test_hocr_result_has_ocr_tree_field(self):
"""HOCRResult should have ocr_tree field."""
from ocrmypdf._pipelines._common import HOCRResult
fields = {f.name for f in dataclasses.fields(HOCRResult)}
assert 'ocr_tree' in fields
def test_hocr_result_ocr_tree_default_none(self):
"""HOCRResult.ocr_tree should default to None."""
from ocrmypdf._pipelines._common import HOCRResult
result = HOCRResult(pageno=0)
assert result.ocr_tree is None
================================================
FILE: tests/test_preprocessing.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from math import isclose
import pytest
from PIL import Image
from ocrmypdf._exec import ghostscript, tesseract
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf
RENDERERS = ['fpdf2', 'sandwich']
def test_deskew(resources, outdir):
# Run with deskew
deskewed_pdf = check_ocrmypdf(resources / 'skew.pdf', outdir / 'skew.pdf', '-d')
# Now render as an image again...
deskewed_png = outdir / 'deskewed.png'
ghostscript.rasterize_pdf(
deskewed_pdf,
deskewed_png,
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(150, 150),
pageno=1,
)
# ...and use Tessera to find the skew angle to confirm that it was deskewed
skew_angle = tesseract.get_deskew(deskewed_png, [], None, 5.0)
print(skew_angle)
assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def test_deskew_blank_page(resources, outpdf):
# Tesseract doesn't like blank pages - make sure we can get through
check_ocrmypdf(resources / 'blank.pdf', outpdf, '--deskew')
@pytest.mark.xfail(reason="remove background disabled")
def test_remove_background(resources, outdir):
# Ensure the input image does not contain pure white/black
with Image.open(resources / 'baiona_color.jpg') as im:
assert im.getextrema() != ((0, 255), (0, 255), (0, 255))
output_pdf = check_ocrmypdf(
resources / 'baiona_color.jpg',
outdir / 'test_remove_bg.pdf',
'--remove-background',
'--image-dpi',
'150',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
output_png = outdir / 'remove_bg.png'
ghostscript.rasterize_pdf(
output_pdf,
output_png,
raster_device=GhostscriptRasterDevice.PNG16M,
raster_dpi=Resolution(100, 100),
pageno=1,
)
# The output image should contain pure white and black
with Image.open(output_png) as im:
assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
# This will run 5 * 2 * 2 = 20 test cases
@pytest.mark.parametrize(
"pdf", ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf']
)
@pytest.mark.parametrize("renderer", ['sandwich', 'fpdf2'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(pdf, renderer, output_type, resources, outdir):
outfile = outdir / f'test_{pdf}_{renderer}.pdf'
check_ocrmypdf(
resources / pdf,
outfile,
'-dc' if have_unpaper() else '-d',
'-v',
'1',
'--output-type',
output_type,
'--sidecar',
'--skip-text',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
assert outfile.with_suffix('.pdf.txt').exists()
@pytest.mark.parametrize('renderer', RENDERERS)
def test_non_square_resolution(renderer, resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y
proc = run_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
# PDF/A conversion can fail for this file if Ghostscript >= 10.3, so don't test
# exit code in that case
if proc.returncode != ExitCode.pdfa_conversion_failed:
proc.check_returncode()
out_pageinfo = PdfInfo(outpdf)
# Confirm resolution was kept the same
assert in_pageinfo[0].dpi == out_pageinfo[0].dpi
@pytest.mark.parametrize('renderer', RENDERERS)
def test_convert_to_square_resolution(renderer, resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y
# --force-ocr requires means forced conversion to square resolution
check_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--force-ocr',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
out_pageinfo = PdfInfo(outpdf)
in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
# Resolution show now be equal
assert out_p0.dpi.x == out_p0.dpi.y
# Page size should match input page size
assert isclose(in_p0.width_inches, out_p0.width_inches)
assert isclose(in_p0.height_inches, out_p0.height_inches)
# Because we rasterized the page to produce a new image, it should occupy
# the entire page
out_im_w = out_p0.images[0].width / out_p0.images[0].dpi.x
out_im_h = out_p0.images[0].height / out_p0.images[0].dpi.y
assert isclose(out_p0.width_inches, out_im_w)
assert isclose(out_p0.height_inches, out_im_h)
================================================
FILE: tests/test_quality.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from ocrmypdf import quality as qual
def test_quality_measurement():
oqd = qual.OcrQualityDictionary(
wordlist=["words", "words", "quick", "brown", "fox", "dog", "lazy"]
)
assert len(oqd.dictionary) == 6 # 6 unique
assert (
oqd.measure_words_matched("The quick brown fox jumps quickly over the lazy dog")
== 0.5
)
assert oqd.measure_words_matched("12345 10% _f 7fox -brown | words") == 1.0
assert oqd.measure_words_matched("quick quick quick") == 1.0
================================================
FILE: tests/test_rasterizer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Tests for the --rasterizer CLI option."""
from __future__ import annotations
from io import BytesIO
import img2pdf
import pikepdf
import pytest
from PIL import Image
from ocrmypdf._options import OcrOptions
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from .conftest import check_ocrmypdf
# Check if pypdfium2 is available
try:
import pypdfium2 # noqa: F401
PYPDFIUM_AVAILABLE = True
except ImportError:
PYPDFIUM_AVAILABLE = False
class TestRasterizerOption:
"""Test the --rasterizer CLI option."""
def test_rasterizer_auto_default(self, resources, outpdf):
"""Test that --rasterizer auto (default) works."""
check_ocrmypdf(
resources / 'graph.pdf',
outpdf,
'--rasterizer',
'auto',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_rasterizer_ghostscript(self, resources, outpdf):
"""Test that --rasterizer ghostscript works."""
check_ocrmypdf(
resources / 'graph.pdf',
outpdf,
'--rasterizer',
'ghostscript',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_rasterizer_pypdfium(self, resources, outpdf):
"""Test that --rasterizer pypdfium works when pypdfium2 is installed."""
check_ocrmypdf(
resources / 'graph.pdf',
outpdf,
'--rasterizer',
'pypdfium',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_rasterizer_invalid(self):
"""Test that an invalid rasterizer value is rejected."""
with pytest.raises(ValueError, match="rasterizer must be one of"):
OcrOptions(
input_file='test.pdf', output_file='out.pdf', rasterizer='invalid'
)
class TestRasterizerWithRotation:
"""Test --rasterizer interaction with --rotate-pages."""
def test_ghostscript_with_rotation(self, resources, outpdf):
"""Test Ghostscript rasterizer with page rotation."""
check_ocrmypdf(
resources / 'cardinal.pdf',
outpdf,
'--rasterizer',
'ghostscript',
'--rotate-pages',
'--rotate-pages-threshold',
'0.1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_pypdfium_with_rotation(self, resources, outpdf):
"""Test pypdfium rasterizer with page rotation."""
check_ocrmypdf(
resources / 'cardinal.pdf',
outpdf,
'--rasterizer',
'pypdfium',
'--rotate-pages',
'--rotate-pages-threshold',
'0.1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
def test_auto_with_rotation(self, resources, outpdf):
"""Test auto rasterizer with page rotation."""
check_ocrmypdf(
resources / 'cardinal.pdf',
outpdf,
'--rasterizer',
'auto',
'--rotate-pages',
'--rotate-pages-threshold',
'0.1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
class TestRasterizerHookDirect:
"""Test rasterize_pdf_page hook directly with different rasterizer options."""
def test_ghostscript_hook_respects_option(self, resources, tmp_path):
"""Test that Ghostscript hook returns None when pypdfium is requested."""
pm = get_plugin_manager([])
# Create options requesting pypdfium
options = OcrOptions(
input_file=resources / 'graph.pdf',
output_file=tmp_path / 'out.pdf',
rasterizer='pypdfium',
)
img = tmp_path / 'ghostscript_test.png'
result = pm.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device='pngmono',
raster_dpi=Resolution(50, 50),
page_dpi=Resolution(50, 50),
pageno=1,
rotation=0,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
# When pypdfium is requested:
# - If pypdfium IS available, pypdfium handles it and returns the path
# - If pypdfium is NOT available, both plugins return None
# (ghostscript returns None because pypdfium was requested,
# pypdfium returns None because it's not installed)
if PYPDFIUM_AVAILABLE:
assert result == img
else:
assert result is None
def test_pypdfium_hook_respects_option(self, resources, tmp_path):
"""Test that pypdfium hook returns None when ghostscript is requested."""
pm = get_plugin_manager([])
# Create options requesting ghostscript
options = OcrOptions(
input_file=resources / 'graph.pdf',
output_file=tmp_path / 'out.pdf',
rasterizer='ghostscript',
)
img = tmp_path / 'pypdfium_test.png'
result = pm.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device='pngmono',
raster_dpi=Resolution(50, 50),
page_dpi=Resolution(50, 50),
pageno=1,
rotation=0,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
# Ghostscript should handle it
assert result == img
assert img.exists()
def test_auto_uses_pypdfium_when_available(self, resources, tmp_path):
"""Test that auto mode uses pypdfium when available."""
pm = get_plugin_manager([])
options = OcrOptions(
input_file=resources / 'graph.pdf',
output_file=tmp_path / 'out.pdf',
rasterizer='auto',
)
img = tmp_path / 'auto_test.png'
result = pm.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device='pngmono',
raster_dpi=Resolution(50, 50),
page_dpi=Resolution(50, 50),
pageno=1,
rotation=0,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
assert result == img
assert img.exists()
def _create_gradient_image(width: int, height: int) -> Image.Image:
"""Create an image with multiple gradients to detect rasterization errors.
The image contains:
- Horizontal gradient from red to blue
- Vertical gradient overlay from green to transparent
- Diagonal bands for edge detection
"""
img = Image.new('RGB', (width, height))
pixels = img.load()
for y in range(height):
for x in range(width):
# Horizontal gradient: red to blue
r = int(255 * (1 - x / width))
b = int(255 * (x / width))
# Vertical gradient: add green component
g = int(255 * (y / height))
# Add diagonal bands for edge detection
band = ((x + y) // 20) % 2
if band:
r = min(255, r + 40)
g = min(255, g + 40)
b = min(255, b + 40)
pixels[x, y] = (r, g, b)
return img
@pytest.fixture
def pdf_with_nonstandard_boxes(tmp_path):
"""Create a PDF with nonstandard MediaBox, TrimBox and CropBox."""
# Create an image with gradients to detect rasterization errors
img = _create_gradient_image(200, 300)
img_bytes = BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
# Convert to PDF
pdf_bytes = BytesIO()
img2pdf.convert(
img_bytes.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),
outputstream=pdf_bytes,
**IMG2PDF_KWARGS,
)
pdf_bytes.seek(0)
# Modify the PDF to have nonstandard boxes
pdf_path = tmp_path / 'nonstandard_boxes.pdf'
with pikepdf.open(pdf_bytes) as pdf:
page = pdf.pages[0]
# Set MediaBox larger than content
page.MediaBox = pikepdf.Array([0, 0, 400, 500])
# Set CropBox smaller - this is what viewers typically show
page.CropBox = pikepdf.Array([50, 50, 350, 450])
# Set TrimBox even smaller - indicates intended trim area
page.TrimBox = pikepdf.Array([75, 75, 325, 425])
pdf.save(pdf_path)
return pdf_path
@pytest.fixture
def pdf_with_negative_mediabox(tmp_path):
"""Create a PDF with MediaBox that has negative origin coordinates."""
# Create an image with gradients to detect rasterization errors
img = _create_gradient_image(200, 300)
img_bytes = BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
pdf_bytes = BytesIO()
img2pdf.convert(
img_bytes.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),
outputstream=pdf_bytes,
**IMG2PDF_KWARGS,
)
pdf_bytes.seek(0)
pdf_path = tmp_path / 'negative_mediabox.pdf'
with pikepdf.open(pdf_bytes) as pdf:
page = pdf.pages[0]
# MediaBox with negative origin (valid PDF but unusual)
page.MediaBox = pikepdf.Array([-100, -100, 300, 400])
pdf.save(pdf_path)
return pdf_path
class TestRasterizerWithNonStandardBoxes:
"""Test rasterizers with PDFs having nonstandard MediaBox/TrimBox/CropBox."""
def test_ghostscript_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):
"""Test Ghostscript handles nonstandard page boxes correctly."""
check_ocrmypdf(
pdf_with_nonstandard_boxes,
outpdf,
'--rasterizer',
'ghostscript',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_pypdfium_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):
"""Test pypdfium handles nonstandard page boxes correctly."""
check_ocrmypdf(
pdf_with_nonstandard_boxes,
outpdf,
'--rasterizer',
'pypdfium',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_ghostscript_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):
"""Test Ghostscript handles negative MediaBox origin."""
check_ocrmypdf(
pdf_with_negative_mediabox,
outpdf,
'--rasterizer',
'ghostscript',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_pypdfium_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):
"""Test pypdfium handles negative MediaBox origin."""
check_ocrmypdf(
pdf_with_negative_mediabox,
outpdf,
'--rasterizer',
'pypdfium',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_compare_rasterizers_nonstandard_boxes(
self, pdf_with_nonstandard_boxes, tmp_path
):
"""Compare output dimensions between rasterizers for nonstandard boxes."""
pm = get_plugin_manager([])
options_gs = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out_gs.pdf',
rasterizer='ghostscript',
)
img_gs = tmp_path / 'gs.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=img_gs,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=0,
filter_vector=False,
stop_on_soft_error=True,
options=options_gs,
use_cropbox=False,
)
with Image.open(img_gs) as im_gs:
gs_size = im_gs.size
if PYPDFIUM_AVAILABLE:
options_pdfium = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out_pdfium.pdf',
rasterizer='pypdfium',
)
img_pdfium = tmp_path / 'pdfium.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=img_pdfium,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=0,
filter_vector=False,
stop_on_soft_error=True,
options=options_pdfium,
use_cropbox=False,
)
with Image.open(img_pdfium) as im_pdfium:
pdfium_size = im_pdfium.size
# Both rasterizers should now produce MediaBox dimensions (400x500)
# when use_cropbox=False (the default)
assert gs_size == (400, 500), f"Ghostscript size: {gs_size}"
assert pdfium_size == (400, 500), f"pypdfium size: {pdfium_size}"
class TestRasterizerWithRotationAndBoxes:
"""Test rasterizer + rotation + nonstandard boxes combinations."""
# The pdf_with_nonstandard_boxes fixture creates a PDF with:
# - MediaBox: [0, 0, 400, 500] → 400x500 points
# - CropBox: [50, 50, 350, 450] → 300x400 points
# - TrimBox: [75, 75, 325, 425] → 250x350 points
#
# With use_cropbox=False (default), both rasterizers use MediaBox
MEDIABOX_WIDTH = 400
MEDIABOX_HEIGHT = 500
def _get_expected_size(self, rotation: int) -> tuple[int, int]:
"""Get expected image dimensions after rotation."""
width, height = self.MEDIABOX_WIDTH, self.MEDIABOX_HEIGHT
if rotation in (0, 180):
return (width, height)
else: # 90, 270
return (height, width)
def test_ghostscript_rotation_dimensions(
self, pdf_with_nonstandard_boxes, tmp_path
):
"""Test Ghostscript produces correct dimensions with rotation."""
pm = get_plugin_manager([])
options = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out.pdf',
rasterizer='ghostscript',
)
for rotation in [0, 90, 180, 270]:
img_path = tmp_path / f'gs_rot{rotation}.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=img_path,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=rotation,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
assert img_path.exists(), f"Failed to rasterize with rotation {rotation}"
with Image.open(img_path) as img:
expected = self._get_expected_size(rotation)
# Allow small tolerance for rounding
assert abs(img.size[0] - expected[0]) <= 2, (
f"Width mismatch at {rotation}°: got {img.size[0]}, "
f"expected {expected[0]}"
)
assert abs(img.size[1] - expected[1]) <= 2, (
f"Height mismatch at {rotation}°: got {img.size[1]}, "
f"expected {expected[1]}"
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_pypdfium_rotation_dimensions(self, pdf_with_nonstandard_boxes, tmp_path):
"""Test pypdfium produces correct dimensions with rotation."""
pm = get_plugin_manager([])
options = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out.pdf',
rasterizer='pypdfium',
)
for rotation in [0, 90, 180, 270]:
img_path = tmp_path / f'pdfium_rot{rotation}.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=img_path,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=rotation,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
assert img_path.exists(), f"Failed to rasterize with rotation {rotation}"
with Image.open(img_path) as img:
expected = self._get_expected_size(rotation)
# Allow small tolerance for rounding
assert abs(img.size[0] - expected[0]) <= 2, (
f"Width mismatch at {rotation}°: got {img.size[0]}, "
f"expected {expected[0]}"
)
assert abs(img.size[1] - expected[1]) <= 2, (
f"Height mismatch at {rotation}°: got {img.size[1]}, "
f"expected {expected[1]}"
)
@pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
def test_rasterizers_produce_same_dimensions(
self, pdf_with_nonstandard_boxes, tmp_path
):
"""Verify ghostscript and pypdfium produce the same MediaBox dimensions.
With use_cropbox=False (the default), both rasterizers should render
to the MediaBox and produce identical dimensions.
"""
pm = get_plugin_manager([])
for rotation in [0, 90, 180, 270]:
# Rasterize with Ghostscript
gs_options = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out.pdf',
rasterizer='ghostscript',
)
gs_img_path = tmp_path / f'gs_cmp_rot{rotation}.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=gs_img_path,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=rotation,
filter_vector=False,
stop_on_soft_error=True,
options=gs_options,
use_cropbox=False,
)
# Rasterize with pypdfium
pdfium_options = OcrOptions(
input_file=pdf_with_nonstandard_boxes,
output_file=tmp_path / 'out.pdf',
rasterizer='pypdfium',
)
pdfium_img_path = tmp_path / f'pdfium_cmp_rot{rotation}.png'
pm.rasterize_pdf_page(
input_file=pdf_with_nonstandard_boxes,
output_file=pdfium_img_path,
raster_device='png16m',
raster_dpi=Resolution(72, 72),
page_dpi=Resolution(72, 72),
pageno=1,
rotation=rotation,
filter_vector=False,
stop_on_soft_error=True,
options=pdfium_options,
use_cropbox=False,
)
# Verify both produce the same MediaBox dimensions
with (
Image.open(gs_img_path) as gs_img,
Image.open(pdfium_img_path) as pdfium_img,
):
expected = self._get_expected_size(rotation)
assert abs(gs_img.size[0] - expected[0]) <= 2, (
f"GS width at {rotation}°: {gs_img.size[0]}, "
f"expected {expected[0]}"
)
assert abs(gs_img.size[1] - expected[1]) <= 2, (
f"GS height at {rotation}°: {gs_img.size[1]}, "
f"expected {expected[1]}"
)
assert abs(pdfium_img.size[0] - expected[0]) <= 2, (
f"pdfium width at {rotation}°: {pdfium_img.size[0]}, "
f"expected {expected[0]}"
)
assert abs(pdfium_img.size[1] - expected[1]) <= 2, (
f"pdfium height at {rotation}°: {pdfium_img.size[1]}, "
f"expected {expected[1]}"
)
================================================
FILE: tests/test_rotation.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import operator
from io import BytesIO
from math import cos, pi, sin
from os import fspath
from subprocess import run
import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageChops
from reportlab.pdfgen.canvas import Canvas
from ocrmypdf._exec import ghostscript
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from .conftest import check_ocrmypdf, run_ocrmypdf_api
# pylintx: disable=unused-variable
RENDERERS = ['fpdf2', 'sandwich']
def compare_images_monochrome(
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
pdf,
png,
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(100, 100),
pageno=pageno,
rotation=0,
)
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:
assert reference_im.mode == test_im.mode == '1'
assert reference_im.size == test_im.size, "Images must be the same size"
# XOR the images: matching pixels become 0, different pixels become 1
difference = ImageChops.logical_xor(reference_im, test_im)
# Count matching pixels directly using getcolors()
# For a binary image, getcolors returns [(count, 0), (count, 1)] or subset
colors = difference.getcolors()
color_counts = {color: count for count, color in colors}
count_same = color_counts.get(0, 0) # 0 = matching pixels (XOR result is 0)
count_different = color_counts.get(255, 0) # 255 = different pixels
total = count_same + count_different
return count_same / total
def test_monochrome_comparison(resources, outdir):
# Self test: check that an incorrect rotated image has poor
# comparison with reference
cmp = compare_images_monochrome(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
)
assert cmp < 0.90
cmp = compare_images_monochrome(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
)
assert cmp > 0.95
@pytest.mark.slow
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(renderer, resources, outdir):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'-r',
'-v',
'1',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
for n in range(1, 4 + 1):
cmp = compare_images_monochrome(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=n,
)
assert cmp > 0.95
@pytest.mark.parametrize(
'threshold, op, comparison_threshold',
[
('1', operator.ge, 0.95), # Low thresh -> always rotate -> high score
('99', operator.le, 0.90), # High thres -> never rotate -> low score
],
)
def test_autorotate_threshold(threshold, op, comparison_threshold, resources, outdir):
check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'--rotate-pages-threshold',
threshold,
'-r',
# '-v',
# '1',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
cmp = compare_images_monochrome( # pylint: disable=unused-variable
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=3,
)
assert op(cmp, comparison_threshold)
@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rotated_skew_timeout(resources, outpdf, rasterizer):
"""Check rotated skew timeout.
This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
in_pageinfo = PdfInfo(input_file)[0]
assert (
in_pageinfo.height_pixels < in_pageinfo.width_pixels
), "Expected the input page to be landscape"
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
input_file,
outpdf,
'--pdf-renderer',
'fpdf2',
'--deskew',
'--tesseract-timeout',
'0',
'--rasterizer',
rasterizer,
)
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
assert h > w, "Expected the output page to be portrait"
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
assert (
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
), "Expected page rotation to be baked in"
@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rotate_deskew_ocr_timeout(resources, outdir, rasterizer):
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--rotate-pages',
'--rotate-pages-threshold',
'0',
'--deskew',
'--tesseract-timeout',
'0',
'--pdf-renderer',
'fpdf2',
'--rasterizer',
rasterizer,
)
cmp = compare_images_monochrome(
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
test_pageno=1,
)
# Confirm that the page still got deskewed
# pypdfium anti-aliases so gets better visual quality, but lower score (0.88)
# on monochrome comparison; ghostscript looks ugly but gets > 0.95
assert cmp > 0.85
def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle, cropbox=None):
memimg = BytesIO()
with Image.open(fspath(imagefile)) as im:
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image.Transpose, f'ROTATE_{ccw_angle}'))
im.save(memimg, format='PNG')
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
outputstream=mempdf,
**IMG2PDF_KWARGS,
)
mempdf.seek(0)
with pikepdf.open(mempdf) as pdf:
pdf.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
if cropbox:
pdf.pages[0].CropBox = cropbox
pdf.save(target)
return target
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir, caplog):
reference = make_rotate_test(resources / 'typewriter.png', outdir, 'ref', 0, 0)
test = make_rotate_test(
resources / 'typewriter.png', outdir, 'test', image_angle, page_angle
)
out = test.with_suffix('.out.pdf')
exitcode = run_ocrmypdf_api(
test,
out,
'-O0',
'--rotate-pages',
'--rotate-pages-threshold',
'0.001',
)
assert exitcode == 0, caplog.text
assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.2
@pytest.mark.slow
@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
def test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):
# Check that pages that have an image that is misrotated but restored to
# correct rotation with a /Rotate will be processed correct and yield text.
test = make_rotate_test(
resources / 'crom.png', outdir, 'test', -page_rotate_angle, page_rotate_angle
)
out = test.with_suffix('.out.pdf')
exitcode = run_ocrmypdf_api(
test,
out,
'-O0',
)
assert exitcode == 0, caplog.text
def pdftotext(filename):
return (
run(['pdftotext', '-enc', 'UTF-8', filename, '-'], capture_output=True)
.stdout.strip()
.decode('utf-8')
)
test_text = pdftotext(out)
assert 'is a' in test_text, test_text
@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('renderer', ['sandwich', 'fpdf2'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_rotate_and_crop(
resources, outdir, page_rotate_angle, renderer, output_type, caplog
):
cropbox = (100, 200, 1000, 800)
reference = make_rotate_test(
resources / 'typewriter.png', outdir, 'ref', 0, 0, cropbox
)
test = make_rotate_test(
resources / 'typewriter.png',
outdir,
'test',
-page_rotate_angle,
page_rotate_angle,
cropbox,
)
out = test.with_suffix('.out.pdf')
exitcode = run_ocrmypdf_api(
test,
out,
'-O0',
'--rotate-pages',
'--rotate-pages-threshold',
'0',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
'--no-progress-bar',
)
assert exitcode == 0, caplog.text
assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.9
@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rasterize_rotates(resources, tmp_path, rasterizer):
from ocrmypdf._options import OcrOptions
pm = get_plugin_manager([])
options = OcrOptions(
input_file=resources / 'graph.pdf',
output_file=tmp_path / 'out.pdf',
rasterizer=rasterizer,
)
img = tmp_path / 'img90.png'
pm.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(20, 20),
page_dpi=Resolution(20, 20),
pageno=1,
rotation=90,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
with Image.open(img) as im:
assert im.size == (83, 200), "Image not rotated"
img = tmp_path / 'img180.png'
pm.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device=GhostscriptRasterDevice.PNGMONO,
raster_dpi=Resolution(20, 20),
page_dpi=Resolution(20, 20),
pageno=1,
rotation=180,
filter_vector=False,
stop_on_soft_error=True,
options=options,
use_cropbox=False,
)
assert Image.open(img).size == (200, 83), "Image not rotated"
def test_simulated_scan(outdir):
canvas = Canvas(
fspath(outdir / 'fakescan.pdf'),
pagesize=(209.8, 297.6),
)
page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)]
for n, page_var in enumerate(page_vars):
text = canvas.beginText()
text.setFont('Helvetica', 20)
angle, x, y = page_var
cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi)
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y)
text.textOut(f'Page {n + 1}')
canvas.drawText(text)
canvas.showPage()
canvas.save()
check_ocrmypdf(
outdir / 'fakescan.pdf',
outdir / 'out.pdf',
'--force-ocr',
'--deskew',
'--rotate-pages',
'--plugin',
'tests/plugins/tesseract_debug_rotate.py',
)
with pikepdf.open(outdir / 'out.pdf') as pdf:
assert (
pdf.pages[1].mediabox[2] > pdf.pages[1].mediabox[3]
), "Wrong orientation: not landscape"
assert (
pdf.pages[3].mediabox[2] > pdf.pages[3].mediabox[3]
), "Wrong orientation: Not landscape"
assert (
pdf.pages[0].mediabox[2] < pdf.pages[0].mediabox[3]
), "Wrong orientation: Not portrait"
assert (
pdf.pages[2].mediabox[2] < pdf.pages[2].mediabox[3]
), "Wrong orientation: Not portrait"
================================================
FILE: tests/test_semfree.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import sys
import pytest
from ocrmypdf.exceptions import ExitCode
from .conftest import is_linux, run_ocrmypdf_api
@pytest.mark.skipif(not is_linux(), reason='semfree plugin only works on Linux')
@pytest.mark.skipif(
sys.version_info >= (3, 14),
reason='semfree plugin only works on Python 3.13 or earlier',
)
def test_semfree(resources, outpdf):
with pytest.warns(DeprecationWarning, match="semfree.py is deprecated"):
exitcode = run_ocrmypdf_api(
resources / 'multipage.pdf',
outpdf,
'--skip-text',
'--skip-big',
'2',
'--plugin',
'ocrmypdf.extra_plugins.semfree',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)
================================================
FILE: tests/test_soft_error.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
import pytest
from ocrmypdf.exceptions import ExitCode
from .conftest import run_ocrmypdf_api
def test_raster_continue_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--continue-on-soft-render-error',
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_raster_soft_error.py',
)
assert exitcode == ExitCode.ok
def test_raster_stop_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_raster_soft_error.py',
)
assert exitcode == ExitCode.child_process_error
def test_render_continue_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--continue-on-soft-render-error',
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_render_soft_error.py',
)
assert exitcode == ExitCode.ok
@pytest.mark.skipif(os.name == 'nt', reason='Ghostscript on Windows errors out')
def test_render_stop_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',
'tests/plugins/gs_render_soft_error.py',
)
assert exitcode == ExitCode.child_process_error
================================================
FILE: tests/test_stdio.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import os
from subprocess import DEVNULL, PIPE, run
import pytest
from ocrmypdf.helpers import check_pdf
from .conftest import run_ocrmypdf
def test_stdin(ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf - output.pdf < testfile.pdf
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'-',
output_file,
'--plugin',
'tests/plugins/tesseract_noop.py',
]
run(p_args, capture_output=True, stdin=input_stream, check=True)
def test_stdout(ocrmypdf_exec, resources, outpdf):
if 'COV_CORE_DATAFILE' in os.environ:
pytest.skip("Coverage uses stdout")
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf francais.pdf - > test_stdout.pdf
with open(output_file, 'wb') as output_stream:
p_args = ocrmypdf_exec + [
input_file,
'-',
'--plugin',
'tests/plugins/tesseract_noop.py',
]
run(p_args, stdout=output_stream, stderr=PIPE, stdin=DEVNULL, check=True)
assert check_pdf(output_file)
@pytest.mark.skipif(os.name == 'nt', reason='Windows does not support /dev/null')
def test_dev_null(resources):
if 'COV_CORE_DATAFILE' in os.environ:
pytest.skip("Coverage uses stdout")
p = run_ocrmypdf(
resources / 'trivial.pdf',
os.devnull,
'--force-ocr',
'--plugin',
'tests/plugins/tesseract_noop.py',
)
assert p.returncode == 0, "could not send output to /dev/null"
assert len(p.stdout) == 0, "wrote to stdout"
================================================
FILE: tests/test_system_font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for SystemFontProvider and ChainedFontProvider."""
from __future__ import annotations
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from ocrmypdf.font import (
BuiltinFontProvider,
ChainedFontProvider,
SystemFontProvider,
)
# --- SystemFontProvider Platform Detection Tests ---
class TestSystemFontProviderPlatform:
"""Test platform detection in SystemFontProvider."""
def test_get_platform_linux(self):
"""Test Linux platform detection."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'linux'):
assert provider._get_platform() == 'linux'
def test_get_platform_darwin(self):
"""Test macOS platform detection."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'darwin'):
assert provider._get_platform() == 'darwin'
def test_get_platform_windows(self):
"""Test Windows platform detection."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'win32'):
assert provider._get_platform() == 'windows'
def test_get_platform_freebsd(self):
"""Test FreeBSD platform detection."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'freebsd13'):
assert provider._get_platform() == 'freebsd'
class TestSystemFontProviderDirectories:
"""Test font directory resolution."""
def test_linux_font_dirs(self):
"""Test Linux font directories."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'linux'):
provider._font_dirs = None # Reset cache
dirs = provider._get_font_dirs()
assert Path('/usr/share/fonts') in dirs
assert Path('/usr/local/share/fonts') in dirs
def test_darwin_font_dirs(self):
"""Test macOS font directories."""
provider = SystemFontProvider()
with patch.object(sys, 'platform', 'darwin'):
provider._font_dirs = None # Reset cache
dirs = provider._get_font_dirs()
assert Path('/Library/Fonts') in dirs
assert Path('/System/Library/Fonts') in dirs
def test_windows_font_dirs_with_windir(self):
"""Test Windows font directory from WINDIR env var."""
provider = SystemFontProvider()
with (
patch.object(sys, 'platform', 'win32'),
patch.dict('os.environ', {'WINDIR': r'D:\Windows'}),
):
provider._font_dirs = None # Reset cache
dirs = provider._get_font_dirs()
# Check that Fonts subdir of WINDIR is included
# Use str comparison to avoid Path normalization issues across platforms
dir_strs = [str(d) for d in dirs]
assert any('Fonts' in d for d in dir_strs)
def test_windows_font_dirs_default(self):
"""Test Windows font directory with default path."""
provider = SystemFontProvider()
with (
patch.object(sys, 'platform', 'win32'),
patch.dict('os.environ', {}, clear=True),
):
provider._font_dirs = None # Reset cache
dirs = provider._get_font_dirs()
# Check that Windows\Fonts is included (default fallback)
dir_strs = [str(d) for d in dirs]
assert any('Windows' in d and 'Fonts' in d for d in dir_strs)
def test_windows_font_dirs_with_localappdata(self):
"""Test Windows user fonts directory from LOCALAPPDATA env var."""
provider = SystemFontProvider()
with (
patch.object(sys, 'platform', 'win32'),
patch.dict(
'os.environ',
{'WINDIR': r'C:\Windows', 'LOCALAPPDATA': r'C:\Users\Test\AppData\Local'},
),
):
provider._font_dirs = None # Reset cache
dirs = provider._get_font_dirs()
dir_strs = [str(d) for d in dirs]
# Should have both system and user font directories
assert len(dirs) == 2
assert any('Windows' in d and 'Fonts' in d for d in dir_strs)
assert any(
'AppData' in d and 'Local' in d and 'Fonts' in d
for d in dir_strs
)
def test_font_dirs_cached(self):
"""Test that font directories are cached."""
provider = SystemFontProvider()
dirs1 = provider._get_font_dirs()
dirs2 = provider._get_font_dirs()
assert dirs1 is dirs2 # Same object, not recomputed
class TestSystemFontProviderLazyLoading:
"""Test lazy loading behavior."""
def test_no_scanning_on_init(self):
"""Test that no directory scanning happens during initialization."""
provider = SystemFontProvider()
# Caches should be empty
assert len(provider._font_cache) == 0
assert len(provider._not_found) == 0
def test_get_font_unknown_name_returns_none(self):
"""Test that unknown font names return None."""
provider = SystemFontProvider()
result = provider.get_font('UnknownFont-Regular')
assert result is None
# Unknown fonts are added to not_found to cache the negative result
assert 'UnknownFont-Regular' in provider._not_found
def test_negative_cache(self):
"""Test that not-found results are cached."""
provider = SystemFontProvider()
# Mock _find_font_file to return None
with patch.object(provider, '_find_font_file', return_value=None):
result1 = provider.get_font('NotoSansCJK-Regular')
assert result1 is None
assert 'NotoSansCJK-Regular' in provider._not_found
# Second call should not call _find_font_file again
provider._find_font_file = MagicMock(return_value=None)
result2 = provider.get_font('NotoSansCJK-Regular')
assert result2 is None
provider._find_font_file.assert_not_called()
def test_positive_cache(self):
"""Test that found fonts are cached."""
provider = SystemFontProvider()
font_dir = Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
font_path = font_dir / "NotoSans-Regular.ttf"
if not font_path.exists():
pytest.skip("Test font not available")
with patch.object(provider, '_find_font_file', return_value=font_path):
result1 = provider.get_font('NotoSans-Regular')
assert result1 is not None
assert 'NotoSans-Regular' in provider._font_cache
# Second call should use cache
provider._find_font_file = MagicMock()
result2 = provider.get_font('NotoSans-Regular')
assert result2 is result1
provider._find_font_file.assert_not_called()
class TestSystemFontProviderAvailableFonts:
"""Test get_available_fonts method."""
def test_returns_all_patterns(self):
"""Test that get_available_fonts returns all known font patterns."""
provider = SystemFontProvider()
fonts = provider.get_available_fonts()
assert 'NotoSans-Regular' in fonts
assert 'NotoSansCJK-Regular' in fonts
assert 'NotoSansArabic-Regular' in fonts
assert 'NotoSansThai-Regular' in fonts
def test_fallback_font_raises(self):
"""Test that get_fallback_font raises NotImplementedError."""
provider = SystemFontProvider()
with pytest.raises(NotImplementedError):
provider.get_fallback_font()
# --- ChainedFontProvider Tests ---
class TestChainedFontProvider:
"""Test ChainedFontProvider."""
def test_requires_at_least_one_provider(self):
"""Test that empty provider list raises error."""
with pytest.raises(ValueError, match="At least one provider"):
ChainedFontProvider([])
def test_get_font_tries_providers_in_order(self):
"""Test that get_font tries providers in order."""
provider1 = MagicMock()
provider1.get_font.return_value = None
provider2 = MagicMock()
mock_font = MagicMock()
provider2.get_font.return_value = mock_font
chain = ChainedFontProvider([provider1, provider2])
result = chain.get_font('TestFont')
provider1.get_font.assert_called_once_with('TestFont')
provider2.get_font.assert_called_once_with('TestFont')
assert result is mock_font
def test_get_font_stops_on_first_match(self):
"""Test that get_font stops after first successful match."""
mock_font = MagicMock()
provider1 = MagicMock()
provider1.get_font.return_value = mock_font
provider2 = MagicMock()
chain = ChainedFontProvider([provider1, provider2])
result = chain.get_font('TestFont')
provider1.get_font.assert_called_once()
provider2.get_font.assert_not_called()
assert result is mock_font
def test_get_font_returns_none_if_all_fail(self):
"""Test that get_font returns None if all providers fail."""
provider1 = MagicMock()
provider1.get_font.return_value = None
provider2 = MagicMock()
provider2.get_font.return_value = None
chain = ChainedFontProvider([provider1, provider2])
result = chain.get_font('TestFont')
assert result is None
def test_get_available_fonts_combines_providers(self):
"""Test that get_available_fonts combines all providers."""
provider1 = MagicMock()
provider1.get_available_fonts.return_value = ['Font1', 'Font2']
provider2 = MagicMock()
provider2.get_available_fonts.return_value = ['Font2', 'Font3']
chain = ChainedFontProvider([provider1, provider2])
fonts = chain.get_available_fonts()
assert fonts == ['Font1', 'Font2', 'Font3'] # Deduplicated, order preserved
def test_get_fallback_font_from_first_provider(self):
"""Test that get_fallback_font uses first available fallback."""
mock_font = MagicMock()
provider1 = MagicMock()
provider1.get_fallback_font.return_value = mock_font
provider2 = MagicMock()
chain = ChainedFontProvider([provider1, provider2])
result = chain.get_fallback_font()
assert result is mock_font
provider2.get_fallback_font.assert_not_called()
def test_get_fallback_font_skips_not_implemented(self):
"""Test that get_fallback_font skips providers that raise."""
provider1 = MagicMock()
provider1.get_fallback_font.side_effect = NotImplementedError()
mock_font = MagicMock()
provider2 = MagicMock()
provider2.get_fallback_font.return_value = mock_font
chain = ChainedFontProvider([provider1, provider2])
result = chain.get_fallback_font()
assert result is mock_font
def test_get_fallback_font_raises_if_none_available(self):
"""Test that get_fallback_font raises if no provider has fallback."""
provider1 = MagicMock()
provider1.get_fallback_font.side_effect = NotImplementedError()
provider2 = MagicMock()
provider2.get_fallback_font.side_effect = KeyError()
chain = ChainedFontProvider([provider1, provider2])
with pytest.raises(RuntimeError, match="No fallback font available"):
chain.get_fallback_font()
class TestChainedFontProviderIntegration:
"""Integration tests with real providers."""
@pytest.fixture
def font_dir(self):
"""Return path to font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
def test_builtin_then_system_chain(self, font_dir):
"""Test chaining BuiltinFontProvider with SystemFontProvider."""
builtin = BuiltinFontProvider(font_dir)
system = SystemFontProvider()
chain = ChainedFontProvider([builtin, system])
# Should find NotoSans from builtin
font = chain.get_font('NotoSans-Regular')
assert font is not None
# Should get fallback from builtin
fallback = chain.get_fallback_font()
assert fallback is not None
def test_system_fonts_extend_builtin(self, font_dir):
"""Test that system fonts add to builtin fonts."""
builtin = BuiltinFontProvider(font_dir)
system = SystemFontProvider()
chain = ChainedFontProvider([builtin, system])
builtin_fonts = set(builtin.get_available_fonts())
chain_fonts = set(chain.get_available_fonts())
# Chain should have at least as many fonts as builtin
assert chain_fonts >= builtin_fonts
================================================
FILE: tests/test_tagged.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pytest
import ocrmypdf
def test_block_tagged(resources):
with pytest.raises(ocrmypdf.exceptions.TaggedPDFError):
ocrmypdf.ocr(resources / 'tagged.pdf', '_.pdf')
def test_force_tagged_warns(resources, outpdf, caplog):
caplog.set_level('WARNING')
ocrmypdf.ocr(
resources / 'tagged.pdf',
outpdf,
force_ocr=True,
plugins=['tests/plugins/tesseract_noop.py'],
)
assert 'marked as a Tagged PDF' in caplog.text
def test_tagged_pdf_mode_ignore_with_skip_text(resources, outpdf, caplog):
"""Ignore tagged_pdf_mode should warn but not error."""
caplog.set_level('WARNING')
ocrmypdf.ocr(
resources / 'tagged.pdf',
outpdf,
tagged_pdf_mode='ignore',
skip_text=True, # Tagged PDF has text, so skip pages with text
plugins=['tests/plugins/tesseract_noop.py'],
)
assert 'marked as a Tagged PDF' in caplog.text
def test_tagged_pdf_mode_ignore_with_force(resources, outpdf, caplog):
"""Ignore tagged_pdf_mode with force mode should warn."""
caplog.set_level('WARNING')
ocrmypdf.ocr(
resources / 'tagged.pdf',
outpdf,
tagged_pdf_mode='ignore',
force_ocr=True,
plugins=['tests/plugins/tesseract_noop.py'],
)
assert 'marked as a Tagged PDF' in caplog.text
================================================
FILE: tests/test_tesseract.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import os
import subprocess
from os import fspath
from pathlib import Path
import pytest
from ocrmypdf import pdfinfo
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from .conftest import check_ocrmypdf, run_ocrmypdf_api
# pylint: disable=redefined-outer-name
@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(resources, basename, outdir):
infile = resources / basename
outpdf = outdir / basename
check_ocrmypdf(
infile,
outpdf,
'--pdf-renderer',
'sandwich',
'--force-ocr',
'--tesseract-timeout',
'0',
)
info_in = pdfinfo.PdfInfo(infile)
info = pdfinfo.PdfInfo(outpdf)
for page in info:
assert len(page.images) == 1, "skipped page was replicated"
for n, info_out_n in enumerate(info):
assert info_out_n.width_inches == info_in[n].width_inches, "output resized"
assert info_out_n.height_inches == info_in[n].height_inches, "output resized"
def test_content_preservation(resources, outpdf):
infile = resources / 'masks.pdf'
check_ocrmypdf(
infile, outpdf, '--pdf-renderer', 'fpdf2', '--tesseract-timeout', '0'
)
info = pdfinfo.PdfInfo(outpdf)
page = info[0]
assert len(page.images) > 1, "masks were rasterized"
@pytest.mark.skipif(
tesseract.version() >= tesseract.TesseractVersion('5'), reason="doesn't fool Tess 5"
)
def test_no_languages(tmp_path, monkeypatch):
(tmp_path / 'tessdata').mkdir()
monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))
with pytest.raises(MissingDependencyError):
tesseract.get_languages()
def test_image_too_large_hocr(monkeypatch, resources, outdir):
def dummy_run(args, *, env=None, **kwargs):
raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')
monkeypatch.setattr(tesseract, 'run', dummy_run)
tesseract.generate_hocr(
input_file=resources / 'crom.png',
output_hocr=outdir / 'out.hocr',
output_text=outdir / 'out.txt',
languages=['eng'],
engine_mode=None,
tessconfig=[],
timeout=180.0,
pagesegmode=None,
thresholding=0,
user_words=None,
user_patterns=None,
)
assert Path(outdir / 'out.hocr').read_text() == ''
def test_image_too_large_pdf(monkeypatch, resources, outdir):
def dummy_run(args, *, env=None, **kwargs):
raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')
monkeypatch.setattr(tesseract, 'run', dummy_run)
tesseract.generate_pdf(
input_file=resources / 'crom.png',
output_pdf=outdir / 'pdf.pdf',
output_text=outdir / 'txt.txt',
languages=['eng'],
engine_mode=None,
tessconfig=[],
timeout=180.0,
pagesegmode=None,
thresholding=0,
user_words=None,
user_patterns=None,
)
assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'
if os.name != 'nt': # different semantics
assert Path(outdir / 'pdf.pdf').stat().st_size == 0
def test_timeout(caplog):
tesseract.page_timedout(5)
assert "took too long" in caplog.text
@pytest.mark.parametrize(
'in_, logged',
[
(b'Tesseract Open Source', ''),
(b'lots of diacritics blah blah', 'diacritics'),
(b'Warning in pixReadMem', ''),
(b'OSD: Weak margin', 'unsure about page orientation'),
(b'Error in pixScanForForeground', ''),
(b'Error in boxClipToRectangle', ''),
(b'an unexpected error', 'an unexpected error'),
(b'a dire warning', 'a dire warning'),
(b'read_params_file something', 'read_params_file'),
(b'an innocent message', 'innocent'),
(b'\x7f\x7f\x80innocent unicode failure', 'innocent'),
],
)
def test_tesseract_log_output(caplog, in_, logged):
caplog.set_level(logging.INFO)
tesseract.tesseract_log_output(in_)
if logged == '':
assert caplog.text == ''
else:
assert logged in caplog.text
def test_tesseract_log_output_raises(caplog):
with pytest.raises(tesseract.TesseractConfigError):
tesseract.tesseract_log_output(b'parameter not found: moo')
assert 'not found' in caplog.text
def test_blocked_language(resources, no_outpdf):
infile = resources / 'masks.pdf'
for bad_lang in ['osd', 'equ']:
with pytest.raises(BadArgsError):
run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)
================================================
FILE: tests/test_unpaper.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
from os import fspath
from unittest.mock import Mock, patch
import pytest
from packaging.version import Version
from pydantic import ValidationError
from ocrmypdf._exec import unpaper
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api
# pylint: disable=redefined-outer-name
needs_unpaper = pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_no_unpaper(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options, pm = get_options_and_plugins(["--clean", input_, output])
with patch("ocrmypdf._exec.unpaper.version") as mock:
mock.side_effect = FileNotFoundError("unpaper")
with pytest.raises(MissingDependencyError):
check_options(options, pm)
mock.assert_called()
def test_old_unpaper(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options, pm = get_options_and_plugins(["--clean", input_, output])
with patch("ocrmypdf._exec.unpaper.version") as mock:
mock.return_value = Version('0.5')
with pytest.raises(MissingDependencyError):
check_options(options, pm)
mock.assert_called()
def test_unpaper_version_chatter(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options, pm = get_options_and_plugins(["--clean", input_, output])
with patch("ocrmypdf.subprocess.run") as mock:
mock.return_value = Mock(stdout='Warning: using insecure memory!\n7.0.0\n')
with pytest.raises(MissingDependencyError):
check_options(options, pm)
mock.assert_called()
@needs_unpaper
def test_clean(resources, outpdf):
check_ocrmypdf(
resources / "skew.pdf",
outpdf,
"-c",
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@needs_unpaper
def test_unpaper_args_valid(resources, outpdf):
check_ocrmypdf(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"--layout double", # Spaces required here
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@needs_unpaper
def test_unpaper_args_invalid_filename(resources, outpdf, caplog):
with pytest.raises(ValidationError, match="No filenames allowed"):
run_ocrmypdf_api(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"/etc/passwd",
'--plugin',
'tests/plugins/tesseract_noop.py',
)
@needs_unpaper
def test_unpaper_args_invalid(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"unpaper is not going to like these arguments",
'--plugin',
'tests/plugins/tesseract_noop.py',
)
# Can't tell difference between unpaper choking on bad arguments or some
# other unpaper failure
assert exitcode == ExitCode.child_process_error
@needs_unpaper
def test_unpaper_image_too_big(resources, outdir, caplog):
with patch('ocrmypdf._exec.unpaper.UNPAPER_IMAGE_PIXEL_LIMIT', 42):
infile = resources / 'crom.png'
assert unpaper.clean(infile, outdir / 'out.png', dpi=300) == infile
assert any(
'too large for cleaning' in rec.message
for rec in caplog.get_records('call')
if rec.levelno == logging.WARNING
)
@needs_unpaper
def test_palette_image(resources, outpdf):
check_ocrmypdf(
resources / "palette.pdf",
outpdf,
"-c",
'--plugin',
'tests/plugins/tesseract_noop.py',
)
================================================
FILE: tests/test_userunit.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
from math import isclose
import pytest
from ocrmypdf.pdfinfo import PdfInfo
from .conftest import check_ocrmypdf
# pylint: disable=redefined-outer-name
@pytest.fixture
def poster(resources):
return resources / 'poster.pdf'
@pytest.mark.parametrize("mode", ['pdf', 'pdfa'])
def test_userunit_pdf_passes(mode, poster, outpdf):
before = PdfInfo(poster)
check_ocrmypdf(
poster,
outpdf,
f'--output-type={mode}',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
after = PdfInfo(outpdf)
assert isclose(before[0].width_inches, after[0].width_inches)
def test_rotate_interaction(poster, outpdf):
check_ocrmypdf(
poster,
outpdf,
'--output-type=pdf',
'--rotate-pages',
'--plugin',
'tests/plugins/tesseract_cache.py',
)
================================================
FILE: tests/test_validation.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import os
from unittest.mock import patch
import pikepdf
import pytest
from ocrmypdf import _validation as vd
from ocrmypdf._concurrent import NullProgressBar, SerialExecutor
from ocrmypdf._exec.tesseract import TesseractVersion
from ocrmypdf._options import OcrOptions
from ocrmypdf.api import create_options, setup_plugin_infrastructure
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from ocrmypdf.pdfinfo import PdfInfo
from .conftest import run_ocrmypdf_api
def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
if language is not None:
kwargs['language'] = language
parser = get_parser()
pm = setup_plugin_infrastructure(plugins=kwargs.get('plugins', []))
pm.add_options(parser=parser)
return (
create_options(
input_file=input_file, output_file=output_file, parser=parser, **kwargs
),
pm,
)
def make_opts(*args, **kwargs):
opts, _pm = make_opts_pm(*args, **kwargs)
return opts
def make_ocr_opts(input_file='a.pdf', output_file='b.pdf', **kwargs):
"""Create OcrOptions directly for testing Pydantic validation."""
return OcrOptions(input_file=input_file, output_file=output_file, **kwargs)
def test_old_tesseract_error():
with patch(
'ocrmypdf._exec.tesseract.version',
return_value=TesseractVersion('4.00.00alpha'),
), pytest.raises(MissingDependencyError):
vd.check_options(*make_opts_pm(pdf_renderer='sandwich', language='eng'))
def test_tesseract_not_installed(caplog):
with patch('ocrmypdf.subprocess.run') as not_found:
not_found.side_effect = FileNotFoundError('tesseract')
with pytest.raises(MissingDependencyError, match="Could not find program"):
vd.check_options(*make_opts_pm())
assert (
"'tesseract' could not be executed" in caplog.text
), "Error message not printed"
assert 'install' in caplog.text, "Install advice not printed"
not_found.assert_called()
def test_lossless_redo():
with pytest.raises(ValueError, match="--redo-ocr.*is not currently compatible"):
make_ocr_opts(redo_ocr=True, deskew=True)
def test_mutex_options():
with pytest.raises(
ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
):
make_ocr_opts(force_ocr=True, skip_text=True)
with pytest.raises(
ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
):
make_ocr_opts(redo_ocr=True, skip_text=True)
with pytest.raises(
ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
):
make_ocr_opts(redo_ocr=True, force_ocr=True)
def test_optimizing(caplog):
vd.check_options(
*make_opts_pm(optimize=0, png_quality=18, jpeg_quality=10)
)
assert 'will be ignored because' in caplog.text
def test_pillow_options():
# Test that max_image_mpixels=0 is valid (validation now in OcrOptions)
opts = make_ocr_opts(max_image_mpixels=0)
assert opts.max_image_mpixels == 0
# Test that negative values are rejected
with pytest.raises(ValueError, match="max_image_mpixels must be non-negative"):
make_ocr_opts(max_image_mpixels=-1)
def test_output_tty():
with patch('sys.stdout.isatty', return_value=True), pytest.raises(BadArgsError):
vd.check_requested_output_file(make_opts(output_file='-'))
def test_report_file_size(tmp_path, caplog):
logging.getLogger('pikepdf._qpdf').setLevel(logging.CRITICAL) # Suppress logging
in_ = tmp_path / 'a.pdf'
out = tmp_path / 'b.pdf'
pdf = pikepdf.new()
pdf.save(in_)
pdf.save(out)
opts = make_opts(output_type='pdf')
vd.report_output_file_size(opts, in_, out)
assert caplog.text == ''
caplog.clear()
waste_of_space = b'Dummy' * 5000
pdf.Root.Dummy = waste_of_space
pdf.save(in_)
pdf.Root.Dummy2 = waste_of_space + waste_of_space
pdf.save(out)
vd.report_output_file_size(opts, in_, out, ['The optional dependency...'])
assert 'optional dependency' in caplog.text
caplog.clear()
vd.report_output_file_size(opts, in_, out, [])
assert 'No reason' in caplog.text
caplog.clear()
opts = make_opts(in_, out, optimize=0, output_type='pdf')
vd.report_output_file_size(opts, in_, out, ["Optimization was disabled."])
assert 'disabled' in caplog.text
caplog.clear()
def test_false_action_store_true():
opts = make_opts(keep_temporary_files=True)
assert opts.keep_temporary_files
opts = make_opts(keep_temporary_files=False)
assert not opts.keep_temporary_files
@pytest.mark.parametrize('progress_bar', [True, False])
def test_no_progress_bar(progress_bar, resources):
opts, pm = make_opts_pm(
progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')
)
vd.check_options(opts, pm)
pbar_disabled = None
class CheckProgressBar(NullProgressBar):
def __init__(self, disable, **kwargs):
nonlocal pbar_disabled
pbar_disabled = disable
super().__init__(disable=disable, **kwargs)
executor = SerialExecutor(pbar_class=CheckProgressBar)
pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor)
assert pdfinfo is not None
assert pbar_disabled is not None and pbar_disabled != progress_bar
def make_version(version):
def _make_version():
return TesseractVersion(version)
return _make_version
def test_version_comparison():
vd.check_external_program(
program="dummy_basic",
package="dummy",
version_checker=make_version('9.0'),
need_version='8.0.2',
)
vd.check_external_program(
program="dummy_doubledigit",
package="dummy",
version_checker=make_version('10.0'),
need_version='8.0.2',
)
with pytest.raises(MissingDependencyError):
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=make_version('4.0.0-beta.1'),
need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=make_version('v5.0.0-alpha.20200201'),
need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=make_version('5.0.0-rc1.20211030'),
need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=make_version('v4.1.1.20181030'), # Used in some Windows builds
need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="gs",
package="ghostscript",
version_checker=make_version('10.0'),
need_version='9.50',
)
with pytest.raises(MissingDependencyError):
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=make_version('4.1.1-rc2-25-g9707'),
need_version='4.1.1',
version_parser=TesseractVersion,
)
with pytest.raises(MissingDependencyError):
vd.check_external_program(
program="dummy_fails",
package="dummy",
version_checker=make_version('1.0'),
need_version='2.0',
)
def test_optional_program_recommended(caplog):
caplog.clear()
def raiser():
raise FileNotFoundError('jbig2')
with caplog.at_level(logging.WARNING):
vd.check_external_program(
program="jbig2",
package="jbig2enc",
version_checker=raiser,
need_version='42',
required_for='this test case',
recommended=True,
)
assert any(
(loglevel == logging.WARNING and "recommended" in msg)
for _logger_name, loglevel, msg in caplog.record_tuples
)
def test_pagesegmode_warning(caplog):
opts = make_opts(tesseract_pagesegmode='0')
plugin_manager = setup_plugin_infrastructure(plugins=opts.plugins or [])
vd.check_options(opts, plugin_manager)
assert 'disable OCR' in caplog.text
def test_two_languages():
vd.check_options_languages(
create_options(
input_file='a.pdf',
output_file='b.pdf',
parser=get_parser(),
languages=['fakelang1', 'fakelang2'],
),
['fakelang1', 'fakelang2'],
)
def test_sidecar_equals_output(resources, no_outpdf):
op = no_outpdf
with pytest.raises(BadArgsError, match=r'--sidecar'):
run_ocrmypdf_api(resources / 'trivial.pdf', op, '--sidecar', op)
def test_devnull_sidecar(resources):
with pytest.raises(BadArgsError, match=r'--sidecar.*NUL'):
run_ocrmypdf_api(resources / 'trivial.pdf', os.devnull, '--sidecar')
================================================
FILE: tests/test_verapdf.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: CC-BY-SA-4.0
"""Tests for verapdf wrapper and speculative PDF/A conversion."""
from __future__ import annotations
import pikepdf
import pytest
from pikepdf import Name
from ocrmypdf._exec import verapdf
from ocrmypdf.pdfa import (
_pdfa_part_conformance,
add_pdfa_metadata,
add_srgb_output_intent,
speculative_pdfa_conversion,
)
class TestVerapdfModule:
"""Tests for verapdf wrapper module."""
def test_output_type_to_flavour(self):
assert verapdf.output_type_to_flavour('pdfa') == '2b'
assert verapdf.output_type_to_flavour('pdfa-1') == '1b'
assert verapdf.output_type_to_flavour('pdfa-2') == '2b'
assert verapdf.output_type_to_flavour('pdfa-3') == '3b'
# Unknown should default to 2b
assert verapdf.output_type_to_flavour('unknown') == '2b'
@pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
def test_version(self):
ver = verapdf.version()
assert ver.major >= 1
@pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
def test_validate_non_pdfa(self, tmp_path):
"""Test validation of a non-PDF/A file returns invalid."""
test_pdf = tmp_path / 'test.pdf'
with pikepdf.new() as pdf:
pdf.add_blank_page()
pdf.save(test_pdf)
result = verapdf.validate(test_pdf, '2b')
assert not result.valid
assert result.failed_rules > 0
class TestPdfaPartConformance:
"""Tests for _pdfa_part_conformance helper."""
def test_pdfa_part_conformance(self):
assert _pdfa_part_conformance('pdfa') == ('2', 'B')
assert _pdfa_part_conformance('pdfa-1') == ('1', 'B')
assert _pdfa_part_conformance('pdfa-2') == ('2', 'B')
assert _pdfa_part_conformance('pdfa-3') == ('3', 'B')
# Unknown should default to 2B
assert _pdfa_part_conformance('unknown') == ('2', 'B')
class TestAddPdfaMetadata:
"""Tests for add_pdfa_metadata function."""
def test_add_pdfa_metadata(self, tmp_path):
"""Test adding PDF/A XMP metadata."""
test_pdf = tmp_path / 'test.pdf'
with pikepdf.new() as pdf:
pdf.add_blank_page()
pdf.save(test_pdf)
with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
add_pdfa_metadata(pdf, '2', 'B')
with pdf.open_metadata() as meta:
assert meta.pdfa_status == '2B'
pdf.save(test_pdf)
# Verify it persists after save
with pikepdf.open(test_pdf) as pdf, pdf.open_metadata() as meta:
assert meta.pdfa_status == '2B'
class TestAddSrgbOutputIntent:
"""Tests for add_srgb_output_intent function."""
def test_add_srgb_output_intent(self, tmp_path):
"""Test adding sRGB OutputIntent to a PDF."""
test_pdf = tmp_path / 'test.pdf'
with pikepdf.new() as pdf:
pdf.add_blank_page()
pdf.save(test_pdf)
with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
add_srgb_output_intent(pdf)
assert Name.OutputIntents in pdf.Root
assert len(pdf.Root.OutputIntents) == 1
intent = pdf.Root.OutputIntents[0]
assert str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB'
pdf.save(test_pdf)
def test_add_srgb_output_intent_idempotent(self, tmp_path):
"""Test that adding OutputIntent twice doesn't duplicate."""
test_pdf = tmp_path / 'test.pdf'
with pikepdf.new() as pdf:
pdf.add_blank_page()
pdf.save(test_pdf)
with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
add_srgb_output_intent(pdf)
add_srgb_output_intent(pdf) # Second call should be a no-op
assert len(pdf.Root.OutputIntents) == 1
pdf.save(test_pdf)
class TestSpeculativePdfaConversion:
"""Tests for speculative PDF/A conversion."""
def test_speculative_conversion_creates_pdfa_structures(self, tmp_path, resources):
"""Test that speculative conversion adds PDF/A structures."""
input_pdf = resources / 'graph.pdf'
output_pdf = tmp_path / 'output.pdf'
result = speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')
assert result.exists()
with pikepdf.open(result) as pdf:
assert Name.OutputIntents in pdf.Root
with pdf.open_metadata() as meta:
assert meta.pdfa_status == '2B'
def test_speculative_conversion_different_parts(self, tmp_path, resources):
"""Test speculative conversion with different PDF/A parts."""
input_pdf = resources / 'graph.pdf'
for output_type, expected_status in [
('pdfa-1', '1B'),
('pdfa-2', '2B'),
('pdfa-3', '3B'),
]:
output_pdf = tmp_path / f'output_{output_type}.pdf'
speculative_pdfa_conversion(input_pdf, output_pdf, output_type)
with pikepdf.open(output_pdf) as pdf, pdf.open_metadata() as meta:
assert meta.pdfa_status == expected_status
@pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
class TestVerapdfIntegration:
"""Integration tests requiring verapdf."""
def test_speculative_conversion_validation(self, tmp_path, resources):
"""Test that speculative conversion can be validated by verapdf.
Note: Most test PDFs will fail validation because they have issues
that require Ghostscript to fix (fonts, colorspaces, etc.). This test
verifies the validation pipeline works, not that all PDFs pass.
"""
input_pdf = resources / 'graph.pdf'
output_pdf = tmp_path / 'output.pdf'
speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')
# The converted file can be validated (even if it fails)
result = verapdf.validate(output_pdf, '2b')
assert isinstance(result.valid, bool)
assert isinstance(result.failed_rules, int)
================================================
FILE: tests/test_watcher.py
================================================
from __future__ import annotations
import datetime as dt
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path
import pytest
watchdog = pytest.importorskip('watchdog')
@pytest.mark.parametrize('year_month', [True, False])
def test_watcher(tmp_path, resources, year_month):
input_dir = tmp_path / 'input'
input_dir.mkdir()
output_dir = tmp_path / 'output'
output_dir.mkdir()
processed_dir = tmp_path / 'processed'
processed_dir.mkdir()
env_extra = {'OCR_OUTPUT_DIRECTORY_YEAR_MONTH': '1'} if year_month else {}
proc = subprocess.Popen(
[
sys.executable,
Path(__file__).parent.parent / 'misc' / 'watcher.py',
str(input_dir),
str(output_dir),
str(processed_dir),
],
cwd=str(tmp_path),
env=os.environ.copy() | env_extra,
)
time.sleep(5)
shutil.copy(resources / 'trivial.pdf', input_dir / 'trivial.pdf')
time.sleep(5)
if year_month:
assert (
output_dir
/ f'{dt.date.today().year}'
/ f'{dt.date.today().month:02d}'
/ 'trivial.pdf'
).exists()
else:
assert (output_dir / 'trivial.pdf').exists()
proc.terminate()
proc.wait()