Repository: kcroker/dpsprep
Branch: master
Commit: 93f377b6e0d0
Files: 32
Total size: 63.1 KB

Directory structure:
gitextract_m64660jx/

├── .github/
│   └── workflows/
│       └── test.yml
├── .gitignore
├── .python-version
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── dpsprep/
│   ├── __init__.py
│   ├── conftest.py
│   ├── dpsprep.py
│   ├── images.py
│   ├── logging.py
│   ├── ocrmypdf.py
│   ├── outline.py
│   ├── pdf.py
│   ├── py.typed
│   ├── sexpr.py
│   ├── test_images.py
│   ├── test_outline.py
│   ├── test_text.py
│   ├── text.py
│   └── workdir.py
├── dpsprep.1
├── dpsprep.1.ronn
├── fixtures/
│   ├── .gitattributes
│   ├── Makefile
│   ├── lipsum.tex
│   ├── lipsum_01.txt
│   ├── lipsum_lines.djvu
│   ├── lipsum_words.djvu
│   └── lipsum_words_invalid.djvu
└── pyproject.toml

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/test.yml
================================================
name: Run tests

on: [push]

jobs:
  test:
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-24.04, macos-14]

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v3

    - name: Install prerequisites on Ubuntu
      if: matrix.os == 'ubuntu-24.04'
      run: |
        sudo apt update
        sudo apt install --yes libdjvulibre21 libdjvulibre-dev

    - name: Install prerequisites on macOS
      if: matrix.os == 'macos-14'
      run: brew install djvulibre libtiff

    # - name: Install prerequisites on Windows
    #   if: matrix.os == 'windows-2022'
    #   run: |
    #     choco install djvu-libre
    #     vcpkg install tiff

    - uses: astral-sh/setup-uv@v7
    - name: Install dependencies
      run: uv sync --all-extras

    - name: Lint
      run: make lint

    - name: Test
      run: make test


================================================
FILE: .gitignore
================================================
.pytest_cache
.ruff_cache
.tests


================================================
FILE: .python-version
================================================
3.11


================================================
FILE: CHANGELOG.md
================================================
## v2.5.4 (2026-04-24)

* Run `uv` security audit and update some dependencies

## v2.5.3 (2026-03-25)

* Fix broken workflow without text layer translation
* Shorter names for temporary directories
* Code maintenance

## v2.5.2 (2026-03-25)

* Relax dependency versions

## v2.5.1 (2026-03-14)

* Allow manually configuring PDF page resolution (DPI)

## v2.5.0 (2026-03-13)

* Account for DjVu file resolution
* Simplify image diffing and regenerate better-quality fixtures

## v2.4.2 (2026-02-24)

* Fix issue where only the main process has its logger configured

## v2.4.1 (2026-02-24)

* Fix compatibility issues with the new OCRmyPDF API
* Remove support for Python 3.10

## v2.4.0 (2026-02-24)

* Migrate to `uv` from `pyenv` + `poetry`
* Update dependencies

## v2.3.1 (2025-10-28)

* Fix mixed-up email format

## v2.3.0 (2025-10-28)

* Remove support for Python 3.9
* Migrate to standardized `pyproject.toml`
* Update dependencies

## v2.2.15 (2025-07-02)

* Add support for installation via `pipx`

## v2.2.14 (2025-05-27)

* Improve installation notes
* Bump djvulibre-python version

## v2.2.13 (2025-02-12)

* Fail-safe quality settings for non-JPEG images

## v2.2.12 (2025-01-27)

* Update pytest_image_diff and fix newly broken tests

## v2.2.11 (2025-01-26)

* Update dependencies

## v2.2.10 (2024-10-25)

* Improve interface with OCRmyPDF
* Fix CI build

## v2.2.9 (2024-10-25)

* Improve type hints
* Update dependencies

## v2.2.8 (2024-10-18)

* Support single characters in the text layer

## v2.2.7 (2024-08-27)

* Improve tab and newline handling

## v2.2.6 (2024-08-05)

* Fix accidental whitespace removal from text blocks

## v2.2.5 (2024-07-20)

* Re-add ability to force the image mode (RGB/Grayscale/Monochrome)

## v2.2.4 (2024-02-24)

* Update dependencies

## v2.2.3 (2023-12-09)

* Fix CI build
* Ignore invalid UTF-8 sequences
* Ignore unrecognized page titles in the outline (#23)

## v2.2.2 (2023-10-29)

* Update dependencies

## v2.2.1 (2023-11-06)

* Handle invalid PDF pages
* Fix exception in text layer processing (#20)

## v2.2.0 (2023-10-28)

* Add options for disabling the text layer and for directly running OCR

## v2.1.5 (2023-10-27)

* Fix inverted colors in images (#16)

## v2.1.4 (2023-10-06)

* Fix typo in logging code

## v2.1.3 (2023-10-06)

* Improve logging

## v2.1.2 (2023-10-02)

* Accidental version bump

## v2.1.1 (2023-10-02)

* Remove debug code

## v2.1.0 (2023-10-02)

* Add support for OCRmyPDF

## v2.0.2 (2023-08-03)

* Update some other dependencies
* Replace `python-djvulibre` with `djvulibre-python`

## v2.0.1 (2023-06-22)

* Minor improvements in packaging

## v2.0.0 (2023-05-04)

* Fully rewrite


================================================
FILE: LICENSE
================================================
Copyright (C) 2015 Kevin Arthur Schiff Croker

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.


================================================
FILE: Makefile
================================================
.PHONY: lint test

lint:
	uv run ruff check
	uv run mypy

test:
	uv run pytest

dpsprep.1: dpsprep.1.ronn
	ronn --roff dpsprep.1.ronn


================================================
FILE: README.md
================================================
# dpsprep

[![Tests](https://github.com/kcroker/dpsprep/actions/workflows/test.yml/badge.svg)](https://github.com/kcroker/dpsprep/actions/workflows/test.yml) [![AUR Package](https://img.shields.io/aur/version/dpsprep)](https://aur.archlinux.org/packages/dpsprep)

This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e.g. TOC) and text layers (e.g. OCR).

## Usage

Full example (the name of the PDF is optional and inferred from the input name):

    dpsprep --pool=8 --quality=50 input.djvu output.pdf

If you have [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) installed, you can use its PDF optimizer:

    dpsprep -O3 input.djvu

You can also skip translating the text layer (it is sometimes not translated well) and redo the OCR (rather than launching the `ocrmypdf` CLI, we use the API directly and accept options in JSON format):

    dpsprep --ocr '{"language": ["rus", "eng"]}' input.djvu

Consult the man file ([online](./dpsprep.1.ronn)) for details; there are a lot of options to consider.

See the next section for different ways to run the program.

## Installation

### Automated

An easy way to install a `dpsprep` executable for the current user is via [`uv`](https://docs.astral.sh/uv/):

    uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep

For better compression (see below), the `compress` extra must be specified:

    uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep[compress]

Sometimes a particular feature branch need to be tested. For installing a fixed revision (i.e. common/branch/tag), the following should work (if `extra-name` is needed, use `dpsprep@rev[extra-name]`):

    uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep@rev

The only hard prerequisite is `djvulibre` (e.g. `djvulibre` on Arch, `libdjvulibre-dev` on Ubuntu, etc.). We use the Python bindings from the package [`djvulibre-python`](https://github.com/FriedrichFroebel/python-djvulibre) (not to be confused with the unmaintained [`python-djvulibre`](https://github.com/jwilk-archive/python-djvulibre); see [this pull request](https://github.com/kcroker/dpsprep/pull/10)).

> [!TIP]
> A few people have reported installation problems; see [this possible solution](https://github.com/kcroker/dpsprep/issues/38) and [this sample Dockerfile](https://github.com/kcroker/dpsprep/pull/37).

> [!NOTE]
> Note that Windows support in `djvulibre-python` requires 64-bit `djvulibre`, and they only officially distribute 32-bit Windows packages. If you manage to make it work, consider opening a pull request.

Optional prerequisites are:
* `libtiff` for bitonal image compression.
* `libjpeg` (or `libjpeg-turbo`) for multitotal (RGB or grayscale) compression.
* `OCRmyPDF` and `jbig2enc` for PDF optimization (see the next section).

`libtiff` depends on `libjpeg`, so installing `libtiff` will likely install both.

For details on how these dependencies can be installed, see the GitHub Actions [workflow](./.github/workflows/test.yml) and the [dpsprep](https://aur.archlinux.org/packages/dpsprep) package for Arch Linux.

### Manual

Setting up the project in is again done via `uv`. Once inside the cloned repository, the environment for the program can be set up by simply running `uv sync --all-extras`. After than, the following should work:

    uv run dpsprep [OPTIONS] SRC [DEST]

> [!NOTE]
> Previous versions used [`pyenv`](https://github.com/pyenv/pyenv) for managing Python versions and [`poetry`](https://python-poetry.org/) for managing dependencies and building. Since then the project migrated to `uv`, which subsumes both and provides other niceties.

You can also build and install the project, for example via [`pipx`](https://pipx.pypa.io/en/stable/):

    uv build --wheel
    pipx install --include-deps dist/*.whl

> [!TIP]
> The build can fail if the [`uv_build`](https://docs.astral.sh/uv/concepts/build-backend/) Python package is not installed. Make sure not only the `uv` binary, but also the corresponding Python package is available. For example, in the Arch repositories, these are distinct packages, `uv` and `python-uv`. Alternatively, try to install the [`uv-build`](https://pypi.org/project/uv-build/) PyPI package (`python-uv-build` in Arch) explicitly in this case.

If you want `dpsprep` to be able to use `ocrmypdf` from `pipx`'s isolated environment, you must [inject](https://fig.io/manual/pipx/inject) it explicitly via

    pipx inject dpsprep ocrmypdf

> [!TIP]
> If you are packaging this for some other package manager, consider using PEP-517 tools as shown in [this PKGBUILD file](https://aur.archlinux.org/cgit/aur.git/tree/PKGBUILD?h=dpsprep).

> [!NOTE]
> Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/Makefile`](./fixtures/Makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `oxipng` (oxipng), `pdftotext` (Poppler), `djvudigital` (GSDjVU) and `djvused` (DjVuLibre). Similarly, the man file is checked in, but building it from markdown depends on `ronn`.

## Details

### Compression

We perform compression in two stages:

* The first one is the default compression provided by [Pillow](https://github.com/python-pillow/Pillow). For bitonal images, [the PDF generation code says](https://github.com/python-pillow/Pillow/blob/a088d54509e42e4eeed37d618b42d775c0d16ef5/src/PIL/PdfImagePlugin.py#L138C16-L138C16) that, if `libtiff` is available, `group4` compression is used.

* If [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) is installed, its PDF optimization can be used via the flags `-O1` to `-O3` (this involves no OCR). This allows us to use advanced techniques, including JBIG2 compression via `jbig2enc`.

If manually running OCRmyPDF, note that the optimization command suggested [in the documentation](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html#optimize-images-without-performing-ocr) (setting `--tesseract-timeout` to `0`) may ruin existing text layers. To perform only PDF optimization you can use the following undocumented tool instead:

    python -m ocrmypdf.optimize <input_file> <level> <output_file>

### Text layer

The visible contents of a DjVu file are well-compressed images (see [here](http://yann.lecun.com/ex/djvu/index.html)). But a DjVu file also contains a "text layer" stored as metadata attached to invisible rectangular blocks. PDF does not support such constructs, so we do a little hack.

We render each page as an image and put it as a background in the PDF. We then use a font, [`invisible1.ttf`](./dpsprep/invisible.ttf), taken from [here](https://www.angelfire.com/pr/pgpf/if.html), to "draw" text. Every time we draw a block of text, we rescale the font so that the width of the text matches that of the corresponding DjVu block.

> [!NOTE]
> The font is small (12kb) and contains (invisible) Latin, Cyrillic and Greek characters. Even Chinese characters seem to be working correctly, at least with [Evince](https://gitlab.gnome.org/GNOME/evince).

The following screenshot displays the result of converting a DjVu document:

![Image](./screenshots/lipsum_with_image.png)

The following screenshot displays the same document without the background image and with the invisible font replaced by Times New Roman:

![Image](./screenshots/lipsum_with_text.png)

Since the image is actually drawn on top of the text, there is no harm in using an actual visible font, possibly rendered using a transparent "color". Still, when searching and selecting text, the scrambled letters from the second image would be highlighted. With the invisible font, there are no visible glyphs to highlight, so an illusory "block" containing the text is highlighted instead.

See [`./dpsprep/text.py`](./dpsprep/text.py) for the implementation.

## Kevin's notes regarding the first version

I wrote this with the specific intent of converting ebooks in the DJVU format into PDFs for use with the fantastic (but pricey) 
Sony Digital Paper System.

DjVu technology is strikingly superior for many ebook applications, yet the Sony Digital Paper System (rev 1.3 US)
only supports PDF technology: this is because its primary design purpose is not as an ereader.  The device, however, 
is quite nearly the **perfect** ereader.

Unfortunately, all presently available DjVu to PDF tools seem to just dump flattened enormous TIFF images.  This is ridiculous.
Since PDF really can't do that much better on the way it stores image data, a 5-6x bloat cannot be avoided.  However, none of the 
existing tools preserve:

* The OCR'd text content
* Table of Contents or Internal links

This is kind of silly, but until Sony's Digital Paper, there was no need to move functional DjVu files to PDFs.
In order to make workable PDFs from DjVu files for use on the Digital Paper System, I have implemented in one location the following
procedures detailed here:

By automating the procedure of user zetah for extracting the text and getting it in the correct locations:
http://askubuntu.com/questions/46233/converting-djvu-to-pdf (OCR text transfer)

By implementing the procedure of user pyrocrasty for extracting the outline, and putting it into the PDF generated above:
http://superuser.com/questions/801893/converting-djvu-to-pdf-and-preserving-table-of-contents-how-is-it-possible (bookmark transfer)


================================================
FILE: dpsprep/__init__.py
================================================
from .dpsprep import dpsprep


__all__ = ['dpsprep']


================================================
FILE: dpsprep/conftest.py
================================================
import loguru
import pytest


@pytest.fixture(autouse=True)
def disable_loguru() -> None:
    loguru.logger.remove()


================================================
FILE: dpsprep/dpsprep.py
================================================
import json
import multiprocessing.pool
import shutil
from time import time

import click
import djvu.decode
import loguru
import pdfrw

from .images import ImageMode, failsafe_save_djvu_page, process_djvu_page
from .logging import configure_loguru, human_readable_size
from .ocrmypdf import optimize_pdf, perform_ocr
from .outline import OutlineTransformVisitor
from .pdf import combine_pdfs_on_fs_with_text, combine_pdfs_on_fs_without_text, is_valid_pdf
from .text import djvu_pages_to_text_fpdf
from .workdir import WorkingDirectory


def process_page_bg(workdir: WorkingDirectory, mode: ImageMode, quality: int | None, dpi: int | None, i: int, *, verbose: bool) -> None:  # noqa: PLR0913
    configure_loguru(verbose=verbose)
    page_number = i + 1

    if workdir.get_page_pdf_path(i).exists():
        if is_valid_pdf(workdir.get_page_pdf_path(i)):
            loguru.logger.debug(f'Image data from page {page_number} already processed.')
            return
        loguru.logger.debug(f'Invalid page generated for {page_number}, regenerating.')
    else:
        loguru.logger.debug(f'Processing image data from page {page_number}.')

    start_time = time()
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI(workdir.src),
    )
    document.decoding_job.wait()

    page_bg = process_djvu_page(document.pages[i], mode, i)

    failsafe_save_djvu_page(
        page_bg,
        workdir.get_page_pdf_path(i),
        quality,
        dpi,
        page_number,
    )

    pdf_size = workdir.get_page_pdf_path(i).stat().st_size
    loguru.logger.debug(f'Image data with size {human_readable_size(pdf_size)} from page {page_number} processed in {time() - start_time:.2f}s and written to working directory.')


def process_text(workdir: WorkingDirectory, dpi: int | None, *, verbose: bool) -> None:
    configure_loguru(verbose=verbose)

    if workdir.text_layer_pdf_path.exists():
        loguru.logger.info('Text data already processed.')
        return

    loguru.logger.debug('Processing text data.')

    start_time = time()
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI(workdir.src),
    )
    document.decoding_job.wait()

    fpdf = djvu_pages_to_text_fpdf(document.pages, dpi)
    fpdf.output(str(workdir.text_layer_pdf_path))

    pdf_size = workdir.text_layer_pdf_path.stat().st_size
    loguru.logger.info(f'Text data with size {human_readable_size(pdf_size) } processed in {time() - start_time:.2f}s and written to working directory')


@click.option('-d', '--delete-working', is_flag=True, help='Delete any existing files in the working directory prior to writing to it.')
@click.option('-w', '--preserve-working', is_flag=True, help='Preserve the working directory after script termination.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite destination file.')
@click.option('-v', '--verbose', is_flag=True, help='Display debug messages.')
@click.option('-t', '--no-text', is_flag=True, help='Disable the generation of text layers. Implied by --ocr.')
@click.option('-O1', 'optlevel', flag_value=1, help='Use the lossless PDF image optimization from OCRmyPDF (without performing OCR).')
@click.option('-O2', 'optlevel', flag_value=2, help='Use the PDF image optimization from OCRmyPDF.')
@click.option('-O3', 'optlevel', flag_value=3, help='Use the aggressive lossy PDF image optimization from OCRmyPDF.')
@click.option('-p', '--pool-size', type=click.IntRange(min=0), default=4, help='Size of MultiProcessing pool for handling page-by-page operations.')
@click.option('-q', '--quality', type=click.IntRange(min=0, max=100), help="Quality of images in output. Used only for JPEG compression, i.e. RGB and Grayscale images. Passed directly to Pillow and to OCRmyPDF's optimizer.")
@click.option('-m', '--mode', type=click.Choice(['infer', 'bitonal', 'grayscale', 'rgb']), default='infer', help='Override the image modes encoded in the DjVu file for individual pages. It sometimes makes sense to force bitonal images since they compress well.')
@click.option('--dpi', type=click.IntRange(min=1), help='Override DPI values encoded in the DjVu file for individual pages.')
@click.option('--ocr', type=str, is_flag=False, flag_value='{}', help='Perform OCR via OCRmyPDF rather than trying to convert the text layer. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF.')
@click.version_option()
@click.argument('dest', type=click.Path(exists=False, resolve_path=True), required=False)
@click.argument('src', type=click.Path(exists=True, resolve_path=True), required=True)
@click.command()
def dpsprep(  # noqa: C901, PLR0912, PLR0913, PLR0915
    src: str,
    dest: str | None,
    quality: int | None,
    dpi: int | None,
    pool_size: int,
    mode: ImageMode,
    optlevel: int | None,
    ocr: str | None,
    *,
    verbose: bool,
    overwrite: bool,
    delete_working: bool,
    preserve_working: bool,
    no_text: bool,
) -> None:
    configure_loguru(verbose=verbose)
    workdir = WorkingDirectory(src, dest)

    if ocr is None:
        ocr_options = None
    else:
        try:
            ocr_options = json.loads(ocr)
        except ValueError as err:
            msg = f'The OCR options {ocr!r} are not valid JSON.'
            raise SystemExit(msg) from err
        else:
            if not isinstance(ocr_options, dict):
                msg = f'The OCR options {ocr!r} are not a JSON dictionary.'
                raise SystemExit(msg)

        no_text = True

    if not overwrite and workdir.dest.exists():
        msg = f'File {workdir.dest} already exists.'
        raise SystemExit(msg)

    start_time = time()

    if workdir.workdir.exists():
        if delete_working:
            loguru.logger.debug(f'Removing existing working directory {workdir.workdir}.')
            workdir.destroy()
            loguru.logger.info(f'Removed existing working directory {workdir.workdir}.')
        else:
            loguru.logger.info(f'Reusing working directory {workdir.workdir}.')
    else:
        loguru.logger.info(f'Working directory {workdir.workdir} has been created.')

    workdir.create_if_necessary()

    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI(workdir.src),
    )
    document.decoding_job.wait()

    djvu_size = workdir.src.stat().st_size
    loguru.logger.info(f'Processing {workdir.src} with {len(document.pages)} pages and size {human_readable_size(djvu_size)} using {pool_size} workers.')

    pool = multiprocessing.Pool(processes=pool_size)
    tasks = list[multiprocessing.pool.AsyncResult]()

    if not no_text:
        tasks.append(pool.apply_async(func=process_text, args=[workdir, dpi], kwds={'verbose': verbose}))

    for i in range(len(document.pages)):
        # Cannot pass the page object itself because it does not support serialization for IPC
        tasks.append(pool.apply_async(func=process_page_bg, args=[workdir, mode, quality, dpi, i], kwds={'verbose': verbose}))

    pool.close()
    pool_is_working = True

    while pool_is_working:
        pool_is_working = False

        for task in tasks:
            try:
                task.get(timeout=25)
            except multiprocessing.TimeoutError:
                pool_is_working = True

    pool.join()
    loguru.logger.info('Processed all pages.')

    outline = pdfrw.IndirectPdfDict()

    if len(document.outline.sexpr) > 0:
        loguru.logger.info('Processing metadata.')
        outline = OutlineTransformVisitor().visit(document.outline.sexpr)
        loguru.logger.info('Metadata processed.')
    else:
        loguru.logger.info('No metadata to process.')

    loguru.logger.info('Combining everything.')

    if no_text:
        combine_pdfs_on_fs_without_text(workdir, outline, len(document.pages))

        ocr_success = False

        if ocr_options:
            loguru.logger.info('Performing OCR.')
            ocr_success = perform_ocr(workdir, ocr_options)
        else:
            loguru.logger.info('Skipping the text layer.')

        if not ocr_success:
            shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path)
    else:
        combine_pdfs_on_fs_with_text(workdir, outline)

    combined_size = workdir.combined_pdf_path.stat().st_size
    loguru.logger.info(f'Produced a combined output file with size {human_readable_size(combined_size)} in {time() - start_time:.2f}s. This is {round(100 * combined_size / djvu_size, 2)}% of the DjVu source file.')

    opt_success = False

    if optlevel is not None:
        loguru.logger.info(f'Performing level {optlevel} optimization.')
        opt_success = optimize_pdf(workdir, optlevel, quality, pool_size)

    if opt_success:
        opt_size = workdir.optimized_pdf_path.stat().st_size

        loguru.logger.info(f'The optimized file has size {human_readable_size(opt_size)}, which is {round(100 * opt_size / combined_size, 2)}% of the raw combined file and {round(100 * opt_size / djvu_size, 2)}% of the DjVu source file.')

        if opt_size < combined_size:
            loguru.logger.info('Using the optimized file.')
            shutil.copy(workdir.optimized_pdf_path, workdir.dest)
        else:
            loguru.logger.info('Using the raw combined file.')
            shutil.copy(workdir.combined_pdf_path, workdir.dest)
    else:
        shutil.copy(workdir.combined_pdf_path, workdir.dest)

    if preserve_working:
        loguru.logger.info(f'Working directory {workdir.workdir} will be preserved.')
    else:
        loguru.logger.info(f'Deleting the working directory {workdir.workdir}.')
        workdir.destroy()


================================================
FILE: dpsprep/images.py
================================================
import pathlib
from typing import Literal, NamedTuple

import djvu.decode
import loguru
import PIL.features
from PIL import Image, ImageOps


ImageMode = Literal['rgb', 'grayscale', 'bitonal', 'infer']


djvu_pixel_formats = {
    'rgb': djvu.decode.PixelFormatRgb(byte_order='RGB'),
    'grayscale': djvu.decode.PixelFormatGrey(),
    'bitonal': djvu.decode.PixelFormatPackedBits('>'),
}


for pixel_format in djvu_pixel_formats.values():
    pixel_format.rows_top_to_bottom = 1
    pixel_format.y_top_to_bottom = 0


pil_modes = {
    'rgb': 'RGB',
    'grayscale': 'L',
    'bitonal': '1',
}


class ProcessedPageBackground(NamedTuple):
    pil_image: Image.Image
    resolution: int


def process_djvu_page(page: djvu.decode.Page, mode: ImageMode, i: int) -> ProcessedPageBackground:
    page_job = page.decode(wait=True)
    width, height = page_job.size
    buffer = bytearray(3 * width * height)  # RGB at most

    rect = (0, 0, width, height)

    if mode == 'infer':
        mode = 'bitonal' if page_job.type == djvu.decode.PAGE_TYPE_BITONAL else 'rgb'

    if mode == 'bitonal':
        if not PIL.features.check_codec('libtiff'):
            loguru.logger.warning('Bitonal image compression may suffer because Pillow has been built without libtiff support.')
    elif not PIL.features.check_codec('jpg'):
        loguru.logger.warning('Multitonal image compression may suffer because Pillow has been built without libjpeg support.')

    try:
        page_job.render(
            # RENDER_COLOR is simply a default value and doesn't actually imply colors
            mode=djvu.decode.RENDER_COLOR,
            page_rect=rect,
            render_rect=rect,
            pixel_format=djvu_pixel_formats[mode],
            buffer=buffer,
        )
    except djvu.decode.NotAvailable:
        loguru.logger.warning(f'libdjvu claims that data for page {i + 1} is not available. Producing a blank page.')
        image = Image.new(
            pil_modes['bitonal'],
            page_job.size,
            1,
        )

        return ProcessedPageBackground(image, page_job.dpi)

    image = Image.frombuffer(
        pil_modes[mode],
        page_job.size,
        buffer,
        'raw',
    )

    return ProcessedPageBackground(
        # I have experimentally determined that we need to invert the black-and-white images. -- Ianis, 2023-05-13
        # See also https://github.com/kcroker/dpsprep/issues/16
        ImageOps.invert(image) if mode == 'bitonal' else image,
        page_job.dpi,
    )


def failsafe_save_djvu_page(page_bg: ProcessedPageBackground, target: pathlib.Path, quality: int | None, dpi: int | None, page_number: int) -> None:
    if quality is not None:
        if page_bg.pil_image.mode in pil_modes['bitonal'] and PIL.features.check_codec('libtiff'):
            loguru.logger.warning('Pillow uses TIFF for encoding bitonal PDF images. The encoder does not support a "quality" setting. If the conversion fails, please try again without specifying quality.')

        try:
            page_bg.pil_image.save(
                target,
                format='PDF',
                quality=quality,
                resolution=dpi or page_bg.resolution,
            )
        except ValueError:
            loguru.logger.warning(f'Failed to encode page {page_number}. Trying again without setting quality.')
        else:
            return

    page_bg.pil_image.save(
        target,
        format='PDF',
        resolution=dpi or page_bg.resolution,
    )


================================================
FILE: dpsprep/logging.py
================================================
import os
import sys
from types import TracebackType

import loguru


cached_stdout = sys.stdout


def configure_loguru(*, verbose: bool) -> None:
    loguru.logger.remove()
    loguru.logger.add(
        cached_stdout,
        format='<level>{level}</level> <green>{time:HH:mm:ss}</green> <level>{message}</level>',
        level='DEBUG' if verbose else 'INFO',
    )


def human_readable_size(size: int) -> str:
    # ruff: disable[PLR2004]
    if size < 1024:
        return f'{size} bytes'

    if size < 1024 ** 2:
        return f'{size / 1024:.02f} KiB'

    return f'{size / 1024 ** 2:.02f} MiB'
    # ruff: enable[PLR2004]


# img2pdf abuses debug logging by using print
# This is a way to temporarily silence it
class SilencePrint:
    def __enter__(self) -> None:
        sys.stdout = open(os.devnull, 'w', encoding='utf-8')

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        traceback: TracebackType | None,
     ) -> None:
        sys.stdout.close()
        sys.stdout = cached_stdout


================================================
FILE: dpsprep/ocrmypdf.py
================================================
# We use OCRmyPDF in a non-canonical way: only optimize the file without performing any OCR.
# The optimization procedure provides good results and preserves the text layer and outline.
# The code here is based on
#   https://github.com/ocrmypdf/OCRmyPDF/blob/fb006ef39f7f8842dec1976bebe4bcd5ca2e8df8/src/ocrmypdf/optimize.py#L724
# with some simplifications for OCRmyPDF 17

# ruff: noqa: PLC0415

import shutil
from typing import Any

import loguru

from .workdir import WorkingDirectory


def optimize_pdf(workdir: WorkingDirectory, optlevel: int, quality: int | None, pool_size: int) -> bool:
    try:
        # ObjectStreamMode is actually from pikepdf, but I did not want to include that as a dependency
        from ocrmypdf._options import OcrOptions
        from ocrmypdf.optimize import ObjectStreamMode, PdfContext, optimize
        from ocrmypdf.pdfinfo import PdfInfo
    except ImportError:
        loguru.logger.warning('Cannot detect OCRmyPDF. No optimizations will be performed on the output file.')
        return False

    options = OcrOptions(
        input_file=workdir.combined_pdf_without_text_path,
        output_file=workdir.combined_pdf_path,
        # Jobs correspond to CPU cores rather than threads, but it seems better to use the available pool size parameter
        jobs=pool_size,
        optimize=optlevel,
        # When 0, these should be adjusted inside OCRmyPDF's "optimize" function
        jpg_quality=quality or 0,
        png_quality=quality or 0,
    )

    info = PdfInfo(workdir.combined_pdf_path)
    context = PdfContext(options, workdir.ocrmypdf_tmp_path, workdir.combined_pdf_path, info, None)

    optimize(
        workdir.combined_pdf_path,
        workdir.optimized_pdf_path,
        context,
        {
            'compress_streams': True,
            'preserve_pdfa': True,
            'object_stream_mode': ObjectStreamMode.generate,
        },
    )

    return True


def perform_ocr(workdir: WorkingDirectory, options: dict[str, Any]) -> bool:
    try:
        from ocrmypdf import api
    except ImportError:
        loguru.logger.warning('Cannot detect OCRmyPDF. No OCR will be performed on the output file.')
        return False

    try:
        api.ocr(
            input_file_or_options=workdir.combined_pdf_without_text_path,
            output_file=workdir.combined_pdf_path,
            **options,
        )
    except Exception as err:
        loguru.logger.warning(f'OCRmyPDF failed: {err}')
        shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path)
        return False
    else:
        return True


================================================
FILE: dpsprep/outline.py
================================================
import djvu.sexpr
import loguru
from pdfrw import IndirectPdfDict, PdfDict, PdfName

from .sexpr import SExpressionVisitor


# Based on
# https://github.com/pmaupin/pdfrw/issues/52#issuecomment-271190546
class OutlineTransformVisitor(SExpressionVisitor[PdfDict]):
    def visit_plain_list(self, node: djvu.sexpr.StringExpression, parent: IndirectPdfDict) -> PdfDict:
        title, page, *rest = node

        # I have experimentally determined that we need to translate page indices. -- Ianis, 2023-05-03
        try:
            page_number = int(page.value[1:]) - 1
        except ValueError:
            # As far as I understand, python-djvulibre doesn't support Djvu's page titles. -- Ianis, 2023-12-09
            loguru.logger.warning(f'Could not determine page number from the page title {page.value}.')
            return None

        try:
            title_text = title.value
        except UnicodeDecodeError:
            loguru.logger.warning(f'Could not decode page title {title!r}; leaving it in escaped form.')
            title_text = str(title)

        bookmark = IndirectPdfDict(
            Parent = parent,
            Title = title_text,
            A = PdfDict(
                D = [page_number, PdfName.Fit],
                S = PdfName.GoTo,
            ),
        )

        if parent.Count is None:
            parent.Count = 0
            parent.First = bookmark
        else:
            bookmark.Prev = parent.Last
            bookmark.Prev.Next = bookmark

        parent.Count += 1
        parent.Last = bookmark

        for child in rest:
            self.visit(child, parent=bookmark)

        return bookmark

    def visit_list_bookmarks(self, node: djvu.sexpr.ListExpression) -> PdfDict:
        _, *rest = node

        outline = IndirectPdfDict()

        for child in rest:
            self.visit(child, parent=outline)

        return outline


================================================
FILE: dpsprep/pdf.py
================================================
import pathlib

import pdfrw

from .workdir import WorkingDirectory


def is_valid_pdf(path: pathlib.Path) -> bool:
    try:
        pdfrw.PdfReader(path)
    except pdfrw.errors.PdfParseError:
        return False
    else:
        return True


def combine_pdfs_on_fs_with_text(workdir: WorkingDirectory, outline: pdfrw.IndirectPdfDict) -> None:
    text_pdf = pdfrw.PdfReader(workdir.text_layer_pdf_path)
    writer = pdfrw.PdfWriter()

    for i, text_page in enumerate(text_pdf.pages):
        # We take the one-page text PDF and add the image layer on top
        # Even if the font was not invisible, it would be hidden visually (but not during search or text highlight)
        image_pdf = pdfrw.PdfReader(workdir.get_page_pdf_path(i))
        image_page = image_pdf.pages[0]
        merger = pdfrw.PageMerge(text_page)
        merger.add(image_page).render()
        writer.addpage(text_page)

    writer.trailer.Root.Outlines = outline
    writer.write(workdir.combined_pdf_path)


def combine_pdfs_on_fs_without_text(workdir: WorkingDirectory, outline: pdfrw.IndirectPdfDict, max_page: int) -> None:
    writer = pdfrw.PdfWriter()

    for i in range(max_page):
        image_pdf = pdfrw.PdfReader(workdir.get_page_pdf_path(i))
        image_page = image_pdf.pages[0]
        writer.addpage(image_page)

    writer.trailer.Root.Outlines = outline
    writer.write(workdir.combined_pdf_without_text_path)


================================================
FILE: dpsprep/py.typed
================================================


================================================
FILE: dpsprep/sexpr.py
================================================
from typing import Generic, TypeVar

import djvu.sexpr
import loguru


T = TypeVar('T')
R = TypeVar('R')


class SExpressionVisitor(Generic[R]):
    def visit_list(self, node: djvu.sexpr.ListExpression, **kwargs: T) -> R | None:
        if len(node) > 0 and isinstance(node[0], djvu.sexpr.SymbolExpression):
            method = getattr(self, f'visit_list_{node[0]}', None)
            if method is None:
                loguru.logger.warning(f"Don't know how to visit ListExpression of type {str(node[0])!r}.")
                return None
            return method(node, **kwargs)
        if hasattr(self, 'visit_plain_list'):
            return self.visit_plain_list(node, **kwargs)
        loguru.logger.warning("Don't know how to visit a plain ListExpression.")
        return None

    def visit_other(self, node: djvu.sexpr.Expression, **kwargs: T) -> R | None:  # noqa: ARG002
        loguru.logger.warning(f"Don't know how to visit S-expression type {type(node)!r}.")
        return None

    def visit(self, node: djvu.sexpr.Expression, **kwargs: T) -> R | None:
        if isinstance(node, djvu.sexpr.IntExpression):
            if hasattr(self, 'visit_int'):
                return self.visit_int(node, **kwargs)
            loguru.logger.warning("Don't know how to visit IntExpression.")
            return None
        if isinstance(node, djvu.sexpr.StringExpression):
            if hasattr(self, 'visit_string'):
                return self.visit_string(node, **kwargs)
            loguru.logger.warning("Don't know how to visit StringExpression.")
            return None
        if isinstance(node, djvu.sexpr.ListExpression):
            return self.visit_list(node, **kwargs)
        return self.visit_other(node, **kwargs)


================================================
FILE: dpsprep/test_images.py
================================================
import djvu.decode
from PIL import Image, ImageChops, ImageStat

from .images import process_djvu_page


# A simple score function for Pillow images.
# We previously used the pytest-image-diff module, which used the diffimg module.
# It turned out that diffimg uses a similar approach, so we dropped the dependency in favor of a few-liner.
def calculate_image_diff_score(a: Image.Image, b: Image.Image) -> float:
    assert a.size == b.size, 'We only support diffing images with identical sizes'
    assert a.mode == b.mode, 'We only support diffing images with the same mode'

    diff = ImageChops.difference(a, b)
    stat = ImageStat.Stat(diff)
    return max(stat.rms) / 256  # The ImageStat module uses 256 bins


def test_process_djvu_page_bitonal() -> None:
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI('fixtures/lipsum_words.djvu'),
    )
    document.decoding_job.wait()

    fixture = Image.open('fixtures/lipsum_01.png')
    result = process_djvu_page(document.pages[0], mode='infer', i=0)

    page_decode_job = document.pages[0].decode()
    page_decode_job.wait()
    assert result.resolution == page_decode_job.dpi

    assert calculate_image_diff_score(fixture, result.pil_image) < 0.05


================================================
FILE: dpsprep/test_outline.py
================================================
from djvu import sexpr
from pdfrw import IndirectPdfDict

from .outline import OutlineTransformVisitor


def test_basic_outline() -> None:
    src = sexpr.ListExpression([
        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
        sexpr.ListExpression([
            sexpr.StringExpression(b'Chapter 2'),
            sexpr.StringExpression(b'#100'),
        ]),
    ])

    visitor = OutlineTransformVisitor()
    bookmarks = visitor.visit(src)
    assert bookmarks is not None
    assert bookmarks.Count == 1
    assert bookmarks.First.Title == 'Chapter 2'
    assert bookmarks.First.A.D[0] == 99  # The page number


def test_nested_outline() -> None:
    src = sexpr.ListExpression([
        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
        sexpr.ListExpression([
            sexpr.StringExpression(b'Chapter 2'),
            sexpr.StringExpression(b'#100'),
            sexpr.ListExpression([
                sexpr.StringExpression(b'Chapter 2.1'),
                sexpr.StringExpression(b'#200'),
            ]),
        ]),
    ])

    visitor = OutlineTransformVisitor()
    bookmarks = visitor.visit(src)
    assert bookmarks is not None
    assert bookmarks.Count == 1
    assert bookmarks.First.Count == 1
    assert bookmarks.First.A.D[0] == 99  # The page number of chapter 2
    assert bookmarks.First.First.A.D[0] == 199  # The page number of chapter 2.1


# Sometimes the page numbers are instead page titles, which our libdjvu bindings do not support
# We ignore them since there is not much we can do in this case
# See https://github.com/kcroker/dpsprep/issues/23
def test_outline_with_page_titles() -> None:
    src = sexpr.ListExpression([
        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
        sexpr.ListExpression([
            sexpr.StringExpression(b'Preface'),
            sexpr.StringExpression(b'#f007.djvu'),
        ]),
        sexpr.ListExpression([
            sexpr.StringExpression(b'Contents'),
            sexpr.StringExpression(b'#f011.djvu'),
        ]),
        sexpr.ListExpression([
            sexpr.StringExpression(b'0 Prologue'),
            sexpr.StringExpression(b'#p001.djvu'),
        ]),
    ])

    visitor = OutlineTransformVisitor()
    bookmarks = visitor.visit(src)
    empty_pdf_dict = IndirectPdfDict()
    assert bookmarks == empty_pdf_dict


def test_outline_with_invalid_unicode() -> None:
    src = sexpr.ListExpression([
        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
        sexpr.ListExpression([
            sexpr.StringExpression(b'\2470'),
            sexpr.StringExpression(b'#1'),
        ]),
    ])

    visitor = OutlineTransformVisitor()
    bookmarks = visitor.visit(src)
    assert bookmarks is not None
    assert bookmarks.Count == 1
    assert bookmarks.First.Title == '"\\2470"'


================================================
FILE: dpsprep/test_text.py
================================================
import pathlib
import string

import djvu.decode
import pytest

from .text import TextExtractVisitor


def remove_whitespace(src: str) -> str:
    return src.translate({ord(c): None for c in string.whitespace})


def test_extract_djvu_page_text_words() -> None:
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI('fixtures/lipsum_words.djvu'),
    )
    document.decoding_job.wait()

    djvu_page = document.pages[0]
    djvu_page.get_info()
    djvu_text = TextExtractVisitor().visit(djvu_page.text.sexpr)

    assert djvu_text is not None

    source_pdf_text = pathlib.Path('fixtures/lipsum_01.txt').read_text(encoding='utf-8')

    assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text)


def test_extract_djvu_page_text_lines() -> None:
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI('fixtures/lipsum_lines.djvu'),
    )
    document.decoding_job.wait()

    djvu_page = document.pages[0]
    djvu_page.get_info()
    djvu_text = TextExtractVisitor().visit(djvu_page.text.sexpr)

    assert djvu_text is not None

    source_pdf_text = pathlib.Path('fixtures/lipsum_01.txt').read_text(encoding='utf-8')

    assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text)


def test_invalid_utf8() -> None:
    document = djvu.decode.Context().new_document(
        djvu.decode.FileURI('fixtures/lipsum_words_invalid.djvu'),
    )
    document.decoding_job.wait()

    djvu_page = document.pages[0]
    djvu_page.get_info()
    first_word_sexpr = djvu_page.text.sexpr[5][5]

    # djvulibre cannot decode the first word
    with pytest.raises(UnicodeDecodeError):
        first_word_sexpr.value  # noqa: B018

    first_word = TextExtractVisitor().visit(first_word_sexpr)
    assert first_word == ''


================================================
FILE: dpsprep/text.py
================================================
# ruff: noqa: RUF059

import unicodedata
from collections.abc import Iterable, Sequence
from pathlib import Path

import djvu.sexpr
import loguru
from fpdf import FPDF

from .sexpr import SExpressionVisitor


BASE_FONT_SIZE = 10
TAB_SIZE = 4


class TextExtractVisitor(SExpressionVisitor[str]):
    def iter_chars(self, string: str) -> Iterable[str]:
        for char in string:
            code = unicodedata.category(char)

            # Line Separator (Zl) | Space Separator (Zs)
            if code in {'Zl', 'Zs'}:
                yield ' '

            # Paragraph Separator (Zp)
            elif code == 'Zp':
                yield '\n'

            # Control (Cc)
            elif code == 'Cc':
                if char == '\t':
                    yield ' ' * TAB_SIZE
                elif char == '\n':
                    yield ' '

            # These break FPDF.
            # A full list of categories can be found in https://www.compart.com/en/unicode/category
            # Format (Cf) | Private Use (Co) | Surrogate 'Cs':
            elif code in {'Cf', 'Co', 'Cs'}:
                pass

            else:
                yield char

    def visit_string(self, node: djvu.sexpr.StringExpression) -> str:
        try:
            string = node.value  # This getter is not static - it does UTF-8 conversion and fails for some DjVu files
        except ValueError as err:
            loguru.logger.warning(f'Could not decode {node!r}: {err}')
            return ''
        else:
            return ''.join(self.iter_chars(string))

    def visit_plain_list(self, node: djvu.sexpr.ListExpression) -> str:  # noqa: ARG002
        return ''

    def visit_list_word(self, node: djvu.sexpr.ListExpression) -> str | None:
        _, x1, y1, x2, y2, content, *rest = node
        return self.visit(content)

    visit_list_char = visit_list_word

    def visit_list_line(self, node: djvu.sexpr.ListExpression) -> str:
        _, x1, y1, x2, y2, *rest = node
        return ' '.join(self.visit(child) or '' for child in rest)

    def visit_list_para(self, node: djvu.sexpr.ListExpression) -> str:
        _, x1, y1, x2, y2, *rest = node
        return '\n'.join(self.visit(child) or '' for child in rest)

    visit_list_column = visit_list_para
    visit_list_region = visit_list_para
    visit_list_page = visit_list_para


class TextDrawVisitor(SExpressionVisitor):
    pdf: FPDF
    dpi: int
    extractor: TextExtractVisitor

    def __init__(self, pdf: FPDF, dpi: int) -> None:
        self.pdf = pdf
        self.dpi = dpi
        self.extractor = TextExtractVisitor()

    def draw_text(self, x1: int, x2: int, y1: int, y2: int, text: str) -> None:  # noqa: ARG002
        page_width, page_height = self.pdf.pages[self.pdf.page].dimensions()

        if page_height is None:
            loguru.logger.warning(f'Cannot draw {text!r} because page height is not set.')
            return

        self.pdf.set_font('Invisible', size=BASE_FONT_SIZE)

        # Adjust font size
        desired_width = (x2 - x1) / self.dpi
        actual_width = self.pdf.get_string_width(text)

        if actual_width == 0:
            return

        self.pdf.set_font('Invisible', size=int(BASE_FONT_SIZE * desired_width / actual_width))

        try:
            self.pdf.text(x=x1 / self.dpi, y=page_height / 72 - y1 / self.dpi, text=text)
        except TypeError as err:
            loguru.logger.warning(f'FPDF refuses to draw {text!r}: {err}')

    def iter_loose_string_content(self, expressions: list[djvu.sexpr.Expression]) -> Iterable[str]:
        for child in expressions:
            if not isinstance(child, djvu.sexpr.StringExpression):
                continue

            if (text := self.extractor.visit(child)) is not None:
                yield text

    def get_loose_string_content(self, expressions: list[djvu.sexpr.Expression], delimiter: str) -> str:
        return delimiter.join(self.iter_loose_string_content(expressions))

    def visit_list_word(self, node: djvu.sexpr.ListExpression) -> None:
        _, x1, y1, x2, y2, *rest = node
        text = self.extractor.visit(node)

        if text is not None:
            self.draw_text(x1.value, x2.value, y1.value, y2.value, text)

    visit_list_char = visit_list_word

    def visit_list_line(self, node: djvu.sexpr.ListExpression) -> None:
        _, x1, y1, x2, y2, *rest = node

        text = self.get_loose_string_content(rest, ' ')

        if len(text) > 0:
            self.draw_text(x1.value, x2.value, y1.value, y2.value, text)

        for child in rest:
            if not isinstance(child, djvu.sexpr.StringExpression):
                self.visit(child)

    def visit_list_para(self, node: djvu.sexpr.ListExpression) -> None:
        _, x1, y1, x2, y2, *rest = node

        text = self.get_loose_string_content(rest, '\n')

        if len(text) > 0:
            self.draw_text(x1.value, x2.value, y1.value, y2.value, text)

        for child in rest:
            if not isinstance(child, djvu.sexpr.StringExpression):
                self.visit(child)

    def visit_list_column(self, node: djvu.sexpr.ListExpression) -> None:
        _, x1, y1, x2, y2, *rest = node

        for child in rest:
            self.visit(child)

    visit_list_page = visit_list_column
    visit_list_region = visit_list_column


def djvu_pages_to_text_fpdf(pages: Sequence[djvu.decode.Page], dpi: int | None) -> FPDF:
    pdf = FPDF(unit='in')
    pdf.add_font(
        family='Invisible',
        fname=Path(__file__).parent / 'invisible1.ttf',
        style='',
    )

    for i, page in enumerate(pages):
        page_job = page.decode(wait=True)
        page_dpi = dpi or page_job.dpi
        pdf.add_page(format=(page_job.width / page_dpi, page_job.height / page_dpi))
        loguru.logger.debug(f'Processing text for page {i + 1}.')
        visitor = TextDrawVisitor(pdf, page_dpi)
        visitor.visit(page.text.sexpr)

    return pdf


================================================
FILE: dpsprep/workdir.py
================================================
import hashlib
import os
import pathlib
import shutil
import tempfile

import loguru


HASHING_BUFFER_SIZE = 64 * 1024


def get_file_hash(path: os.PathLike | str) -> str:
    h = hashlib.blake2b(digest_size=4)

    with open(path, 'rb') as file:
        data = file.read(HASHING_BUFFER_SIZE)

        while len(data) > 0:
            h.update(data)
            data = file.read(HASHING_BUFFER_SIZE)

    return h.hexdigest()


class WorkingDirectory:
    src: pathlib.Path
    dest: pathlib.Path
    workdir: pathlib.Path

    def __init__(self, src: os.PathLike | str, dest: os.PathLike | str | None) -> None:
        self.src = pathlib.Path(src)

        if dest is None:
            self.dest = pathlib.Path(pathlib.Path(src).with_suffix('.pdf').name)
        else:
            self.dest = pathlib.Path(dest)

        # Working path
        # If possible, we avoid the ephemeral storage /tmp
        persistent_tmp = pathlib.Path('/var/tmp')  # noqa: S108

        if persistent_tmp.exists() and (persistent_tmp.stat().st_mode & (os.W_OK | os.X_OK)):
            loguru.logger.debug('Using non-ephemeral storage "/var/tmp".')
            root = persistent_tmp
        else:
            loguru.logger.debug(f'Using default system storage {tempfile.gettempdir()!r}.')
            root = pathlib.Path(tempfile.gettempdir())

        self.workdir = root / 'dpsprep' / get_file_hash(self.src)

    def create_if_necessary(self) -> None:
        if not self.workdir.exists():
            loguru.logger.debug(f'Creating {str(self.workdir)!r}.')
            self.workdir.mkdir(parents=True)

        if not self.ocrmypdf_tmp_path.exists():
            loguru.logger.debug(f'Creating {str(self.ocrmypdf_tmp_path)!r}.')

    def get_page_pdf_path(self, i: int) -> pathlib.Path:
        return self.workdir / f'page_bg_{i + 1}.pdf'

    @property
    def text_layer_pdf_path(self) -> pathlib.Path:
        return self.workdir / 'text_layer.pdf'

    @property
    def ocrmypdf_tmp_path(self) -> pathlib.Path:
        return self.workdir / 'ocrmypdf'

    @property
    def combined_pdf_without_text_path(self) -> pathlib.Path:
        return self.workdir / 'combined_without_text.pdf'

    @property
    def combined_pdf_path(self) -> pathlib.Path:
        return self.workdir / 'combined.pdf'

    @property
    def optimized_pdf_path(self) -> pathlib.Path:
        return self.workdir / 'optimized.pdf'

    def destroy(self) -> None:
        shutil.rmtree(self.workdir)


================================================
FILE: dpsprep.1
================================================
.\" generated with Ronn-NG/v0.10.1
.\" http://github.com/apjanke/ronn-ng/tree/0.10.1
.TH "DPS" "1" "March 2026" ""
.SH "NAME"
\fBdps\fR \- a DjVu to PDF converter
.SH "SYNOPSIS"
\fBdpsprep\fR \fIoptions\fR src [dest]
.SH "DESCRIPTION"
This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general\-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e\.g\. TOC) and text layers (e\.g\. OCR)\.
.SH "OPTIONS"
.IP "\(bu" 4
\fB\-q\fR, \fB\-\-quality\fR: Quality of images in output\. Used only for JPEG compression, i\.e\. RGB and Grayscale images\. Passed directly to Pillow and to OCRmyPDF's optimizer\.
.IP "\(bu" 4
\fB\-v\fR, \fB\-\-verbose\fR: Display debug messages\.
.IP "\(bu" 4
\fB\-o\fR, \fB\-\-overwrite\fR: Overwrite destination file\.
.IP "\(bu" 4
\fB\-w\fR, \fB\-\-preserve\-working\fR: Preserve the working directory after script termination\.
.IP "\(bu" 4
\fB\-d\fR, \fB\-\-delete\-working\fR: Delete any existing files in the working directory prior to writing to it\.
.IP "\(bu" 4
\fB\-t\fR, \fB\-\-no\-text\fR: Disable the generation of text layers\. Implied by \-\-ocr\.
.IP "\(bu" 4
\fB\-p\fR, \fB\-\-pool\-size\fR \fIint\fR: Size of MultiProcessing pool for handling page\-by\-page operations\.
.IP "\(bu" 4
\fB\-m\fR, \fB\-\-mode\fR \fIinfer|bitonal|grayscale|rgb\fR: Override the image modes encoded in the DjVu file for individual pages\. It sometimes makes sense to force bitonal images since they compress well\.
.IP "\(bu" 4
\fB\-\-dpi\fR \fIint\fR: Override DPI values encoded in the DjVu file for individual pages\.
.IP "\(bu" 4
\fB\-\-ocr\fR \fIJSON\fR: Perform OCR via OCRmyPDF rather than trying to convert the text layer\. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF\.
.IP "\(bu" 4
\fB\-O1\fR: Use the lossless PDF image optimization from OCRmyPDF (without performing OCR)\.
.IP "\(bu" 4
\fB\-O2\fR: Use the PDF image optimization from OCRmyPDF\.
.IP "\(bu" 4
\fB\-O3\fR: Use the aggressive lossy PDF image optimization from OCRmyPDF\.
.IP "\(bu" 4
\fB\-\-help\fR: Show help message and exit\.
.IP "\(bu" 4
\fB\-\-version\fR: Show the version and exit\.
.IP "" 0
.SH "EXAMPLES"
Produce \fBfile\.pdf\fR in the current directory:
.IP "" 4
.nf
dpsprep /wherever/file\.djvu
.fi
.IP "" 0
.P
Produce \fBoutput\.pdf\fR with reduced image quality and aggressive PDF image optimizations:
.IP "" 4
.nf
dpsprep \-\-\-quality=30 \-O3 input\.djvu output\.pdf
.fi
.IP "" 0
.P
Produce an output file using a large pool of workers:
.IP "" 4
.nf
dpsprep \-\-pool=16 input\.djvu
.fi
.IP "" 0
.P
Force bitonal images:
.IP "" 4
.nf
dpsprep \-\-mode bitonal input\.djvu
.fi
.IP "" 0
.P
Produce an output file by disregarding the text layer and running OCRmyPDF instead:
.IP "" 4
.nf
dpsprep \-\-ocr '{"language": ["rus", "eng"]}' input\.djvu
.fi
.IP "" 0
.P
Or simply disregard the text layer without OCR:
.IP "" 4
.nf
dpsprep \-\-no\-text input\.djvu
.fi
.IP "" 0
.SH "NOTE REGARDING COMPRESSION"
We perform compression in two stages:
.IP "\(bu" 4
The first one is the default compression provided by Pillow\. For bitonal images, the PDF generation code says that, if \fBlibtiff\fR is available, \fBgroup4\fR compression is used\.
.IP "\(bu" 4
If OCRmyPDF is installed, its PDF optimization can be used via the flags \fB\-O1\fR to \fB\-O3\fR (this involves no OCR)\. This allows us to use advanced techniques, including JBIG2 compression via \fBjbig2enc\fR\.
.IP "" 0
.P
If manually running OCRmyPDF, note that the optimization command suggested in the documentation (setting \fB\-\-tesseract\-timeout\fR to \fB0\fR) may ruin existing text layers\. To perform only PDF optimization you can use the following undocumented tool instead:
.IP "" 4
.nf
python \-m ocrmypdf\.optimize <input_file> <level> <output_file>
.fi
.IP "" 0


================================================
FILE: dpsprep.1.ronn
================================================
# dps(1) -- a DjVu to PDF converter

## SYNOPSIS

`dpsprep` [options] src [dest]

## DESCRIPTION

This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e.g. TOC) and text layers (e.g. OCR).

## OPTIONS

* `-q`, `--quality`:                            Quality of images in output. Used only for JPEG compression, i.e. RGB and Grayscale images. Passed directly to Pillow and to OCRmyPDF's optimizer.
* `-v`, `--verbose`:                            Display debug messages.
* `-o`, `--overwrite`:                          Overwrite destination file.
* `-w`, `--preserve-working`:                   Preserve the working directory after script termination.
* `-d`, `--delete-working`:                     Delete any existing files in the working directory prior to writing to it.
* `-t`, `--no-text`:                            Disable the generation of text layers. Implied by --ocr.
* `-p`, `--pool-size` <int>:                    Size of MultiProcessing pool for handling page-by-page operations.
* `-m`, `--mode` <infer|bitonal|grayscale|rgb>: Override the image modes encoded in the DjVu file for individual pages. It sometimes makes sense to force bitonal images since they compress well.
* `--dpi` <int>:                                Override DPI values encoded in the DjVu file for individual pages.
* `--ocr` <JSON>:                               Perform OCR via OCRmyPDF rather than trying to convert the text layer. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF.
* `-O1`:                                        Use the lossless PDF image optimization from OCRmyPDF (without performing OCR).
* `-O2`:                                        Use the PDF image optimization from OCRmyPDF.
* `-O3`:                                        Use the aggressive lossy PDF image optimization from OCRmyPDF.
* `--help`:                                     Show help message and exit.
* `--version`:                                  Show the version and exit.

## EXAMPLES

Produce `file.pdf` in the current directory:

    dpsprep /wherever/file.djvu

Produce `output.pdf` with reduced image quality and aggressive PDF image optimizations:

    dpsprep ---quality=30 -O3 input.djvu output.pdf

Produce an output file using a large pool of workers:

    dpsprep --pool=16 input.djvu

Force bitonal images:

    dpsprep --mode bitonal input.djvu

Produce an output file by disregarding the text layer and running OCRmyPDF instead:

    dpsprep --ocr '{"language": ["rus", "eng"]}' input.djvu

Or simply disregard the text layer without OCR:

    dpsprep --no-text input.djvu

## NOTE REGARDING COMPRESSION

We perform compression in two stages:

* The first one is the default compression provided by Pillow. For bitonal images, the PDF generation code says that, if `libtiff` is available, `group4` compression is used.

* If OCRmyPDF is installed, its PDF optimization can be used via the flags `-O1` to `-O3` (this involves no OCR). This allows us to use advanced techniques, including JBIG2 compression via `jbig2enc`.

If manually running OCRmyPDF, note that the optimization command suggested in the documentation (setting `--tesseract-timeout` to `0`) may ruin existing text layers. To perform only PDF optimization you can use the following undocumented tool instead:

    python -m ocrmypdf.optimize <input_file> <level> <output_file>


================================================
FILE: fixtures/.gitattributes
================================================
lipsum* linguist-generated


================================================
FILE: fixtures/Makefile
================================================
.PHONY: all clean

all: lipsum.pdf lipsum_01.txt lipsum_01.png lipsum_lines.djvu lipsum_words.djvu lipsum_words_invalid.djvu

clean:
	rm --force *.djvu *.pdf *.png *.txt

%.pdf: %.tex
	pdflatex $*.tex
	rm $*.aux $*.log

%_01.txt: %.pdf
	pdftotext -l 1 -layout $*.pdf $*_01.txt

%_01.png: %.pdf
	gs -sDEVICE=pngmono -r600 -dLastPage=1 -o $*_01.png $*.pdf
	oxipng $*_01.png

%_words.djvu: %.pdf
	djvudigital --dpi=600 --words $*.pdf
	mv $*.djvu $*_words.djvu

%_lines.djvu: %.pdf
	djvudigital --dpi=600 --lines $*.pdf
	mv $*.djvu $*_lines.djvu

%_invalid.djvu: %.djvu
	cp $*.djvu $*_invalid.djvu
	djvused $*_invalid.djvu -e 'output-all' | \
		sed 's/Lorem/\\270/g' | \
		djvused $*_invalid.djvu -f /dev/stdin -s


================================================
FILE: fixtures/lipsum.tex
================================================
\documentclass{article}

\usepackage{lipsum}

\title{Lorem Ipsum}
\author{Cicero}

\begin{document}
  \lipsum
\end{document}


================================================
FILE: fixtures/lipsum_01.txt
================================================
    Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut purus elit,
vestibulum ut, placerat ac, adipiscing vitae, felis. Curabitur dictum gravida
mauris. Nam arcu libero, nonummy eget, consectetuer id, vulputate a, magna.
Donec vehicula augue eu neque. Pellentesque habitant morbi tristique senectus
et netus et malesuada fames ac turpis egestas. Mauris ut leo. Cras viverra
metus rhoncus sem. Nulla et lectus vestibulum urna fringilla ultrices. Phasellus
eu tellus sit amet tortor gravida placerat. Integer sapien est, iaculis in, pretium
quis, viverra ac, nunc. Praesent eget sem vel leo ultrices bibendum. Aenean
faucibus. Morbi dolor nulla, malesuada eu, pulvinar at, mollis ac, nulla. Cur-
abitur auctor semper nulla. Donec varius orci eget risus. Duis nibh mi, congue
eu, accumsan eleifend, sagittis quis, diam. Duis eget orci sit amet orci dignissim
rutrum.
    Nam dui ligula, fringilla a, euismod sodales, sollicitudin vel, wisi. Morbi
auctor lorem non justo. Nam lacus libero, pretium at, lobortis vitae, ultricies et,
tellus. Donec aliquet, tortor sed accumsan bibendum, erat ligula aliquet magna,
vitae ornare odio metus a mi. Morbi ac orci et nisl hendrerit mollis. Suspendisse
ut massa. Cras nec ante. Pellentesque a nulla. Cum sociis natoque penatibus et
magnis dis parturient montes, nascetur ridiculus mus. Aliquam tincidunt urna.
Nulla ullamcorper vestibulum turpis. Pellentesque cursus luctus mauris.
    Nulla malesuada porttitor diam. Donec felis erat, congue non, volutpat at,
tincidunt tristique, libero. Vivamus viverra fermentum felis. Donec nonummy
pellentesque ante. Phasellus adipiscing semper elit. Proin fermentum massa
ac quam. Sed diam turpis, molestie vitae, placerat a, molestie nec, leo. Mae-
cenas lacinia. Nam ipsum ligula, eleifend at, accumsan nec, suscipit a, ipsum.
Morbi blandit ligula feugiat magna. Nunc eleifend consequat lorem. Sed lacinia
nulla vitae enim. Pellentesque tincidunt purus vel magna. Integer non enim.
Praesent euismod nunc eu purus. Donec bibendum quam in tellus. Nullam cur-
sus pulvinar lectus. Donec et mi. Nam vulputate metus eu enim. Vestibulum
pellentesque felis eu massa.
    Quisque ullamcorper placerat ipsum. Cras nibh. Morbi vel justo vitae lacus
tincidunt ultrices. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. In
hac habitasse platea dictumst. Integer tempus convallis augue. Etiam facilisis.
Nunc elementum fermentum wisi. Aenean placerat. Ut imperdiet, enim sed
gravida sollicitudin, felis odio placerat quam, ac pulvinar elit purus eget enim.
Nunc vitae tortor. Proin tempus nibh sit amet nisl. Vivamus quis tortor vitae
risus porta vehicula.
    Fusce mauris. Vestibulum luctus nibh at lectus. Sed bibendum, nulla a fau-
cibus semper, leo velit ultricies tellus, ac venenatis arcu wisi vel nisl. Vestibulum
diam. Aliquam pellentesque, augue quis sagittis posuere, turpis lacus congue
quam, in hendrerit risus eros eget felis. Maecenas eget erat in sapien mattis
porttitor. Vestibulum porttitor. Nulla facilisi. Sed a turpis eu lacus commodo
facilisis. Morbi fringilla, wisi in dignissim interdum, justo lectus sagittis dui, et
vehicula libero dui cursus dui. Mauris tempor ligula sed lacus. Duis cursus enim
ut augue. Cras ac magna. Cras nulla. Nulla egestas. Curabitur a leo. Quisque
egestas wisi eget nunc. Nam feugiat lacus vel est. Curabitur consectetuer.
    Suspendisse vel felis. Ut lorem lorem, interdum eu, tincidunt sit amet,


                                         1


================================================
FILE: pyproject.toml
================================================
[project]
name = "dpsprep"
version = "2.5.4"
description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers"
requires-python = ">=3.11, <4.0"
authors = [
  { name = "Kevin Arthur Schiff Croker" },
  { name = "Ianis Vasilev", email = "ianis@ivasilev.net" }
]
license = "GPL-3.0-or-later"
dependencies = [
  "click (>=8)",
  "djvulibre-python (>=0.9.3)",
  "fpdf2 (>=2.8)",
  "loguru (>=0.7)",
  "pdfrw (>=0.4)",
  "pillow (>=12.2.0)"
]

[project.urls]
Repository = "https://github.com/kcroker/dpsprep.git"
Changelog = "https://github.com/kcroker/dpsprep/blob/master/CHANGELOG.md"

[project.optional-dependencies]
compress = [
  "ocrmypdf (>=17)"
]

[project.scripts]
dpsprep = "dpsprep:dpsprep"

[dependency-groups]
dev = [
  "mypy (>=1.19)",
  "pytest (>=9.0.3)",
  "ruff (>=0.15)",
  "types-fpdf2 (>=2.8.4.20260322)"
]

[build-system]
# uv build complains if no upper bound is set, but it updates its minor versions often, so we put a major version just to shut it up
requires = ["uv_build (>=0.10, <1)"]
build-backend = "uv_build"

# uv
[tool.uv]
resolution = "lowest-direct"

[tool.uv.build-backend]
module-root = ""  # uv-build expects the code to be in src/dpsprep, but I did not want to move it when migrating to uv

# pytest
[tool.pytest.ini_options]
addopts = "--capture tee-sys"

# ruff
[tool.ruff]
line-length = 120

[tool.ruff.lint]
select = [
  "A",     # flake8-builtins
  "ANN",   # flake8-annotations
  "ARG",   # flake8-unused-arguments
  "ASYNC", # flake8-async
  "B",     # flake8-bugbear
  "C4",    # flake8-comrehensions
  "C90",   # mccabe
  "COM",   # flake8-commas
  "E",     # pycodestyle error
  "F",     # pyflakes
  "FURB",  # refurb
  "I",     # isort
  "INP",   # flake8-no-pep420
  "N",     # pep8-naming
  "PERF",  # perflint
  "PL",    # pylint
  "PT",    # flake8-pytest-style
  "PTH",   # flake8-use-pathlib
  "Q",     # flake8-quotes
  "RUF",   # ruff
  "S",     # flake8-bandit
  "SIM",   # flake8-simplify
  "TC",    # flake8-type-checking
  "TRY",   # tryceratops
  "UP",    # pyupgrade
  "W",     # pycodestyle warning
]
ignore = [
  "E501",    # line-too-long
  "PLC1901", # compare-to-empty-string
  "PLR6301", # no-self-use
  "PTH123",  # builtin-open
  "RUF001", "RUF002", "RUF003", # ambiguous-unicode-character-{string,docstring,comment}
]

[tool.ruff.lint.isort]
lines-after-imports = 2

[tool.ruff.lint.flake8-quotes]
inline-quotes = "single"
multiline-quotes = "single"

[tool.ruff.lint.per-file-ignores]
"test_*.py" = ["S101", "PLR2004"]

# mypy
[tool.mypy]
packages = ["dpsprep"]

[[tool.mypy.overrides]]
module = [
  "djvu.*",
  "ocrmypdf.*",
  "pdfrw.*"
]
ignore_missing_imports = true