Repository: kcroker/dpsprep Branch: master Commit: 93f377b6e0d0 Files: 32 Total size: 63.1 KB Directory structure: gitextract_m64660jx/ ├── .github/ │ └── workflows/ │ └── test.yml ├── .gitignore ├── .python-version ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── dpsprep/ │ ├── __init__.py │ ├── conftest.py │ ├── dpsprep.py │ ├── images.py │ ├── logging.py │ ├── ocrmypdf.py │ ├── outline.py │ ├── pdf.py │ ├── py.typed │ ├── sexpr.py │ ├── test_images.py │ ├── test_outline.py │ ├── test_text.py │ ├── text.py │ └── workdir.py ├── dpsprep.1 ├── dpsprep.1.ronn ├── fixtures/ │ ├── .gitattributes │ ├── Makefile │ ├── lipsum.tex │ ├── lipsum_01.txt │ ├── lipsum_lines.djvu │ ├── lipsum_words.djvu │ └── lipsum_words_invalid.djvu └── pyproject.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/test.yml ================================================ name: Run tests on: [push] jobs: test: strategy: fail-fast: false matrix: os: [ubuntu-24.04, macos-14] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 - name: Install prerequisites on Ubuntu if: matrix.os == 'ubuntu-24.04' run: | sudo apt update sudo apt install --yes libdjvulibre21 libdjvulibre-dev - name: Install prerequisites on macOS if: matrix.os == 'macos-14' run: brew install djvulibre libtiff # - name: Install prerequisites on Windows # if: matrix.os == 'windows-2022' # run: | # choco install djvu-libre # vcpkg install tiff - uses: astral-sh/setup-uv@v7 - name: Install dependencies run: uv sync --all-extras - name: Lint run: make lint - name: Test run: make test ================================================ FILE: .gitignore ================================================ .pytest_cache .ruff_cache .tests ================================================ FILE: .python-version ================================================ 3.11 ================================================ FILE: CHANGELOG.md ================================================ ## v2.5.4 (2026-04-24) * Run `uv` security audit and update some dependencies ## v2.5.3 (2026-03-25) * Fix broken workflow without text layer translation * Shorter names for temporary directories * Code maintenance ## v2.5.2 (2026-03-25) * Relax dependency versions ## v2.5.1 (2026-03-14) * Allow manually configuring PDF page resolution (DPI) ## v2.5.0 (2026-03-13) * Account for DjVu file resolution * Simplify image diffing and regenerate better-quality fixtures ## v2.4.2 (2026-02-24) * Fix issue where only the main process has its logger configured ## v2.4.1 (2026-02-24) * Fix compatibility issues with the new OCRmyPDF API * Remove support for Python 3.10 ## v2.4.0 (2026-02-24) * Migrate to `uv` from `pyenv` + `poetry` * Update dependencies ## v2.3.1 (2025-10-28) * Fix mixed-up email format ## v2.3.0 (2025-10-28) * Remove support for Python 3.9 * Migrate to standardized `pyproject.toml` * Update dependencies ## v2.2.15 (2025-07-02) * Add support for installation via `pipx` ## v2.2.14 (2025-05-27) * Improve installation notes * Bump djvulibre-python version ## v2.2.13 (2025-02-12) * Fail-safe quality settings for non-JPEG images ## v2.2.12 (2025-01-27) * Update pytest_image_diff and fix newly broken tests ## v2.2.11 (2025-01-26) * Update dependencies ## v2.2.10 (2024-10-25) * Improve interface with OCRmyPDF * Fix CI build ## v2.2.9 (2024-10-25) * Improve type hints * Update dependencies ## v2.2.8 (2024-10-18) * Support single characters in the text layer ## v2.2.7 (2024-08-27) * Improve tab and newline handling ## v2.2.6 (2024-08-05) * Fix accidental whitespace removal from text blocks ## v2.2.5 (2024-07-20) * Re-add ability to force the image mode (RGB/Grayscale/Monochrome) ## v2.2.4 (2024-02-24) * Update dependencies ## v2.2.3 (2023-12-09) * Fix CI build * Ignore invalid UTF-8 sequences * Ignore unrecognized page titles in the outline (#23) ## v2.2.2 (2023-10-29) * Update dependencies ## v2.2.1 (2023-11-06) * Handle invalid PDF pages * Fix exception in text layer processing (#20) ## v2.2.0 (2023-10-28) * Add options for disabling the text layer and for directly running OCR ## v2.1.5 (2023-10-27) * Fix inverted colors in images (#16) ## v2.1.4 (2023-10-06) * Fix typo in logging code ## v2.1.3 (2023-10-06) * Improve logging ## v2.1.2 (2023-10-02) * Accidental version bump ## v2.1.1 (2023-10-02) * Remove debug code ## v2.1.0 (2023-10-02) * Add support for OCRmyPDF ## v2.0.2 (2023-08-03) * Update some other dependencies * Replace `python-djvulibre` with `djvulibre-python` ## v2.0.1 (2023-06-22) * Minor improvements in packaging ## v2.0.0 (2023-05-04) * Fully rewrite ================================================ FILE: LICENSE ================================================ Copyright (C) 2015 Kevin Arthur Schiff Croker This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ================================================ FILE: Makefile ================================================ .PHONY: lint test lint: uv run ruff check uv run mypy test: uv run pytest dpsprep.1: dpsprep.1.ronn ronn --roff dpsprep.1.ronn ================================================ FILE: README.md ================================================ # dpsprep [![Tests](https://github.com/kcroker/dpsprep/actions/workflows/test.yml/badge.svg)](https://github.com/kcroker/dpsprep/actions/workflows/test.yml) [![AUR Package](https://img.shields.io/aur/version/dpsprep)](https://aur.archlinux.org/packages/dpsprep) This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e.g. TOC) and text layers (e.g. OCR). ## Usage Full example (the name of the PDF is optional and inferred from the input name): dpsprep --pool=8 --quality=50 input.djvu output.pdf If you have [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) installed, you can use its PDF optimizer: dpsprep -O3 input.djvu You can also skip translating the text layer (it is sometimes not translated well) and redo the OCR (rather than launching the `ocrmypdf` CLI, we use the API directly and accept options in JSON format): dpsprep --ocr '{"language": ["rus", "eng"]}' input.djvu Consult the man file ([online](./dpsprep.1.ronn)) for details; there are a lot of options to consider. See the next section for different ways to run the program. ## Installation ### Automated An easy way to install a `dpsprep` executable for the current user is via [`uv`](https://docs.astral.sh/uv/): uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep For better compression (see below), the `compress` extra must be specified: uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep[compress] Sometimes a particular feature branch need to be tested. For installing a fixed revision (i.e. common/branch/tag), the following should work (if `extra-name` is needed, use `dpsprep@rev[extra-name]`): uv tool install dpsprep --from git+https://github.com/kcroker/dpsprep@rev The only hard prerequisite is `djvulibre` (e.g. `djvulibre` on Arch, `libdjvulibre-dev` on Ubuntu, etc.). We use the Python bindings from the package [`djvulibre-python`](https://github.com/FriedrichFroebel/python-djvulibre) (not to be confused with the unmaintained [`python-djvulibre`](https://github.com/jwilk-archive/python-djvulibre); see [this pull request](https://github.com/kcroker/dpsprep/pull/10)). > [!TIP] > A few people have reported installation problems; see [this possible solution](https://github.com/kcroker/dpsprep/issues/38) and [this sample Dockerfile](https://github.com/kcroker/dpsprep/pull/37). > [!NOTE] > Note that Windows support in `djvulibre-python` requires 64-bit `djvulibre`, and they only officially distribute 32-bit Windows packages. If you manage to make it work, consider opening a pull request. Optional prerequisites are: * `libtiff` for bitonal image compression. * `libjpeg` (or `libjpeg-turbo`) for multitotal (RGB or grayscale) compression. * `OCRmyPDF` and `jbig2enc` for PDF optimization (see the next section). `libtiff` depends on `libjpeg`, so installing `libtiff` will likely install both. For details on how these dependencies can be installed, see the GitHub Actions [workflow](./.github/workflows/test.yml) and the [dpsprep](https://aur.archlinux.org/packages/dpsprep) package for Arch Linux. ### Manual Setting up the project in is again done via `uv`. Once inside the cloned repository, the environment for the program can be set up by simply running `uv sync --all-extras`. After than, the following should work: uv run dpsprep [OPTIONS] SRC [DEST] > [!NOTE] > Previous versions used [`pyenv`](https://github.com/pyenv/pyenv) for managing Python versions and [`poetry`](https://python-poetry.org/) for managing dependencies and building. Since then the project migrated to `uv`, which subsumes both and provides other niceties. You can also build and install the project, for example via [`pipx`](https://pipx.pypa.io/en/stable/): uv build --wheel pipx install --include-deps dist/*.whl > [!TIP] > The build can fail if the [`uv_build`](https://docs.astral.sh/uv/concepts/build-backend/) Python package is not installed. Make sure not only the `uv` binary, but also the corresponding Python package is available. For example, in the Arch repositories, these are distinct packages, `uv` and `python-uv`. Alternatively, try to install the [`uv-build`](https://pypi.org/project/uv-build/) PyPI package (`python-uv-build` in Arch) explicitly in this case. If you want `dpsprep` to be able to use `ocrmypdf` from `pipx`'s isolated environment, you must [inject](https://fig.io/manual/pipx/inject) it explicitly via pipx inject dpsprep ocrmypdf > [!TIP] > If you are packaging this for some other package manager, consider using PEP-517 tools as shown in [this PKGBUILD file](https://aur.archlinux.org/cgit/aur.git/tree/PKGBUILD?h=dpsprep). > [!NOTE] > Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/Makefile`](./fixtures/Makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `oxipng` (oxipng), `pdftotext` (Poppler), `djvudigital` (GSDjVU) and `djvused` (DjVuLibre). Similarly, the man file is checked in, but building it from markdown depends on `ronn`. ## Details ### Compression We perform compression in two stages: * The first one is the default compression provided by [Pillow](https://github.com/python-pillow/Pillow). For bitonal images, [the PDF generation code says](https://github.com/python-pillow/Pillow/blob/a088d54509e42e4eeed37d618b42d775c0d16ef5/src/PIL/PdfImagePlugin.py#L138C16-L138C16) that, if `libtiff` is available, `group4` compression is used. * If [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) is installed, its PDF optimization can be used via the flags `-O1` to `-O3` (this involves no OCR). This allows us to use advanced techniques, including JBIG2 compression via `jbig2enc`. If manually running OCRmyPDF, note that the optimization command suggested [in the documentation](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html#optimize-images-without-performing-ocr) (setting `--tesseract-timeout` to `0`) may ruin existing text layers. To perform only PDF optimization you can use the following undocumented tool instead: python -m ocrmypdf.optimize ### Text layer The visible contents of a DjVu file are well-compressed images (see [here](http://yann.lecun.com/ex/djvu/index.html)). But a DjVu file also contains a "text layer" stored as metadata attached to invisible rectangular blocks. PDF does not support such constructs, so we do a little hack. We render each page as an image and put it as a background in the PDF. We then use a font, [`invisible1.ttf`](./dpsprep/invisible.ttf), taken from [here](https://www.angelfire.com/pr/pgpf/if.html), to "draw" text. Every time we draw a block of text, we rescale the font so that the width of the text matches that of the corresponding DjVu block. > [!NOTE] > The font is small (12kb) and contains (invisible) Latin, Cyrillic and Greek characters. Even Chinese characters seem to be working correctly, at least with [Evince](https://gitlab.gnome.org/GNOME/evince). The following screenshot displays the result of converting a DjVu document: ![Image](./screenshots/lipsum_with_image.png) The following screenshot displays the same document without the background image and with the invisible font replaced by Times New Roman: ![Image](./screenshots/lipsum_with_text.png) Since the image is actually drawn on top of the text, there is no harm in using an actual visible font, possibly rendered using a transparent "color". Still, when searching and selecting text, the scrambled letters from the second image would be highlighted. With the invisible font, there are no visible glyphs to highlight, so an illusory "block" containing the text is highlighted instead. See [`./dpsprep/text.py`](./dpsprep/text.py) for the implementation. ## Kevin's notes regarding the first version I wrote this with the specific intent of converting ebooks in the DJVU format into PDFs for use with the fantastic (but pricey) Sony Digital Paper System. DjVu technology is strikingly superior for many ebook applications, yet the Sony Digital Paper System (rev 1.3 US) only supports PDF technology: this is because its primary design purpose is not as an ereader. The device, however, is quite nearly the **perfect** ereader. Unfortunately, all presently available DjVu to PDF tools seem to just dump flattened enormous TIFF images. This is ridiculous. Since PDF really can't do that much better on the way it stores image data, a 5-6x bloat cannot be avoided. However, none of the existing tools preserve: * The OCR'd text content * Table of Contents or Internal links This is kind of silly, but until Sony's Digital Paper, there was no need to move functional DjVu files to PDFs. In order to make workable PDFs from DjVu files for use on the Digital Paper System, I have implemented in one location the following procedures detailed here: By automating the procedure of user zetah for extracting the text and getting it in the correct locations: http://askubuntu.com/questions/46233/converting-djvu-to-pdf (OCR text transfer) By implementing the procedure of user pyrocrasty for extracting the outline, and putting it into the PDF generated above: http://superuser.com/questions/801893/converting-djvu-to-pdf-and-preserving-table-of-contents-how-is-it-possible (bookmark transfer) ================================================ FILE: dpsprep/__init__.py ================================================ from .dpsprep import dpsprep __all__ = ['dpsprep'] ================================================ FILE: dpsprep/conftest.py ================================================ import loguru import pytest @pytest.fixture(autouse=True) def disable_loguru() -> None: loguru.logger.remove() ================================================ FILE: dpsprep/dpsprep.py ================================================ import json import multiprocessing.pool import shutil from time import time import click import djvu.decode import loguru import pdfrw from .images import ImageMode, failsafe_save_djvu_page, process_djvu_page from .logging import configure_loguru, human_readable_size from .ocrmypdf import optimize_pdf, perform_ocr from .outline import OutlineTransformVisitor from .pdf import combine_pdfs_on_fs_with_text, combine_pdfs_on_fs_without_text, is_valid_pdf from .text import djvu_pages_to_text_fpdf from .workdir import WorkingDirectory def process_page_bg(workdir: WorkingDirectory, mode: ImageMode, quality: int | None, dpi: int | None, i: int, *, verbose: bool) -> None: # noqa: PLR0913 configure_loguru(verbose=verbose) page_number = i + 1 if workdir.get_page_pdf_path(i).exists(): if is_valid_pdf(workdir.get_page_pdf_path(i)): loguru.logger.debug(f'Image data from page {page_number} already processed.') return loguru.logger.debug(f'Invalid page generated for {page_number}, regenerating.') else: loguru.logger.debug(f'Processing image data from page {page_number}.') start_time = time() document = djvu.decode.Context().new_document( djvu.decode.FileURI(workdir.src), ) document.decoding_job.wait() page_bg = process_djvu_page(document.pages[i], mode, i) failsafe_save_djvu_page( page_bg, workdir.get_page_pdf_path(i), quality, dpi, page_number, ) pdf_size = workdir.get_page_pdf_path(i).stat().st_size loguru.logger.debug(f'Image data with size {human_readable_size(pdf_size)} from page {page_number} processed in {time() - start_time:.2f}s and written to working directory.') def process_text(workdir: WorkingDirectory, dpi: int | None, *, verbose: bool) -> None: configure_loguru(verbose=verbose) if workdir.text_layer_pdf_path.exists(): loguru.logger.info('Text data already processed.') return loguru.logger.debug('Processing text data.') start_time = time() document = djvu.decode.Context().new_document( djvu.decode.FileURI(workdir.src), ) document.decoding_job.wait() fpdf = djvu_pages_to_text_fpdf(document.pages, dpi) fpdf.output(str(workdir.text_layer_pdf_path)) pdf_size = workdir.text_layer_pdf_path.stat().st_size loguru.logger.info(f'Text data with size {human_readable_size(pdf_size) } processed in {time() - start_time:.2f}s and written to working directory') @click.option('-d', '--delete-working', is_flag=True, help='Delete any existing files in the working directory prior to writing to it.') @click.option('-w', '--preserve-working', is_flag=True, help='Preserve the working directory after script termination.') @click.option('-o', '--overwrite', is_flag=True, help='Overwrite destination file.') @click.option('-v', '--verbose', is_flag=True, help='Display debug messages.') @click.option('-t', '--no-text', is_flag=True, help='Disable the generation of text layers. Implied by --ocr.') @click.option('-O1', 'optlevel', flag_value=1, help='Use the lossless PDF image optimization from OCRmyPDF (without performing OCR).') @click.option('-O2', 'optlevel', flag_value=2, help='Use the PDF image optimization from OCRmyPDF.') @click.option('-O3', 'optlevel', flag_value=3, help='Use the aggressive lossy PDF image optimization from OCRmyPDF.') @click.option('-p', '--pool-size', type=click.IntRange(min=0), default=4, help='Size of MultiProcessing pool for handling page-by-page operations.') @click.option('-q', '--quality', type=click.IntRange(min=0, max=100), help="Quality of images in output. Used only for JPEG compression, i.e. RGB and Grayscale images. Passed directly to Pillow and to OCRmyPDF's optimizer.") @click.option('-m', '--mode', type=click.Choice(['infer', 'bitonal', 'grayscale', 'rgb']), default='infer', help='Override the image modes encoded in the DjVu file for individual pages. It sometimes makes sense to force bitonal images since they compress well.') @click.option('--dpi', type=click.IntRange(min=1), help='Override DPI values encoded in the DjVu file for individual pages.') @click.option('--ocr', type=str, is_flag=False, flag_value='{}', help='Perform OCR via OCRmyPDF rather than trying to convert the text layer. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF.') @click.version_option() @click.argument('dest', type=click.Path(exists=False, resolve_path=True), required=False) @click.argument('src', type=click.Path(exists=True, resolve_path=True), required=True) @click.command() def dpsprep( # noqa: C901, PLR0912, PLR0913, PLR0915 src: str, dest: str | None, quality: int | None, dpi: int | None, pool_size: int, mode: ImageMode, optlevel: int | None, ocr: str | None, *, verbose: bool, overwrite: bool, delete_working: bool, preserve_working: bool, no_text: bool, ) -> None: configure_loguru(verbose=verbose) workdir = WorkingDirectory(src, dest) if ocr is None: ocr_options = None else: try: ocr_options = json.loads(ocr) except ValueError as err: msg = f'The OCR options {ocr!r} are not valid JSON.' raise SystemExit(msg) from err else: if not isinstance(ocr_options, dict): msg = f'The OCR options {ocr!r} are not a JSON dictionary.' raise SystemExit(msg) no_text = True if not overwrite and workdir.dest.exists(): msg = f'File {workdir.dest} already exists.' raise SystemExit(msg) start_time = time() if workdir.workdir.exists(): if delete_working: loguru.logger.debug(f'Removing existing working directory {workdir.workdir}.') workdir.destroy() loguru.logger.info(f'Removed existing working directory {workdir.workdir}.') else: loguru.logger.info(f'Reusing working directory {workdir.workdir}.') else: loguru.logger.info(f'Working directory {workdir.workdir} has been created.') workdir.create_if_necessary() document = djvu.decode.Context().new_document( djvu.decode.FileURI(workdir.src), ) document.decoding_job.wait() djvu_size = workdir.src.stat().st_size loguru.logger.info(f'Processing {workdir.src} with {len(document.pages)} pages and size {human_readable_size(djvu_size)} using {pool_size} workers.') pool = multiprocessing.Pool(processes=pool_size) tasks = list[multiprocessing.pool.AsyncResult]() if not no_text: tasks.append(pool.apply_async(func=process_text, args=[workdir, dpi], kwds={'verbose': verbose})) for i in range(len(document.pages)): # Cannot pass the page object itself because it does not support serialization for IPC tasks.append(pool.apply_async(func=process_page_bg, args=[workdir, mode, quality, dpi, i], kwds={'verbose': verbose})) pool.close() pool_is_working = True while pool_is_working: pool_is_working = False for task in tasks: try: task.get(timeout=25) except multiprocessing.TimeoutError: pool_is_working = True pool.join() loguru.logger.info('Processed all pages.') outline = pdfrw.IndirectPdfDict() if len(document.outline.sexpr) > 0: loguru.logger.info('Processing metadata.') outline = OutlineTransformVisitor().visit(document.outline.sexpr) loguru.logger.info('Metadata processed.') else: loguru.logger.info('No metadata to process.') loguru.logger.info('Combining everything.') if no_text: combine_pdfs_on_fs_without_text(workdir, outline, len(document.pages)) ocr_success = False if ocr_options: loguru.logger.info('Performing OCR.') ocr_success = perform_ocr(workdir, ocr_options) else: loguru.logger.info('Skipping the text layer.') if not ocr_success: shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path) else: combine_pdfs_on_fs_with_text(workdir, outline) combined_size = workdir.combined_pdf_path.stat().st_size loguru.logger.info(f'Produced a combined output file with size {human_readable_size(combined_size)} in {time() - start_time:.2f}s. This is {round(100 * combined_size / djvu_size, 2)}% of the DjVu source file.') opt_success = False if optlevel is not None: loguru.logger.info(f'Performing level {optlevel} optimization.') opt_success = optimize_pdf(workdir, optlevel, quality, pool_size) if opt_success: opt_size = workdir.optimized_pdf_path.stat().st_size loguru.logger.info(f'The optimized file has size {human_readable_size(opt_size)}, which is {round(100 * opt_size / combined_size, 2)}% of the raw combined file and {round(100 * opt_size / djvu_size, 2)}% of the DjVu source file.') if opt_size < combined_size: loguru.logger.info('Using the optimized file.') shutil.copy(workdir.optimized_pdf_path, workdir.dest) else: loguru.logger.info('Using the raw combined file.') shutil.copy(workdir.combined_pdf_path, workdir.dest) else: shutil.copy(workdir.combined_pdf_path, workdir.dest) if preserve_working: loguru.logger.info(f'Working directory {workdir.workdir} will be preserved.') else: loguru.logger.info(f'Deleting the working directory {workdir.workdir}.') workdir.destroy() ================================================ FILE: dpsprep/images.py ================================================ import pathlib from typing import Literal, NamedTuple import djvu.decode import loguru import PIL.features from PIL import Image, ImageOps ImageMode = Literal['rgb', 'grayscale', 'bitonal', 'infer'] djvu_pixel_formats = { 'rgb': djvu.decode.PixelFormatRgb(byte_order='RGB'), 'grayscale': djvu.decode.PixelFormatGrey(), 'bitonal': djvu.decode.PixelFormatPackedBits('>'), } for pixel_format in djvu_pixel_formats.values(): pixel_format.rows_top_to_bottom = 1 pixel_format.y_top_to_bottom = 0 pil_modes = { 'rgb': 'RGB', 'grayscale': 'L', 'bitonal': '1', } class ProcessedPageBackground(NamedTuple): pil_image: Image.Image resolution: int def process_djvu_page(page: djvu.decode.Page, mode: ImageMode, i: int) -> ProcessedPageBackground: page_job = page.decode(wait=True) width, height = page_job.size buffer = bytearray(3 * width * height) # RGB at most rect = (0, 0, width, height) if mode == 'infer': mode = 'bitonal' if page_job.type == djvu.decode.PAGE_TYPE_BITONAL else 'rgb' if mode == 'bitonal': if not PIL.features.check_codec('libtiff'): loguru.logger.warning('Bitonal image compression may suffer because Pillow has been built without libtiff support.') elif not PIL.features.check_codec('jpg'): loguru.logger.warning('Multitonal image compression may suffer because Pillow has been built without libjpeg support.') try: page_job.render( # RENDER_COLOR is simply a default value and doesn't actually imply colors mode=djvu.decode.RENDER_COLOR, page_rect=rect, render_rect=rect, pixel_format=djvu_pixel_formats[mode], buffer=buffer, ) except djvu.decode.NotAvailable: loguru.logger.warning(f'libdjvu claims that data for page {i + 1} is not available. Producing a blank page.') image = Image.new( pil_modes['bitonal'], page_job.size, 1, ) return ProcessedPageBackground(image, page_job.dpi) image = Image.frombuffer( pil_modes[mode], page_job.size, buffer, 'raw', ) return ProcessedPageBackground( # I have experimentally determined that we need to invert the black-and-white images. -- Ianis, 2023-05-13 # See also https://github.com/kcroker/dpsprep/issues/16 ImageOps.invert(image) if mode == 'bitonal' else image, page_job.dpi, ) def failsafe_save_djvu_page(page_bg: ProcessedPageBackground, target: pathlib.Path, quality: int | None, dpi: int | None, page_number: int) -> None: if quality is not None: if page_bg.pil_image.mode in pil_modes['bitonal'] and PIL.features.check_codec('libtiff'): loguru.logger.warning('Pillow uses TIFF for encoding bitonal PDF images. The encoder does not support a "quality" setting. If the conversion fails, please try again without specifying quality.') try: page_bg.pil_image.save( target, format='PDF', quality=quality, resolution=dpi or page_bg.resolution, ) except ValueError: loguru.logger.warning(f'Failed to encode page {page_number}. Trying again without setting quality.') else: return page_bg.pil_image.save( target, format='PDF', resolution=dpi or page_bg.resolution, ) ================================================ FILE: dpsprep/logging.py ================================================ import os import sys from types import TracebackType import loguru cached_stdout = sys.stdout def configure_loguru(*, verbose: bool) -> None: loguru.logger.remove() loguru.logger.add( cached_stdout, format='{level} {time:HH:mm:ss} {message}', level='DEBUG' if verbose else 'INFO', ) def human_readable_size(size: int) -> str: # ruff: disable[PLR2004] if size < 1024: return f'{size} bytes' if size < 1024 ** 2: return f'{size / 1024:.02f} KiB' return f'{size / 1024 ** 2:.02f} MiB' # ruff: enable[PLR2004] # img2pdf abuses debug logging by using print # This is a way to temporarily silence it class SilencePrint: def __enter__(self) -> None: sys.stdout = open(os.devnull, 'w', encoding='utf-8') def __exit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: sys.stdout.close() sys.stdout = cached_stdout ================================================ FILE: dpsprep/ocrmypdf.py ================================================ # We use OCRmyPDF in a non-canonical way: only optimize the file without performing any OCR. # The optimization procedure provides good results and preserves the text layer and outline. # The code here is based on # https://github.com/ocrmypdf/OCRmyPDF/blob/fb006ef39f7f8842dec1976bebe4bcd5ca2e8df8/src/ocrmypdf/optimize.py#L724 # with some simplifications for OCRmyPDF 17 # ruff: noqa: PLC0415 import shutil from typing import Any import loguru from .workdir import WorkingDirectory def optimize_pdf(workdir: WorkingDirectory, optlevel: int, quality: int | None, pool_size: int) -> bool: try: # ObjectStreamMode is actually from pikepdf, but I did not want to include that as a dependency from ocrmypdf._options import OcrOptions from ocrmypdf.optimize import ObjectStreamMode, PdfContext, optimize from ocrmypdf.pdfinfo import PdfInfo except ImportError: loguru.logger.warning('Cannot detect OCRmyPDF. No optimizations will be performed on the output file.') return False options = OcrOptions( input_file=workdir.combined_pdf_without_text_path, output_file=workdir.combined_pdf_path, # Jobs correspond to CPU cores rather than threads, but it seems better to use the available pool size parameter jobs=pool_size, optimize=optlevel, # When 0, these should be adjusted inside OCRmyPDF's "optimize" function jpg_quality=quality or 0, png_quality=quality or 0, ) info = PdfInfo(workdir.combined_pdf_path) context = PdfContext(options, workdir.ocrmypdf_tmp_path, workdir.combined_pdf_path, info, None) optimize( workdir.combined_pdf_path, workdir.optimized_pdf_path, context, { 'compress_streams': True, 'preserve_pdfa': True, 'object_stream_mode': ObjectStreamMode.generate, }, ) return True def perform_ocr(workdir: WorkingDirectory, options: dict[str, Any]) -> bool: try: from ocrmypdf import api except ImportError: loguru.logger.warning('Cannot detect OCRmyPDF. No OCR will be performed on the output file.') return False try: api.ocr( input_file_or_options=workdir.combined_pdf_without_text_path, output_file=workdir.combined_pdf_path, **options, ) except Exception as err: loguru.logger.warning(f'OCRmyPDF failed: {err}') shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path) return False else: return True ================================================ FILE: dpsprep/outline.py ================================================ import djvu.sexpr import loguru from pdfrw import IndirectPdfDict, PdfDict, PdfName from .sexpr import SExpressionVisitor # Based on # https://github.com/pmaupin/pdfrw/issues/52#issuecomment-271190546 class OutlineTransformVisitor(SExpressionVisitor[PdfDict]): def visit_plain_list(self, node: djvu.sexpr.StringExpression, parent: IndirectPdfDict) -> PdfDict: title, page, *rest = node # I have experimentally determined that we need to translate page indices. -- Ianis, 2023-05-03 try: page_number = int(page.value[1:]) - 1 except ValueError: # As far as I understand, python-djvulibre doesn't support Djvu's page titles. -- Ianis, 2023-12-09 loguru.logger.warning(f'Could not determine page number from the page title {page.value}.') return None try: title_text = title.value except UnicodeDecodeError: loguru.logger.warning(f'Could not decode page title {title!r}; leaving it in escaped form.') title_text = str(title) bookmark = IndirectPdfDict( Parent = parent, Title = title_text, A = PdfDict( D = [page_number, PdfName.Fit], S = PdfName.GoTo, ), ) if parent.Count is None: parent.Count = 0 parent.First = bookmark else: bookmark.Prev = parent.Last bookmark.Prev.Next = bookmark parent.Count += 1 parent.Last = bookmark for child in rest: self.visit(child, parent=bookmark) return bookmark def visit_list_bookmarks(self, node: djvu.sexpr.ListExpression) -> PdfDict: _, *rest = node outline = IndirectPdfDict() for child in rest: self.visit(child, parent=outline) return outline ================================================ FILE: dpsprep/pdf.py ================================================ import pathlib import pdfrw from .workdir import WorkingDirectory def is_valid_pdf(path: pathlib.Path) -> bool: try: pdfrw.PdfReader(path) except pdfrw.errors.PdfParseError: return False else: return True def combine_pdfs_on_fs_with_text(workdir: WorkingDirectory, outline: pdfrw.IndirectPdfDict) -> None: text_pdf = pdfrw.PdfReader(workdir.text_layer_pdf_path) writer = pdfrw.PdfWriter() for i, text_page in enumerate(text_pdf.pages): # We take the one-page text PDF and add the image layer on top # Even if the font was not invisible, it would be hidden visually (but not during search or text highlight) image_pdf = pdfrw.PdfReader(workdir.get_page_pdf_path(i)) image_page = image_pdf.pages[0] merger = pdfrw.PageMerge(text_page) merger.add(image_page).render() writer.addpage(text_page) writer.trailer.Root.Outlines = outline writer.write(workdir.combined_pdf_path) def combine_pdfs_on_fs_without_text(workdir: WorkingDirectory, outline: pdfrw.IndirectPdfDict, max_page: int) -> None: writer = pdfrw.PdfWriter() for i in range(max_page): image_pdf = pdfrw.PdfReader(workdir.get_page_pdf_path(i)) image_page = image_pdf.pages[0] writer.addpage(image_page) writer.trailer.Root.Outlines = outline writer.write(workdir.combined_pdf_without_text_path) ================================================ FILE: dpsprep/py.typed ================================================ ================================================ FILE: dpsprep/sexpr.py ================================================ from typing import Generic, TypeVar import djvu.sexpr import loguru T = TypeVar('T') R = TypeVar('R') class SExpressionVisitor(Generic[R]): def visit_list(self, node: djvu.sexpr.ListExpression, **kwargs: T) -> R | None: if len(node) > 0 and isinstance(node[0], djvu.sexpr.SymbolExpression): method = getattr(self, f'visit_list_{node[0]}', None) if method is None: loguru.logger.warning(f"Don't know how to visit ListExpression of type {str(node[0])!r}.") return None return method(node, **kwargs) if hasattr(self, 'visit_plain_list'): return self.visit_plain_list(node, **kwargs) loguru.logger.warning("Don't know how to visit a plain ListExpression.") return None def visit_other(self, node: djvu.sexpr.Expression, **kwargs: T) -> R | None: # noqa: ARG002 loguru.logger.warning(f"Don't know how to visit S-expression type {type(node)!r}.") return None def visit(self, node: djvu.sexpr.Expression, **kwargs: T) -> R | None: if isinstance(node, djvu.sexpr.IntExpression): if hasattr(self, 'visit_int'): return self.visit_int(node, **kwargs) loguru.logger.warning("Don't know how to visit IntExpression.") return None if isinstance(node, djvu.sexpr.StringExpression): if hasattr(self, 'visit_string'): return self.visit_string(node, **kwargs) loguru.logger.warning("Don't know how to visit StringExpression.") return None if isinstance(node, djvu.sexpr.ListExpression): return self.visit_list(node, **kwargs) return self.visit_other(node, **kwargs) ================================================ FILE: dpsprep/test_images.py ================================================ import djvu.decode from PIL import Image, ImageChops, ImageStat from .images import process_djvu_page # A simple score function for Pillow images. # We previously used the pytest-image-diff module, which used the diffimg module. # It turned out that diffimg uses a similar approach, so we dropped the dependency in favor of a few-liner. def calculate_image_diff_score(a: Image.Image, b: Image.Image) -> float: assert a.size == b.size, 'We only support diffing images with identical sizes' assert a.mode == b.mode, 'We only support diffing images with the same mode' diff = ImageChops.difference(a, b) stat = ImageStat.Stat(diff) return max(stat.rms) / 256 # The ImageStat module uses 256 bins def test_process_djvu_page_bitonal() -> None: document = djvu.decode.Context().new_document( djvu.decode.FileURI('fixtures/lipsum_words.djvu'), ) document.decoding_job.wait() fixture = Image.open('fixtures/lipsum_01.png') result = process_djvu_page(document.pages[0], mode='infer', i=0) page_decode_job = document.pages[0].decode() page_decode_job.wait() assert result.resolution == page_decode_job.dpi assert calculate_image_diff_score(fixture, result.pil_image) < 0.05 ================================================ FILE: dpsprep/test_outline.py ================================================ from djvu import sexpr from pdfrw import IndirectPdfDict from .outline import OutlineTransformVisitor def test_basic_outline() -> None: src = sexpr.ListExpression([ sexpr.SymbolExpression(sexpr.Symbol('bookmarks')), sexpr.ListExpression([ sexpr.StringExpression(b'Chapter 2'), sexpr.StringExpression(b'#100'), ]), ]) visitor = OutlineTransformVisitor() bookmarks = visitor.visit(src) assert bookmarks is not None assert bookmarks.Count == 1 assert bookmarks.First.Title == 'Chapter 2' assert bookmarks.First.A.D[0] == 99 # The page number def test_nested_outline() -> None: src = sexpr.ListExpression([ sexpr.SymbolExpression(sexpr.Symbol('bookmarks')), sexpr.ListExpression([ sexpr.StringExpression(b'Chapter 2'), sexpr.StringExpression(b'#100'), sexpr.ListExpression([ sexpr.StringExpression(b'Chapter 2.1'), sexpr.StringExpression(b'#200'), ]), ]), ]) visitor = OutlineTransformVisitor() bookmarks = visitor.visit(src) assert bookmarks is not None assert bookmarks.Count == 1 assert bookmarks.First.Count == 1 assert bookmarks.First.A.D[0] == 99 # The page number of chapter 2 assert bookmarks.First.First.A.D[0] == 199 # The page number of chapter 2.1 # Sometimes the page numbers are instead page titles, which our libdjvu bindings do not support # We ignore them since there is not much we can do in this case # See https://github.com/kcroker/dpsprep/issues/23 def test_outline_with_page_titles() -> None: src = sexpr.ListExpression([ sexpr.SymbolExpression(sexpr.Symbol('bookmarks')), sexpr.ListExpression([ sexpr.StringExpression(b'Preface'), sexpr.StringExpression(b'#f007.djvu'), ]), sexpr.ListExpression([ sexpr.StringExpression(b'Contents'), sexpr.StringExpression(b'#f011.djvu'), ]), sexpr.ListExpression([ sexpr.StringExpression(b'0 Prologue'), sexpr.StringExpression(b'#p001.djvu'), ]), ]) visitor = OutlineTransformVisitor() bookmarks = visitor.visit(src) empty_pdf_dict = IndirectPdfDict() assert bookmarks == empty_pdf_dict def test_outline_with_invalid_unicode() -> None: src = sexpr.ListExpression([ sexpr.SymbolExpression(sexpr.Symbol('bookmarks')), sexpr.ListExpression([ sexpr.StringExpression(b'\2470'), sexpr.StringExpression(b'#1'), ]), ]) visitor = OutlineTransformVisitor() bookmarks = visitor.visit(src) assert bookmarks is not None assert bookmarks.Count == 1 assert bookmarks.First.Title == '"\\2470"' ================================================ FILE: dpsprep/test_text.py ================================================ import pathlib import string import djvu.decode import pytest from .text import TextExtractVisitor def remove_whitespace(src: str) -> str: return src.translate({ord(c): None for c in string.whitespace}) def test_extract_djvu_page_text_words() -> None: document = djvu.decode.Context().new_document( djvu.decode.FileURI('fixtures/lipsum_words.djvu'), ) document.decoding_job.wait() djvu_page = document.pages[0] djvu_page.get_info() djvu_text = TextExtractVisitor().visit(djvu_page.text.sexpr) assert djvu_text is not None source_pdf_text = pathlib.Path('fixtures/lipsum_01.txt').read_text(encoding='utf-8') assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text) def test_extract_djvu_page_text_lines() -> None: document = djvu.decode.Context().new_document( djvu.decode.FileURI('fixtures/lipsum_lines.djvu'), ) document.decoding_job.wait() djvu_page = document.pages[0] djvu_page.get_info() djvu_text = TextExtractVisitor().visit(djvu_page.text.sexpr) assert djvu_text is not None source_pdf_text = pathlib.Path('fixtures/lipsum_01.txt').read_text(encoding='utf-8') assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text) def test_invalid_utf8() -> None: document = djvu.decode.Context().new_document( djvu.decode.FileURI('fixtures/lipsum_words_invalid.djvu'), ) document.decoding_job.wait() djvu_page = document.pages[0] djvu_page.get_info() first_word_sexpr = djvu_page.text.sexpr[5][5] # djvulibre cannot decode the first word with pytest.raises(UnicodeDecodeError): first_word_sexpr.value # noqa: B018 first_word = TextExtractVisitor().visit(first_word_sexpr) assert first_word == '' ================================================ FILE: dpsprep/text.py ================================================ # ruff: noqa: RUF059 import unicodedata from collections.abc import Iterable, Sequence from pathlib import Path import djvu.sexpr import loguru from fpdf import FPDF from .sexpr import SExpressionVisitor BASE_FONT_SIZE = 10 TAB_SIZE = 4 class TextExtractVisitor(SExpressionVisitor[str]): def iter_chars(self, string: str) -> Iterable[str]: for char in string: code = unicodedata.category(char) # Line Separator (Zl) | Space Separator (Zs) if code in {'Zl', 'Zs'}: yield ' ' # Paragraph Separator (Zp) elif code == 'Zp': yield '\n' # Control (Cc) elif code == 'Cc': if char == '\t': yield ' ' * TAB_SIZE elif char == '\n': yield ' ' # These break FPDF. # A full list of categories can be found in https://www.compart.com/en/unicode/category # Format (Cf) | Private Use (Co) | Surrogate 'Cs': elif code in {'Cf', 'Co', 'Cs'}: pass else: yield char def visit_string(self, node: djvu.sexpr.StringExpression) -> str: try: string = node.value # This getter is not static - it does UTF-8 conversion and fails for some DjVu files except ValueError as err: loguru.logger.warning(f'Could not decode {node!r}: {err}') return '' else: return ''.join(self.iter_chars(string)) def visit_plain_list(self, node: djvu.sexpr.ListExpression) -> str: # noqa: ARG002 return '' def visit_list_word(self, node: djvu.sexpr.ListExpression) -> str | None: _, x1, y1, x2, y2, content, *rest = node return self.visit(content) visit_list_char = visit_list_word def visit_list_line(self, node: djvu.sexpr.ListExpression) -> str: _, x1, y1, x2, y2, *rest = node return ' '.join(self.visit(child) or '' for child in rest) def visit_list_para(self, node: djvu.sexpr.ListExpression) -> str: _, x1, y1, x2, y2, *rest = node return '\n'.join(self.visit(child) or '' for child in rest) visit_list_column = visit_list_para visit_list_region = visit_list_para visit_list_page = visit_list_para class TextDrawVisitor(SExpressionVisitor): pdf: FPDF dpi: int extractor: TextExtractVisitor def __init__(self, pdf: FPDF, dpi: int) -> None: self.pdf = pdf self.dpi = dpi self.extractor = TextExtractVisitor() def draw_text(self, x1: int, x2: int, y1: int, y2: int, text: str) -> None: # noqa: ARG002 page_width, page_height = self.pdf.pages[self.pdf.page].dimensions() if page_height is None: loguru.logger.warning(f'Cannot draw {text!r} because page height is not set.') return self.pdf.set_font('Invisible', size=BASE_FONT_SIZE) # Adjust font size desired_width = (x2 - x1) / self.dpi actual_width = self.pdf.get_string_width(text) if actual_width == 0: return self.pdf.set_font('Invisible', size=int(BASE_FONT_SIZE * desired_width / actual_width)) try: self.pdf.text(x=x1 / self.dpi, y=page_height / 72 - y1 / self.dpi, text=text) except TypeError as err: loguru.logger.warning(f'FPDF refuses to draw {text!r}: {err}') def iter_loose_string_content(self, expressions: list[djvu.sexpr.Expression]) -> Iterable[str]: for child in expressions: if not isinstance(child, djvu.sexpr.StringExpression): continue if (text := self.extractor.visit(child)) is not None: yield text def get_loose_string_content(self, expressions: list[djvu.sexpr.Expression], delimiter: str) -> str: return delimiter.join(self.iter_loose_string_content(expressions)) def visit_list_word(self, node: djvu.sexpr.ListExpression) -> None: _, x1, y1, x2, y2, *rest = node text = self.extractor.visit(node) if text is not None: self.draw_text(x1.value, x2.value, y1.value, y2.value, text) visit_list_char = visit_list_word def visit_list_line(self, node: djvu.sexpr.ListExpression) -> None: _, x1, y1, x2, y2, *rest = node text = self.get_loose_string_content(rest, ' ') if len(text) > 0: self.draw_text(x1.value, x2.value, y1.value, y2.value, text) for child in rest: if not isinstance(child, djvu.sexpr.StringExpression): self.visit(child) def visit_list_para(self, node: djvu.sexpr.ListExpression) -> None: _, x1, y1, x2, y2, *rest = node text = self.get_loose_string_content(rest, '\n') if len(text) > 0: self.draw_text(x1.value, x2.value, y1.value, y2.value, text) for child in rest: if not isinstance(child, djvu.sexpr.StringExpression): self.visit(child) def visit_list_column(self, node: djvu.sexpr.ListExpression) -> None: _, x1, y1, x2, y2, *rest = node for child in rest: self.visit(child) visit_list_page = visit_list_column visit_list_region = visit_list_column def djvu_pages_to_text_fpdf(pages: Sequence[djvu.decode.Page], dpi: int | None) -> FPDF: pdf = FPDF(unit='in') pdf.add_font( family='Invisible', fname=Path(__file__).parent / 'invisible1.ttf', style='', ) for i, page in enumerate(pages): page_job = page.decode(wait=True) page_dpi = dpi or page_job.dpi pdf.add_page(format=(page_job.width / page_dpi, page_job.height / page_dpi)) loguru.logger.debug(f'Processing text for page {i + 1}.') visitor = TextDrawVisitor(pdf, page_dpi) visitor.visit(page.text.sexpr) return pdf ================================================ FILE: dpsprep/workdir.py ================================================ import hashlib import os import pathlib import shutil import tempfile import loguru HASHING_BUFFER_SIZE = 64 * 1024 def get_file_hash(path: os.PathLike | str) -> str: h = hashlib.blake2b(digest_size=4) with open(path, 'rb') as file: data = file.read(HASHING_BUFFER_SIZE) while len(data) > 0: h.update(data) data = file.read(HASHING_BUFFER_SIZE) return h.hexdigest() class WorkingDirectory: src: pathlib.Path dest: pathlib.Path workdir: pathlib.Path def __init__(self, src: os.PathLike | str, dest: os.PathLike | str | None) -> None: self.src = pathlib.Path(src) if dest is None: self.dest = pathlib.Path(pathlib.Path(src).with_suffix('.pdf').name) else: self.dest = pathlib.Path(dest) # Working path # If possible, we avoid the ephemeral storage /tmp persistent_tmp = pathlib.Path('/var/tmp') # noqa: S108 if persistent_tmp.exists() and (persistent_tmp.stat().st_mode & (os.W_OK | os.X_OK)): loguru.logger.debug('Using non-ephemeral storage "/var/tmp".') root = persistent_tmp else: loguru.logger.debug(f'Using default system storage {tempfile.gettempdir()!r}.') root = pathlib.Path(tempfile.gettempdir()) self.workdir = root / 'dpsprep' / get_file_hash(self.src) def create_if_necessary(self) -> None: if not self.workdir.exists(): loguru.logger.debug(f'Creating {str(self.workdir)!r}.') self.workdir.mkdir(parents=True) if not self.ocrmypdf_tmp_path.exists(): loguru.logger.debug(f'Creating {str(self.ocrmypdf_tmp_path)!r}.') def get_page_pdf_path(self, i: int) -> pathlib.Path: return self.workdir / f'page_bg_{i + 1}.pdf' @property def text_layer_pdf_path(self) -> pathlib.Path: return self.workdir / 'text_layer.pdf' @property def ocrmypdf_tmp_path(self) -> pathlib.Path: return self.workdir / 'ocrmypdf' @property def combined_pdf_without_text_path(self) -> pathlib.Path: return self.workdir / 'combined_without_text.pdf' @property def combined_pdf_path(self) -> pathlib.Path: return self.workdir / 'combined.pdf' @property def optimized_pdf_path(self) -> pathlib.Path: return self.workdir / 'optimized.pdf' def destroy(self) -> None: shutil.rmtree(self.workdir) ================================================ FILE: dpsprep.1 ================================================ .\" generated with Ronn-NG/v0.10.1 .\" http://github.com/apjanke/ronn-ng/tree/0.10.1 .TH "DPS" "1" "March 2026" "" .SH "NAME" \fBdps\fR \- a DjVu to PDF converter .SH "SYNOPSIS" \fBdpsprep\fR \fIoptions\fR src [dest] .SH "DESCRIPTION" This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general\-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e\.g\. TOC) and text layers (e\.g\. OCR)\. .SH "OPTIONS" .IP "\(bu" 4 \fB\-q\fR, \fB\-\-quality\fR: Quality of images in output\. Used only for JPEG compression, i\.e\. RGB and Grayscale images\. Passed directly to Pillow and to OCRmyPDF's optimizer\. .IP "\(bu" 4 \fB\-v\fR, \fB\-\-verbose\fR: Display debug messages\. .IP "\(bu" 4 \fB\-o\fR, \fB\-\-overwrite\fR: Overwrite destination file\. .IP "\(bu" 4 \fB\-w\fR, \fB\-\-preserve\-working\fR: Preserve the working directory after script termination\. .IP "\(bu" 4 \fB\-d\fR, \fB\-\-delete\-working\fR: Delete any existing files in the working directory prior to writing to it\. .IP "\(bu" 4 \fB\-t\fR, \fB\-\-no\-text\fR: Disable the generation of text layers\. Implied by \-\-ocr\. .IP "\(bu" 4 \fB\-p\fR, \fB\-\-pool\-size\fR \fIint\fR: Size of MultiProcessing pool for handling page\-by\-page operations\. .IP "\(bu" 4 \fB\-m\fR, \fB\-\-mode\fR \fIinfer|bitonal|grayscale|rgb\fR: Override the image modes encoded in the DjVu file for individual pages\. It sometimes makes sense to force bitonal images since they compress well\. .IP "\(bu" 4 \fB\-\-dpi\fR \fIint\fR: Override DPI values encoded in the DjVu file for individual pages\. .IP "\(bu" 4 \fB\-\-ocr\fR \fIJSON\fR: Perform OCR via OCRmyPDF rather than trying to convert the text layer\. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF\. .IP "\(bu" 4 \fB\-O1\fR: Use the lossless PDF image optimization from OCRmyPDF (without performing OCR)\. .IP "\(bu" 4 \fB\-O2\fR: Use the PDF image optimization from OCRmyPDF\. .IP "\(bu" 4 \fB\-O3\fR: Use the aggressive lossy PDF image optimization from OCRmyPDF\. .IP "\(bu" 4 \fB\-\-help\fR: Show help message and exit\. .IP "\(bu" 4 \fB\-\-version\fR: Show the version and exit\. .IP "" 0 .SH "EXAMPLES" Produce \fBfile\.pdf\fR in the current directory: .IP "" 4 .nf dpsprep /wherever/file\.djvu .fi .IP "" 0 .P Produce \fBoutput\.pdf\fR with reduced image quality and aggressive PDF image optimizations: .IP "" 4 .nf dpsprep \-\-\-quality=30 \-O3 input\.djvu output\.pdf .fi .IP "" 0 .P Produce an output file using a large pool of workers: .IP "" 4 .nf dpsprep \-\-pool=16 input\.djvu .fi .IP "" 0 .P Force bitonal images: .IP "" 4 .nf dpsprep \-\-mode bitonal input\.djvu .fi .IP "" 0 .P Produce an output file by disregarding the text layer and running OCRmyPDF instead: .IP "" 4 .nf dpsprep \-\-ocr '{"language": ["rus", "eng"]}' input\.djvu .fi .IP "" 0 .P Or simply disregard the text layer without OCR: .IP "" 4 .nf dpsprep \-\-no\-text input\.djvu .fi .IP "" 0 .SH "NOTE REGARDING COMPRESSION" We perform compression in two stages: .IP "\(bu" 4 The first one is the default compression provided by Pillow\. For bitonal images, the PDF generation code says that, if \fBlibtiff\fR is available, \fBgroup4\fR compression is used\. .IP "\(bu" 4 If OCRmyPDF is installed, its PDF optimization can be used via the flags \fB\-O1\fR to \fB\-O3\fR (this involves no OCR)\. This allows us to use advanced techniques, including JBIG2 compression via \fBjbig2enc\fR\. .IP "" 0 .P If manually running OCRmyPDF, note that the optimization command suggested in the documentation (setting \fB\-\-tesseract\-timeout\fR to \fB0\fR) may ruin existing text layers\. To perform only PDF optimization you can use the following undocumented tool instead: .IP "" 4 .nf python \-m ocrmypdf\.optimize .fi .IP "" 0 ================================================ FILE: dpsprep.1.ronn ================================================ # dps(1) -- a DjVu to PDF converter ## SYNOPSIS `dpsprep` [options] src [dest] ## DESCRIPTION This tool, initially made specifically for use with Sony's Digital Paper System (DPS), is now a general-purpose DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines (e.g. TOC) and text layers (e.g. OCR). ## OPTIONS * `-q`, `--quality`: Quality of images in output. Used only for JPEG compression, i.e. RGB and Grayscale images. Passed directly to Pillow and to OCRmyPDF's optimizer. * `-v`, `--verbose`: Display debug messages. * `-o`, `--overwrite`: Overwrite destination file. * `-w`, `--preserve-working`: Preserve the working directory after script termination. * `-d`, `--delete-working`: Delete any existing files in the working directory prior to writing to it. * `-t`, `--no-text`: Disable the generation of text layers. Implied by --ocr. * `-p`, `--pool-size` : Size of MultiProcessing pool for handling page-by-page operations. * `-m`, `--mode` : Override the image modes encoded in the DjVu file for individual pages. It sometimes makes sense to force bitonal images since they compress well. * `--dpi` : Override DPI values encoded in the DjVu file for individual pages. * `--ocr` : Perform OCR via OCRmyPDF rather than trying to convert the text layer. If this parameter has a value, it should be a JSON dictionary of options to be passed to OCRmyPDF. * `-O1`: Use the lossless PDF image optimization from OCRmyPDF (without performing OCR). * `-O2`: Use the PDF image optimization from OCRmyPDF. * `-O3`: Use the aggressive lossy PDF image optimization from OCRmyPDF. * `--help`: Show help message and exit. * `--version`: Show the version and exit. ## EXAMPLES Produce `file.pdf` in the current directory: dpsprep /wherever/file.djvu Produce `output.pdf` with reduced image quality and aggressive PDF image optimizations: dpsprep ---quality=30 -O3 input.djvu output.pdf Produce an output file using a large pool of workers: dpsprep --pool=16 input.djvu Force bitonal images: dpsprep --mode bitonal input.djvu Produce an output file by disregarding the text layer and running OCRmyPDF instead: dpsprep --ocr '{"language": ["rus", "eng"]}' input.djvu Or simply disregard the text layer without OCR: dpsprep --no-text input.djvu ## NOTE REGARDING COMPRESSION We perform compression in two stages: * The first one is the default compression provided by Pillow. For bitonal images, the PDF generation code says that, if `libtiff` is available, `group4` compression is used. * If OCRmyPDF is installed, its PDF optimization can be used via the flags `-O1` to `-O3` (this involves no OCR). This allows us to use advanced techniques, including JBIG2 compression via `jbig2enc`. If manually running OCRmyPDF, note that the optimization command suggested in the documentation (setting `--tesseract-timeout` to `0`) may ruin existing text layers. To perform only PDF optimization you can use the following undocumented tool instead: python -m ocrmypdf.optimize ================================================ FILE: fixtures/.gitattributes ================================================ lipsum* linguist-generated ================================================ FILE: fixtures/Makefile ================================================ .PHONY: all clean all: lipsum.pdf lipsum_01.txt lipsum_01.png lipsum_lines.djvu lipsum_words.djvu lipsum_words_invalid.djvu clean: rm --force *.djvu *.pdf *.png *.txt %.pdf: %.tex pdflatex $*.tex rm $*.aux $*.log %_01.txt: %.pdf pdftotext -l 1 -layout $*.pdf $*_01.txt %_01.png: %.pdf gs -sDEVICE=pngmono -r600 -dLastPage=1 -o $*_01.png $*.pdf oxipng $*_01.png %_words.djvu: %.pdf djvudigital --dpi=600 --words $*.pdf mv $*.djvu $*_words.djvu %_lines.djvu: %.pdf djvudigital --dpi=600 --lines $*.pdf mv $*.djvu $*_lines.djvu %_invalid.djvu: %.djvu cp $*.djvu $*_invalid.djvu djvused $*_invalid.djvu -e 'output-all' | \ sed 's/Lorem/\\270/g' | \ djvused $*_invalid.djvu -f /dev/stdin -s ================================================ FILE: fixtures/lipsum.tex ================================================ \documentclass{article} \usepackage{lipsum} \title{Lorem Ipsum} \author{Cicero} \begin{document} \lipsum \end{document} ================================================ FILE: fixtures/lipsum_01.txt ================================================ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut purus elit, vestibulum ut, placerat ac, adipiscing vitae, felis. Curabitur dictum gravida mauris. Nam arcu libero, nonummy eget, consectetuer id, vulputate a, magna. Donec vehicula augue eu neque. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Mauris ut leo. Cras viverra metus rhoncus sem. Nulla et lectus vestibulum urna fringilla ultrices. Phasellus eu tellus sit amet tortor gravida placerat. Integer sapien est, iaculis in, pretium quis, viverra ac, nunc. Praesent eget sem vel leo ultrices bibendum. Aenean faucibus. Morbi dolor nulla, malesuada eu, pulvinar at, mollis ac, nulla. Cur- abitur auctor semper nulla. Donec varius orci eget risus. Duis nibh mi, congue eu, accumsan eleifend, sagittis quis, diam. Duis eget orci sit amet orci dignissim rutrum. Nam dui ligula, fringilla a, euismod sodales, sollicitudin vel, wisi. Morbi auctor lorem non justo. Nam lacus libero, pretium at, lobortis vitae, ultricies et, tellus. Donec aliquet, tortor sed accumsan bibendum, erat ligula aliquet magna, vitae ornare odio metus a mi. Morbi ac orci et nisl hendrerit mollis. Suspendisse ut massa. Cras nec ante. Pellentesque a nulla. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Aliquam tincidunt urna. Nulla ullamcorper vestibulum turpis. Pellentesque cursus luctus mauris. Nulla malesuada porttitor diam. Donec felis erat, congue non, volutpat at, tincidunt tristique, libero. Vivamus viverra fermentum felis. Donec nonummy pellentesque ante. Phasellus adipiscing semper elit. Proin fermentum massa ac quam. Sed diam turpis, molestie vitae, placerat a, molestie nec, leo. Mae- cenas lacinia. Nam ipsum ligula, eleifend at, accumsan nec, suscipit a, ipsum. Morbi blandit ligula feugiat magna. Nunc eleifend consequat lorem. Sed lacinia nulla vitae enim. Pellentesque tincidunt purus vel magna. Integer non enim. Praesent euismod nunc eu purus. Donec bibendum quam in tellus. Nullam cur- sus pulvinar lectus. Donec et mi. Nam vulputate metus eu enim. Vestibulum pellentesque felis eu massa. Quisque ullamcorper placerat ipsum. Cras nibh. Morbi vel justo vitae lacus tincidunt ultrices. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. In hac habitasse platea dictumst. Integer tempus convallis augue. Etiam facilisis. Nunc elementum fermentum wisi. Aenean placerat. Ut imperdiet, enim sed gravida sollicitudin, felis odio placerat quam, ac pulvinar elit purus eget enim. Nunc vitae tortor. Proin tempus nibh sit amet nisl. Vivamus quis tortor vitae risus porta vehicula. Fusce mauris. Vestibulum luctus nibh at lectus. Sed bibendum, nulla a fau- cibus semper, leo velit ultricies tellus, ac venenatis arcu wisi vel nisl. Vestibulum diam. Aliquam pellentesque, augue quis sagittis posuere, turpis lacus congue quam, in hendrerit risus eros eget felis. Maecenas eget erat in sapien mattis porttitor. Vestibulum porttitor. Nulla facilisi. Sed a turpis eu lacus commodo facilisis. Morbi fringilla, wisi in dignissim interdum, justo lectus sagittis dui, et vehicula libero dui cursus dui. Mauris tempor ligula sed lacus. Duis cursus enim ut augue. Cras ac magna. Cras nulla. Nulla egestas. Curabitur a leo. Quisque egestas wisi eget nunc. Nam feugiat lacus vel est. Curabitur consectetuer. Suspendisse vel felis. Ut lorem lorem, interdum eu, tincidunt sit amet, 1 ================================================ FILE: pyproject.toml ================================================ [project] name = "dpsprep" version = "2.5.4" description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers" requires-python = ">=3.11, <4.0" authors = [ { name = "Kevin Arthur Schiff Croker" }, { name = "Ianis Vasilev", email = "ianis@ivasilev.net" } ] license = "GPL-3.0-or-later" dependencies = [ "click (>=8)", "djvulibre-python (>=0.9.3)", "fpdf2 (>=2.8)", "loguru (>=0.7)", "pdfrw (>=0.4)", "pillow (>=12.2.0)" ] [project.urls] Repository = "https://github.com/kcroker/dpsprep.git" Changelog = "https://github.com/kcroker/dpsprep/blob/master/CHANGELOG.md" [project.optional-dependencies] compress = [ "ocrmypdf (>=17)" ] [project.scripts] dpsprep = "dpsprep:dpsprep" [dependency-groups] dev = [ "mypy (>=1.19)", "pytest (>=9.0.3)", "ruff (>=0.15)", "types-fpdf2 (>=2.8.4.20260322)" ] [build-system] # uv build complains if no upper bound is set, but it updates its minor versions often, so we put a major version just to shut it up requires = ["uv_build (>=0.10, <1)"] build-backend = "uv_build" # uv [tool.uv] resolution = "lowest-direct" [tool.uv.build-backend] module-root = "" # uv-build expects the code to be in src/dpsprep, but I did not want to move it when migrating to uv # pytest [tool.pytest.ini_options] addopts = "--capture tee-sys" # ruff [tool.ruff] line-length = 120 [tool.ruff.lint] select = [ "A", # flake8-builtins "ANN", # flake8-annotations "ARG", # flake8-unused-arguments "ASYNC", # flake8-async "B", # flake8-bugbear "C4", # flake8-comrehensions "C90", # mccabe "COM", # flake8-commas "E", # pycodestyle error "F", # pyflakes "FURB", # refurb "I", # isort "INP", # flake8-no-pep420 "N", # pep8-naming "PERF", # perflint "PL", # pylint "PT", # flake8-pytest-style "PTH", # flake8-use-pathlib "Q", # flake8-quotes "RUF", # ruff "S", # flake8-bandit "SIM", # flake8-simplify "TC", # flake8-type-checking "TRY", # tryceratops "UP", # pyupgrade "W", # pycodestyle warning ] ignore = [ "E501", # line-too-long "PLC1901", # compare-to-empty-string "PLR6301", # no-self-use "PTH123", # builtin-open "RUF001", "RUF002", "RUF003", # ambiguous-unicode-character-{string,docstring,comment} ] [tool.ruff.lint.isort] lines-after-imports = 2 [tool.ruff.lint.flake8-quotes] inline-quotes = "single" multiline-quotes = "single" [tool.ruff.lint.per-file-ignores] "test_*.py" = ["S101", "PLR2004"] # mypy [tool.mypy] packages = ["dpsprep"] [[tool.mypy.overrides]] module = [ "djvu.*", "ocrmypdf.*", "pdfrw.*" ] ignore_missing_imports = true