Repository: ernestofgonzalez/epub-utils Branch: main Commit: 8c5417c331f2 Files: 60 Total size: 315.4 KB Directory structure: gitextract_obeqz0f5/ ├── .github/ │ └── workflows/ │ ├── docs.yml │ └── test.yml ├── .gitignore ├── .vscode/ │ └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── docs/ │ ├── Makefile │ ├── api-reference.rst │ ├── api-tutorial.rst │ ├── changelog.rst │ ├── cli-reference.rst │ ├── cli-tutorial.rst │ ├── conf.py │ ├── contributing.rst │ ├── epub-standards.rst │ ├── examples.rst │ ├── formats.rst │ ├── index.rst │ └── installation.rst ├── epub_utils/ │ ├── __init__.py │ ├── __main__.py │ ├── cli.py │ ├── container.py │ ├── content/ │ │ ├── __init__.py │ │ ├── base.py │ │ └── xhtml.py │ ├── doc.py │ ├── exceptions.py │ ├── navigation/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── nav/ │ │ │ ├── __init__.py │ │ │ └── dom.py │ │ └── ncx/ │ │ ├── __init__.py │ │ └── dom.py │ ├── package/ │ │ ├── __init__.py │ │ ├── manifest.py │ │ ├── metadata.py │ │ └── spine.py │ └── printers.py ├── pytest.ini ├── requirements/ │ ├── requirements-docs.txt │ ├── requirements-linting.txt │ ├── requirements-testing.txt │ └── requirements.txt ├── requirements.txt ├── ruff.toml ├── setup.py └── tests/ ├── assets/ │ └── roads.epub ├── conftest.py ├── test_cli.py ├── test_container.py ├── test_doc.py ├── test_manifest.py ├── test_metadata.py ├── test_nav_navigation.py ├── test_ncx_navigation.py ├── test_package.py ├── test_spine.py └── test_xhtml_content.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/docs.yml ================================================ name: Publish documentation on: push: branches: - main jobs: docs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 - name: Install dependencies run: | pip install -r requirements/requirements-docs.txt - name: Sphinx build run: | sphinx-build docs _build - name: Deploy uses: peaceiris/actions-gh-pages@v3 if: ${{ github.ref == 'refs/heads/main' }} with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: _build/ force_orphan: true ================================================ FILE: .github/workflows/test.yml ================================================ name: Test on: push: branches: - "main" pull_request: concurrency: group: ${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: test: name: Python ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: max-parallel: 4 matrix: os: - ubuntu-24.04 - windows-2022 - macos-14 python-version: - "3.8" - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} allow-prereleases: true - name: Cache pip packages uses: actions/cache@v3 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Run tests run: | pytest ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # MacOS .DS_Store ================================================ FILE: .vscode/settings.json ================================================ { "python.testing.pytestEnabled": true } ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2025 Ernesto González Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ #!/usr/bin/env bash LIGHT_CYAN=\033[1;36m NO_COLOR=\033[0m .PHONY: docs help: @echo "test - run tests with pytest" @echo "coverage - get code coverage report" @echo "lint - lint the python code" @echo "format - format the python code" # Run tests test: @echo "${LIGHT_CYAN}Running tests...${NO_COLOR}" pytest # Get code coverage report coverage: @echo "${LIGHT_CYAN}Running tests and collecting coverage data...${NO_COLOR}" pytest coverage combine @echo "${LIGHT_CYAN}Reporting code coverage data...${NO_COLOR}" coverage report @echo "${LIGHT_CYAN}Creating HTML report...${NO_COLOR}" coverage html @echo "${LIGHT_CYAN}Creating coverage badge...${NO_COLOR}" @rm ./coverage.svg coverage-badge -o coverage.svg # Lint code lint: @echo "${LIGHT_CYAN}Linting code...${NO_COLOR}" ruff check # Format code format: @echo "${LIGHT_CYAN}Formatting code...${NO_COLOR}" ruff check --select I --fix ruff format ================================================ FILE: README.md ================================================ # epub-utils [![PyPI](https://img.shields.io/pypi/v/epub-utils.svg)](https://pypi.org/project/epub-utils/) [![Changelog](https://img.shields.io/github/v/release/ernestofgonzalez/epub-utils?include_prereleases&label=changelog)](https://ernestofgonzalez.github.io/epub-utils/changelog) [![Python 3.x](https://img.shields.io/pypi/pyversions/epub-utils.svg?logo=python&logoColor=white)](https://pypi.org/project/epub-utils/) [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/ernestofgonzalez/epub-utils/blob/main/LICENSE) A Python library and CLI tool for inspecting ePub from the terminal. ## Features - **Complete EPUB Support** - Parse both EPUB 2.0.1 and EPUB 3.0+ specifications with container, package, manifest, spine, and table of contents inspection - **Rich Metadata Extraction** - Extract Dublin Core metadata (title, author, language, publisher) with key-value, XML, and raw output formats for easy scripting - **Content Analysis** - Access document content by manifest ID or file path, with plain text extraction for content analysis and word counting - **File System Navigation** - Browse and extract any file within EPUB archives (XHTML, CSS, images, fonts) with detailed file information including sizes and compression ratios - **Multiple Output Formats** - XML with syntax highlighting, raw content, key-value pairs, plain text, and formatted tables to suit different workflows - **CLI and Python API** - Comprehensive command-line tool for terminal workflows plus a clean Python library for programmatic access - **Standards Compliance** - Built-in validation capabilities and adherence to W3C/IDPF specifications for reliable EPUB processing - **Performance Optimized** - Lazy loading, efficient ZIP parsing, and optional lxml support for handling large EPUB collections ## Installation `epub-utils` is available as a [PyPI](https://pypi.org/) package ```bash pip install epub-utils ``` ## Use as a CLI tool The basic format is: ```bash epub-utils EPUB_PATH COMMAND [OPTIONS] ``` ### Commands - `container` - Display the container.xml contents ```bash # Show container.xml with syntax highlighting epub-utils book.epub container # Show container.xml as raw content epub-utils book.epub container --format raw # Show container.xml with pretty formatting epub-utils book.epub container --pretty-print ``` - `package` - Display the package OPF file contents ```bash # Show package.opf with syntax highlighting epub-utils book.epub package # Show package.opf as raw content epub-utils book.epub package --format raw ``` - `toc` - Display the table of contents file contents ```bash # Show toc.ncx/nav.xhtml with syntax highlighting (auto-detect) epub-utils book.epub toc # Show toc.ncx/nav.xhtml as raw content epub-utils book.epub toc --format raw # Force NCX format (EPUB 2 navigation control file) epub-utils book.epub toc --ncx # Force Navigation Document (EPUB 3 navigation file) epub-utils book.epub toc --nav ``` - `metadata` - Display the metadata information from the package file ```bash # Show metadata with syntax highlighting epub-utils book.epub metadata # Show metadata as key-value pairs epub-utils book.epub metadata --format kv # Show metadata with pretty formatting epub-utils book.epub metadata --pretty-print ``` - `manifest` - Display the manifest information from the package file ```bash # Show manifest with syntax highlighting epub-utils book.epub manifest # Show manifest as raw content epub-utils book.epub manifest --format raw ``` - `spine` - Display the spine information from the package file ```bash # Show spine with syntax highlighting epub-utils book.epub spine # Show spine as raw content epub-utils book.epub spine --format raw ``` - `content` - Display the content of a document by its manifest item ID ```bash # Show content with syntax highlighting epub-utils book.epub content chapter1 # Show raw HTML/XML content epub-utils book.epub content chapter1 --format raw # Show plain text content (HTML tags stripped) epub-utils book.epub content chapter1 --format plain ``` - `files` - List all files in the EPUB archive or display content of a specific file ```bash # List all files in table format (default) epub-utils book.epub files # List all files as simple paths epub-utils book.epub files --format raw # Display content of a specific file by path epub-utils book.epub files OEBPS/chapter1.xhtml # Display XHTML file content in different formats epub-utils book.epub files OEBPS/chapter1.xhtml --format raw epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print epub-utils book.epub files OEBPS/chapter1.xhtml --format plain # Display non-XHTML files (CSS, images, etc.) epub-utils book.epub files OEBPS/styles/main.css epub-utils book.epub files META-INF/container.xml ``` ### Options - `-h, --help` - Show help message and exit - `-v, --version` - Show program version and exit - `-fmt, --format` - Output format (default: xml) - `xml` - Display with XML syntax highlighting (default) - `raw` - Display raw content without formatting - `plain` - Display plain text content (HTML tags stripped, for content command only) - `kv` - Display key-value pairs (where supported) - `-pp, --pretty-print` - Pretty-print XML output (applies to xml and raw formats only) ```bash # Display as raw content epub-utils book.epub package --format raw # Display with XML syntax highlighting (default) epub-utils book.epub package --format xml # Display as key-value pairs (for supported commands) epub-utils book.epub metadata --format kv # Display plain text content (content command only) epub-utils book.epub content chapter1 --format plain # Pretty-print XML with proper indentation epub-utils book.epub package --pretty-print # Combine format and pretty-print options epub-utils book.epub metadata --format raw --pretty-print ``` ## Use as a Python library ```python from epub_utils import Document # Load an EPUB document doc = Document("path/to/book.epub") ``` ### Basic Document Access Access the main components of an EPUB document: ```python # Get container information container = doc.container print(container.to_xml()) # Formatted XML with syntax highlighting print(container.to_str()) # Raw XML content # Get package information package = doc.package print(package.to_xml()) # Formatted XML with syntax highlighting print(package.to_str()) # Raw XML content # Get table of contents toc = doc.toc if toc: # TOC might be None if not present print(toc.to_xml()) # Formatted XML with syntax highlighting print(toc.to_str()) # Raw XML content # Access specific navigation formats ncx = doc.ncx # NCX format (EPUB 2 or EPUB 3 with NCX) if ncx: print("NCX navigation available") print(ncx.to_xml()) nav = doc.nav # Navigation Document (EPUB 3 only) if nav: print("Navigation Document available") print(nav.to_xml()) print(toc.to_str()) # Raw XML content ``` ### Working with Metadata Access and format metadata information: ```python # Access package metadata metadata = doc.package.metadata # Basic Dublin Core elements print(f"Title: {metadata.title}") print(f"Creator: {metadata.creator}") print(f"Identifier: {metadata.identifier}") print(f"Language: {metadata.language}") print(f"Publisher: {metadata.publisher}") print(f"Date: {metadata.date}") # Dynamic attribute access for any metadata field isbn = getattr(metadata, 'isbn', 'Not available') series = getattr(metadata, 'series', 'Not available') # Get formatted metadata output print(metadata.to_xml()) # Formatted XML with syntax highlighting print(metadata.to_str()) # Raw XML content print(metadata.to_kv()) # Key-value format for easy parsing ``` ### Working with Manifest Access the manifest to see all files in the EPUB: ```python # Get manifest information manifest = doc.package.manifest # Access all manifest items for item in manifest.items: print(f"ID: {item['id']}") print(f"File: {item['href']}") print(f"Type: {item['media_type']}") print(f"Properties: {item['properties']}") # Find specific items nav_item = manifest.find_by_property('nav') chapter = manifest.find_by_id('chapter1') xhtml_items = manifest.find_by_media_type('application/xhtml+xml') # Get formatted manifest output print(manifest.to_xml()) # Formatted XML with syntax highlighting print(manifest.to_str()) # Raw XML content ``` ### Working with Spine Access the spine to see the reading order: ```python # Get spine information spine = doc.package.spine # Access spine properties print(f"TOC reference: {spine.toc}") print(f"Page progression: {spine.page_progression_direction}") # Access spine items in reading order for itemref in spine.itemrefs: print(f"ID: {itemref['idref']}") print(f"Linear: {itemref['linear']}") print(f"Properties: {itemref['properties']}") # Find specific spine item spine_item = spine.find_by_idref('chapter1') # Get formatted spine output print(spine.to_xml()) # Formatted XML with syntax highlighting print(spine.to_str()) # Raw XML content ``` ### Content Extraction Extract content from specific documents within the EPUB: ```python # Access content by manifest item ID try: content = doc.find_content_by_id('chapter1') # Get content in different formats print(content.to_xml()) # Formatted XHTML with syntax highlighting print(content.to_str()) # Raw XHTML content print(content.to_plain()) # Plain text with HTML tags stripped # Access the parsed content tree for advanced processing tree = content.tree inner_text = content.inner_text except ValueError as e: print(f"Content not found: {e}") # Find publication resources by ID (for non-spine items) try: resource = doc.find_pub_resource_by_id('cover-image') except ValueError as e: print(f"Resource not found: {e}") ``` ### File Operations List and access files directly by their paths in the EPUB archive: ```python # Get information about all files files_info = doc.get_files_info() for file_info in files_info: print(f"Path: {file_info['path']}") print(f"Size: {file_info['size']} bytes") print(f"Compressed: {file_info['compressed_size']} bytes") print(f"Modified: {file_info['modified']}") # Access specific file by path try: # For XHTML files, returns XHTMLContent object xhtml_content = doc.get_file_by_path('OEBPS/chapter1.xhtml') print(xhtml_content.to_xml()) print(xhtml_content.to_plain()) # For other files, returns raw string content css_content = doc.get_file_by_path('OEBPS/styles/main.css') print(css_content) except ValueError as e: print(f"File not found: {e}") ``` ### Output Formatting Options All document components support flexible output formatting: ```python # Pretty-printed XML output print(metadata.to_str(pretty_print=True)) print(manifest.to_xml(pretty_print=True)) # Syntax highlighting can be controlled print(package.to_xml(highlight_syntax=True)) # With highlighting (default) print(package.to_xml(highlight_syntax=False)) # Without highlighting ``` ## Industry Standards & Compliance `epub-utils` provides comprehensive support for industry-standard ePub specifications and related technologies, ensuring broad compatibility across the digital publishing ecosystem. ### Supported EPUB Standards - **EPUB 2.0.1** (IDPF, 2010) - Complete OPF 2.0 package document support - NCX navigation control file support - Dublin Core metadata extraction - Legacy EPUB compatibility - **EPUB 3.0+** (IDPF/W3C, 2011-present) - EPUB 3.3 specification compliance - HTML5-based content documents - Navigation document (nav.xhtml) support - Enhanced accessibility features - Media overlays and scripting support ### Metadata Standards - **Dublin Core Metadata Initiative (DCMI)** - Dublin Core Metadata Element Set v1.1 - Dublin Core Metadata Terms (DCTERMS) - **Open Packaging Format (OPF)** - OPF 2.0 specification (EPUB 2.0.1) - OPF 3.0 specification (EPUB 3.0+) The library maintains strict adherence to published specifications while providing robust handling of real-world EPUB variations commonly found in commercial and open-source reading applications. ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/api-reference.rst ================================================ API Reference ============= This section provides complete API documentation for all classes and methods in epub-utils. Document Class -------------- .. py:class:: Document(path) Main class for working with EPUB files. :param str path: Path to the EPUB file **Example**: .. code-block:: python from epub_utils import Document doc = Document("book.epub") print(doc.package.metadata.title) .. py:attribute:: container Access to the container information. :type: Container :returns: Container object with container.xml information **Example**: .. code-block:: python container = doc.container print(f"Package path: {container.rootfile_path}") .. py:attribute:: package Access to the package (OPF) information. :type: Package :returns: Package object with OPF file information **Example**: .. code-block:: python package = doc.package print(f"Title: {package.metadata.title}") .. py:attribute:: toc Access to the table of contents. :type: TableOfContents :returns: Table of contents object **Example**: .. code-block:: python toc = doc.toc toc_xml = toc.to_xml() .. py:attribute:: ncx Access to the NCX (Navigation Control for XML) table of contents. :type: TableOfContents or None :returns: NCX table of contents object for EPUB 2, or for EPUB 3 if NCX is present, None otherwise **Example**: .. code-block:: python ncx = doc.ncx if ncx: ncx_xml = ncx.to_xml() **Note**: For EPUB 2, this returns the same as ``toc``. For EPUB 3, this specifically accesses the NCX file if present, which provides backward compatibility. .. py:attribute:: nav Access to the Navigation Document (EPUB 3 only). :type: TableOfContents or None :returns: Navigation Document table of contents object for EPUB 3, None for EPUB 2 or if not present **Example**: .. code-block:: python nav = doc.nav if nav: nav_xml = nav.to_xml() **Note**: This property specifically accesses EPUB 3 Navigation Documents. Returns None for EPUB 2 documents. .. py:method:: get_files_info() Get detailed information about all files in the EPUB. :returns: List of dictionaries containing file information :rtype: List[Dict[str, Union[str, int]]] Each dictionary contains: - ``path`` (str): File path within the EPUB - ``size`` (int): Uncompressed size in bytes - ``compressed_size`` (int): Compressed size in bytes - ``modified`` (str): Last modified date in ISO format **Example**: .. code-block:: python files = doc.get_files_info() for file_info in files: print(f"{file_info['path']}: {file_info['size']} bytes") .. py:method:: list_files() Get basic information about all files in the EPUB. :returns: List of dictionaries with basic file information :rtype: List[Dict[str, str]] **Example**: .. code-block:: python files = doc.list_files() print(f"EPUB contains {len(files)} files") Container Class --------------- .. py:class:: Container Represents the META-INF/container.xml file information. .. py:attribute:: rootfile_path Path to the main package file within the EPUB. :type: str .. py:attribute:: rootfile_media_type Media type of the main package file. :type: str .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_str() Get raw XML content. :returns: Raw XML string :rtype: str Package Class ------------- .. py:class:: Package Represents the main OPF package file. .. py:attribute:: metadata Package metadata information. :type: Metadata .. py:attribute:: manifest Package manifest information. :type: Manifest .. py:attribute:: spine Package spine information. :type: Spine .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation of the complete package. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_str() Get raw XML content of the complete package. :returns: Raw XML string :rtype: str Metadata Class -------------- .. py:class:: Metadata Represents Dublin Core and EPUB-specific metadata. .. py:attribute:: title Book title from dc:title element. :type: str .. py:attribute:: creator Book author/creator from dc:creator element. :type: str .. py:attribute:: language Language code from dc:language element. :type: str .. py:attribute:: identifier Unique identifier from dc:identifier element. :type: str .. py:attribute:: publisher Publisher from dc:publisher element. :type: str .. py:attribute:: date Publication date from dc:date element. :type: str .. py:attribute:: subject Subject/keywords from dc:subject element. :type: str .. py:attribute:: description Description from dc:description element. :type: str .. py:attribute:: contributor Contributor from dc:contributor element. :type: str .. py:attribute:: type Resource type from dc:type element. :type: str .. py:attribute:: format Format from dc:format element. :type: str .. py:attribute:: source Source from dc:source element. :type: str .. py:attribute:: relation Relation from dc:relation element. :type: str .. py:attribute:: coverage Coverage from dc:coverage element. :type: str .. py:attribute:: rights Rights information from dc:rights element. :type: str .. py:method:: __getattr__(name) Dynamic attribute access for any metadata field. :param str name: Metadata field name :returns: Metadata value or empty string :rtype: str **Example**: .. code-block:: python # Access any metadata field isbn = metadata.isbn if hasattr(metadata, 'isbn') else 'Not available' series = getattr(metadata, 'series', 'Not available') .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation of metadata. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_kv() Get metadata as key-value pairs. :returns: Key-value formatted string :rtype: str **Example**: .. code-block:: python kv_data = metadata.to_kv() print(kv_data) # Output: # title: The Great Gatsby # creator: F. Scott Fitzgerald # language: en .. py:method:: to_str() Get raw XML content of metadata. :returns: Raw XML string :rtype: str Manifest Class -------------- .. py:class:: Manifest Represents the package manifest section. .. py:attribute:: items Dictionary of manifest items. :type: Dict[str, Dict[str, str]] Each item contains: - ``href``: File path - ``media-type``: MIME type - Other attributes as needed **Example**: .. code-block:: python for item_id, item in manifest.items.items(): print(f"ID: {item_id}") print(f" File: {item['href']}") print(f" Type: {item['media-type']}") .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_str() Get raw XML content. :returns: Raw XML string :rtype: str Spine Class ----------- .. py:class:: Spine Represents the package spine section. .. py:attribute:: items List of spine items in reading order. :type: List[Dict[str, str]] **Example**: .. code-block:: python for item in spine.items: print(f"Reading order item: {item}") .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_str() Get raw XML content. :returns: Raw XML string :rtype: str TableOfContents Class --------------------- .. py:class:: TableOfContents Represents the table of contents (NCX or Navigation Document). .. py:method:: to_xml(highlight_syntax=True) Get formatted XML representation. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted XML string :rtype: str .. py:method:: to_str() Get raw XML content. :returns: Raw XML string :rtype: str Content Classes --------------- .. py:class:: Content Base class for EPUB content documents. .. py:method:: to_xml(highlight_syntax=True) Get formatted content. :param bool highlight_syntax: Whether to apply syntax highlighting :returns: Formatted content string :rtype: str .. py:method:: to_str() Get raw content. :returns: Raw content string :rtype: str .. py:class:: XHTMLContent Specialized class for XHTML content documents. Inherits from Content with additional XHTML-specific methods. .. py:method:: to_plain() Get plain text content with HTML tags stripped. :returns: Plain text string :rtype: str **Example**: .. code-block:: python from epub_utils.content import XHTMLContent # This would typically be accessed through Document # content = XHTMLContent(raw_html) # plain_text = content.to_plain() Exception Classes ----------------- .. py:exception:: ParseError Raised when there's an error parsing EPUB content. Base class: ``Exception`` **Example**: .. code-block:: python from epub_utils import Document from epub_utils.exceptions import ParseError try: doc = Document("corrupted.epub") title = doc.package.metadata.title except ParseError as e: print(f"Failed to parse EPUB: {e}") except FileNotFoundError: print("EPUB file not found") Usage Examples -------------- Basic Usage ~~~~~~~~~~~ .. code-block:: python from epub_utils import Document # Load document doc = Document("book.epub") # Access metadata metadata = doc.package.metadata print(f"Title: {metadata.title}") print(f"Author: {metadata.creator}") # Check file structure files = doc.get_files_info() print(f"Contains {len(files)} files") # Get formatted output toc_xml = doc.toc.to_xml() metadata_kv = metadata.to_kv() Error Handling ~~~~~~~~~~~~~~ .. code-block:: python from epub_utils import Document from epub_utils.exceptions import ParseError def safe_load_epub(path): try: doc = Document(path) return { 'status': 'success', 'document': doc, 'title': getattr(doc.package.metadata, 'title', 'Unknown') } except ParseError as e: return { 'status': 'parse_error', 'error': str(e) } except FileNotFoundError: return { 'status': 'file_not_found', 'error': 'EPUB file not found' } except Exception as e: return { 'status': 'unknown_error', 'error': str(e) } Batch Processing ~~~~~~~~~~~~~~~~ .. code-block:: python import os from pathlib import Path from epub_utils import Document def process_epub_directory(directory): epub_files = Path(directory).glob("*.epub") results = [] for epub_path in epub_files: try: doc = Document(str(epub_path)) metadata = doc.package.metadata result = { 'file': epub_path.name, 'title': getattr(metadata, 'title', ''), 'author': getattr(metadata, 'creator', ''), 'language': getattr(metadata, 'language', ''), 'file_size': epub_path.stat().st_size, 'epub_files': len(doc.get_files_info()) } results.append(result) except Exception as e: results.append({ 'file': epub_path.name, 'error': str(e) }) return results Type Hints ---------- For better IDE support and type checking, here are the main type hints: .. code-block:: python from typing import Dict, List, Union, Optional from epub_utils import Document # Function signatures for reference def get_files_info(self) -> List[Dict[str, Union[str, int]]]: ... def list_files(self) -> List[Dict[str, str]]: ... def to_xml(self, highlight_syntax: bool = True) -> str: ... def to_str(self) -> str: ... def to_kv(self) -> str: ... # Type-safe usage example doc: Document = Document("book.epub") files_info: List[Dict[str, Union[str, int]]] = doc.get_files_info() title: str = doc.package.metadata.title kv_data: str = doc.package.metadata.to_kv() Module Structure ---------------- The ``epub-utils`` package is organized as follows: .. code-block:: text epub_utils/ ├── __init__.py # Main exports (Document, Container) ├── doc.py # Document class ├── container.py # Container class ├── package/ │ ├── __init__.py # Package class │ ├── metadata.py # Metadata class │ ├── manifest.py # Manifest class │ └── spine.py # Spine class ├── content/ │ ├── __init__.py # Content classes │ ├── base.py # Base Content class │ └── xhtml.py # XHTMLContent class ├── toc.py # TableOfContents class ├── exceptions.py # Exception classes ├── highlighters.py # Syntax highlighting utilities └── cli.py # Command-line interface For detailed implementation examples, see :doc:`api-tutorial` and :doc:`examples`. ================================================ FILE: docs/api-tutorial.rst ================================================ Use as a Python library ======================= This guide covers using ``epub-utils`` as a Python library. The API is designed to be intuitive and follows Python best practices for ease of use and integration into your projects. Quick Start ----------- The main entry point is the ``Document`` class: .. code-block:: python from epub_utils import Document # Load an EPUB file doc = Document("path/to/book.epub") # Access various components print(f"Title: {doc.package.metadata.title}") print(f"Author: {doc.package.metadata.creator}") Core Classes ------------ Document Class ~~~~~~~~~~~~~~ The ``Document`` class is your main interface to an EPUB file: .. code-block:: python from epub_utils import Document doc = Document("example.epub") # Access major components container = doc.container # Container information package = doc.package # Package/OPF file toc = doc.toc # Table of contents # Get file information files_info = doc.get_files_info() **Key Methods**: - ``get_files_info()``: Returns detailed information about all files in the EPUB - ``list_files()``: Returns a simple list of files with basic metadata Container Access ~~~~~~~~~~~~~~~~ The container provides information from the META-INF/container.xml file: .. code-block:: python # Access container properties print(f"Package path: {doc.container.rootfile_path}") print(f"Media type: {doc.container.rootfile_media_type}") # Get raw XML container_xml = doc.container.to_xml() raw_container = doc.container.to_str() Package and Metadata ~~~~~~~~~~~~~~~~~~~~~ The package object gives you access to the main OPF file and its metadata: .. code-block:: python package = doc.package # Access metadata metadata = package.metadata print(f"Title: {metadata.title}") print(f"Author: {metadata.creator}") print(f"Language: {metadata.language}") print(f"Identifier: {metadata.identifier}") print(f"Publisher: {metadata.publisher}") # Get all metadata as key-value pairs kv_metadata = metadata.to_kv() print(kv_metadata) # Access manifest and spine manifest = package.manifest spine = package.spine Working with Metadata ---------------------- Extracting Common Fields ~~~~~~~~~~~~~~~~~~~~~~~~~ The metadata object provides easy access to Dublin Core and EPUB-specific metadata: .. code-block:: python metadata = doc.package.metadata # Basic Dublin Core elements title = metadata.title creator = metadata.creator # Usually the author subject = metadata.subject # Keywords/topics description = metadata.description publisher = metadata.publisher contributor = metadata.contributor date = metadata.date type = metadata.type format = metadata.format identifier = metadata.identifier source = metadata.source language = metadata.language relation = metadata.relation coverage = metadata.coverage rights = metadata.rights Dynamic Attribute Access ~~~~~~~~~~~~~~~~~~~~~~~~ The metadata object supports dynamic attribute access for any metadata field: .. code-block:: python # Access any metadata field by name isbn = getattr(metadata, 'isbn', 'Not available') series = getattr(metadata, 'series', 'Not available') # Or use the more direct approach try: custom_field = metadata.custom_metadata_field except AttributeError: custom_field = "Field not found" Formatted Output ~~~~~~~~~~~~~~~~ Get metadata in different formats: .. code-block:: python # XML format with syntax highlighting xml_metadata = metadata.to_xml(highlight_syntax=True) # Raw XML without highlighting raw_xml = metadata.to_xml(highlight_syntax=False) # Key-value format for easy parsing kv_format = metadata.to_kv() Manifest and Spine ------------------- Working with the Manifest ~~~~~~~~~~~~~~~~~~~~~~~~~~ The manifest lists all files in the EPUB package: .. code-block:: python manifest = doc.package.manifest # Get all items items = manifest.items # Dictionary of manifest items # Find specific items for item_id, item in items.items(): print(f"ID: {item_id}") print(f" File: {item['href']}") print(f" Type: {item['media-type']}") # Get formatted output manifest_xml = manifest.to_xml() Understanding the Spine ~~~~~~~~~~~~~~~~~~~~~~~~ The spine defines the reading order: .. code-block:: python spine = doc.package.spine # Get spine items in reading order spine_items = spine.items # Get formatted output spine_xml = spine.to_xml() Table of Contents ----------------- Working with TOC ~~~~~~~~~~~~~~~~ Access the table of contents (either NCX or Navigation Document): .. code-block:: python toc = doc.toc # Get formatted TOC toc_xml = toc.to_xml() raw_toc = toc.to_str() Specific TOC Access ~~~~~~~~~~~~~~~~~~~ For fine-grained control over which table of contents format to access: .. code-block:: python # Access NCX specifically (EPUB 2 or EPUB 3 with NCX) ncx = doc.ncx if ncx: ncx_xml = ncx.to_xml() print("NCX navigation available") else: print("No NCX navigation found") # Access Navigation Document specifically (EPUB 3 only) nav = doc.nav if nav: nav_xml = nav.to_xml() print("Navigation Document available") else: print("No Navigation Document found (likely EPUB 2)") # Handle different EPUB versions package = doc.package if package.version.major >= 3: # EPUB 3 - prefer Navigation Document, fallback to NCX nav_doc = doc.nav or doc.ncx else: # EPUB 2 - use NCX nav_doc = doc.ncx if nav_doc: print("Table of contents found:", nav_doc.to_str()[:100]) Content Extraction ------------------ Accessing Document Content ~~~~~~~~~~~~~~~~~~~~~~~~~~ Extract content from specific documents within the EPUB: .. code-block:: python # First, find content IDs from the manifest manifest = doc.package.manifest content_items = { item_id: item for item_id, item in manifest.items.items() if item['media-type'] == 'application/xhtml+xml' } # Access content by ID for content_id in content_items: try: content = doc.get_content(content_id) # Process content as needed print(f"Content ID {content_id}: {len(content)} characters") except Exception as e: print(f"Could not access content {content_id}: {e}") File Information ---------------- Detailed File Analysis ~~~~~~~~~~~~~~~~~~~~~~ Get comprehensive information about all files in the EPUB: .. code-block:: python files_info = doc.get_files_info() for file_info in files_info: print(f"Path: {file_info['path']}") print(f"Size: {file_info['size']} bytes") print(f"Compressed: {file_info['compressed_size']} bytes") print(f"Modified: {file_info['modified']}") print("---") # Calculate total size total_size = sum(f['size'] for f in files_info) total_compressed = sum(f['compressed_size'] for f in files_info) compression_ratio = (1 - total_compressed / total_size) * 100 print(f"Total size: {total_size} bytes") print(f"Compressed size: {total_compressed} bytes") print(f"Compression ratio: {compression_ratio:.1f}%") Error Handling -------------- Robust Error Handling ~~~~~~~~~~~~~~~~~~~~~~ epub-utils provides specific exception types for better error handling: .. code-block:: python from epub_utils import Document from epub_utils.exceptions import ParseError try: doc = Document("potentially_corrupt.epub") # Try to access metadata title = doc.package.metadata.title print(f"Successfully loaded: {title}") except ParseError as e: print(f"EPUB parsing error: {e}") except FileNotFoundError: print("EPUB file not found") except Exception as e: print(f"Unexpected error: {e}") Graceful Degradation ~~~~~~~~~~~~~~~~~~~~ Handle missing or malformed metadata gracefully: .. code-block:: python def safe_get_metadata(doc, field_name, default="Unknown"): """Safely extract metadata field with fallback.""" try: return getattr(doc.package.metadata, field_name, default) except (AttributeError, ParseError): return default # Usage title = safe_get_metadata(doc, 'title', 'Untitled') author = safe_get_metadata(doc, 'creator', 'Unknown Author') Next Steps ---------- - Explore the complete :doc:`api-reference` for detailed class documentation - See more :doc:`examples` for advanced use cases - Learn about :doc:`epub-standards` to understand the underlying specifications - Check out the :doc:`cli-reference` for command-line equivalents ================================================ FILE: docs/changelog.rst ================================================ .. _changelog: ========= Changelog ========= .. _v_0_1_0a1: 0.1.0a1 (2025-06-14) -------------------- * Added `toc` retrieval as dictionary (:issue:`4`) * Added Comprehensive navigation reading support (`#38 `__, `#39 `__, `#42 `__) * Added MacOS test runner (`#41 `__) * Added support for Python 3.8 and Python 3.9 (`#40 `__) .. _v_0_0_0a5: 0.0.0a5 (2025-06-01) -------------------- * Added file retrieval by file path. (:issue:`22`) * Added pretty printing to XML inspection (:issue:`23`) .. _v_0_0_0a4: 0.0.0a4 (2025-05-26) -------------------- * Added file inspection and ``files`` CLI command. (`#20 `__) * Added content inspection and ``content`` CLI command (:issue:`5`) * Added manifest parsing and ``manifest`` CLI command (`#13 `__) * Added spine parsing and ``spine`` CLI command (`#9 `__) * Added Key-value support for ``metadata`` CLI command * Fixed table of contents parsing for OEBPS 1 (`#11 `__). Thanks, `Christian Klein `__. .. _v_0_0_0a3: 0.0.0a3 (2025-05-04) -------------------- * Fixed `toc` command. (:issue:`1`) .. _v_0_0_0a2: 0.0.0a2 (2025-05-03) -------------------- * Added classifiers .. _v_0_0_0a1: 0.0.0a1 (2025-05-03) -------------------- * Initial relese to PyPI ================================================ FILE: docs/cli-reference.rst ================================================ CLI Reference ============= This reference documents all available command-line options and commands for ``epub-utils``. Synopsis -------- .. code-block:: text epub-utils [GLOBAL_OPTIONS] EPUB_FILE COMMAND [COMMAND_OPTIONS] Global Options -------------- ``-h, --help`` Show help message and exit ``-v, --version`` Show program version and exit ``-pp, --pretty-print`` Pretty-print XML output with proper indentation (applies to xml and raw formats only) Commands -------- All commands operate on an EPUB file and support the ``--format`` and ``--pretty-print`` options unless otherwise noted. container ~~~~~~~~~ Display the container.xml file contents. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE container [--format FORMAT] [--pretty-print] **Description**: The container command shows the contents of META-INF/container.xml, which defines the location of the main package file within the EPUB. **Supported formats**: ``xml`` (default), ``raw`` **Examples**: .. code-block:: bash # Show container with syntax highlighting epub-utils book.epub container # Show raw container XML epub-utils book.epub container --format raw # Show container with pretty formatting epub-utils book.epub container --pretty-print # Combine both options epub-utils book.epub container --format raw --pretty-print epub-utils book.epub container --format raw **Sample output**: .. code-block:: xml package ~~~~~~~ Display the main package (OPF) file contents. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE package [--format FORMAT] [--pretty-print] **Description**: The package command shows the complete OPF (Open Packaging Format) file, which contains metadata, manifest, and spine information. **Supported formats**: ``xml`` (default), ``raw`` **Examples**: .. code-block:: bash # Show package with syntax highlighting epub-utils book.epub package # Show raw package XML for processing epub-utils book.epub package --format raw | xmllint --format - # Show package with pretty formatting epub-utils book.epub package --pretty-print toc ~~~ Display the table of contents file. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE toc [--format FORMAT] [--pretty-print] [--ncx | --nav] **Description**: Shows the table of contents, which can be either an NCX file (EPUB 2.x) or a Navigation Document (EPUB 3.x). By default, automatically detects and uses the appropriate format for the EPUB version. **Options**: ``--ncx`` Force retrieval of NCX file (EPUB 2 navigation control file). For EPUB 2, this is the same as the default behavior. For EPUB 3, this specifically accesses the NCX file if present for backward compatibility. ``--nav`` Force retrieval of Navigation Document (EPUB 3 navigation file). Only works with EPUB 3 documents that have a Navigation Document. **Note**: The ``--ncx`` and ``--nav`` flags are mutually exclusive. **Supported formats**: ``xml`` (default), ``raw`` **Examples**: .. code-block:: bash # Show TOC with highlighting (auto-detect format) epub-utils book.epub toc # Extract navigation structure epub-utils book.epub toc --format raw # Show TOC with pretty formatting epub-utils book.epub toc --pretty-print # Force NCX format (EPUB 2 style) epub-utils book.epub toc --ncx # Force Navigation Document (EPUB 3 style) epub-utils book.epub toc --nav metadata ~~~~~~~~ Display metadata information from the package file. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE metadata [--format FORMAT] [--pretty-print] **Description**: Extracts and displays Dublin Core and EPUB-specific metadata from the package file. **Supported formats**: ``xml`` (default), ``raw``, ``kv`` **Examples**: .. code-block:: bash # Show formatted metadata epub-utils book.epub metadata # Get key-value pairs for scripting epub-utils book.epub metadata --format kv # Raw metadata XML epub-utils book.epub metadata --format raw # Show metadata with pretty formatting epub-utils book.epub metadata --pretty-print **Key-value output format**: .. code-block:: text title: The Great Gatsby creator: F. Scott Fitzgerald language: en identifier: urn:uuid:12345678-1234-1234-1234-123456789abc publisher: Scribner date: 2021-01-01 subject: Fiction, Classic Literature manifest ~~~~~~~~ Display the manifest section from the package file. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE manifest [--format FORMAT] [--pretty-print] **Description**: Shows the manifest, which lists all files included in the EPUB package with their IDs, file paths, and media types. **Supported formats**: ``xml`` (default), ``raw`` **Examples**: .. code-block:: bash # Show manifest with highlighting epub-utils book.epub manifest # Find all CSS files epub-utils book.epub manifest --format raw | grep 'media-type="text/css"' # Show manifest with pretty formatting epub-utils book.epub manifest --pretty-print epub-utils book.epub manifest --format raw | grep 'media-type="text/css"' # Count content files epub-utils book.epub manifest --format raw | grep -c 'application/xhtml+xml' spine ~~~~~ Display the spine section from the package file. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE spine [--format FORMAT] [--pretty-print] **Description**: Shows the spine, which defines the default reading order of the book's content. **Supported formats**: ``xml`` (default), ``raw`` **Examples**: .. code-block:: bash # Show spine with highlighting epub-utils book.epub spine # Extract reading order epub-utils book.epub spine --format raw # Show spine with pretty formatting epub-utils book.epub spine --pretty-print content ~~~~~~~ Display the content of a document by its manifest item ID. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE content ITEM_ID [--format FORMAT] [--pretty-print] **Description**: Extracts and displays the content of a specific document within the EPUB, identified by its manifest item ID. **Supported formats**: ``xml`` (default), ``raw``, ``plain`` **Arguments**: - ``ITEM_ID``: The ID of the item as defined in the manifest **Examples**: .. code-block:: bash # Show content with syntax highlighting epub-utils book.epub content chapter1 # Get raw HTML/XHTML epub-utils book.epub content intro --format raw # Extract plain text (no HTML tags) epub-utils book.epub content chapter2 --format plain # Show content with pretty formatting epub-utils book.epub content chapter1 --pretty-print **Finding item IDs**: .. code-block:: bash # First check the manifest for available IDs epub-utils book.epub manifest | grep 'id=' # Then extract specific content epub-utils book.epub content found_id --format plain files ~~~~~ List all files in the EPUB archive with metadata, or display content of a specific file. **Syntax**: .. code-block:: bash epub-utils EPUB_FILE files [FILE_PATH] [--format FORMAT] [--pretty-print] **Description**: When used without a file path, provides detailed information about all files contained within the EPUB archive, including sizes, compression ratios, and modification dates. When used with a file path, displays the content of the specified file within the EPUB archive. **Supported formats**: - For file listing: ``table`` (default), ``raw`` - For file content: ``raw``, ``xml`` (default), ``plain``, ``kv`` **Arguments**: - ``FILE_PATH`` (optional): Path to a specific file within the EPUB archive **Supported formats**: ``table`` (default), ``raw`` **Examples**: .. code-block:: bash # List all files in table format (default) epub-utils book.epub files # Get simple file list epub-utils book.epub files --format raw # Count total files epub-utils book.epub files --format raw | wc -l # Display content of a specific XHTML file epub-utils book.epub files OEBPS/chapter1.xhtml # Display XHTML file in different formats epub-utils book.epub files OEBPS/chapter1.xhtml --format raw epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print epub-utils book.epub files OEBPS/chapter1.xhtml --format plain # Display non-XHTML files (CSS, etc.) epub-utils book.epub files OEBPS/styles/main.css **Key differences from content command**: - ``files`` uses file paths within the EPUB archive - ``content`` uses manifest item IDs - ``files`` can access any file, including CSS, XML, and image files - ``content`` only accesses files listed in the manifest **Sample table output**: .. code-block:: text File Information for book.epub ┌────────────────────────────────────────┬──────────┬──────────────┬─────────────────────┐ │ Path │ Size │ Compressed │ Modified │ ├────────────────────────────────────────┼──────────┼──────────────┼─────────────────────┤ │ META-INF/container.xml │ 230 B │ 140 B │ 2021-01-01 10:00:00│ │ OEBPS/content.opf │ 2.1 KB │ 856 B │ 2021-01-01 10:00:00│ │ OEBPS/Text/chapter01.xhtml │ 12.4 KB │ 3.2 KB │ 2021-01-01 10:00:00│ └────────────────────────────────────────┴──────────┴──────────────┴─────────────────────┘ Format Options -------------- Most commands support the ``--format`` and ``--pretty-print`` options to control output formatting: ``xml`` (default for most commands) Syntax-highlighted, formatted XML output ``raw`` Unformatted content exactly as stored in the EPUB ``kv`` (metadata command only) Key-value pairs suitable for shell scripting ``plain`` (content command only) Plain text with HTML tags stripped ``table`` (files command only) Formatted table with aligned columns Pretty Print Option ~~~~~~~~~~~~~~~~~~~ The ``--pretty-print`` (or ``-pp``) option formats XML output with proper indentation and structure: .. code-block:: bash # Default output (with syntax highlighting but compact) epub-utils book.epub metadata # Pretty-printed output (with proper indentation) epub-utils book.epub metadata --pretty-print # Combine with raw format for clean, formatted XML epub-utils book.epub package --format raw --pretty-print **Note**: The pretty-print option applies to both ``xml`` and ``raw`` formats, but has no effect on ``kv``, ``plain``, or ``table`` formats. Exit Codes ---------- epub-utils uses standard exit codes: - ``0``: Success - ``1``: General error (file not found, invalid EPUB, etc.) - ``2``: Command line usage error Examples can check exit codes for error handling: .. code-block:: bash if epub-utils book.epub metadata >/dev/null 2>&1; then echo "EPUB is valid" else echo "EPUB has issues" fi Environment Variables --------------------- epub-utils respects these environment variables: ``NO_COLOR`` Disable color output when set to any value ``FORCE_COLOR`` Force color output even when not outputting to a terminal **Examples**: .. code-block:: bash # Disable colors NO_COLOR=1 epub-utils book.epub metadata # Force colors in pipes FORCE_COLOR=1 epub-utils book.epub metadata | less -R Common Usage Patterns --------------------- Validation Workflow ~~~~~~~~~~~~~~~~~~~ .. code-block:: bash #!/bin/zsh # validate-epub.sh - Basic EPUB validation epub_file="$1" echo "Validating: $epub_file" # Check container if ! epub-utils "$epub_file" container >/dev/null 2>&1; then echo "❌ Invalid container" exit 1 fi # Check package if ! epub-utils "$epub_file" package >/dev/null 2>&1; then echo "❌ Invalid package" exit 1 fi # Check required metadata metadata=$(epub-utils "$epub_file" metadata --format kv 2>/dev/null) if ! echo "$metadata" | grep -q "^title:"; then echo "⚠️ Missing title" fi if ! echo "$metadata" | grep -q "^creator:"; then echo "⚠️ Missing author" fi echo "✅ EPUB structure is valid" Metadata Extraction ~~~~~~~~~~~~~~~~~~~ .. code-block:: bash #!/bin/zsh # extract-metadata.sh - Extract metadata to CSV echo "filename,title,author,language,publisher" > metadata.csv for epub in *.epub; do if [[ -f "$epub" ]]; then metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null) title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | tr ',' ';') author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | tr ',' ';') language=$(echo "$metadata" | grep "^language:" | cut -d' ' -f2-) publisher=$(echo "$metadata" | grep "^publisher:" | cut -d' ' -f2- | tr ',' ';') echo "$epub,$title,$author,$language,$publisher" >> metadata.csv fi done Content Analysis ~~~~~~~~~~~~~~~~ .. code-block:: bash #!/bin/zsh # analyze-content.sh - Analyze EPUB content structure epub_file="$1" echo "Content Analysis for: $epub_file" echo "==================================" # Get content files from manifest content_ids=$(epub-utils "$epub_file" manifest --format raw | \ grep 'media-type="application/xhtml+xml"' | \ sed 's/.*id="\([^"]*\)".*/\1/') total_words=0 for content_id in $content_ids; do if word_count=$(epub-utils "$epub_file" content "$content_id" --format plain 2>/dev/null | wc -w); then echo "Content ID '$content_id': $word_count words" total_words=$((total_words + word_count)) fi done echo "==================================" echo "Total words: $total_words" Error Handling -------------- Always handle errors when using epub-utils in scripts: .. code-block:: bash # Check if file exists first if [[ ! -f "$epub_file" ]]; then echo "Error: File '$epub_file' not found" >&2 exit 1 fi # Capture and handle command errors if ! output=$(epub-utils "$epub_file" metadata --format kv 2>&1); then echo "Error processing EPUB: $output" >&2 exit 1 fi # Check for specific issues if [[ -z "$output" ]]; then echo "Warning: No metadata found" >&2 fi Performance Tips ---------------- 1. **Use raw format for large-scale processing** to avoid syntax highlighting overhead 2. **Pipe efficiently** to avoid unnecessary intermediate files 3. **Process files in parallel** when handling many EPUBs 4. **Cache results** when running the same command multiple times .. code-block:: bash # Efficient parallel processing find . -name "*.epub" | xargs -n 1 -P 4 -I {} \ zsh -c 'echo "{}: $(epub-utils "{}" metadata --format kv | grep "^title:" | cut -d" " -f2-)"' Troubleshooting --------------- Common Issues and Solutions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ **"Invalid value for 'PATH': File does not exist"** Check the file path and ensure the EPUB file exists. **"ParseError: Unable to parse container.xml"** The EPUB file may be corrupted. Verify it's a valid ZIP file. **"Content with id 'X' not found"** Check available content IDs using the manifest command first. **No color output** Ensure your terminal supports colors and check the ``NO_COLOR`` environment variable. **Large file performance** Use ``--format raw`` for better performance with large files. ================================================ FILE: docs/cli-tutorial.rst ================================================ Use as a command-line tool ========================== This tutorial will guide you through using ``epub-utils`` from the command line. We'll cover all available commands with practical examples and tips for everyday usage. Getting Started --------------- The basic syntax for epub-utils is: .. code-block:: bash epub-utils [OPTIONS] EPUB_FILE COMMAND [COMMAND_OPTIONS] Let's start with a simple example: .. code-block:: bash # Display help epub-utils --help # Check version epub-utils --version Basic File Inspection --------------------- Container Information ~~~~~~~~~~~~~~~~~~~~~ The container command shows the EPUB's container.xml file, which points to the main package file: .. code-block:: bash # Show container with syntax highlighting (default) epub-utils book.epub container # Show raw XML without highlighting epub-utils book.epub container --format raw # Show container with pretty formatting epub-utils book.epub container --pretty-print **Example output**: .. code-block:: xml Package Information ~~~~~~~~~~~~~~~~~~~ The package command displays the main OPF (Open Packaging Format) file: .. code-block:: bash # Show package file with highlighting epub-utils book.epub package # Show raw package content epub-utils book.epub package --format raw # Show package with pretty formatting epub-utils book.epub package --pretty-print This reveals the complete EPUB structure including metadata, manifest, and spine. Working with Metadata ---------------------- Extracting Metadata ~~~~~~~~~~~~~~~~~~~~ The metadata command is perfect for getting book information: .. code-block:: bash # Pretty-printed metadata with highlighting epub-utils book.epub metadata # Key-value format for scripting epub-utils book.epub metadata --format kv # Metadata with pretty formatting epub-utils book.epub metadata --pretty-print **Example key-value output**: .. code-block:: text title: The Great Gatsby creator: F. Scott Fitzgerald language: en identifier: urn:uuid:12345678-1234-1234-1234-123456789abc publisher: Scribner date: 2021-01-01 subject: Fiction, Classic Literature Scripting with Metadata ~~~~~~~~~~~~~~~~~~~~~~~~ The key-value format is perfect for shell scripting: .. code-block:: bash # Extract just the title epub-utils book.epub metadata --format kv | grep "^title:" | cut -d' ' -f2- # Get author name author=$(epub-utils book.epub metadata --format kv | grep "^creator:" | cut -d' ' -f2-) echo "Author: $author" # Batch process multiple files for epub in *.epub; do title=$(epub-utils "$epub" metadata --format kv | grep "^title:" | cut -d' ' -f2-) echo "$epub: $title" done Understanding EPUB Structure ----------------------------- Table of Contents ~~~~~~~~~~~~~~~~~ View the navigation structure of your EPUB: .. code-block:: bash # Show table of contents with highlighting (auto-detect format) epub-utils book.epub toc # Raw TOC for processing epub-utils book.epub toc --format raw # TOC with pretty formatting epub-utils book.epub toc --pretty-print **EPUB Version-Specific Access**: For precise control over which navigation format to access: .. code-block:: bash # Force NCX format (EPUB 2 navigation control file) epub-utils book.epub toc --ncx # Force Navigation Document (EPUB 3 navigation file) epub-utils book.epub toc --nav **Use Cases**: - Use ``--ncx`` when you specifically need the EPUB 2 style navigation or want to access backward-compatible NCX in EPUB 3 - Use ``--nav`` when you specifically need the EPUB 3 Navigation Document features - Use the default (no flags) for general TOC access that works with any EPUB version Manifest Inspection ~~~~~~~~~~~~~~~~~~~ The manifest lists all files contained in the EPUB: .. code-block:: bash # View manifest with syntax highlighting epub-utils book.epub manifest # Raw manifest content epub-utils book.epub manifest --format raw # Manifest with pretty formatting epub-utils book.epub manifest --pretty-print **What you'll see**: Each item in the manifest includes: - ``id``: Unique identifier for the item - ``href``: File path within the EPUB - ``media-type``: MIME type of the file Spine Information ~~~~~~~~~~~~~~~~~ The spine defines the reading order of the book: .. code-block:: bash # View spine with highlighting epub-utils book.epub spine # Raw spine for processing epub-utils book.epub spine --format raw Content Extraction ------------------ Viewing Document Content ~~~~~~~~~~~~~~~~~~~~~~~~ Extract content from specific documents using their manifest ID: .. code-block:: bash # Show content with syntax highlighting epub-utils book.epub content chapter1 # Raw HTML/XHTML content epub-utils book.epub content chapter1 --format raw # Plain text (HTML tags stripped) epub-utils book.epub content chapter1 --format plain **Finding Content IDs**: Use the manifest command to see available content IDs: .. code-block:: bash # First, check the manifest for available IDs epub-utils book.epub manifest # Then extract specific content epub-utils book.epub content intro --format plain File Listing and Content Access ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Get detailed information about all files in the EPUB, or access specific file content: .. code-block:: bash # Formatted table of files epub-utils book.epub files # Raw file list epub-utils book.epub files --format raw # Display content of a specific file by path epub-utils book.epub files OEBPS/chapter1.xhtml # Access different file types epub-utils book.epub files META-INF/container.xml epub-utils book.epub files OEBPS/styles/main.css epub-utils book.epub files OEBPS/images/cover.jpg # Different output formats for XHTML content epub-utils book.epub files OEBPS/chapter1.xhtml --format raw epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print epub-utils book.epub files OEBPS/chapter1.xhtml --format plain **Key advantages of the files command**: - Access any file in the EPUB archive by its path - No need to know manifest item IDs - Works with all file types (XHTML, CSS, XML, images, etc.) - Complements the ``content`` command which uses manifest IDs Content Analysis ~~~~~~~~~~~~~~~~ Analyze EPUB content structure: .. code-block:: bash #!/bin/bash # analyze-content.sh - Analyze EPUB content structure epub_file="$1" echo "=== Content Analysis for $epub_file ===" # Get all content files from manifest epub-utils "$epub_file" manifest --format raw | \ grep 'media-type="application/xhtml+xml"' | \ sed 's/.*id="\([^"]*\)".*/\1/' | \ while read -r content_id; do echo "--- Content ID: $content_id ---" word_count=$(epub-utils "$epub_file" content "$content_id" --format plain | wc -w) echo "Word count: $word_count" echo "" done Output Format Options --------------------- epub-utils supports multiple output formats for different use cases: XML Format (Default) ~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash epub-utils book.epub metadata # Produces syntax-highlighted, formatted XML Raw Format ~~~~~~~~~~ .. code-block:: bash epub-utils book.epub metadata --format raw # Produces unformatted XML, perfect for piping to other tools Key-Value Format ~~~~~~~~~~~~~~~~ .. code-block:: bash epub-utils book.epub metadata --format kv # Produces key: value pairs, ideal for scripting Plain Text Format ~~~~~~~~~~~~~~~~~ .. code-block:: bash epub-utils book.epub content chapter1 --format plain # Strips HTML tags, produces readable text Pretty-Print Option ~~~~~~~~~~~~~~~~~~~ Use the ``--pretty-print`` (or ``-pp``) option to format XML output with proper indentation: .. code-block:: bash # Default output (compact XML) epub-utils book.epub metadata --format raw # Pretty-formatted output (with indentation) epub-utils book.epub metadata --format raw --pretty-print # Works with syntax highlighting too epub-utils book.epub package --pretty-print Next Steps ---------- Now that you're familiar with the CLI basics, you might want to: - Explore the :doc:`api-tutorial` for programmatic access - Check out more :doc:`examples` for real-world use cases - Learn about :doc:`epub-standards` for deeper understanding - Contribute to the project via :doc:`contributing` ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'epub-utils' copyright = '2025, Ernesto González' author = 'Ernesto González' release = '0.1.0a1' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx_copybutton', 'sphinx_issues', ] templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Napoleon settings ------------------------------------------------------- napoleon_google_docstring = True napoleon_numpy_docstring = True napoleon_include_init_with_doc = False napoleon_include_private_with_doc = False # -- Autodoc settings -------------------------------------------------------- autodoc_member_order = 'bysource' autodoc_default_flags = ['members'] autosummary_generate = True # -- Intersphinx mapping ----------------------------------------------------- intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), 'lxml': ('https://lxml.de/', None), } # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = 'furo' html_static_path = ['_static'] # Add source link in footer html_show_sourcelink = True html_copy_source = True html_show_sphinx = True # -- Linking Github issues -------------------------------------------------- # https://github.com/sloria/sphinx-issues issues_github_path = 'ernestofgonzalez/epub-utils' ================================================ FILE: docs/contributing.rst ================================================ ============ Contributing ============ We welcome contributions to ``epub-utils``! This guide will help you get started with contributing to the project. Getting Started =============== Setting Up Development Environment ---------------------------------- 1. **Fork the Repository** Fork the ``epub-utils`` repository on GitHub to your own account. 2. **Clone Your Fork** .. code-block:: bash git clone https://github.com/yourusername/epub-utils.git cd epub-utils 3. **Set Up Development Environment** .. code-block:: bash # Create virtual environment python -m venv dev-env source dev-env/bin/activate # On Windows: dev-env\Scripts\activate # Install in development mode pip install -e ".[dev]" # Or install dependencies manually pip install -e . pip install pytest black flake8 mypy sphinx Project Structure ----------------- .. code-block:: text epub-utils/ ├── src/ │ └── epub_utils/ │ ├── __init__.py │ ├── cli.py # Command-line interface │ ├── document.py # Main Document class │ ├── extractors.py # Content extraction logic │ └── formatters.py # Output formatting ├── tests/ │ ├── __init__.py │ ├── test_document.py │ ├── test_cli.py │ └── fixtures/ # Test EPUB files ├── docs/ │ ├── conf.py │ ├── index.rst │ └── ... # Documentation files ├── pyproject.toml ├── README.md └── CHANGELOG.md Development Workflow ==================== Branch Strategy --------------- - ``main`` branch: Stable, release-ready code - ``develop`` branch: Integration branch for features - Feature branches: ``feature/your-feature-name`` - Bug fix branches: ``fix/issue-description`` Making Changes -------------- 1. **Create a Feature Branch** .. code-block:: bash git checkout -b feature/your-feature-name 2. **Make Your Changes** Follow the coding standards outlined below. 3. **Write Tests** All new features should include comprehensive tests. 4. **Run Tests Locally** .. code-block:: bash # Run all tests pytest # Run with coverage pytest --cov=epub_utils # Run specific test file pytest tests/test_document.py 5. **Check Code Quality** .. code-block:: bash # Format code black src/ tests/ # Check linting flake8 src/ tests/ # Type checking mypy src/ 6. **Update Documentation** If your changes affect the API or add new features, update the documentation. 7. **Commit Your Changes** .. code-block:: bash git add . git commit -m "Add: Brief description of your changes" 8. **Push and Create Pull Request** .. code-block:: bash git push origin feature/your-feature-name Then create a pull request on GitHub. Coding Standards ================ Python Style Guide ------------------ We follow PEP 8 with some modifications: - **Line length**: 88 characters (Black's default) - **String quotes**: Use double quotes for strings - **Import sorting**: Use isort or similar tool - **Docstrings**: Use Google-style docstrings Code Formatting --------------- We use **Black** for code formatting: .. code-block:: bash # Format all Python files black src/ tests/ # Check formatting without making changes black --check src/ tests/ Example of properly formatted code: .. code-block:: python def extract_metadata(epub_path: str, format_type: str = "dict") -> dict: """Extract metadata from an EPUB file. Args: epub_path: Path to the EPUB file. format_type: Output format ('dict', 'xml', 'json'). Returns: Dictionary containing extracted metadata. Raises: FileNotFoundError: If the EPUB file doesn't exist. ValueError: If format_type is not supported. """ if not os.path.exists(epub_path): raise FileNotFoundError(f"EPUB file not found: {epub_path}") if format_type not in ["dict", "xml", "json"]: raise ValueError(f"Unsupported format: {format_type}") # Implementation here... return {} Linting ------- We use **ruff** for linting: .. code-block:: bash # Check for linting errors make lint Type Hints ---------- Use type hints for all function signatures: .. code-block:: python from typing import List, Dict, Optional, Union from pathlib import Path def process_files( file_paths: List[Union[str, Path]], output_format: str = "table" ) -> Optional[Dict[str, any]]: """Process multiple EPUB files.""" pass Documentation Standards ======================= Docstring Format ---------------- Use Google-style docstrings: .. code-block:: python def complex_function(param1: str, param2: int, param3: bool = False) -> dict: """Brief description of the function. Longer description if needed. Explain the purpose, behavior, and any important details about the function. Args: param1: Description of the first parameter. param2: Description of the second parameter. param3: Description of optional parameter. Defaults to False. Returns: Description of return value and its structure. Raises: ValueError: When param2 is negative. FileNotFoundError: When the specified file doesn't exist. Example: Basic usage example: >>> result = complex_function("test", 42) >>> print(result["status"]) "success" """ pass API Documentation ----------------- When adding new classes or functions to the public API: 1. **Add to __init__.py** exports if appropriate 2. **Update API reference** documentation 3. **Include usage examples** in docstrings 4. **Add to tutorials** if it's a major feature RST Documentation ----------------- When writing RST documentation: .. code-block:: rst Section Title ============= Subsection ---------- Code examples: .. code-block:: python # Python code here import epub_utils Shell commands: .. code-block:: bash epub-utils info book.epub Testing Guidelines ================== Test Structure -------------- - **Unit tests**: Test individual functions and methods - **Integration tests**: Test component interactions - **End-to-end tests**: Test complete workflows - **Performance tests**: Test with large files (optional) Writing Tests ------------- Use pytest for all tests: .. code-block:: python import pytest from epub_utils import Document from pathlib import Path def test_document_with_invalid_file(): """Test error handling with invalid file.""" with pytest.raises(FileNotFoundError): Document("nonexistent.epub") @pytest.mark.parametrize("format_type", ["dict", "xml", "json"]) def test_metadata_formats(doc_path, format_type): """Test different metadata formats.""" doc = Document(str(doc_path) metadata = doc.get_metadata(format_type=format_type) assert metadata is not None Test Fixtures ------------- Create test EPUB files in ``tests/fixtures/``: .. code-block:: python # tests/conftest.py import pytest from pathlib import Path @pytest.fixture def sample_epub(): """Provide path to sample EPUB for testing.""" return Path(__file__).parent / "fixtures" / "sample.epub" @pytest.fixture def invalid_epub(): """Provide path to invalid EPUB for error testing.""" return Path(__file__).parent / "fixtures" / "invalid.epub" Running Tests ------------- .. code-block:: bash # Run all tests make test # Run specific test file pytest tests/test_document.py Types of Contributions ====================== Bug Reports ----------- When reporting bugs: 1. Check existing issues first 2. Use the issue template if available 3. Provide minimal reproduction case 4. Include system information .. code-block:: text **Bug Description** Clear description of the bug. **Steps to Reproduce** 1. Step one 2. Step two 3. Step three **Expected Behavior** What should happen. **Actual Behavior** What actually happens. **Environment** - epub-utils version: - Python version: - Operating system: **Sample File** Attach or link to EPUB file if relevant. Feature Requests ---------------- For new features: 1. Describe the use case clearly 2. Explain why it's valuable to users 3. Suggest implementation approach if you have ideas 4. Consider backward compatibility Documentation Improvements -------------------------- Documentation contributions are highly valued: - Fix typos and grammar errors - Improve clarity of explanations - Add more examples to existing docs - Create new tutorials for common use cases - Update outdated information Code Contributions ------------------ Areas where contributions are welcome: 1. Performance improvements 2. New output formats 3. Additional EPUB validation 4. Better error handling 5. CLI usability enhancements 6. Support for EPUB 3 features Release Process =============== Versioning ---------- We follow `Semantic Versioning `_: - MAJOR: Incompatible API changes - MINOR: New functionality (backward compatible) - PATCH: Bug fixes (backward compatible) Version format: ``MAJOR.MINOR.PATCH`` (e.g., ``1.2.3``) Development versions may include additional identifiers: - ``1.2.3-dev`` (development) - ``1.2.3rc1`` (release candidate) ================================================ FILE: docs/epub-standards.rst ================================================ ============== EPUB Standards ============== Understanding EPUB Specifications ================================= EPUB (Electronic Publication) is an open standard for digital books and publications. This guide covers the EPUB specifications and how epub-utils ensures compliance. EPUB 3.3 Specification ====================== Current Standard ---------------- EPUB 3.3 is the current specification, published by the W3C. It defines: - **Package Document**: Contains metadata, manifest, and spine - **Container Format**: ZIP-based archive structure - **Content Documents**: XHTML5, SVG, and other media types - **Navigation Document**: Replaces NCX for table of contents Key Components -------------- Container Structure ~~~~~~~~~~~~~~~~~~~ .. code-block:: text book.epub ├── META-INF/ │ ├── container.xml # Points to package document │ └── signatures.xml # Digital signatures (optional) ├── OEBPS/ # Content folder (common name) │ ├── package.opf # Package document │ ├── nav.xhtml # Navigation document │ ├── content/ # Text content │ ├── images/ # Images │ ├── styles/ # CSS files │ └── fonts/ # Font files (optional) └── mimetype # Must be first file, uncompressed Package Document (OPF) ~~~~~~~~~~~~~~~~~~~~~~ The package document defines three main sections: **Metadata Section**: .. code-block:: xml Book Title Author Name urn:uuid:12345 en 2024-01-01T00:00:00Z **Manifest Section**: .. code-block:: xml **Spine Section**: .. code-block:: xml Navigation Document ~~~~~~~~~~~~~~~~~~~ EPUB 3 uses XHTML navigation documents instead of NCX: .. code-block:: html Navigation EPUB Compliance with epub-utils =============================== Validation Capabilities ----------------------- epub-utils helps ensure EPUB compliance by: 1. **Structure Validation**: Checks container format 2. **Metadata Validation**: Verifies required elements 3. **Manifest Validation**: Ensures all files are declared 4. **Spine Validation**: Checks reading order 5. **Content Validation**: Basic XHTML structure checks Checking Compliance ------------------- Use epub-utils to validate EPUB structure: .. code-block:: bash # Check basic structure epub-utils info book.epub # Detailed manifest information epub-utils manifest book.epub --format table # Extract and examine package document epub-utils extract book.epub --output-dir temp/ cat temp/OEBPS/package.opf Python API for Validation ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from epub_utils import Document def validate_epub_structure(epub_path): """Validate basic EPUB structure.""" try: doc = Document(epub_path) # Check required components checks = { 'has_container': hasattr(doc, 'container'), 'has_package': hasattr(doc, 'package'), 'has_metadata': len(doc.metadata) > 0, 'has_manifest': len(doc.manifest) > 0, 'has_spine': len(doc.spine) > 0, } # Check required metadata required_metadata = ['title', 'language', 'identifier'] metadata_present = {} for item in doc.metadata: for req in required_metadata: if req in item.get('name', '').lower(): metadata_present[req] = True print("Structure Validation:") for check, passed in checks.items(): status = "✓" if passed else "✗" print(f" {status} {check}") print("\nRequired Metadata:") for req in required_metadata: status = "✓" if metadata_present.get(req) else "✗" print(f" {status} {req}") return all(checks.values()) and len(metadata_present) >= 2 except Exception as e: print(f"Validation failed: {e}") return False Common Compliance Issues ======================== Missing Required Elements ------------------------- **Problem**: EPUB missing required metadata .. code-block:: bash # Check metadata completeness epub-utils metadata book.epub --format table **Solution**: Ensure these elements are present: - ``dc:title`` - ``dc:language`` - ``dc:identifier`` (with unique ID) - ``meta property="dcterms:modified"`` (EPUB 3) Invalid File References ----------------------- **Problem**: Manifest references files that don't exist .. code-block:: python def check_file_references(epub_path): """Check if all manifest files exist in the archive.""" doc = Document(epub_path) missing_files = [] for item in doc.manifest: file_path = item.get('href') if file_path: # Check if file exists in the EPUB try: # This would need zip file checking pass except: missing_files.append(file_path) if missing_files: print("Missing files referenced in manifest:") for file in missing_files: print(f" - {file}") Incorrect MIME Types -------------------- **Problem**: Wrong media-type attributes in manifest Common correct MIME types: - XHTML: ``application/xhtml+xml`` - CSS: ``text/css`` - JPEG: ``image/jpeg`` - PNG: ``image/png`` - NCX: ``application/x-dtbncx+xml`` EPUB 2 vs EPUB 3 Differences ============================ Format Evolution ----------------- +------------------+-------------------------+-------------------------+ | Feature | EPUB 2 | EPUB 3 | +==================+=========================+=========================+ | Navigation | NCX file required | XHTML nav document | +------------------+-------------------------+-------------------------+ | Content Types | XHTML 1.1, limited | XHTML5, SVG, MathML | +------------------+-------------------------+-------------------------+ | Metadata | Dublin Core only | Enhanced metadata | +------------------+-------------------------+-------------------------+ | Accessibility | Limited | Rich accessibility | +------------------+-------------------------+-------------------------+ | Scripting | Not allowed | Limited JavaScript | +------------------+-------------------------+-------------------------+ Migration Considerations ------------------------ When working with older EPUB 2 files: .. code-block:: python def detect_epub_version(epub_path): """Detect EPUB version from package document.""" doc = Document(epub_path) # Check package document for version attribute # This is a simplified example for item in doc.manifest: if 'nav' in item.get('properties', ''): return "EPUB 3" # Check for NCX file (EPUB 2 indicator) for item in doc.manifest: if item.get('media-type') == 'application/x-dtbncx+xml': return "EPUB 2" return "Unknown" Best Practices for Compliance ============================= Metadata Best Practices ----------------------- 1. **Always include required elements**: .. code-block:: xml Complete Book Title Author Full Name urn:uuid:unique-identifier en-US 2. **Use proper Dublin Core refinements**: .. code-block:: xml Jane Doe aut 3. **Include modification date for EPUB 3**: .. code-block:: xml 2024-05-25T10:30:00Z File Organization ----------------- 1. **Use consistent folder structure** 2. **Declare all files in manifest** 3. **Use proper MIME types** 4. **Include fallbacks for specialized content** Content Guidelines ------------------ 1. **Valid XHTML**: Ensure all content files are well-formed 2. **Proper encoding**: Use UTF-8 encoding 3. **Relative links**: Use relative paths for internal references 4. **Alt text**: Include alt attributes for images Testing and Validation Tools ============================ External Validators ------------------- - **EPUBCheck**: Official EPUB validator - **Ace by DAISY**: Accessibility checker - **pagina EPUB-Checker**: Online validator Integration with epub-utils --------------------------- .. code-block:: bash # Basic structure check epub-utils info book.epub # Export for external validation epub-utils extract book.epub --output-dir validation/ # Run EPUBCheck on extracted content # Check specific components epub-utils manifest book.epub --format xml > manifest.xml epub-utils metadata book.epub --format xml > metadata.xml Future Standards ================ EPUB 3.3 and Beyond ------------------- Current developments in EPUB standards: - **Enhanced accessibility features** - **Better multimedia support** - **Improved metadata vocabularies** - **Web standards alignment** Staying Current --------------- - Monitor W3C EPUB Working Group - Test with latest validators - Follow accessibility guidelines (WCAG) - Use semantic markup Resources ========= Official Specifications ----------------------- - `EPUB 3.3 Specification `_ - `EPUB Accessibility 1.1 `_ - `EPUB Open Container Format 3.0.1 `_ Validation Tools ---------------- - `EPUBCheck `_ - `Ace Accessibility Checker `_ - `EPUB Validator `_ Developer Resources ------------------- - `EPUB 3 Best Practices `_ - `IDPF EPUB Resources `_ - `Accessibility Guidelines `_ ================================================ FILE: docs/examples.rst ================================================ Examples and Use Cases ====================== This page showcases real-world examples of using epub-utils for various tasks. Each example includes both CLI and Python API approaches where applicable. Digital Library Management -------------------------- Cataloging Your EPUB Collection ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: You have a large collection of EPUB files and want to create a comprehensive catalog. **CLI Approach**: .. code-block:: bash #!/bin/bash # catalog-epubs.sh - Create a catalog of all EPUB files echo "Creating EPUB catalog..." echo "File,Title,Author,Publisher,Language,Year,Files,Size" > epub_catalog.csv find . -name "*.epub" -type f | while read -r epub; do echo "Processing: $epub" # Extract metadata using epub-utils metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null) if [ $? -eq 0 ]; then title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | sed 's/,/;/g') author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | sed 's/,/;/g') publisher=$(echo "$metadata" | grep "^publisher:" | cut -d' ' -f2- | sed 's/,/;/g') language=$(echo "$metadata" | grep "^language:" | cut -d' ' -f2-) year=$(echo "$metadata" | grep "^date:" | cut -d' ' -f2- | cut -d'-' -f1) # Count files and get size file_count=$(epub-utils "$epub" files --format raw 2>/dev/null | wc -l) size=$(stat -f%z "$epub" 2>/dev/null || stat -c%s "$epub" 2>/dev/null) echo "$epub,$title,$author,$publisher,$language,$year,$file_count,$size" >> epub_catalog.csv else echo "$epub,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR" >> epub_catalog.csv fi done echo "Catalog complete! See epub_catalog.csv" **Python Approach**: .. code-block:: python import csv import os from pathlib import Path from epub_utils import Document def create_epub_catalog(directory, output_file="epub_catalog.csv"): """Create a comprehensive catalog of EPUB files.""" fieldnames = [ 'filepath', 'filename', 'title', 'author', 'publisher', 'language', 'year', 'isbn', 'file_count', 'size_bytes', 'size_mb' ] epub_files = list(Path(directory).rglob("*.epub")) print(f"Found {len(epub_files)} EPUB files") with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i, epub_path in enumerate(epub_files, 1): print(f"Processing {i}/{len(epub_files)}: {epub_path.name}") try: doc = Document(str(epub_path)) metadata = doc.package.metadata # Extract date year date_str = getattr(metadata, 'date', '') year = date_str.split('-')[0] if date_str else '' # Get file size size_bytes = epub_path.stat().st_size size_mb = round(size_bytes / (1024 * 1024), 2) row = { 'filepath': str(epub_path), 'filename': epub_path.name, 'title': getattr(metadata, 'title', ''), 'author': getattr(metadata, 'creator', ''), 'publisher': getattr(metadata, 'publisher', ''), 'language': getattr(metadata, 'language', ''), 'year': year, 'isbn': getattr(metadata, 'identifier', ''), 'file_count': len(doc.get_files_info()), 'size_bytes': size_bytes, 'size_mb': size_mb } writer.writerow(row) except Exception as e: print(f" Error: {e}") # Write error row writer.writerow({ 'filepath': str(epub_path), 'filename': epub_path.name, 'title': f'ERROR: {str(e)}', 'author': '', 'publisher': '', 'language': '', 'year': '', 'isbn': '', 'file_count': 0, 'size_bytes': epub_path.stat().st_size, 'size_mb': 0 }) # Usage create_epub_catalog("/path/to/your/epub/collection") Quality Assurance and Validation --------------------------------- EPUB Health Check ~~~~~~~~~~~~~~~~~ **Scenario**: Validate EPUB files and identify potential issues. .. code-block:: python from epub_utils import Document, ParseError import zipfile from pathlib import Path class EPUBHealthChecker: def __init__(self): self.issues = [] def check_epub(self, epub_path): """Comprehensive EPUB health check.""" self.issues = [] epub_path = Path(epub_path) print(f"Checking EPUB: {epub_path.name}") # Basic file checks if not epub_path.exists(): self.issues.append("File does not exist") return self.get_report() if epub_path.stat().st_size == 0: self.issues.append("File is empty") return self.get_report() # ZIP integrity check try: with zipfile.ZipFile(epub_path, 'r') as zf: corrupt_files = zf.testzip() if corrupt_files: self.issues.append(f"Corrupt ZIP file: {corrupt_files}") except zipfile.BadZipFile: self.issues.append("Invalid ZIP file") return self.get_report() # EPUB structure checks try: doc = Document(str(epub_path)) self._check_container(doc) self._check_package(doc) self._check_metadata(doc) self._check_manifest(doc) self._check_files(doc) except ParseError as e: self.issues.append(f"Parse error: {e}") except Exception as e: self.issues.append(f"Unexpected error: {e}") return self.get_report() def _check_container(self, doc): """Check container structure.""" try: container = doc.container if not container.rootfile_path: self.issues.append("No rootfile specified in container") except Exception as e: self.issues.append(f"Container error: {e}") def _check_package(self, doc): """Check package/OPF file.""" try: package = doc.package if not hasattr(package, 'metadata'): self.issues.append("Package missing metadata") if not hasattr(package, 'manifest'): self.issues.append("Package missing manifest") if not hasattr(package, 'spine'): self.issues.append("Package missing spine") except Exception as e: self.issues.append(f"Package error: {e}") def _check_metadata(self, doc): """Check metadata quality.""" try: metadata = doc.package.metadata # Check required fields if not getattr(metadata, 'title', '').strip(): self.issues.append("Missing or empty title") if not getattr(metadata, 'language', '').strip(): self.issues.append("Missing or empty language") if not getattr(metadata, 'identifier', '').strip(): self.issues.append("Missing or empty identifier") except Exception as e: self.issues.append(f"Metadata error: {e}") def _check_manifest(self, doc): """Check manifest integrity.""" try: manifest = doc.package.manifest if not manifest.items: self.issues.append("Empty manifest") # Check for common content types has_html = any( item.get('media-type') == 'application/xhtml+xml' for item in manifest.items.values() ) if not has_html: self.issues.append("No XHTML content files found") except Exception as e: self.issues.append(f"Manifest error: {e}") def _check_files(self, doc): """Check file structure.""" try: files_info = doc.get_files_info() if len(files_info) < 3: # At least container, package, and one content file self.issues.append("Very few files in EPUB (possibly incomplete)") # Check for suspiciously large files for file_info in files_info: if file_info['size'] > 10 * 1024 * 1024: # 10MB self.issues.append(f"Large file found: {file_info['path']} ({file_info['size']} bytes)") except Exception as e: self.issues.append(f"File check error: {e}") def get_report(self): """Generate health check report.""" if not self.issues: return {"status": "healthy", "issues": []} else: return {"status": "issues_found", "issues": self.issues} # Usage checker = EPUBHealthChecker() report = checker.check_epub("book.epub") if report["status"] == "healthy": print("✓ EPUB is healthy!") else: print("⚠ Issues found:") for issue in report["issues"]: print(f" - {issue}") Metadata Management ------------------- Standardizing Metadata ~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Clean and standardize metadata across your EPUB collection. .. code-block:: python import re from epub_utils import Document class MetadataStandardizer: def __init__(self): self.language_codes = { 'english': 'en', 'spanish': 'es', 'french': 'fr', 'german': 'de', 'italian': 'it' # Add more as needed } def analyze_metadata(self, epub_path): """Analyze and suggest metadata improvements.""" doc = Document(epub_path) metadata = doc.package.metadata suggestions = [] # Check title title = getattr(metadata, 'title', '') if not title: suggestions.append("Missing title") elif len(title) > 200: suggestions.append("Title is very long (>200 chars)") elif title.isupper(): suggestions.append("Title is all uppercase - consider title case") # Check author creator = getattr(metadata, 'creator', '') if not creator: suggestions.append("Missing author/creator") elif ',' not in creator and len(creator.split()) > 2: suggestions.append("Author name might need reformatting (Last, First)") # Check language language = getattr(metadata, 'language', '') if not language: suggestions.append("Missing language code") elif len(language) > 3: # Might be full language name instead of code lang_lower = language.lower() if lang_lower in self.language_codes: suggestions.append(f"Use language code '{self.language_codes[lang_lower]}' instead of '{language}'") # Check identifier identifier = getattr(metadata, 'identifier', '') if not identifier: suggestions.append("Missing identifier") elif not self._is_valid_identifier(identifier): suggestions.append("Identifier format might be invalid") # Check date format date = getattr(metadata, 'date', '') if date and not re.match(r'\d{4}(-\d{2}-\d{2})?', date): suggestions.append("Date should be in YYYY or YYYY-MM-DD format") return { 'file': epub_path, 'current_metadata': { 'title': title, 'creator': creator, 'language': language, 'identifier': identifier, 'date': date }, 'suggestions': suggestions } def _is_valid_identifier(self, identifier): """Check if identifier looks valid.""" # Check for ISBN, DOI, UUID patterns patterns = [ r'urn:isbn:\d{10,13}', # ISBN URN r'isbn:\d{10,13}', # Simple ISBN r'urn:uuid:[a-f0-9-]{36}', # UUID URN r'doi:10\.\d+/.+', # DOI r'urn:doi:10\.\d+/.+' # DOI URN ] return any(re.match(pattern, identifier, re.I) for pattern in patterns) # Usage standardizer = MetadataStandardizer() analysis = standardizer.analyze_metadata("book.epub") print(f"Analyzing: {analysis['file']}") if analysis['suggestions']: print("Suggestions for improvement:") for suggestion in analysis['suggestions']: print(f" - {suggestion}") else: print("Metadata looks good!") Content Analysis and Statistics ------------------------------- Reading Level Analysis ~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Analyze EPUB content to determine reading complexity. .. code-block:: python import re import math from epub_utils import Document class ReadingLevelAnalyzer: def analyze_epub(self, epub_path): """Analyze reading level of an EPUB.""" doc = Document(epub_path) # Get all text content all_text = self._extract_all_text(doc) if not all_text.strip(): return {"error": "No readable text found"} # Calculate statistics stats = self._calculate_text_stats(all_text) # Calculate reading level scores flesch_score = self._flesch_reading_ease(stats) flesch_grade = self._flesch_kincaid_grade(stats) return { 'title': getattr(doc.package.metadata, 'title', 'Unknown'), 'word_count': stats['words'], 'sentence_count': stats['sentences'], 'syllable_count': stats['syllables'], 'avg_words_per_sentence': round(stats['words'] / stats['sentences'], 2), 'avg_syllables_per_word': round(stats['syllables'] / stats['words'], 2), 'flesch_reading_ease': round(flesch_score, 2), 'flesch_kincaid_grade': round(flesch_grade, 2), 'reading_level': self._interpret_flesch_score(flesch_score) } def _extract_all_text(self, doc): """Extract all readable text from EPUB.""" # This is a simplified version - real implementation would # need to parse XHTML content files try: manifest = doc.package.manifest # In a real implementation, you'd extract and parse each content file # For now, return placeholder return "Sample text for analysis. This would contain the actual book content." except Exception: return "" def _calculate_text_stats(self, text): """Calculate basic text statistics.""" # Clean text text = re.sub(r'[^\w\s\.\!\?]', '', text) # Count words words = len(text.split()) # Count sentences sentences = len(re.findall(r'[.!?]+', text)) if sentences == 0: sentences = 1 # Avoid division by zero # Count syllables (simplified) syllables = self._count_syllables(text) return { 'words': words, 'sentences': sentences, 'syllables': syllables } def _count_syllables(self, text): """Simplified syllable counting.""" words = text.lower().split() syllable_count = 0 for word in words: word = re.sub(r'[^a-z]', '', word) if word: # Simple syllable counting heuristic vowels = 'aeiouy' syllables = sum(1 for i, char in enumerate(word) if char in vowels and (i == 0 or word[i-1] not in vowels)) if word.endswith('e') and syllables > 1: syllables -= 1 syllable_count += max(1, syllables) return syllable_count def _flesch_reading_ease(self, stats): """Calculate Flesch Reading Ease score.""" return (206.835 - (1.015 * (stats['words'] / stats['sentences'])) - (84.6 * (stats['syllables'] / stats['words']))) def _flesch_kincaid_grade(self, stats): """Calculate Flesch-Kincaid Grade Level.""" return ((0.39 * (stats['words'] / stats['sentences'])) + (11.8 * (stats['syllables'] / stats['words'])) - 15.59) def _interpret_flesch_score(self, score): """Interpret Flesch Reading Ease score.""" if score >= 90: return "Very Easy (5th grade)" elif score >= 80: return "Easy (6th grade)" elif score >= 70: return "Fairly Easy (7th grade)" elif score >= 60: return "Standard (8th-9th grade)" elif score >= 50: return "Fairly Difficult (10th-12th grade)" elif score >= 30: return "Difficult (College level)" else: return "Very Difficult (Graduate level)" # Usage analyzer = ReadingLevelAnalyzer() analysis = analyzer.analyze_epub("book.epub") print(f"Reading Level Analysis for: {analysis['title']}") print(f"Word Count: {analysis['word_count']:,}") print(f"Reading Level: {analysis['reading_level']}") print(f"Flesch-Kincaid Grade: {analysis['flesch_kincaid_grade']}") Direct File Access and Extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Extract specific files from EPUB archives for processing or analysis. **CLI Approach**: .. code-block:: bash #!/bin/bash # extract-epub-assets.sh - Extract and process EPUB content files epub_file="$1" output_dir="extracted_content" mkdir -p "$output_dir" echo "Extracting content from: $epub_file" # Get list of all XHTML content files epub-utils "$epub_file" files --format raw | grep '\.xhtml$' | while read -r file_path; do echo "Processing: $file_path" # Extract plain text content safe_name=$(echo "$file_path" | tr '/' '_') epub-utils "$epub_file" files "$file_path" --format plain > "$output_dir/${safe_name}.txt" # Extract styled HTML content epub-utils "$epub_file" files "$file_path" --format raw > "$output_dir/${safe_name}.html" done # Extract CSS files for styling reference epub-utils "$epub_file" files --format raw | grep '\.css$' | while read -r css_path; do echo "Extracting CSS: $css_path" safe_name=$(echo "$css_path" | tr '/' '_') epub-utils "$epub_file" files "$css_path" > "$output_dir/${safe_name}" done echo "Extraction complete! Files saved to $output_dir/" **Comparing files vs content commands**: .. code-block:: bash # Using files command (direct path access) epub-utils book.epub files OEBPS/chapter1.xhtml --format plain epub-utils book.epub files OEBPS/styles/main.css epub-utils book.epub files META-INF/container.xml # Using content command (requires manifest item ID) epub-utils book.epub manifest | grep chapter1 # Find the ID first epub-utils book.epub content chapter1-id --format plain **Key advantages of the files command**: - **Direct access**: Use actual file paths without needing manifest IDs - **Universal file access**: Access any file type (XHTML, CSS, XML, images, etc.) - **Simpler automation**: No need to parse manifest to find item IDs - **Better for file-system-based workflows**: Mirrors actual EPUB structure **Python equivalent using API**: .. code-block:: python from epub_utils import Document def extract_file_content(epub_path, file_path): """Extract content from a specific file in EPUB.""" doc = Document(epub_path) try: content = doc.get_file_by_path(file_path) # Handle different content types if hasattr(content, 'to_plain'): # XHTML content - can extract plain text return { 'raw_html': content.to_str(), 'plain_text': content.to_plain(), 'formatted_xml': content.to_xml(pretty_print=True) } else: # Other file types (CSS, XML, etc.) return {'raw_content': content} except ValueError as e: return {'error': str(e)} # Usage doc = Document("book.epub") # Extract chapter content chapter_content = extract_file_content("book.epub", "OEBPS/chapter1.xhtml") if 'plain_text' in chapter_content: print(f"Chapter text: {chapter_content['plain_text'][:200]}...") # Extract CSS for styling analysis css_content = extract_file_content("book.epub", "OEBPS/styles/main.css") if 'raw_content' in css_content: print(f"CSS rules: {len(css_content['raw_content'].split('{'))} rules found") Automation and Workflows ------------------------- Automated EPUB Processing Pipeline ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Set up an automated pipeline for processing new EPUB files. .. code-block:: python import os import shutil import json from pathlib import Path from datetime import datetime from epub_utils import Document class EPUBProcessor: def __init__(self, input_dir, output_dir, processed_dir): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.processed_dir = Path(processed_dir) # Create directories if they don't exist self.output_dir.mkdir(exist_ok=True) self.processed_dir.mkdir(exist_ok=True) def process_new_files(self): """Process all new EPUB files in input directory.""" epub_files = list(self.input_dir.glob("*.epub")) if not epub_files: print("No EPUB files found to process") return print(f"Found {len(epub_files)} EPUB files to process") results = [] for epub_path in epub_files: result = self.process_single_file(epub_path) results.append(result) # Generate processing report self.generate_report(results) return results def process_single_file(self, epub_path): """Process a single EPUB file.""" print(f"Processing: {epub_path.name}") try: doc = Document(str(epub_path)) # Extract metadata metadata = self.extract_metadata(doc) # Validate file validation_result = self.validate_epub(doc) # Generate file info file_info = self.generate_file_info(epub_path, doc) # Create organized filename new_filename = self.create_organized_filename(metadata) # Move file to organized location organized_path = self.organize_file(epub_path, new_filename, metadata) result = { 'original_path': str(epub_path), 'new_path': str(organized_path), 'status': 'success', 'metadata': metadata, 'validation': validation_result, 'file_info': file_info, 'processed_at': datetime.now().isoformat() } # Move original to processed directory processed_path = self.processed_dir / epub_path.name shutil.move(str(epub_path), str(processed_path)) return result except Exception as e: result = { 'original_path': str(epub_path), 'status': 'error', 'error': str(e), 'processed_at': datetime.now().isoformat() } # Move problematic file to processed directory processed_path = self.processed_dir / f"ERROR_{epub_path.name}" shutil.move(str(epub_path), str(processed_path)) return result def extract_metadata(self, doc): """Extract standardized metadata.""" metadata = doc.package.metadata return { 'title': getattr(metadata, 'title', '').strip(), 'author': getattr(metadata, 'creator', '').strip(), 'publisher': getattr(metadata, 'publisher', '').strip(), 'language': getattr(metadata, 'language', '').strip(), 'year': self.extract_year(getattr(metadata, 'date', '')), 'identifier': getattr(metadata, 'identifier', '').strip(), 'subject': getattr(metadata, 'subject', '').strip() } def extract_year(self, date_str): """Extract year from date string.""" if not date_str: return '' return date_str.split('-')[0] if '-' in date_str else date_str[:4] def validate_epub(self, doc): """Basic EPUB validation.""" issues = [] try: metadata = doc.package.metadata if not getattr(metadata, 'title', '').strip(): issues.append('Missing title') if not getattr(metadata, 'creator', '').strip(): issues.append('Missing author') if not getattr(metadata, 'language', '').strip(): issues.append('Missing language') # Check for content manifest = doc.package.manifest has_content = any( item.get('media-type') == 'application/xhtml+xml' for item in manifest.items.values() ) if not has_content: issues.append('No content files found') except Exception as e: issues.append(f'Validation error: {e}') return { 'is_valid': len(issues) == 0, 'issues': issues } def generate_file_info(self, epub_path, doc): """Generate file information.""" stat = epub_path.stat() return { 'filename': epub_path.name, 'size_bytes': stat.st_size, 'size_mb': round(stat.st_size / (1024 * 1024), 2), 'file_count': len(doc.get_files_info()), 'modified': datetime.fromtimestamp(stat.st_mtime).isoformat() } def create_organized_filename(self, metadata): """Create an organized filename from metadata.""" # Clean strings for filename def clean_for_filename(s): return re.sub(r'[^\w\s-]', '', s).strip()[:50] author = clean_for_filename(metadata['author'] or 'Unknown_Author') title = clean_for_filename(metadata['title'] or 'Unknown_Title') year = metadata['year'] or 'Unknown_Year' return f"{author} - {title} ({year}).epub" def organize_file(self, epub_path, new_filename, metadata): """Organize file into structured directory.""" # Create author directory author = metadata['author'] or 'Unknown_Author' author_dir = self.output_dir / author[:50] # Limit length author_dir.mkdir(exist_ok=True) # Create final path final_path = author_dir / new_filename # Copy file to organized location shutil.copy2(str(epub_path), str(final_path)) return final_path def generate_report(self, results): """Generate processing report.""" report_path = self.output_dir / f"processing_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" summary = { 'total_files': len(results), 'successful': len([r for r in results if r['status'] == 'success']), 'errors': len([r for r in results if r['status'] == 'error']), 'generated_at': datetime.now().isoformat(), 'results': results } with open(report_path, 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print(f"Processing complete!") print(f"Successfully processed: {summary['successful']}") print(f"Errors: {summary['errors']}") print(f"Report saved to: {report_path}") # Usage processor = EPUBProcessor( input_dir="/path/to/new/epubs", output_dir="/path/to/organized/library", processed_dir="/path/to/processed/files" ) results = processor.process_new_files() Command-Line Power User Examples -------------------------------- Advanced Shell Scripts ~~~~~~~~~~~~~~~~~~~~~~ **Complex metadata extraction with error handling**: .. code-block:: bash #!/bin/bash # advanced-epub-analysis.sh set -euo pipefail EPUB_DIR="${1:-./}" OUTPUT_FILE="detailed_analysis.json" echo "Starting advanced EPUB analysis..." echo "Directory: $EPUB_DIR" echo "Output: $OUTPUT_FILE" # Initialize JSON output echo '{"analysis_date": "'$(date -Iseconds)'", "epubs": [' > "$OUTPUT_FILE" first=true find "$EPUB_DIR" -name "*.epub" -type f | while read -r epub; do echo "Analyzing: $(basename "$epub")" if [ "$first" = true ]; then first=false else echo "," >> "$OUTPUT_FILE" fi # Start JSON object for this EPUB echo ' {' >> "$OUTPUT_FILE" echo " \"file\": \"$epub\"," >> "$OUTPUT_FILE" # Extract metadata with error handling if metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null); then echo " \"metadata\": {" >> "$OUTPUT_FILE" # Parse metadata into JSON echo "$metadata" | while IFS=': ' read -r key value; do if [ -n "$key" ] && [ -n "$value" ]; then echo " \"$key\": \"$value\"," >> "$OUTPUT_FILE" fi done | sed '$s/,$//' # Remove last comma echo " }," >> "$OUTPUT_FILE" else echo " \"metadata\": null," >> "$OUTPUT_FILE" echo " \"metadata_error\": true," >> "$OUTPUT_FILE" fi # File analysis if file_info=$(epub-utils "$epub" files --format raw 2>/dev/null); then file_count=$(echo "$file_info" | wc -l) echo " \"file_count\": $file_count," >> "$OUTPUT_FILE" else echo " \"file_count\": null," >> "$OUTPUT_FILE" fi # File size size=$(stat -f%z "$epub" 2>/dev/null || stat -c%s "$epub" 2>/dev/null || echo "0") echo " \"size_bytes\": $size," >> "$OUTPUT_FILE" # Validation check if epub-utils "$epub" container >/dev/null 2>&1 && \ epub-utils "$epub" package >/dev/null 2>&1; then echo " \"is_valid\": true" >> "$OUTPUT_FILE" else echo " \"is_valid\": false" >> "$OUTPUT_FILE" fi echo " }" >> "$OUTPUT_FILE" done # Close JSON echo "]}" >> "$OUTPUT_FILE" echo "Analysis complete! Results in $OUTPUT_FILE" **Batch processing with parallel execution**: .. code-block:: bash #!/bin/bash # parallel-epub-check.sh EPUB_DIR="${1:-./}" MAX_JOBS=4 export -f check_single_epub check_single_epub() { epub="$1" base=$(basename "$epub") echo "[$base] Starting check..." # Quick validation if ! epub-utils "$epub" container >/dev/null 2>&1; then echo "[$base] ❌ Invalid container" return 1 fi if ! epub-utils "$epub" package >/dev/null 2>&1; then echo "[$base] ❌ Invalid package" return 1 fi # Check for required metadata metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null) if ! echo "$metadata" | grep -q "^title:"; then echo "[$base] ⚠️ Missing title" fi if ! echo "$metadata" | grep -q "^creator:"; then echo "[$base] ⚠️ Missing author" fi echo "[$base] ✅ Check complete" } # Run parallel checks find "$EPUB_DIR" -name "*.epub" -type f | \ xargs -n 1 -P $MAX_JOBS -I {} bash -c 'check_single_epub "$@"' _ {} Navigation and Table of Contents -------------------------------- Working with EPUB Navigation Documents ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Extract and analyze navigation structures from both EPUB 2 and EPUB 3 files. **CLI Approach - Version-Specific TOC Access**: .. code-block:: bash #!/bin/bash # extract-navigation.sh - Extract navigation from EPUB files EPUB_FILE="$1" if [ -z "$EPUB_FILE" ]; then echo "Usage: $0 " exit 1 fi echo "Analyzing navigation in: $(basename "$EPUB_FILE")" echo "========================================" # Try EPUB 3 nav document first echo "Attempting EPUB 3 nav document extraction..." if epub-utils "$EPUB_FILE" toc --nav > /tmp/nav.xml 2>/dev/null; then echo "✅ EPUB 3 nav document found" echo "Navigation structure:" # Extract navigation items with their hierarchy grep -o ']*href="[^"]*"[^>]*>[^<]*' /tmp/nav.xml | \ sed 's/]*href="\([^"]*\)"[^>]*>\([^<]*\)<\/a>/ → \2 (\1)/' | \ head -10 # Count navigation items nav_count=$(grep -c ']*href=' /tmp/nav.xml) echo "Total navigation items: $nav_count" else echo "❌ No EPUB 3 nav document found" fi echo "" echo "Attempting EPUB 2 NCX extraction..." if epub-utils "$EPUB_FILE" toc --ncx > /tmp/ncx.xml 2>/dev/null; then echo "✅ EPUB 2 NCX document found" echo "Table of contents structure:" # Extract NCX navigation points grep -o '[^<]*' /tmp/ncx.xml | \ sed 's/\([^<]*\)<\/text><\/navLabel>/ → \1/' | \ head -10 # Count NCX nav points ncx_count=$(grep -c '/dev/null | wc -l) echo "Standard TOC items: $standard_toc" **Python Approach - Advanced Navigation Analysis**: .. code-block:: python from epub_utils import Document import xml.etree.ElementTree as ET from pathlib import Path class NavigationAnalyzer: def __init__(self, epub_path): self.doc = Document(epub_path) self.epub_path = Path(epub_path) def analyze_navigation(self): """Comprehensive navigation analysis.""" print(f"Analyzing: {self.epub_path.name}") print("=" * 50) # Check EPUB version version = getattr(self.doc.package.metadata, 'version', 'unknown') print(f"EPUB Version: {version}") print() # Analyze EPUB 3 nav document self._analyze_nav_document() # Analyze EPUB 2 NCX document self._analyze_ncx_document() # Compare with standard TOC self._analyze_standard_toc() def _analyze_nav_document(self): """Analyze EPUB 3 navigation document.""" print("EPUB 3 Navigation Document Analysis:") print("-" * 40) try: nav_content = self.doc.nav if nav_content: print("✅ Nav document found") # Parse navigation structure nav_items = self._parse_nav_structure(nav_content) print(f"Navigation items found: {len(nav_items)}") # Show hierarchy print("\nNavigation hierarchy:") for item in nav_items[:10]: # Show first 10 indent = " " * item['level'] print(f"{indent}→ {item['title']} ({item['href']})") if len(nav_items) > 10: print(f" ... and {len(nav_items) - 10} more items") else: print("❌ No nav document found") except Exception as e: print(f"❌ Error accessing nav document: {e}") print() def _analyze_ncx_document(self): """Analyze EPUB 2 NCX document.""" print("EPUB 2 NCX Document Analysis:") print("-" * 30) try: ncx_content = self.doc.ncx if ncx_content: print("✅ NCX document found") # Parse NCX structure ncx_items = self._parse_ncx_structure(ncx_content) print(f"NCX navigation points: {len(ncx_items)}") # Show structure print("\nNCX structure:") for item in ncx_items[:10]: # Show first 10 indent = " " * item['level'] print(f"{indent}→ {item['title']} ({item['src']})") if len(ncx_items) > 10: print(f" ... and {len(ncx_items) - 10} more items") else: print("❌ No NCX document found") except Exception as e: print(f"❌ Error accessing NCX document: {e}") print() def _analyze_standard_toc(self): """Analyze standard TOC extraction.""" print("Standard TOC Analysis:") print("-" * 22) try: toc = self.doc.get_toc() toc_items = len(toc.get_nav_items()) print(f"✅ Standard TOC items: {toc_items}") # Show some items print("\nStandard TOC items:") for i, item in enumerate(toc.get_nav_items()[:5]): print(f" → {item.title} ({item.href})") except Exception as e: print(f"❌ Error with standard TOC: {e}") print() def _parse_nav_structure(self, nav_content): """Parse EPUB 3 nav document structure.""" items = [] try: root = ET.fromstring(nav_content) # Handle namespaces namespaces = {'xhtml': 'http://www.w3.org/1999/xhtml'} def parse_nav_list(ol_element, level=0): for li in ol_element.findall('.//xhtml:li', namespaces): a_elem = li.find('.//xhtml:a', namespaces) if a_elem is not None: title = a_elem.text or "" href = a_elem.get('href', '') items.append({ 'title': title.strip(), 'href': href, 'level': level }) # Check for nested lists nested_ol = li.find('.//xhtml:ol', namespaces) if nested_ol is not None: parse_nav_list(nested_ol, level + 1) # Find main navigation nav_elem = root.find('.//xhtml:nav[@*="toc"]', namespaces) if nav_elem is None: nav_elem = root.find('.//xhtml:nav', namespaces) if nav_elem is not None: ol_elem = nav_elem.find('.//xhtml:ol', namespaces) if ol_elem is not None: parse_nav_list(ol_elem) except ET.ParseError as e: print(f"Warning: Could not parse nav XML: {e}") return items def _parse_ncx_structure(self, ncx_content): """Parse EPUB 2 NCX document structure.""" items = [] try: root = ET.fromstring(ncx_content) # NCX namespace namespaces = {'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} def parse_nav_point(nav_point, level=0): # Get label nav_label = nav_point.find('ncx:navLabel/ncx:text', namespaces) title = nav_label.text if nav_label is not None else "" # Get content source content = nav_point.find('ncx:content', namespaces) src = content.get('src', '') if content is not None else "" items.append({ 'title': title.strip(), 'src': src, 'level': level }) # Process child nav points for child_nav_point in nav_point.findall('ncx:navPoint', namespaces): parse_nav_point(child_nav_point, level + 1) # Find all top-level navigation points nav_map = root.find('ncx:navMap', namespaces) if nav_map is not None: for nav_point in nav_map.findall('ncx:navPoint', namespaces): parse_nav_point(nav_point) except ET.ParseError as e: print(f"Warning: Could not parse NCX XML: {e}") return items # Usage examples def analyze_single_epub(epub_path): """Analyze a single EPUB file.""" analyzer = NavigationAnalyzer(epub_path) analyzer.analyze_navigation() def compare_navigation_across_epubs(epub_directory): """Compare navigation structures across multiple EPUB files.""" epub_files = list(Path(epub_directory).glob("*.epub")) print(f"Comparing navigation across {len(epub_files)} EPUB files") print("=" * 60) results = [] for epub_path in epub_files: try: doc = Document(str(epub_path)) # Check what navigation documents are available has_nav = bool(doc.nav) has_ncx = bool(doc.ncx) standard_toc_count = len(doc.get_toc().get_nav_items()) results.append({ 'file': epub_path.name, 'has_nav': has_nav, 'has_ncx': has_ncx, 'toc_items': standard_toc_count, 'version': getattr(doc.package.metadata, 'version', 'unknown') }) except Exception as e: print(f"Error processing {epub_path.name}: {e}") # Print comparison table print(f"{'File':<30} {'Version':<8} {'Nav':<5} {'NCX':<5} {'TOC Items':<10}") print("-" * 65) for result in results: nav_mark = "✅" if result['has_nav'] else "❌" ncx_mark = "✅" if result['has_ncx'] else "❌" print(f"{result['file']:<30} {result['version']:<8} " f"{nav_mark:<5} {ncx_mark:<5} {result['toc_items']:<10}") # Example usage if __name__ == "__main__": # Analyze single file analyze_single_epub("/path/to/your/book.epub") # Compare multiple files compare_navigation_across_epubs("/path/to/epub/collection") Building Smart Reading Lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Scenario**: Create curated reading lists based on navigation complexity and structure. .. code-block:: python from epub_utils import Document import json from pathlib import Path from collections import defaultdict class ReadingListBuilder: def __init__(self): self.books = [] def analyze_book_complexity(self, epub_path): """Analyze book's structural complexity.""" try: doc = Document(str(epub_path)) # Get navigation info toc_items = len(doc.get_toc().get_nav_items()) has_advanced_nav = bool(doc.nav) or bool(doc.ncx) # Get file structure info files_info = doc.get_files_info() html_files = [f for f in files_info if f['media_type'] == 'application/xhtml+xml'] complexity_score = self._calculate_complexity_score( toc_items, len(html_files), has_advanced_nav ) return { 'path': epub_path, 'title': getattr(doc.package.metadata, 'title', ''), 'author': getattr(doc.package.metadata, 'creator', ''), 'toc_items': toc_items, 'html_files': len(html_files), 'has_advanced_nav': has_advanced_nav, 'complexity_score': complexity_score, 'complexity_level': self._get_complexity_level(complexity_score) } except Exception as e: print(f"Error analyzing {epub_path}: {e}") return None def _calculate_complexity_score(self, toc_items, html_files, has_advanced_nav): """Calculate structural complexity score.""" score = 0 # TOC complexity if toc_items > 50: score += 30 elif toc_items > 20: score += 20 elif toc_items > 10: score += 10 # File structure complexity if html_files > 100: score += 25 elif html_files > 50: score += 15 elif html_files > 20: score += 10 # Advanced navigation features if has_advanced_nav: score += 15 return min(score, 100) # Cap at 100 def _get_complexity_level(self, score): """Convert score to complexity level.""" if score >= 70: return "Advanced" elif score >= 40: return "Intermediate" else: return "Beginner" def build_reading_lists(self, epub_directory, output_file="reading_lists.json"): """Build categorized reading lists.""" epub_files = list(Path(epub_directory).glob("*.epub")) print(f"Analyzing {len(epub_files)} EPUB files for reading lists...") # Analyze all books for epub_path in epub_files: book_info = self.analyze_book_complexity(epub_path) if book_info: self.books.append(book_info) # Categorize books categories = defaultdict(list) for book in self.books: # By complexity categories[f"complexity_{book['complexity_level'].lower()}"].append(book) # By navigation richness if book['toc_items'] >= 20: categories['detailed_structure'].append(book) if book['has_advanced_nav']: categories['advanced_navigation'].append(book) # Create final reading lists reading_lists = { 'beginner_friendly': { 'description': 'Books with simple structure, perfect for casual reading', 'books': sorted(categories['complexity_beginner'], key=lambda x: x['toc_items'])[:10] }, 'intermediate_reads': { 'description': 'Well-structured books with moderate complexity', 'books': sorted(categories['complexity_intermediate'], key=lambda x: x['complexity_score'])[:15] }, 'advanced_studies': { 'description': 'Complex books with rich navigation, ideal for research', 'books': sorted(categories['complexity_advanced'], key=lambda x: x['complexity_score'], reverse=True)[:10] }, 'detailed_references': { 'description': 'Books with detailed table of contents', 'books': sorted(categories['detailed_structure'], key=lambda x: x['toc_items'], reverse=True)[:12] }, 'enhanced_navigation': { 'description': 'Books with advanced navigation features', 'books': categories['advanced_navigation'][:10] } } # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(reading_lists, f, indent=2, ensure_ascii=False, default=str) # Print summary print(f"\nReading Lists Generated:") print("=" * 25) for list_name, list_data in reading_lists.items(): print(f"{list_name}: {len(list_data['books'])} books") print(f" → {list_data['description']}") print(f"\nSaved to: {output_file}") # Usage builder = ReadingListBuilder() builder.build_reading_lists("/path/to/epub/collection") These examples demonstrate the power and flexibility of ``epub-utils`` for various real-world scenarios. Whether you're managing a digital library, performing quality assurance, building automated workflows, or analyzing navigation structures, epub-utils provides the tools you need to work effectively with EPUB files. ================================================ FILE: docs/formats.rst ================================================ Output Formats Reference ======================== ``epub-utils`` supports multiple output formats to suit different use cases. This guide explains each format with examples and best practices for when to use each one. Overview -------- All commands in ``epub-utils`` support the ``--format`` option with these values: - ``xml`` - Syntax-highlighted XML (default for most commands) - ``raw`` - Unformatted, raw content - ``kv`` - Key-value pairs (where supported) - ``plain`` - Plain text with HTML tags stripped (content command only) - ``table`` - Formatted table (files command only) Additionally, most commands support the ``--pretty-print`` option to format XML output with proper indentation and structure. XML Format (Default) -------------------- The XML format provides syntax-highlighted, pretty-printed XML output that's easy to read. **When to use**: Interactive inspection, debugging, learning EPUB structure **Example**: .. code-block:: bash $ epub-utils book.epub metadata --format xml **Output**: .. code-block:: xml The Great Gatsby F. Scott Fitzgerald en urn:uuid:12345678-1234-1234-1234-123456789abc Scribner 2021-01-01 Fiction Classic Literature **Features**: - Color syntax highlighting - Proper indentation - Easy to read structure - Preserves all XML attributes and namespaces Raw Format ---------- The raw format outputs unprocessed content exactly as stored in the EPUB file. **When to use**: Piping to other tools, automated processing, debugging XML issues **Example**: .. code-block:: bash $ epub-utils book.epub metadata --format raw **Output**: .. code-block:: xml The Great GatsbyF. Scott Fitzgeraldenurn:uuid:12345678-1234-1234-1234-123456789abcScribner2021-01-01FictionClassic Literature **Use cases**: .. code-block:: bash # Pipe to xmllint for custom formatting $ epub-utils book.epub package --format raw | xmllint --format - # Extract specific elements with grep $ epub-utils book.epub manifest --format raw | grep 'media-type="text/css"' # Validate XML structure $ epub-utils book.epub toc --format raw | xmllint --valid - Key-Value Format ---------------- The key-value format presents metadata as simple ``key: value`` pairs, perfect for scripting. **When to use**: Shell scripting, automated data extraction, configuration files **Supported commands**: ``metadata`` **Example**: .. code-block:: bash $ epub-utils book.epub metadata --format kv **Output**: .. code-block:: text title: The Great Gatsby creator: F. Scott Fitzgerald language: en identifier: urn:uuid:12345678-1234-1234-1234-123456789abc publisher: Scribner date: 2021-01-01 subject: Fiction, Classic Literature **Scripting examples**: .. code-block:: bash # Extract just the title title=$(epub-utils book.epub metadata --format kv | grep "^title:" | cut -d' ' -f2-) # Get all metadata into shell variables eval "$(epub-utils book.epub metadata --format kv | sed 's/^/meta_/')" echo "Book title: $meta_title" echo "Author: $meta_creator" # Create a simple database echo "filename,title,author" > books.csv for epub in *.epub; do metadata=$(epub-utils "$epub" metadata --format kv) title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | tr ',' ';') author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | tr ',' ';') echo "$epub,$title,$author" >> books.csv done Plain Text Format ----------------- The plain text format strips HTML tags and returns readable text content. **When to use**: Content analysis, word counting, text extraction **Supported commands**: ``content``, ``files`` (with file path) **Example**: .. code-block:: bash $ epub-utils book.epub content chapter1 --format plain **Output**: .. code-block:: text Chapter 1: The Beginning In my younger and more vulnerable years my father gave me some advice that I've carried with me ever since. "Whenever you feel like criticizing anyone," he told me, "just remember that all the people in this world haven't had the advantages that you've had." **Use cases**: .. code-block:: bash # Count words in a chapter (using content command) word_count=$(epub-utils book.epub content chapter1 --format plain | wc -w) echo "Chapter 1 has $word_count words" # Extract all text for analysis (using files command) epub-utils book.epub files OEBPS/chapter1.xhtml --format plain > chapter1.txt # Search for specific content in any file if epub-utils book.epub files OEBPS/chapter2.xhtml --format plain | grep -q "important phrase"; then echo "Found the phrase in chapter 2" fi # Access files by path without knowing manifest IDs epub-utils book.epub files OEBPS/styles/main.css epub-utils book.epub files META-INF/container.xml Table Format ------------ The table format presents file information in a readable tabular layout. **When to use**: File analysis, human-readable file listings **Supported commands**: ``files`` **Example**: .. code-block:: bash $ epub-utils book.epub files --format table **Output**: .. code-block:: text File Information for book.epub ┌────────────────────────────────────────┬──────────┬──────────────┬─────────────────────┐ │ Path │ Size │ Compressed │ Modified │ ├────────────────────────────────────────┼──────────┼──────────────┼─────────────────────┤ │ META-INF/container.xml │ 230 B │ 140 B │ 2021-01-01 10:00:00│ │ OEBPS/content.opf │ 2.1 KB │ 856 B │ 2021-01-01 10:00:00│ │ OEBPS/toc.ncx │ 1.8 KB │ 542 B │ 2021-01-01 10:00:00│ │ OEBPS/Text/chapter01.xhtml │ 12.4 KB │ 3.2 KB │ 2021-01-01 10:00:00│ │ OEBPS/Text/chapter02.xhtml │ 15.6 KB │ 4.1 KB │ 2021-01-01 10:00:00│ │ OEBPS/Styles/stylesheet.css │ 3.2 KB │ 1.1 KB │ 2021-01-01 10:00:00│ │ OEBPS/Images/cover.jpg │ 145.2 KB │ 144.8 KB │ 2021-01-01 10:00:00│ └────────────────────────────────────────┴──────────┴──────────────┴─────────────────────┘ Command-Specific Format Support ------------------------------- Here's a quick reference for which formats each command supports: .. list-table:: Format Support by Command :header-rows: 1 :widths: 20 15 15 15 15 15 * - Command - XML - Raw - KV - Plain - Table * - ``container`` - ✓ - ✓ - ✗ - ✗ - ✗ * - ``package`` - ✓ - ✓ - ✗ - ✗ - ✗ * - ``toc`` - ✓ - ✓ - ✗ - ✗ - ✗ * - ``metadata`` - ✓ - ✓ - ✓ - ✗ - ✗ * - ``manifest`` - ✓ - ✓ - ✗ - ✗ - ✗ * - ``spine`` - ✓ - ✓ - ✗ - ✗ - ✗ * - ``content`` - ✓ - ✓ - ✗ - ✓ - ✗ * - ``files`` - ✓* - ✓ - ✗ - ✓* - ✓* .. note:: \* For the ``files`` command: ``xml``, ``plain``, and ``table`` formats are only available when specifying a file path. When listing files (no path specified), only ``table`` and ``raw`` formats are supported. Advanced Format Usage --------------------- Combining Formats with Shell Tools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Pretty-print with custom tools**: .. code-block:: bash # Use xmllint for custom XML formatting epub-utils book.epub package --format raw | xmllint --format --noblanks - # Convert to JSON using xq (if available) epub-utils book.epub metadata --format raw | xq '.' **Processing key-value output**: .. code-block:: bash # Convert to environment variables export $(epub-utils book.epub metadata --format kv | tr ' ' '_' | tr ':' '=') echo "Title: $title" # Create YAML-like output epub-utils book.epub metadata --format kv | sed 's/^/ /' | sed '1i metadata:' **Text analysis workflows**: .. code-block:: bash # Analyze reading time (assuming 200 words per minute) words=$(epub-utils book.epub content chapter1 --format plain | wc -w) minutes=$((words / 200)) echo "Chapter 1 reading time: $minutes minutes" # Extract quotes (lines starting with quotation marks) epub-utils book.epub content chapter1 --format plain | grep '^".*"$' Format Selection Guidelines --------------------------- Choose the right format based on your use case: **For Human Reading**: - Use ``xml`` for inspecting EPUB structure - Use ``table`` for file listings - Use ``plain`` for content reading **For Automation**: - Use ``raw`` for piping to other XML tools - Use ``kv`` for simple scripting and data extraction - Use ``raw`` with ``files`` for getting simple file lists **For Integration**: - Use ``raw`` when feeding into other programs - Use ``kv`` for configuration file generation - Use ``plain`` for text processing workflows **Performance Considerations**: - ``raw`` format is fastest (no syntax highlighting) - ``xml`` format has slight overhead for highlighting - ``table`` format requires additional formatting computation Error Handling with Formats ---------------------------- Different formats handle errors differently: .. code-block:: bash # XML format shows formatted error messages $ epub-utils corrupted.epub metadata --format xml Error: Unable to parse metadata # Raw format may show parsing errors directly $ epub-utils corrupted.epub metadata --format raw ParseError: Invalid XML structure # KV format gracefully handles missing fields $ epub-utils incomplete.epub metadata --format kv title: creator: Unknown Author language: en Custom Format Processing ------------------------ You can create custom output formats by post-processing the raw output: .. code-block:: bash #!/bin/zsh # custom-json-format.sh - Convert metadata to JSON epub_file="$1" echo "{" epub-utils "$epub_file" metadata --format kv | while IFS=': ' read -r key value; do if [[ -n "$key" && -n "$value" ]]; then echo " \"$key\": \"$value\"," fi done | sed '$s/,$//' echo "}" .. code-block:: bash #!/bin/zsh # custom-markdown-format.sh - Convert metadata to Markdown epub_file="$1" echo "# Book Information" echo "" epub-utils "$epub_file" metadata --format kv | while IFS=': ' read -r key value; do if [[ -n "$key" && -n "$value" ]]; then formatted_key=$(echo "$key" | sed 's/\b\w/\U&/g') # Title case echo "**$formatted_key**: $value" fi done Pretty-Print Option ------------------- The ``--pretty-print`` (or ``-pp``) option enhances XML output by adding proper indentation and structure, making it more readable for human inspection. **When to use**: Human review, debugging XML structure, cleaner output for documentation **Supported formats**: ``xml`` and ``raw`` **Example without pretty-print**: .. code-block:: bash $ epub-utils book.epub metadata --format raw **Output**: .. code-block:: xml The Great GatsbyF. Scott Fitzgeralden **Example with pretty-print**: .. code-block:: bash $ epub-utils book.epub metadata --format raw --pretty-print **Output**: .. code-block:: xml The Great Gatsby F. Scott Fitzgerald en **Use cases**: .. code-block:: bash # Better readability for manual inspection epub-utils book.epub package --pretty-print # Clean output for documentation or examples epub-utils book.epub container --format raw --pretty-print # Pipe to file with proper formatting epub-utils book.epub toc --pretty-print > toc-formatted.xml **Note**: Pretty-print has no effect on ``kv``, ``plain``, or ``table`` formats as these are already optimized for readability. Best Practices -------------- 1. **Default to XML for interactive use** - it's the most readable 2. **Use raw for scripting** - it's the most reliable for automation 3. **Use kv for metadata extraction** - it's purpose-built for simple parsing 4. **Use plain for content analysis** - it removes HTML complexity 5. **Use pretty-print for human review** - it makes XML structure clearer 6. **Always handle errors** - EPUB files can be malformed 7. **Test with various EPUB files** - format output can vary with different EPUB structures These format options make epub-utils flexible enough to handle everything from quick interactive inspection to complex automated workflows. ================================================ FILE: docs/index.rst ================================================ epub-utils: EPUB Inspection and Manipulation ============================================= .. image:: https://img.shields.io/pypi/v/epub-utils.svg :target: https://pypi.org/project/epub-utils/ :alt: PyPI version .. image:: https://img.shields.io/pypi/pyversions/epub-utils.svg?logo=python&logoColor=white :target: https://pypi.org/project/epub-utils/ :alt: Python versions .. image:: https://img.shields.io/badge/license-Apache%202.0-blue.svg :target: https://github.com/ernestofgonzalez/epub-utils/blob/main/LICENSE :alt: License **epub-utils** is a comprehensive Python library and command-line tool for working with EPUB files. It provides both a programmatic API and an intuitive CLI interface for inspecting and parsing EPUB archives. .. note:: epub-utils supports **EPUB 2.0.1** and **EPUB 3.0+** specifications, ensuring compatibility with the vast majority of EPUB files in circulation. Key Features ------------ **Rich CLI Interface** - Syntax-highlighted XML output - Multiple output formats (XML, raw, key-value, plain text) - Comprehensive file inspection capabilities **Complete EPUB Support** - Parse container.xml and package files - Extract and display table of contents - Access manifest and spine information - Retrieve document content by ID **Metadata Extraction** - Dublin Core metadata support - EPUB-specific metadata fields - Key-value output for easy parsing **Python API** - Clean, object-oriented interface - Lazy loading for performance - Comprehensive error handling Quick Start ----------- Installation ~~~~~~~~~~~~ .. code-block:: bash $ pip install epub-utils Basic CLI Usage ~~~~~~~~~~~~~~~ Inspect an EPUB file with a simple command: .. code-block:: bash # Display metadata with beautiful syntax highlighting $ epub-utils my-book.epub metadata # Show table of contents structure $ epub-utils my-book.epub toc # Get key-value metadata for scripting $ epub-utils my-book.epub metadata --format kv Basic Python Usage ~~~~~~~~~~~~~~~~~~ .. code-block:: python from epub_utils import Document # Load an EPUB document doc = Document("path/to/book.epub") # Access metadata easily print(f"Title: {doc.package.metadata.title}") print(f"Author: {doc.package.metadata.creator}") print(f"Language: {doc.package.metadata.language}") # Get table of contents toc_xml = doc.toc.to_xml() print(toc_xml) Why epub-utils? --------------- epub-utils fills a crucial gap in the Python ecosystem for EPUB file manipulation. While there are libraries for creating EPUBs, few focus on inspection and analysis. This tool is perfect for: **Publishers and Authors** Validate EPUB structure and metadata before distribution **Digital Librarians** Batch process and analyze EPUB collections **Automation Scripts** Extract metadata for catalogs and databases **Debugging** Inspect malformed or problematic EPUB files **Learning** Understand EPUB structure and standards compliance Documentation Contents ---------------------- .. toctree:: :maxdepth: 2 :caption: User Guide installation cli-tutorial api-tutorial examples formats .. toctree:: :maxdepth: 2 :caption: Reference cli-reference api-reference epub-standards .. toctree:: :maxdepth: 1 :caption: Development contributing changelog Community & Support ------------------- - **Source Code**: `GitHub Repository `_ - **Issues**: `Bug Reports & Feature Requests `_ - **PyPI**: `Package Index `_ License ------- ``epub-utils`` is distributed under the `Apache License 2.0 `_. ================================================ FILE: docs/installation.rst ================================================ Installation Guide ================== System Requirements ------------------- ``epub-utils`` requires Python 3.10 or higher and works on: - **Linux** (Ubuntu 18.04+, Debian 10+, CentOS 7+, Fedora 30+) - **macOS** (10.14+) - **Windows** (Windows 10+) Installing from PyPI --------------------- The easiest way to install ``epub-utils`` is using pip: .. code-block:: bash $ pip install epub-utils This will install the latest stable version with all required dependencies. Development Installation ------------------------ If you want to contribute to ``epub-utils`` or use the latest development version: .. code-block:: bash # Clone the repository $ git clone https://github.com/ernestofgonzalez/epub-utils.git $ cd epub-utils # Create a virtual environment $ python -m venv env $ source env/bin/activate # On Windows: env\Scripts\activate # Install in development mode $ pip install -e . # Install development dependencies $ pip install -r requirements/requirements-testing.txt $ pip install -r requirements/requirements-linting.txt Virtual Environment Installation -------------------------------- For isolated installations, we recommend using virtual environments: Using venv (Python 3.3+) ~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash # Create virtual environment $ python -m venv epub-utils-env # Activate virtual environment $ source epub-utils-env/bin/activate # Linux/macOS $ epub-utils-env\Scripts\activate # Windows # Install epub-utils $ pip install epub-utils Using conda ~~~~~~~~~~~ .. code-block:: bash # Create conda environment $ conda create -n epub-utils python=3.10 # Activate environment $ conda activate epub-utils # Install epub-utils $ pip install epub-utils Verifying Installation ---------------------- After installation, verify that ``epub-utils`` is working correctly: .. code-block:: bash # Check version $ epub-utils --version # Test with a sample EPUB (if you have one) $ epub-utils sample.epub metadata If you see the version number and can run commands without errors, the installation was successful! Installing from Source ---------------------- To install from source code: .. code-block:: bash # Download and extract the source $ wget https://github.com/ernestofgonzalez/epub-utils/archive/main.zip $ unzip main.zip $ cd epub-utils-main # Install $ pip install . Upgrading --------- To upgrade to the latest version: .. code-block:: bash $ pip install --upgrade epub-utils Uninstalling ------------ To remove epub-utils: .. code-block:: bash $ pip uninstall epub-utils Performance Considerations -------------------------- Installing lxml ~~~~~~~~~~~~~~~ While not required, installing ``lxml`` can significantly improve XML parsing performance: .. code-block:: bash $ pip install lxml ``epub-utils`` will automatically use lxml if available, falling back to the standard library's ``xml.etree.ElementTree`` if not. ================================================ FILE: epub_utils/__init__.py ================================================ from epub_utils.container import Container from epub_utils.doc import Document __all__ = ['Document', 'Container'] ================================================ FILE: epub_utils/__main__.py ================================================ from epub_utils.cli import main if __name__ == '__main__': main(prog_name='epub-utils') ================================================ FILE: epub_utils/cli.py ================================================ import click from epub_utils.doc import Document from epub_utils.exceptions import ( EPUBError, FileNotFoundError, ) VERSION = '0.1.0a1' def format_error_message(e: Exception) -> str: """Format exception messages for CLI output.""" if isinstance(e, EPUBError): # Use the custom formatting from our EPUBError class return str(e) else: # For other exceptions, just return the message return str(e) def print_version(ctx, param, value): if not value or ctx.resilient_parsing: return click.echo(VERSION) ctx.exit() @click.group( context_settings=dict(help_option_names=['-h', '--help']), ) @click.option( '-v', '--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Print epub-utils version.', ) @click.argument( 'path', type=click.Path(exists=True, file_okay=True), required=True, ) @click.pass_context def main(ctx, path): ctx.ensure_object(dict) ctx.obj['path'] = path def format_option(default='xml'): """Reusable decorator for the format option.""" return click.option( '-fmt', '--format', type=click.Choice(['raw', 'xml', 'plain', 'kv'], case_sensitive=False), default=default, help=f'Output format, defaults to {default}.', ) def pretty_print_option(): """Reusable decorator for the pretty-print option.""" return click.option( '-pp', '--pretty-print', is_flag=True, default=False, help='Pretty-print XML output (only applies to str and xml format).', ) def output_document_part(doc, part_name, format, pretty_print=False): """Helper function to output document parts in the specified format.""" part = getattr(doc, part_name) if format == 'raw': click.echo(part.to_str(pretty_print=pretty_print)) elif format == 'xml': click.echo(part.to_xml(pretty_print=pretty_print)) elif format == 'kv': if hasattr(part, 'to_kv') and callable(getattr(part, 'to_kv')): click.echo(part.to_kv()) else: click.secho( 'Key-value format not supported for this document part. Falling back to raw:\n', fg='yellow', ) click.echo(part.to_str()) def format_file_size(size_bytes: int) -> str: """Format file size in human-readable format.""" if size_bytes == 0: return '0 B' size_names = ['B', 'KB', 'MB', 'GB'] i = 0 size = float(size_bytes) while size >= 1024.0 and i < len(size_names) - 1: size /= 1024.0 i += 1 if i == 0: return f'{int(size)} {size_names[i]}' else: return f'{size:.1f} {size_names[i]}' def format_files_table(files_info: list) -> str: """Format file information as a table.""" if not files_info: return 'No files found in EPUB archive.' # Calculate column widths max_path_width = max(len(file_info['path']) for file_info in files_info) max_size_width = max(len(format_file_size(file_info['size'])) for file_info in files_info) max_compressed_width = max( len(format_file_size(file_info['compressed_size'])) for file_info in files_info ) # Ensure minimum widths for headers path_width = max(max_path_width, len('Path')) size_width = max(max_size_width, len('Size')) compressed_width = max(max_compressed_width, len('Compressed')) modified_width = len('Modified') # Fixed width for date/time # Create header header = f'{"Path":<{path_width}} | {"Size":>{size_width}} | {"Compressed":>{compressed_width}} | {"Modified":<{modified_width}}' separator = '-' * len(header) # Create rows rows = [] for file_info in files_info: path = file_info['path'][:path_width] # Truncate if too long size = format_file_size(file_info['size']) compressed = format_file_size(file_info['compressed_size']) modified = file_info['modified'] row = f'{path:<{path_width}} | {size:>{size_width}} | {compressed:>{compressed_width}} | {modified:<{modified_width}}' rows.append(row) # Combine all parts result = [header, separator] + rows return '\n'.join(result) @main.command() @format_option() @pretty_print_option() @click.pass_context def container(ctx, format, pretty_print): """Outputs the container information of the EPUB file.""" try: doc = Document(ctx.obj['path']) output_document_part(doc, 'container', format, pretty_print) except EPUBError as e: click.secho('EPUB Error:', fg='red', bold=True, err=True) click.secho(format_error_message(e), fg='red', err=True) ctx.exit(1) except Exception as e: click.secho('Unexpected Error:', fg='red', bold=True, err=True) click.secho(str(e), fg='red', err=True) ctx.exit(1) @main.command() @format_option() @pretty_print_option() @click.pass_context def package(ctx, format, pretty_print): """Outputs the package information of the EPUB file.""" doc = Document(ctx.obj['path']) output_document_part(doc, 'package', format, pretty_print) @main.command() @format_option() @pretty_print_option() @click.option( '--ncx', is_flag=True, default=False, help='Force retrieval of NCX file (EPUB 2 navigation control file).', ) @click.option( '--nav', is_flag=True, default=False, help='Force retrieval of Navigation Document (EPUB 3 navigation file).', ) @click.pass_context def toc(ctx, format, pretty_print, ncx, nav): """Outputs the Table of Contents (TOC) of the EPUB file.""" doc = Document(ctx.obj['path']) if ncx and nav: click.secho('Error: --ncx and --nav flags cannot be used together.', fg='red', err=True) ctx.exit(1) if ncx: part = 'ncx' if doc.ncx is None: click.secho( 'Error: This document does not include a Navigation Control eXtended (NCX).', fg='red', err=True, ) ctx.exit(1) elif nav: part = 'nav' if doc.nav is None: click.secho( 'Error: This document does not include an EPUB Navigation Document.', fg='red', err=True, ) ctx.exit(1) else: part = 'toc' output_document_part(doc, part, format, pretty_print) @main.command() @format_option() @pretty_print_option() @click.pass_context def metadata(ctx, format, pretty_print): """Outputs the metadata information from the package file.""" doc = Document(ctx.obj['path']) package = doc.package output_document_part(package, 'metadata', format, pretty_print) @main.command() @format_option() @pretty_print_option() @click.pass_context def manifest(ctx, format, pretty_print): """Outputs the manifest information from the package file.""" doc = Document(ctx.obj['path']) package = doc.package output_document_part(package, 'manifest', format, pretty_print) @main.command() @format_option() @pretty_print_option() @click.pass_context def spine(ctx, format, pretty_print): """Outputs the spine information from the package file.""" doc = Document(ctx.obj['path']) package = doc.package output_document_part(package, 'spine', format, pretty_print) @main.command() @click.argument('item_id', required=True) @format_option() @pretty_print_option() @click.pass_context def content(ctx, item_id, format, pretty_print): """Outputs the content of a document by its manifest item ID.""" doc = Document(ctx.obj['path']) content = doc.find_content_by_id(item_id) if format == 'raw': click.echo(content.to_str()) elif format == 'xml': if hasattr(content, 'to_xml'): click.echo(content.to_xml(pretty_print=pretty_print)) else: click.echo(content.to_str()) elif format == 'plain': click.echo(content.to_plain()) elif format == 'kv': click.secho( 'Key-value format not supported for content documents. Falling back to raw:\n', fg='yellow', ) click.echo(content.to_str()) @main.command() @click.argument('file_path', required=False) @click.option( '-fmt', '--format', type=click.Choice(['table', 'raw', 'xml', 'plain', 'kv'], case_sensitive=False), default=None, help='Output format. For file listing: table, raw. For file content: raw, xml, plain, kv. Defaults to table for listing, xml for file content.', ) @pretty_print_option() @click.pass_context def files(ctx, file_path, format, pretty_print): """List all files in the EPUB archive with their metadata, or output content of a specific file.""" doc = Document(ctx.obj['path']) # Set dynamic default based on whether file_path is provided if format is None: format = 'xml' if file_path else 'table' if file_path: # Display content of specific file try: content = doc.get_file_by_path(file_path) except FileNotFoundError as e: click.secho('FileNotFoundError:', fg='red', bold=True, err=True) click.secho(format_error_message(e), fg='red', err=True) ctx.exit(1) return # Handle XHTMLContent objects if hasattr(content, 'to_str'): if format == 'raw': click.echo(content.to_str()) elif format == 'xml': if hasattr(content, 'to_xml'): click.echo(content.to_xml(pretty_print=pretty_print)) else: click.echo(content.to_str()) elif format == 'plain': if hasattr(content, 'to_plain'): click.echo(content.to_plain()) else: click.echo(content.to_str()) elif format == 'kv': click.secho( 'Key-value format not supported for file content. Falling back to raw:\n', fg='yellow', ) click.echo(content.to_str()) elif format == 'table': # For file content, table format doesn't make sense, fall back to raw click.secho( 'Table format not supported for file content. Falling back to raw:\n', fg='yellow', ) click.echo(content.to_str()) else: # Handle raw string content (non-XHTML files) click.echo(content) else: # List all files (existing behavior) files_info = doc.get_files_info() if format == 'table': click.echo(format_files_table(files_info)) elif format == 'raw': for file_info in files_info: click.echo(f'{file_info["path"]}') else: # For file listing, only table and raw make sense if format in ['xml', 'plain', 'kv']: click.secho( f'{format.title()} format not supported for file listing. Using table format:\n', fg='yellow', ) click.echo(format_files_table(files_info)) ================================================ FILE: epub_utils/container.py ================================================ """ Open Container Format: https://www.w3.org/TR/epub/#sec-ocf This file includes the `Container` class, which is responsible for parsing the `container.xml` file of an EPUB archive. The `container.xml` file is a required component of the EPUB Open Container Format (OCF) and is located in the `META-INF` directory of the EPUB archive. The `container.xml` file serves as the entry point for identifying the package document(s) within the EPUB container. It must conform to the following structure as defined in the EPUB specification: - The root element is `` and must include the `version` attribute with the value "1.0". - The `` element must contain exactly one `` child element. - The `` element must contain one or more `` child elements. - Each `` element must include a `full-path` attribute that specifies the location of the package document relative to the root of the EPUB container. Namespace: - All elements in the `container.xml` file are in the namespace `urn:oasis:names:tc:opendocument:xmlns:container`. For more details on the structure and requirements of the `container.xml` file, refer to the EPUB specification: https://www.w3.org/TR/epub/#sec-ocf """ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree from epub_utils.exceptions import InvalidEPUBError, ParseError from epub_utils.printers import XMLPrinter class Container: """ Represents the parsed container.xml file of an EPUB. Attributes: xml_content (str): The raw XML content of the container.xml file. rootfile_path (str): The path to the rootfile specified in the container. """ NAMESPACE = 'urn:oasis:names:tc:opendocument:xmlns:container' ROOTFILE_XPATH = f'.//{{{NAMESPACE}}}rootfile' def __init__(self, xml_content: str) -> None: """ Initialize the Container by parsing the container.xml data. Args: xml_content (str): The raw XML content of the container.xml file. """ self.xml_content = xml_content self.rootfile_path: str = None self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def _find_rootfile_element(self, root: etree.Element) -> etree.Element: """ Finds the rootfile element in the container.xml data. Args: root (etree.Element): The root element of the parsed XML. Returns: etree.Element: The rootfile element. Raises: InvalidEPUBError: If the rootfile element or its 'full-path' attribute is missing. """ rootfile_element = root.find(self.ROOTFILE_XPATH) if rootfile_element is None: raise InvalidEPUBError( 'Invalid container.xml: Missing rootfile element', suggestions=[ 'Ensure the container.xml contains a rootfile element', 'Check that the container structure follows EPUB specifications', 'Verify the EPUB was created with compliant tools', ], ) if 'full-path' not in rootfile_element.attrib: raise InvalidEPUBError( "Invalid container.xml: Missing 'full-path' attribute in rootfile element", suggestions=[ "Ensure the rootfile element has a 'full-path' attribute", 'Check that the container.xml follows EPUB specifications', 'Verify the EPUB package structure is complete', ], ) return rootfile_element def _parse(self, xml_content: str) -> None: """ Parses the container.xml data to extract the rootfile path. Args: xml_content (str): The raw XML content of the container.xml file. Raises: ParseError: If the XML is invalid or cannot be parsed. InvalidEPUBError: If the container.xml structure is invalid. """ try: if isinstance(xml_content, str): xml_content = xml_content.encode('utf-8') root = etree.fromstring(xml_content) rootfile_element = self._find_rootfile_element(root) self.rootfile_path = rootfile_element.attrib['full-path'] if not self.rootfile_path.strip(): raise InvalidEPUBError( "Invalid container.xml: 'full-path' attribute is empty", suggestions=[ "Ensure the rootfile element has a non-empty 'full-path' attribute", 'Check that the path points to a valid OPF file', 'Verify the EPUB package structure is complete', ], ) except etree.ParseError as e: raise ParseError( f'Invalid XML in container.xml: {str(e)}', suggestions=[ 'Check that the container.xml file contains valid XML', 'Verify the file is not corrupted', 'Ensure all XML tags are properly closed', 'Check for invalid characters in the XML', ], ) from e ================================================ FILE: epub_utils/content/__init__.py ================================================ from epub_utils.content.base import Content from epub_utils.content.xhtml import XHTMLContent __all__ = ['Content', 'XHTMLContent'] ================================================ FILE: epub_utils/content/base.py ================================================ class Content: """ Base class for EPUB content documents. Attributes: media_type (str): The MIME type of the content. href (str): The path to the content file within the EPUB. """ def __init__(self, media_type: str, href: str) -> None: self.media_type = media_type self.href = href ================================================ FILE: epub_utils/content/xhtml.py ================================================ import re from lxml import etree from epub_utils.content.base import Content from epub_utils.exceptions import ParseError, UnsupportedFormatError from epub_utils.printers import XMLPrinter class XHTMLContent(Content): """ Represents an XHTML content document within an EPUB file. """ MEDIA_TYPES = ['application/xhtml+xml', 'text/html'] def __init__(self, xml_content: str, media_type: str, href: str) -> None: self.xml_content = xml_content self._tree = None if media_type not in self.MEDIA_TYPES: raise UnsupportedFormatError( f"Media type '{media_type}' is not supported for XHTML content", suggestions=[ f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}', 'Check that this is an XHTML content file', 'Verify the manifest declares the correct media type', ], ) super().__init__(media_type, href) self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def to_plain(self) -> str: return self.inner_text def _parse(self, xml_content: str) -> None: try: self._tree = etree.fromstring(xml_content.encode('utf-8')) except etree.ParseError as e: raise ParseError( f'Invalid XML in XHTML content file: {str(e)}', suggestions=[ 'Check that the content file contains valid XHTML', 'Verify the file is not corrupted', 'Ensure all XML tags are properly closed', 'Check for invalid characters in the XML', ], ) from e @property def tree(self): """Lazily parse and cache the XHTML tree.""" if self._tree is None: self._parse(self.xml_content) return self._tree @property def inner_text(self) -> str: tree = self.tree body_elements = tree.xpath('//*[local-name()="body"]') if body_elements: inner_text = ''.join(body_elements[0].itertext()) else: inner_text = ''.join(tree.itertext()) # Normalize whitespace inner_text = re.sub(r'\s+', ' ', inner_text).strip() return inner_text ================================================ FILE: epub_utils/doc.py ================================================ import os import zipfile from datetime import datetime from functools import cached_property from pathlib import Path from typing import Dict, List, Optional, Union from epub_utils.container import Container from epub_utils.content import XHTMLContent from epub_utils.exceptions import FileNotFoundError as EPUBFileNotFoundError from epub_utils.exceptions import InvalidEPUBError from epub_utils.navigation import EPUBNavDocNavigation, Navigation, NCXNavigation from epub_utils.package import Package class Document: """ Represents an EPUB document. Attributes: path (Path): The path to the EPUB file. _container (Container): The parsed container document. _package (Package): The parsed package document. _toc (TableOfContents): The parsed table of contents document. """ CONTAINER_FILE_PATH = 'META-INF/container.xml' def __init__(self, path: Union[str, Path]) -> None: """ Initialize the Document from a given path. Args: path (str | Path): The path to the EPUB file. Raises: InvalidEPUBError: If the file is not a valid EPUB archive. """ self.path: Path = Path(path) if not self.path.exists(): raise InvalidEPUBError( f'EPUB file does not exist: {self.path}', suggestions=[ 'Check that the file path is correct', 'Verify the file has not been moved or deleted', ], file_path=str(self.path), ) if not zipfile.is_zipfile(self.path): raise InvalidEPUBError( f'File is not a valid ZIP archive: {self.path}', suggestions=[ 'Ensure the file is a valid EPUB (which is a ZIP archive)', 'Check that the file is not corrupted', 'Verify the file extension is .epub', ], file_path=str(self.path), ) self._container: Container = None self._package: Package = None self._toc: Navigation = None self._ncx: NCXNavigation = None self._nav: EPUBNavDocNavigation = None def _read_file_from_epub(self, file_path: str) -> str: """ Read and decode a file from the EPUB archive. Args: file_path (str): Path to the file within the EPUB archive. Returns: str: Decoded contents of the file. Raises: EPUBFileNotFoundError: If the file is missing from the EPUB archive. """ with zipfile.ZipFile(self.path, 'r') as epub_zip: norm_namelist = {os.path.normpath(name): name for name in epub_zip.namelist()} norm_path = os.path.normpath(file_path) if norm_path not in norm_namelist: available_files = sorted(norm_namelist.keys())[:10] # Show first 10 files suggestions = [ 'Check that the file path is correct', 'Verify the EPUB file structure is complete', ] if available_files: file_list = ', '.join(available_files) if len(norm_namelist) > 10: file_list += f' (and {len(norm_namelist) - 10} more)' suggestions.append(f'Available files include: {file_list}') raise EPUBFileNotFoundError( file_path, epub_path=str(self.path), suggestions=suggestions ) try: return epub_zip.read(norm_namelist[norm_path]).decode('utf-8') except UnicodeDecodeError as e: raise InvalidEPUBError( f"Cannot decode file '{file_path}' as UTF-8", suggestions=[ 'Check that the file contains valid UTF-8 text', 'Verify the EPUB file is not corrupted', 'Ensure the file is a text-based format (XML, HTML, etc.)', ], file_path=str(self.path), ) from e @property def container(self) -> Container: if self._container is None: container_xml_content = self._read_file_from_epub(self.CONTAINER_FILE_PATH) self._container = Container(container_xml_content) return self._container @property def package(self) -> Package: if self._package is None: package_xml_content = self._read_file_from_epub(self.container.rootfile_path) self._package = Package(package_xml_content) return self._package @cached_property def package_href(self): return os.path.dirname(self.container.rootfile_path) @property def toc(self) -> Optional[Navigation]: if self._toc is None: if self.nav is not None: # Default to newer EPUB3 Navigation Document when available self._toc = self.nav elif self.ncx is not None: self._toc = self.ncx return self._toc @property def ncx(self) -> Optional[NCXNavigation]: """Access the Navigation Control eXtended (EPUB 2)""" if self._ncx is None: package = self.package if not package.toc_href: return None toc_href = package.toc_href toc_path = os.path.join(self.package_href, toc_href) toc_xml_content = self._read_file_from_epub(toc_path) self._ncx = NCXNavigation(toc_xml_content) return self._ncx @property def nav(self) -> Optional[EPUBNavDocNavigation]: """Access the Navigation Document (EPUB 3).""" if self._nav is None: package = self.package if not package.nav_href: return None nav_href = package.nav_href nav_path = os.path.join(self.package_href, nav_href) nav_xml_content = self._read_file_from_epub(nav_path) self._nav = EPUBNavDocNavigation(nav_xml_content) return self._nav def find_content_by_id(self, item_id: str) -> str: """ Find and return content by its manifest item ID. Args: item_id: The ID of the item in the manifest. Returns: XHTMLContent: The content object for the specified item. Raises: EPUBFileNotFoundError: If the item ID is not found in spine or manifest. """ spine_item = self.package.spine.find_by_idref(item_id) if not spine_item: spine_ids = [ item.get('idref') for item in self.package.spine.itemrefs if item.get('idref') ] suggestions = [ 'Check that the item ID is correct', 'Verify the item is included in the spine', ] if spine_ids: available_ids = ', '.join(spine_ids[:5]) if len(spine_ids) > 5: available_ids += f' (and {len(spine_ids) - 5} more)' suggestions.append(f'Available spine IDs: {available_ids}') raise EPUBFileNotFoundError( f"spine item '{item_id}'", epub_path=str(self.path), suggestions=suggestions ) manifest_item = self.package.manifest.find_by_id(item_id) if not manifest_item: manifest_ids = [ item.get('id') for item in self.package.manifest.items if item.get('id') ] suggestions = [ 'Check that the item ID is correct', 'Verify the item is declared in the manifest', ] if manifest_ids: available_ids = ', '.join(manifest_ids[:5]) if len(manifest_ids) > 5: available_ids += f' (and {len(manifest_ids) - 5} more)' suggestions.append(f'Available manifest IDs: {available_ids}') raise EPUBFileNotFoundError( f"manifest item '{item_id}'", epub_path=str(self.path), suggestions=suggestions ) content_path = os.path.join(self.package_href, manifest_item['href']) xml_content = self._read_file_from_epub(content_path) content = XHTMLContent(xml_content, manifest_item['media_type'], manifest_item['href']) return content def find_pub_resource_by_id(self, item_id: str) -> str: """ Find and return a publication resource by its manifest item ID. Args: item_id: The ID of the item in the manifest. Returns: str: The raw content of the resource. Raises: EPUBFileNotFoundError: If the item ID is not found in manifest. """ manifest_item = self.package.manifest.find_by_id(item_id) if not manifest_item: manifest_ids = [ item.get('id') for item in self.package.manifest.items if item.get('id') ] suggestions = [ 'Check that the item ID is correct', 'Verify the item is declared in the manifest', ] if manifest_ids: available_ids = ', '.join(manifest_ids[:5]) if len(manifest_ids) > 5: available_ids += f' (and {len(manifest_ids) - 5} more)' suggestions.append(f'Available manifest IDs: {available_ids}') raise EPUBFileNotFoundError( f"manifest item '{item_id}'", epub_path=str(self.path), suggestions=suggestions ) content_path = os.path.join(self.package_href, manifest_item['href']) xml_content = self._read_file_from_epub(content_path) content = XHTMLContent(xml_content, manifest_item['media_type'], manifest_item['href']) return content def list_files(self) -> List[Dict[str, str]]: """ List all files in the EPUB archive. Returns: List[Dict[str, str]]: A list of dictionaries containing file information. """ with zipfile.ZipFile(self.path, 'r') as epub_zip: file_list = [] for zip_info in epub_zip.infolist(): file_info = { 'filename': zip_info.filename, 'file_size': zip_info.file_size, 'compress_size': zip_info.compress_size, 'file_mode': zip_info.external_attr >> 16, 'last_modified': datetime(*zip_info.date_time), } file_list.append(file_info) return file_list def get_files_info(self) -> List[Dict[str, Union[str, int]]]: """ Get information about all files in the EPUB archive. Returns: List[Dict]: A list of dictionaries containing file information. Each dictionary contains: 'path', 'size', 'compressed_size', 'modified'. """ files_info = [] with zipfile.ZipFile(self.path, 'r') as epub_zip: for zip_info in epub_zip.infolist(): if zip_info.filename.endswith('/'): continue modified_time = datetime(*zip_info.date_time).strftime('%Y-%m-%d %H:%M:%S') file_info = { 'path': zip_info.filename, 'size': zip_info.file_size, 'compressed_size': zip_info.compress_size, 'modified': modified_time, } files_info.append(file_info) files_info.sort(key=lambda x: x['path']) return files_info def get_file_by_path(self, file_path: str): """ Retrieve a file from the EPUB archive by its path. Args: file_path (str): Path to the file within the EPUB archive. Returns: XHTMLContent or str: For XHTML files, returns XHTMLContent object. For other files, returns raw content as string. Raises: ValueError: If the file is missing from the EPUB archive. """ file_content = self._read_file_from_epub(file_path) if file_path.lower().endswith(('.xhtml', '.html', '.htm')): media_type = 'application/xhtml+xml' try: for item in self.package.manifest.items: manifest_path = os.path.join(self._Documentpackage_href, item['href']) if os.path.normpath(manifest_path) == os.path.normpath(file_path): media_type = item.get('media_type', 'application/xhtml+xml') break except: pass return XHTMLContent(file_content, media_type, file_path) else: return file_content ================================================ FILE: epub_utils/exceptions.py ================================================ """ Global epub-utils exception classes. This module defines custom exceptions for the epub-utils library that provide more descriptive error messages to help users understand what went wrong and how to fix it. """ class EPUBError(Exception): """Base exception for all epub-utils errors.""" def __init__(self, message: str, suggestions: list = None, file_path: str = None): """ Initialize the EPUBError. Args: message: The error message describing what went wrong suggestions: Optional list of suggestions for fixing the error file_path: Optional path to the file where the error occurred """ super().__init__(message) self.suggestions = suggestions or [] self.file_path = file_path def __str__(self): error_parts = [super().__str__()] if self.file_path: error_parts.append(f'File: {self.file_path}') if self.suggestions: error_parts.append('Suggestions:') for suggestion in self.suggestions: error_parts.append(f' • {suggestion}') return '\n'.join(error_parts) class ParseError(EPUBError, ValueError): """An error when parsing EPUB content due to invalid formatting.""" def __init__( self, message: str, element_name: str = None, line_number: int = None, suggestions: list = None, file_path: str = None, ): """ Initialize the ParseError. Args: message: The error message element_name: The XML element that caused the parsing error line_number: The line number where the error occurred suggestions: List of suggestions for fixing the error file_path: Path to the file with the parsing error """ if element_name: message = f'Error parsing {element_name}: {message}' if line_number: message = f'{message} (line {line_number})' if not suggestions: suggestions = [ 'Verify the EPUB file is not corrupted', 'Check that the XML is well-formed', 'Ensure all required elements are present', ] super().__init__(message, suggestions, file_path) class InvalidEPUBError(EPUBError, ValueError): """An error when the EPUB file structure or content is invalid.""" def __init__( self, message: str, missing_files: list = None, suggestions: list = None, file_path: str = None, ): """ Initialize the InvalidEPUBError. Args: message: The error message missing_files: List of missing required files suggestions: List of suggestions for fixing the error file_path: Path to the invalid EPUB file """ if missing_files: file_list = ', '.join(missing_files) message = f'{message}. Missing required files: {file_list}' if not suggestions: suggestions = [ 'Verify the file is a valid EPUB archive', 'Check that all required EPUB files are present', 'Ensure the EPUB was created with a compliant tool', ] super().__init__(message, suggestions, file_path) class UnsupportedFormatError(EPUBError, ValueError): """An error when attempting operations not supported for the EPUB version/format.""" def __init__( self, message: str, epub_version: str = None, required_version: str = None, suggestions: list = None, file_path: str = None, ): """ Initialize the UnsupportedFormatError. Args: message: The error message epub_version: The version of the EPUB file required_version: The minimum required version for the operation suggestions: List of suggestions for fixing the error file_path: Path to the EPUB file """ if epub_version and required_version: message = f'{message} (EPUB {epub_version} detected, requires EPUB {required_version})' elif epub_version: message = f'{message} (EPUB {epub_version} format)' if not suggestions: suggestions = [ 'Try using an EPUB file with a compatible version', 'Check the EPUB specification for version requirements', ] if required_version: suggestions.insert(0, f'Convert the EPUB to version {required_version} or higher') super().__init__(message, suggestions, file_path) class NotImplementedError(EPUBError): """An error when attempting to use functionality not yet implemented.""" def __init__( self, message: str, feature_name: str = None, suggestions: list = None, file_path: str = None, ): """ Initialize the NotImplementedError. Args: message: The error message feature_name: Name of the unimplemented feature suggestions: List of suggestions for fixing the error file_path: Path to the file (if applicable) """ if feature_name: message = f"Feature '{feature_name}' is not yet implemented: {message}" if not suggestions: suggestions = [ 'Check the documentation for supported features', 'Consider contributing this feature to the project', 'Use an alternative approach if available', ] super().__init__(message, suggestions, file_path) class FileNotFoundError(EPUBError, ValueError): """An error when a required file is not found in the EPUB archive.""" def __init__(self, file_path: str, epub_path: str = None, suggestions: list = None): """ Initialize the FileNotFoundError. Args: file_path: Path to the missing file within the EPUB epub_path: Path to the EPUB file suggestions: List of suggestions for fixing the error """ message = f"Missing '{file_path}' in EPUB archive" if not suggestions: suggestions = [ 'Verify the file path is correct', 'Check that the EPUB file is complete and not corrupted', 'Ensure the file was included when the EPUB was created', ] super().__init__(message, suggestions, epub_path) class ValidationError(EPUBError, ValueError): """An error when EPUB content fails validation.""" def __init__( self, message: str, validation_errors: list = None, suggestions: list = None, file_path: str = None, ): """ Initialize the ValidationError. Args: message: The error message validation_errors: List of specific validation errors suggestions: List of suggestions for fixing the error file_path: Path to the file with validation errors """ if validation_errors: error_list = '\n'.join(f' • {error}' for error in validation_errors) message = f'{message}\nValidation errors:\n{error_list}' if not suggestions: suggestions = [ 'Fix the validation errors listed above', 'Use an EPUB validator to check for additional issues', 'Consult the EPUB specification for requirements', ] super().__init__(message, suggestions, file_path) ================================================ FILE: epub_utils/navigation/__init__.py ================================================ """EPUB Navigation module.""" from .base import Navigation, NavigationItem from .nav import EPUBNavDocNavigation from .ncx import NCXNavigation __all__ = [ 'Navigation', 'NavigationItem', 'NCXNavigation', 'EPUBNavDocNavigation', ] ================================================ FILE: epub_utils/navigation/base.py ================================================ from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import Any, Dict, List, Optional @dataclass class NavigationItem: """Universal navigation item representation.""" id: str label: str target: str # href/src order: Optional[int] = None level: int = 0 item_type: Optional[str] = None # semantic type children: List['NavigationItem'] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: """Convert NavigationItem to dictionary format with all children recursively converted. Returns: Dictionary representation with children as nested dictionaries. """ result = { 'id': self.id, 'label': self.label, 'target': self.target, 'order': self.order, 'level': self.level, 'type': self.item_type, 'children': [child.to_dict() for child in self.children], } return result class Navigation(ABC): """ Base class for Navigation Documents. Attributes: media_type (str): The MIME type of the content. href (str): The path to the content file within the EPUB. """ def __init__(self, media_type: str, href: str) -> None: self.media_type = media_type self.href = href # === Core Abstract Methods === @abstractmethod def get_toc_items(self) -> List[NavigationItem]: """Get table of contents as normalized items.""" pass @abstractmethod def get_page_list(self) -> List[NavigationItem]: """Get page list/breaks as normalized items.""" pass @abstractmethod def get_landmarks(self) -> List[NavigationItem]: """Get landmarks/guide references as normalized items.""" pass # === Editing Interface === @abstractmethod def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None: """Add item to table of contents.""" pass @abstractmethod def remove_toc_item(self, item_id: str) -> bool: """Remove item from table of contents by ID.""" pass @abstractmethod def update_toc_item(self, item_id: str, **kwargs) -> bool: """Update existing TOC item properties.""" pass @abstractmethod def reorder_toc_items(self, new_order: List[str]) -> None: """Reorder TOC items by list of IDs.""" pass # === Query Interface === def find_item_by_id(self, item_id: str) -> Optional[NavigationItem]: """Find navigation item by ID across all collections.""" for item in self.get_all_items(): if item.id == item_id: return item return None def find_items_by_target(self, target: str) -> List[NavigationItem]: """Find navigation items by target/href.""" return [item for item in self.get_all_items() if item.target == target] def get_all_items(self) -> List[NavigationItem]: """Get all navigation items from all collections.""" items = [] items.extend(self.get_toc_items()) items.extend(self.get_page_list()) items.extend(self.get_landmarks()) return items def get_toc_items_as_dicts(self) -> List[Dict[str, Any]]: """Get TOC items as list of dictionaries with recursive children conversion. Returns: List of dictionaries representing the TOC structure, where each item contains all its children recursively converted to dictionaries. """ return [item.to_dict() for item in self.get_toc_items()] def get_page_list_as_dicts(self) -> List[Dict[str, Any]]: """Get page list items as list of dictionaries. Returns: List of dictionaries representing the page list structure. """ return [item.to_dict() for item in self.get_page_list()] def get_landmarks_as_dicts(self) -> List[Dict[str, Any]]: """Get landmarks as list of dictionaries. Returns: List of dictionaries representing the landmarks structure. """ return [item.to_dict() for item in self.get_landmarks()] # === Format-specific Access === @property @abstractmethod def tree(self): """Get underlying XML/DOM tree for format-specific operations.""" pass # === Output Methods === @abstractmethod def to_str(self, *args, **kwargs) -> str: pass @abstractmethod def to_xml(self, *args, **kwargs) -> str: pass @abstractmethod def to_plain(self) -> str: pass ================================================ FILE: epub_utils/navigation/nav/__init__.py ================================================ import re from typing import List, Optional from lxml import etree from epub_utils.exceptions import ParseError, UnsupportedFormatError from epub_utils.navigation.base import Navigation, NavigationItem from epub_utils.printers import XMLPrinter from .dom import NavDocument, NavListItem class EPUBNavDocNavigation(Navigation): """EPUB 3 Navigation Document implementation.""" MEDIA_TYPES = ['application/xhtml+xml'] def __init__( self, xml_content: str, media_type: str = 'application/xhtml+xml', href: str = None ) -> None: self.xml_content = xml_content self._tree = None self.xmlns = None self.lang = None if media_type not in self.MEDIA_TYPES: raise UnsupportedFormatError( f"Media type '{media_type}' is not supported for EPUB Navigation Document", suggestions=[ f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}', 'Check that this is an EPUB 3 Navigation Document', 'Verify the manifest declares the correct media type', ], ) super().__init__(media_type, href) self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def to_plain(self) -> str: return self.inner_text def _parse(self, xml_content: str) -> None: try: self._tree = etree.fromstring(xml_content.encode('utf-8')) root = self._tree self.xmlns = root.nsmap.get(None, '') if root.nsmap else '' self.lang = root.get('{http://www.w3.org/XML/1998/namespace}lang', '') except etree.ParseError as e: raise ParseError( f'Invalid XML in EPUB Navigation Document: {str(e)}', suggestions=[ 'Check that the navigation document contains valid XHTML', 'Verify the file is not corrupted', 'Ensure all XML tags are properly closed', 'Check for invalid characters in the XML', ], ) from e @property def tree(self): """Lazily parse and cache the XHTML tree.""" if self._tree is None: self._parse(self.xml_content) return self._tree @property def inner_text(self) -> str: tree = self.tree body_elements = tree.xpath( '//*[local-name()="body"]', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if body_elements: inner_text = ''.join(body_elements[0].itertext()) else: inner_text = ''.join(tree.itertext()) # Normalize whitespace inner_text = re.sub(r'\s+', ' ', inner_text).strip() return inner_text # === Navigation Interface Implementation === def get_toc_items(self) -> List[NavigationItem]: """Get table of contents as normalized items.""" nav_doc = NavDocument(self.tree) toc_nav = nav_doc.toc_nav if not toc_nav: return [] ordered_list = toc_nav.ordered_list if not ordered_list: return [] return self._convert_list_items_recursive(ordered_list.list_items, level=0) def get_page_list(self) -> List[NavigationItem]: """Get page list/breaks as normalized items.""" nav_doc = NavDocument(self.tree) page_list_nav = nav_doc.page_list_nav if not page_list_nav: return [] ordered_list = page_list_nav.ordered_list if not ordered_list: return [] return self._convert_list_items_to_pages(ordered_list.list_items) def get_landmarks(self) -> List[NavigationItem]: """Get landmarks/guide references as normalized items.""" nav_doc = NavDocument(self.tree) landmarks_nav = nav_doc.landmarks_nav if not landmarks_nav: return [] ordered_list = landmarks_nav.ordered_list if not ordered_list: return [] return self._convert_list_items_to_landmarks(ordered_list.list_items) # === Editing Interface === def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None: """Add item to table of contents.""" nav_doc = NavDocument(self.tree) toc_nav = nav_doc.toc_nav if not toc_nav: # Create TOC nav if it doesn't exist toc_nav = nav_doc.add_nav_section('toc') toc_nav.add_heading(1, 'Table of Contents') ordered_list = toc_nav.add_ordered_list() else: ordered_list = toc_nav.ordered_list if not ordered_list: ordered_list = toc_nav.add_ordered_list() # Create new list item new_li = ordered_list.add_list_item() if item.id: new_li.id = item.id # Add anchor or span based on whether target is provided if item.target: anchor = new_li.add_anchor(item.target, item.label) if item.item_type: anchor.epub_type = item.item_type else: span = new_li.add_span(item.label) if item.id: span.id = item.id # TODO: Handle after_id positioning and children def remove_toc_item(self, item_id: str) -> bool: """Remove item from table of contents by ID.""" nav_doc = NavDocument(self.tree) toc_nav = nav_doc.toc_nav if not toc_nav: return False # Find and remove the list item with the given ID items_to_remove = self.tree.xpath( f'.//xhtml:li[@id="{item_id}"]', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}, ) # Also check for anchors with the ID if not items_to_remove: items_to_remove = self.tree.xpath( f'.//xhtml:a[@id="{item_id}"]', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}, ) # Remove the parent li element if found items_to_remove = [ item.getparent() for item in items_to_remove if item.getparent() is not None ] if items_to_remove: for item in items_to_remove: if item.getparent() is not None: item.getparent().remove(item) return True return False def update_toc_item(self, item_id: str, **kwargs) -> bool: """Update existing TOC item properties.""" nav_doc = NavDocument(self.tree) toc_nav = nav_doc.toc_nav if not toc_nav: return False # Find the item by ID (could be on li or a element) target_items = self.tree.xpath( f'.//xhtml:li[@id="{item_id}"] | .//xhtml:a[@id="{item_id}"]', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}, ) if not target_items: return False target_element = target_items[0] # If we found an anchor, work with it; if we found a li, find its anchor if target_element.tag.endswith('}a'): anchor_element = target_element li_element = target_element.getparent() else: li_element = target_element anchors = li_element.xpath( './xhtml:a', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) anchor_element = anchors[0] if anchors else None # Update properties if 'label' in kwargs and anchor_element is not None: anchor_element.text = kwargs['label'] elif 'label' in kwargs: # Handle span elements or create anchor spans = li_element.xpath( './xhtml:span', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if spans: spans[0].text = kwargs['label'] if 'target' in kwargs and anchor_element is not None: anchor_element.set('href', kwargs['target']) if 'item_type' in kwargs and anchor_element is not None: anchor_element.set('{http://www.idpf.org/2007/ops}type', kwargs['item_type']) return True def reorder_toc_items(self, new_order: List[str]) -> None: """Reorder TOC items by list of IDs.""" # This is a complex operation that would require rebuilding the list structure # For now, we'll implement a basic version that moves items around nav_doc = NavDocument(self.tree) toc_nav = nav_doc.toc_nav if not toc_nav: return ordered_list = toc_nav.ordered_list if not ordered_list: return # Collect all items with their IDs items_map = {} for li_item in ordered_list.list_items: if li_item.id: items_map[li_item.id] = li_item.element elif li_item.anchor and li_item.anchor.id: items_map[li_item.anchor.id] = li_item.element # Reorder by removing and re-adding in new order for item_id in new_order: if item_id in items_map: element = items_map[item_id] parent = element.getparent() if parent is not None: parent.remove(element) parent.append(element) # === Helper Methods === def _convert_list_items_recursive( self, list_items: List[NavListItem], level: int = 0 ) -> List[NavigationItem]: """Convert navigation list items to NavigationItems recursively.""" items = [] for i, list_item in enumerate(list_items): anchor = list_item.anchor span = list_item.span if anchor: item = NavigationItem( id=anchor.id or list_item.id or '', label=anchor.text, target=anchor.href or '', order=i + 1, level=level, item_type=anchor.epub_type, ) elif span: item = NavigationItem( id=span.id or list_item.id or '', label=span.element.text or '', target='', order=i + 1, level=level, item_type=None, ) else: # Fallback for items without anchor or span continue # Convert nested items nested_list = list_item.nested_list if nested_list: item.children = self._convert_list_items_recursive( nested_list.list_items, level + 1 ) items.append(item) return items def _convert_list_items_to_pages(self, list_items: List[NavListItem]) -> List[NavigationItem]: """Convert navigation list items to page NavigationItems.""" items = [] for i, list_item in enumerate(list_items): anchor = list_item.anchor if not anchor: continue item = NavigationItem( id=anchor.id or list_item.id or '', label=anchor.text, target=anchor.href or '', order=i + 1, level=0, item_type=anchor.epub_type or 'page', ) items.append(item) return items def _convert_list_items_to_landmarks( self, list_items: List[NavListItem] ) -> List[NavigationItem]: """Convert navigation list items to landmark NavigationItems.""" items = [] for i, list_item in enumerate(list_items): anchor = list_item.anchor if not anchor: continue item = NavigationItem( id=anchor.id or list_item.id or '', label=anchor.text, target=anchor.href or '', order=i + 1, level=0, item_type=anchor.epub_type or 'landmark', ) items.append(item) return items ================================================ FILE: epub_utils/navigation/nav/dom.py ================================================ """DOM classes for structured access to EPUB 3 Navigation Documents.""" from typing import List, Optional from lxml import etree class NavElement: """Base class for navigation document elements.""" def __init__(self, element: etree.Element) -> None: self.element = element @property def id(self) -> Optional[str]: """Get the id attribute.""" return self.element.get('id') @id.setter def id(self, value: str) -> None: """Set the id attribute.""" self.element.set('id', value) class NavAnchor(NavElement): """Represents an anchor element (a) in navigation.""" @property def href(self) -> Optional[str]: """Get the href attribute.""" return self.element.get('href') @href.setter def href(self, value: str) -> None: """Set the href attribute.""" self.element.set('href', value) @property def text(self) -> str: """Get the text content of the anchor.""" return self.element.text or '' @text.setter def text(self, value: str) -> None: """Set the text content of the anchor.""" self.element.text = value @property def epub_type(self) -> Optional[str]: """Get the epub:type attribute.""" return self.element.get('{http://www.idpf.org/2007/ops}type') @epub_type.setter def epub_type(self, value: str) -> None: """Set the epub:type attribute.""" self.element.set('{http://www.idpf.org/2007/ops}type', value) class NavListItem(NavElement): """Represents a list item (li) in navigation.""" @property def anchor(self) -> Optional[NavAnchor]: """Get the first anchor child element.""" anchors = self.element.xpath( './xhtml:a', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if anchors: return NavAnchor(anchors[0]) return None @property def nested_list(self) -> Optional['NavList']: """Get nested ordered list if present.""" lists = self.element.xpath( './xhtml:ol', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if lists: return NavList(lists[0]) return None @property def span(self) -> Optional[NavElement]: """Get span element if present (for non-linked text).""" spans = self.element.xpath( './xhtml:span', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if spans: return NavElement(spans[0]) return None def add_anchor(self, href: str, text: str, epub_type: Optional[str] = None) -> NavAnchor: """Add an anchor element to this list item.""" anchor_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}a') anchor = NavAnchor(anchor_element) anchor.href = href anchor.text = text if epub_type: anchor.epub_type = epub_type return anchor def add_span(self, text: str) -> NavElement: """Add a span element to this list item.""" span_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}span') span = NavElement(span_element) span.element.text = text return span def add_nested_list(self) -> 'NavList': """Add a nested ordered list to this list item.""" ol_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}ol') return NavList(ol_element) class NavList(NavElement): """Represents an ordered list (ol) in navigation.""" @property def list_items(self) -> List[NavListItem]: """Get all list item children.""" items = self.element.xpath( './xhtml:li', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) return [NavListItem(item) for item in items] def add_list_item(self) -> NavListItem: """Add a new list item to this list.""" li_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}li') return NavListItem(li_element) def get_all_items_recursive(self) -> List[NavListItem]: """Get all list items recursively.""" items = [] def collect_items(nav_list: NavList): for item in nav_list.list_items: items.append(item) nested_list = item.nested_list if nested_list: collect_items(nested_list) collect_items(self) return items class NavSection(NavElement): """Represents a nav element with specific epub:type.""" @property def epub_type(self) -> Optional[str]: """Get the epub:type attribute.""" return self.element.get('{http://www.idpf.org/2007/ops}type') @epub_type.setter def epub_type(self, value: str) -> None: """Set the epub:type attribute.""" self.element.set('{http://www.idpf.org/2007/ops}type', value) @property def heading(self) -> Optional[str]: """Get the text of the heading element (h1-h6).""" for level in range(1, 7): headings = self.element.xpath( f'./xhtml:h{level}', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if headings: return headings[0].text or '' return None @property def ordered_list(self) -> Optional[NavList]: """Get the ordered list child element.""" lists = self.element.xpath( './xhtml:ol', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if lists: return NavList(lists[0]) return None def add_heading(self, level: int, text: str) -> NavElement: """Add a heading element.""" if not 1 <= level <= 6: raise ValueError('Heading level must be between 1 and 6') heading_element = etree.SubElement( self.element, f'{{http://www.w3.org/1999/xhtml}}h{level}' ) heading = NavElement(heading_element) heading.element.text = text return heading def add_ordered_list(self) -> NavList: """Add an ordered list to this nav section.""" ol_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}ol') return NavList(ol_element) class NavDocument(NavElement): """Represents the root html element of a navigation document.""" @property def toc_nav(self) -> Optional[NavSection]: """Get the table of contents nav section.""" navs = self.element.xpath( './/xhtml:nav[@epub:type="toc"]', namespaces={ 'xhtml': 'http://www.w3.org/1999/xhtml', 'epub': 'http://www.idpf.org/2007/ops', }, ) if navs: return NavSection(navs[0]) return None @property def page_list_nav(self) -> Optional[NavSection]: """Get the page list nav section.""" navs = self.element.xpath( './/xhtml:nav[@epub:type="page-list"]', namespaces={ 'xhtml': 'http://www.w3.org/1999/xhtml', 'epub': 'http://www.idpf.org/2007/ops', }, ) if navs: return NavSection(navs[0]) return None @property def landmarks_nav(self) -> Optional[NavSection]: """Get the landmarks nav section.""" navs = self.element.xpath( './/xhtml:nav[@epub:type="landmarks"]', namespaces={ 'xhtml': 'http://www.w3.org/1999/xhtml', 'epub': 'http://www.idpf.org/2007/ops', }, ) if navs: return NavSection(navs[0]) return None @property def all_nav_sections(self) -> List[NavSection]: """Get all nav sections.""" navs = self.element.xpath( './/xhtml:nav', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) return [NavSection(nav) for nav in navs] @property def title(self) -> str: """Get the document title.""" title_elements = self.element.xpath( './/xhtml:title', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) return title_elements[0].text if title_elements else '' @property def body(self) -> Optional[NavElement]: """Get the body element.""" bodies = self.element.xpath( './/xhtml:body', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'} ) if bodies: return NavElement(bodies[0]) return None def add_nav_section(self, epub_type: str) -> NavSection: """Add a new nav section to the body.""" body = self.body if not body: # Create body if it doesn't exist body_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}body') body = NavElement(body_element) nav_element = etree.SubElement(body.element, '{http://www.w3.org/1999/xhtml}nav') nav_section = NavSection(nav_element) nav_section.epub_type = epub_type return nav_section ================================================ FILE: epub_utils/navigation/ncx/__init__.py ================================================ import re from typing import List, Optional from lxml import etree from epub_utils.exceptions import FileNotFoundError as EPUBFileNotFoundError from epub_utils.exceptions import ParseError, UnsupportedFormatError from epub_utils.navigation.base import Navigation, NavigationItem from epub_utils.printers import XMLPrinter from .dom import NCXDocument, NCXNavPoint, NCXNavTarget, NCXPageTarget class NCXNavigation(Navigation): MEDIA_TYPES = ['application/x-dtbncx+xml'] def __init__( self, xml_content: str, media_type: str = 'application/x-dtbncx+xml', href: str = None ) -> None: self.xml_content = xml_content self._tree = None self.xmlns = None self.version = None self.lang = None if media_type not in self.MEDIA_TYPES: raise UnsupportedFormatError( f"Media type '{media_type}' is not supported for NCX navigation", suggestions=[ f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}', 'Check that this is an NCX navigation file', 'Verify the manifest declares the correct media type', ], ) super().__init__(media_type, href) self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def to_plain(self) -> str: return self.inner_text def _parse(self, xml_content: str) -> None: try: self._tree = etree.fromstring(xml_content.encode('utf-8')) root = self._tree self.xmlns = root.nsmap.get(None, '') if root.nsmap else '' self.version = root.get('version', '') self.lang = root.get('{http://www.w3.org/XML/1998/namespace}lang', '') except etree.ParseError as e: raise ParseError( f'Invalid XML in NCX navigation file: {str(e)}', suggestions=[ 'Check that the NCX file contains valid XML', 'Verify the file is not corrupted', 'Ensure all XML tags are properly closed', 'Check for invalid characters in the XML', ], ) from e @property def tree(self): """Lazily parse and cache the XHTML tree.""" if self._tree is None: self._parse(self.xml_content) return self._tree @property def inner_text(self) -> str: tree = self.tree body_elements = tree.xpath('//*[local-name()="body"]') if body_elements: inner_text = ''.join(body_elements[0].itertext()) else: inner_text = ''.join(tree.itertext()) # Normalize whitespace inner_text = re.sub(r'\s+', ' ', inner_text).strip() return inner_text # === Navigation Interface Implementation === def get_toc_items(self) -> List[NavigationItem]: """Get table of contents as normalized items.""" ncx_doc = NCXDocument(self.tree) nav_map = ncx_doc.nav_map if not nav_map: return [] return self._convert_nav_points_recursive(nav_map.nav_points, level=0) def get_page_list(self) -> List[NavigationItem]: """Get page list/breaks as normalized items.""" ncx_doc = NCXDocument(self.tree) page_list = ncx_doc.page_list if not page_list: return [] return self._convert_page_targets(page_list.page_targets) def get_landmarks(self) -> List[NavigationItem]: """Get landmarks/guide references as normalized items.""" ncx_doc = NCXDocument(self.tree) nav_lists = ncx_doc.nav_lists items = [] for nav_list in nav_lists: for nav_target in nav_list.nav_targets: items.append(self._convert_nav_target(nav_target)) return items def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None: """Add item to table of contents.""" ncx_doc = NCXDocument(self.tree) nav_map = ncx_doc.nav_map if not nav_map: raise ParseError( 'NCX document is missing required navMap element', element_name='navMap', suggestions=[ 'Ensure the NCX file contains a navMap element', 'Check that the NCX structure follows EPUB specifications', 'Verify the NCX file was created correctly', ], ) # Find insertion point if after_id: all_nav_points = nav_map.get_all_nav_points() insert_index = None for i, nav_point in enumerate(all_nav_points): if nav_point.id == after_id: insert_index = i + 1 break if insert_index is None: available_ids = [nav_point.id for nav_point in all_nav_points if nav_point.id] suggestions = [ 'Check that the navigation item ID is correct', 'Verify the item exists in the navigation structure', ] if available_ids: id_list = ', '.join(available_ids[:5]) if len(available_ids) > 5: id_list += f' (and {len(available_ids) - 5} more)' suggestions.append(f'Available navigation IDs: {id_list}') raise EPUBFileNotFoundError( f"Navigation item with ID '{after_id}' not found", suggestions=suggestions ) # For now, append to the end if we can't find the exact position # More complex insertion logic would require tree manipulation nav_map.add_nav_point( item.id, item.label, item.target, class_attr=item.item_type, play_order=item.order ) else: # Add to the end nav_map.add_nav_point( item.id, item.label, item.target, class_attr=item.item_type, play_order=item.order ) def remove_toc_item(self, item_id: str) -> bool: """Remove item from table of contents by ID.""" ncx_doc = NCXDocument(self.tree) nav_map = ncx_doc.nav_map if not nav_map: return False # Find and remove the navPoint nav_points = nav_map.element.xpath( f'.//ncx:navPoint[@id="{item_id}"]', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) if nav_points: nav_points[0].getparent().remove(nav_points[0]) return True return False def update_toc_item(self, item_id: str, **kwargs) -> bool: """Update existing TOC item properties.""" ncx_doc = NCXDocument(self.tree) nav_map = ncx_doc.nav_map if not nav_map: return False # Find the navPoint nav_points = nav_map.element.xpath( f'.//ncx:navPoint[@id="{item_id}"]', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) if not nav_points: return False nav_point = NCXNavPoint(nav_points[0]) # Update properties if 'label' in kwargs: nav_label = nav_point.nav_label if nav_label: nav_label.text = kwargs['label'] if 'target' in kwargs: content = nav_point.content if content: content.src = kwargs['target'] if 'order' in kwargs: nav_point.play_order = kwargs['order'] if 'item_type' in kwargs: nav_point.class_attr = kwargs['item_type'] return True def reorder_toc_items(self, new_order: List[str]) -> None: """Reorder TOC items by list of IDs.""" # This is a complex operation that would require rebuilding the navMap # For now, we'll update the playOrder attributes ncx_doc = NCXDocument(self.tree) nav_map = ncx_doc.nav_map if not nav_map: return for i, item_id in enumerate(new_order): nav_points = nav_map.element.xpath( f'.//ncx:navPoint[@id="{item_id}"]', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) if nav_points: nav_point = NCXNavPoint(nav_points[0]) nav_point.play_order = i + 1 # === Helper Methods === def _convert_nav_points_recursive( self, nav_points: List[NCXNavPoint], level: int = 0 ) -> List[NavigationItem]: """Convert NCX navPoints to NavigationItems recursively.""" items = [] for nav_point in nav_points: item = NavigationItem( id=nav_point.id or '', label=nav_point.label_text, target=nav_point.content_src, order=nav_point.play_order, level=level, item_type=nav_point.class_attr, ) # Convert child nav points child_nav_points = nav_point.nav_points if child_nav_points: item.children = self._convert_nav_points_recursive(child_nav_points, level + 1) items.append(item) return items def _convert_page_targets(self, page_targets: List[NCXPageTarget]) -> List[NavigationItem]: """Convert NCX pageTargets to NavigationItems.""" items = [] for page_target in page_targets: item = NavigationItem( id=page_target.id or '', label=page_target.label_text, target=page_target.content_src, order=page_target.play_order, level=0, item_type=page_target.type_attr, ) items.append(item) return items def _convert_nav_target(self, nav_target: NCXNavTarget) -> NavigationItem: """Convert NCX navTarget to NavigationItem.""" return NavigationItem( id=nav_target.id or '', label=nav_target.nav_label.text if nav_target.nav_label else '', target=nav_target.content.src if nav_target.content else '', order=nav_target.play_order, level=0, item_type=nav_target.class_attr, ) ================================================ FILE: epub_utils/navigation/ncx/dom.py ================================================ """NCX DOM classes for structured access to NCX navigation documents.""" from typing import List, Optional from lxml import etree class NCXElement: """Base class for NCX DOM elements.""" def __init__(self, element: etree.Element): self.element = element @property def id(self) -> Optional[str]: """Get the id attribute.""" return self.element.get('id') @id.setter def id(self, value: str) -> None: """Set the id attribute.""" self.element.set('id', value) class NCXText(NCXElement): """Represents a text element.""" @property def text(self) -> str: """Get the text content.""" return self.element.text or '' @text.setter def text(self, value: str) -> None: """Set the text content.""" self.element.text = value class NCXContent(NCXElement): """Represents a content element.""" @property def src(self) -> Optional[str]: """Get the src attribute.""" return self.element.get('src') @src.setter def src(self, value: str) -> None: """Set the src attribute.""" self.element.set('src', value) class NCXNavLabel(NCXElement): """Represents a navLabel element.""" @property def text_element(self) -> Optional[NCXText]: """Get the text child element.""" text_elements = self.element.xpath( './ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if text_elements: return NCXText(text_elements[0]) return None @property def text(self) -> str: """Get the text content.""" text_elem = self.text_element return text_elem.text if text_elem else '' @text.setter def text(self, value: str) -> None: """Set the text content.""" text_elem = self.text_element if text_elem: text_elem.text = value else: # Create text element if it doesn't exist text_element = etree.SubElement( self.element, '{http://www.daisy.org/z3986/2005/ncx/}text' ) text_element.text = value class NCXNavPoint(NCXElement): """Represents a navPoint element in the navigation hierarchy.""" @property def class_attr(self) -> Optional[str]: """Get the class attribute.""" return self.element.get('class') @class_attr.setter def class_attr(self, value: str) -> None: """Set the class attribute.""" self.element.set('class', value) @property def play_order(self) -> Optional[int]: """Get the playOrder attribute.""" play_order = self.element.get('playOrder') return int(play_order) if play_order else None @play_order.setter def play_order(self, value: int) -> None: """Set the playOrder attribute.""" self.element.set('playOrder', str(value)) @property def nav_label(self) -> Optional[NCXNavLabel]: """Get the navLabel child element.""" nav_labels = self.element.xpath( './ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if nav_labels: return NCXNavLabel(nav_labels[0]) return None @property def content(self) -> Optional[NCXContent]: """Get the content child element.""" content_elements = self.element.xpath( './ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if content_elements: return NCXContent(content_elements[0]) return None @property def nav_points(self) -> List['NCXNavPoint']: """Get child navPoint elements.""" nav_point_elements = self.element.xpath( './ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXNavPoint(point) for point in nav_point_elements] def add_nav_point( self, id: str, label_text: str, src: str, class_attr: Optional[str] = None, play_order: Optional[int] = None, ) -> 'NCXNavPoint': """Add a child navPoint element.""" nav_point_element = etree.SubElement( self.element, '{http://www.daisy.org/z3986/2005/ncx/}navPoint' ) nav_point = NCXNavPoint(nav_point_element) nav_point.id = id if class_attr: nav_point.class_attr = class_attr if play_order is not None: nav_point.play_order = play_order # Add navLabel nav_label_element = etree.SubElement( nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel' ) nav_label = NCXNavLabel(nav_label_element) nav_label.text = label_text # Add content content_element = etree.SubElement( nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}content' ) content = NCXContent(content_element) content.src = src return nav_point @property def label_text(self) -> str: """Get the text of the navLabel.""" nav_label = self.nav_label return nav_label.text if nav_label else '' @property def content_src(self) -> str: """Get the src of the content element.""" content = self.content return content.src if content else '' class NCXNavMap(NCXElement): """Represents the navMap element.""" @property def nav_points(self) -> List[NCXNavPoint]: """Get all direct child navPoint elements.""" nav_point_elements = self.element.xpath( './ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXNavPoint(point) for point in nav_point_elements] def add_nav_point( self, id: str, label_text: str, src: str, class_attr: Optional[str] = None, play_order: Optional[int] = None, ) -> NCXNavPoint: """Add a navPoint element.""" nav_point_element = etree.SubElement( self.element, '{http://www.daisy.org/z3986/2005/ncx/}navPoint' ) nav_point = NCXNavPoint(nav_point_element) nav_point.id = id if class_attr: nav_point.class_attr = class_attr if play_order is not None: nav_point.play_order = play_order # Add navLabel nav_label_element = etree.SubElement( nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel' ) nav_label = NCXNavLabel(nav_label_element) nav_label.text = label_text # Add content content_element = etree.SubElement( nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}content' ) content = NCXContent(content_element) content.src = src return nav_point def get_all_nav_points(self) -> List[NCXNavPoint]: """Get all navPoint elements recursively.""" nav_point_elements = self.element.xpath( './/ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXNavPoint(point) for point in nav_point_elements] class NCXPageTarget(NCXElement): """Represents a pageTarget element.""" @property def type_attr(self) -> Optional[str]: """Get the type attribute.""" return self.element.get('type') @type_attr.setter def type_attr(self, value: str) -> None: """Set the type attribute.""" self.element.set('type', value) @property def value(self) -> Optional[str]: """Get the value attribute.""" return self.element.get('value') @value.setter def value(self, value: str) -> None: """Set the value attribute.""" self.element.set('value', value) @property def play_order(self) -> Optional[int]: """Get the playOrder attribute.""" play_order = self.element.get('playOrder') return int(play_order) if play_order else None @play_order.setter def play_order(self, value: int) -> None: """Set the playOrder attribute.""" self.element.set('playOrder', str(value)) @property def nav_label(self) -> Optional[NCXNavLabel]: """Get the navLabel child element.""" nav_labels = self.element.xpath( './ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if nav_labels: return NCXNavLabel(nav_labels[0]) return None @property def content(self) -> Optional[NCXContent]: """Get the content child element.""" content_elements = self.element.xpath( './ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if content_elements: return NCXContent(content_elements[0]) return None @property def label_text(self) -> str: """Get the text of the navLabel.""" nav_label = self.nav_label return nav_label.text if nav_label else '' @property def content_src(self) -> str: """Get the src of the content element.""" content = self.content return content.src if content else '' class NCXPageList(NCXElement): """Represents the pageList element.""" @property def page_targets(self) -> List[NCXPageTarget]: """Get all pageTarget elements.""" page_target_elements = self.element.xpath( './ncx:pageTarget', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXPageTarget(target) for target in page_target_elements] def add_page_target( self, id: str, type_attr: str, value: str, label_text: str, src: str, play_order: Optional[int] = None, ) -> NCXPageTarget: """Add a pageTarget element.""" page_target_element = etree.SubElement( self.element, '{http://www.daisy.org/z3986/2005/ncx/}pageTarget' ) page_target = NCXPageTarget(page_target_element) page_target.id = id page_target.type_attr = type_attr page_target.value = value if play_order is not None: page_target.play_order = play_order # Add navLabel nav_label_element = etree.SubElement( page_target_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel' ) nav_label = NCXNavLabel(nav_label_element) nav_label.text = label_text # Add content content_element = etree.SubElement( page_target_element, '{http://www.daisy.org/z3986/2005/ncx/}content' ) content = NCXContent(content_element) content.src = src return page_target class NCXNavTarget(NCXElement): """Represents a navTarget element.""" @property def value(self) -> Optional[str]: """Get the value attribute.""" return self.element.get('value') @value.setter def value(self, value: str) -> None: """Set the value attribute.""" self.element.set('value', value) @property def class_attr(self) -> Optional[str]: """Get the class attribute.""" return self.element.get('class') @class_attr.setter def class_attr(self, value: str) -> None: """Set the class attribute.""" self.element.set('class', value) @property def play_order(self) -> Optional[int]: """Get the playOrder attribute.""" play_order = self.element.get('playOrder') return int(play_order) if play_order else None @play_order.setter def play_order(self, value: int) -> None: """Set the playOrder attribute.""" self.element.set('playOrder', str(value)) @property def nav_label(self) -> Optional[NCXNavLabel]: """Get the navLabel child element.""" nav_labels = self.element.xpath( './ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if nav_labels: return NCXNavLabel(nav_labels[0]) return None @property def content(self) -> Optional[NCXContent]: """Get the content child element.""" content_elements = self.element.xpath( './ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if content_elements: return NCXContent(content_elements[0]) return None class NCXNavList(NCXElement): """Represents the navList element.""" @property def nav_label(self) -> Optional[NCXNavLabel]: """Get the navLabel child element.""" nav_labels = self.element.xpath( './ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if nav_labels: return NCXNavLabel(nav_labels[0]) return None @property def nav_targets(self) -> List[NCXNavTarget]: """Get all navTarget elements.""" nav_target_elements = self.element.xpath( './ncx:navTarget', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXNavTarget(target) for target in nav_target_elements] def add_nav_target( self, id: str, label_text: str, src: str, play_order: Optional[int] = None ) -> NCXNavTarget: """Add a navTarget element.""" nav_target_element = etree.SubElement( self.element, '{http://www.daisy.org/z3986/2005/ncx/}navTarget' ) nav_target = NCXNavTarget(nav_target_element) nav_target.id = id if play_order is not None: nav_target.play_order = play_order # Add navLabel nav_label_element = etree.SubElement( nav_target_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel' ) nav_label = NCXNavLabel(nav_label_element) nav_label.text = label_text # Add content content_element = etree.SubElement( nav_target_element, '{http://www.daisy.org/z3986/2005/ncx/}content' ) content = NCXContent(content_element) content.src = src return nav_target @property def label_text(self) -> str: """Get the text of the navLabel.""" nav_label = self.nav_label return nav_label.text if nav_label else '' class NCXDocument(NCXElement): """Represents the root ncx element.""" @property def nav_map(self) -> Optional[NCXNavMap]: """Get the navMap element.""" nav_map_elements = self.element.xpath( './ncx:navMap', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if nav_map_elements: return NCXNavMap(nav_map_elements[0]) return None @property def page_list(self) -> Optional[NCXPageList]: """Get the pageList element.""" page_list_elements = self.element.xpath( './ncx:pageList', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) if page_list_elements: return NCXPageList(page_list_elements[0]) return None @property def nav_lists(self) -> List[NCXNavList]: """Get all navList elements.""" nav_list_elements = self.element.xpath( './ncx:navList', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return [NCXNavList(nav_list) for nav_list in nav_list_elements] @property def title(self) -> str: """Get the document title text.""" title_elements = self.element.xpath( './/ncx:docTitle/ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return title_elements[0].text if title_elements else '' @property def author(self) -> str: """Get the document author text.""" author_elements = self.element.xpath( './/ncx:docAuthor/ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'} ) return author_elements[0].text if author_elements else '' def get_uid(self) -> Optional[str]: """Get the dtb:uid meta content.""" uid_elements = self.element.xpath( './/ncx:meta[@name="dtb:uid"]/@content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) return uid_elements[0] if uid_elements else None def get_depth(self) -> Optional[int]: """Get the dtb:depth meta content.""" depth_elements = self.element.xpath( './/ncx:meta[@name="dtb:depth"]/@content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) return int(depth_elements[0]) if depth_elements else None def get_total_page_count(self) -> Optional[int]: """Get the dtb:totalPageCount meta content.""" count_elements = self.element.xpath( './/ncx:meta[@name="dtb:totalPageCount"]/@content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) return int(count_elements[0]) if count_elements else None def get_max_page_number(self) -> Optional[int]: """Get the dtb:maxPageNumber meta content.""" max_elements = self.element.xpath( './/ncx:meta[@name="dtb:maxPageNumber"]/@content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}, ) return int(max_elements[0]) if max_elements else None ================================================ FILE: epub_utils/package/__init__.py ================================================ """ Open Packaging Format (OPF): https://www.w3.org/TR/epub/#sec-package-doc This file includes the `Package` class, which is responsible for parsing the OPF package file of an EPUB archive. The OPF file contains metadata, manifest, spine, and guide information about the EPUB content. Namespace: - The OPF file uses the namespace `http://www.idpf.org/2007/opf`. For more details on the structure and requirements of the OPF file, refer to the EPUB specification: https://www.w3.org/TR/epub/#sec-package-doc """ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree import packaging.version from epub_utils.exceptions import InvalidEPUBError, ParseError, UnsupportedFormatError from epub_utils.package.manifest import Manifest from epub_utils.package.metadata import Metadata from epub_utils.package.spine import Spine from epub_utils.printers import XMLPrinter class Package: """ Represents the parsed OPF package file of an EPUB. Attributes: xml_content (str): The raw XML content of the OPF package file. metadata (dict): The metadata section of the OPF file. manifest (dict): The manifest section listing all resources. spine (list): The spine section defining the reading order. guide (dict): The guide section with navigation references. cover (str): The cover image resource ID. toc (str): The table of contents resource ID. nav (str): The navigation document resource ID. """ NAMESPACE = 'http://www.idpf.org/2007/opf' DC_NAMESPACE = 'http://purl.org/dc/elements/1.1/' METADATA_XPATH = f'.//{{{NAMESPACE}}}metadata' SPINE_XPATH = f'.//{{{NAMESPACE}}}spine' MANIFEST_XPATH = f'.//{{{NAMESPACE}}}manifest' ITEM_XPATH = f'.//{{{NAMESPACE}}}item' NCX_MEDIA_TYPE = 'application/x-dtbncx+xml' TITLE_XPATH = f'.//{{{DC_NAMESPACE}}}title' CREATOR_XPATH = f'.//{{{DC_NAMESPACE}}}creator' IDENTIFIER_XPATH = f'.//{{{DC_NAMESPACE}}}identifier' def __init__(self, xml_content: str) -> None: """ Initialize the Package by parsing the OPF package file. Args: xml_content (str): The raw XML content of the OPF package file. """ self.xml_content = xml_content self.metadata = None self.manifest = None self.spine = None self.guide = None self.cover = None self.toc_href = None self.nav_href = None self.version = None self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def _parse(self, xml_content: str) -> None: """ Parses the OPF package file to extract metadata. Args: xml_content (str): The raw XML content of the OPF package file. Raises: ParseError: If the XML is invalid or cannot be parsed. InvalidEPUBError: If required OPF elements are missing. """ try: if isinstance(xml_content, str): xml_content = xml_content.encode('utf-8') root = etree.fromstring(xml_content) # Check for version attribute if 'version' not in root.attrib: raise InvalidEPUBError( "OPF file missing required 'version' attribute", suggestions=[ 'Ensure the package element has a version attribute', 'Check that this is a valid EPUB OPF file', 'Verify the EPUB was created with compliant tools', ], ) self.version = self._parse_version(root.attrib['version']) # Parse metadata metadata_el = root.find(self.METADATA_XPATH) if metadata_el is None: raise InvalidEPUBError( 'OPF file missing required metadata element', suggestions=[ 'Ensure the OPF file contains a metadata section', 'Check the EPUB package structure', 'Verify all required OPF elements are present', ], ) metadata_xml = etree.tostring(metadata_el, encoding='unicode') self.metadata = Metadata(metadata_xml) # Parse manifest manifest_el = root.find(self.MANIFEST_XPATH) if manifest_el is not None: manifest_xml = etree.tostring(manifest_el, encoding='unicode') self.manifest = Manifest(manifest_xml) else: raise InvalidEPUBError( 'OPF file missing required manifest element', suggestions=[ 'Ensure the OPF file contains a manifest section', 'Check that all resources are declared in the manifest', 'Verify the EPUB package structure is complete', ], ) # Parse spine spine_el = root.find(self.SPINE_XPATH) if spine_el is not None: spine_xml = etree.tostring(spine_el, encoding='unicode') self.spine = Spine(spine_xml) else: raise InvalidEPUBError( 'OPF file missing required spine element', suggestions=[ 'Ensure the OPF file contains a spine section', 'Check that reading order is defined in the spine', 'Verify the EPUB package structure is complete', ], ) # Parse TOC references if self.version.major == 3: self.nav_href = self._find_nav_href(root) else: self.toc_href = self._find_toc_href(root) except etree.ParseError as e: raise ParseError( f'Invalid XML in OPF file: {str(e)}', suggestions=[ 'Check that the OPF file contains valid XML', 'Verify the file is not corrupted', 'Ensure all XML tags are properly closed', 'Check for invalid characters in the XML', ], ) from e def _get_text(self, root: etree.Element, xpath: str) -> str: """ Helper method to extract text content from an XML element. Args: root (etree.Element): The root element to search within. xpath (str): The XPath expression to locate the element. Returns: str: The text content of the element, or None if not found. """ element = root.find(xpath) return element.text.strip() if element is not None and element.text else None def _find_toc_href(self, root: etree.Element) -> str: """ Find the publication navigation control file. Args: root (etree.Element): The root element of the OPF document. Returns: str: The href to the NCX document, or None if not found. """ # First check for NCX media-type in manifest for item in root.findall(self.ITEM_XPATH): if item.get('media-type') == self.NCX_MEDIA_TYPE: return item.get('href') # Then check spine toc attribute spine = root.find(self.SPINE_XPATH) if spine is not None: toc_id = spine.get('toc') if toc_id: for item in root.findall(self.ITEM_XPATH): if item.get('id') == toc_id: href = item.get('href') if href: # Remove fragment identifier if present return href.split('#')[0] return None def _find_nav_href(self, root: etree.Element) -> str: """ Find the publication navigation file. Args: root (etree.Element): The root element of the OPF document. Returns: str: The href to navigation file, or None if not found. """ # Check for item with nav properties for item in root.findall(self.ITEM_XPATH): if item.get('properties') == 'nav': href = item.get('href') if href: return href.split('#')[0] # Fall back to guide TOC reference guide = root.find(f'.//{{{self.NAMESPACE}}}guide') if guide is not None: for reference in guide.findall(f'.//{{{self.NAMESPACE}}}reference'): if reference.get('type') == 'toc': href = reference.get('href') if href: return href.split('#')[0] return None def _parse_version(self, version): """ Parse and validate the EPUB version. Args: version (str): Version string from the OPF file. Returns: packaging.version.Version: Parsed version object. Raises: UnsupportedFormatError: If the EPUB version is not supported. """ try: version_obj = packaging.version.Version(version) except packaging.version.InvalidVersion as e: raise InvalidEPUBError( f"Invalid version format in OPF file: '{version}'", suggestions=[ "Ensure the version follows semantic versioning (e.g., '3.0', '2.0')", 'Check that the version attribute is correctly formatted', 'Verify the EPUB was created with compliant tools', ], ) from e if version_obj.major not in (1, 2, 3): supported_versions = '1.x, 2.x, 3.x' raise UnsupportedFormatError( f'EPUB version {version_obj.major}.x is not supported', epub_version=str(version_obj), suggestions=[ f'Use an EPUB with a supported version ({supported_versions})', 'Convert the EPUB to a supported version', 'Check the EPUB specification for version requirements', ], ) return version_obj ================================================ FILE: epub_utils/package/manifest.py ================================================ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree from epub_utils.exceptions import ParseError from epub_utils.printers import XMLPrinter class Manifest: """ Represents the manifest section of an EPUB package document. The manifest element provides an exhaustive list of the publication resources. """ NAMESPACE = 'http://www.idpf.org/2007/opf' ITEM_XPATH = f'.//{{{NAMESPACE}}}item' def __init__(self, xml_content: str): self.xml_content = xml_content self.items = [] self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def _parse(self, xml_content: str) -> None: """ Parses the manifest XML content. """ try: if isinstance(xml_content, str): xml_content = xml_content.encode('utf-8') root = etree.fromstring(xml_content) for item in root.findall(self.ITEM_XPATH): item_data = { 'id': item.get('id'), 'href': item.get('href'), 'media_type': item.get('media-type'), 'properties': item.get('properties', '').split(), } if all( v is not None for v in [item_data['id'], item_data['href'], item_data['media_type']] ): self.items.append(item_data) except etree.ParseError as e: raise ParseError( f'Invalid XML in manifest element: {str(e)}', element_name='manifest', suggestions=[ 'Check that the manifest contains valid XML', 'Verify all manifest items are properly formatted', 'Ensure required attributes (id, href, media-type) are present', 'Check for invalid characters in the XML', ], ) from e def find_by_property(self, property_name: str) -> dict: """Find the first item with the given property.""" for item in self.items: if property_name in item['properties']: return item return None def find_by_id(self, item_id: str) -> dict: """Find an item by its ID.""" for item in self.items: if item['id'] == item_id: return item return None def find_by_media_type(self, media_type: str) -> list: """Find all items with the given media type.""" return [item for item in self.items if item['media_type'] == media_type] ================================================ FILE: epub_utils/package/metadata.py ================================================ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree from epub_utils.exceptions import ParseError, ValidationError from epub_utils.printers import XMLPrinter class Metadata: """ Represents the metadata section of an EPUB package document. Handles Dublin Core (DC) and Dublin Core Terms (DCTERMS) metadata elements. """ DC_NAMESPACE = 'http://purl.org/dc/elements/1.1/' DCTERMS_NAMESPACE = 'http://purl.org/dc/terms/' REQUIRED_FIELDS = ['identifier', 'title', 'creator'] NSMAP = {'dc': DC_NAMESPACE, 'dcterms': DCTERMS_NAMESPACE} def __init__(self, xml_content: str): self.xml_content = xml_content self.fields = {} self._parse(xml_content) self._printer = XMLPrinter(self) def _parse(self, xml_content: str) -> None: try: if isinstance(xml_content, str): xml_content = xml_content.encode('utf-8') root = etree.fromstring(xml_content) for ns_prefix, ns_uri in self.NSMAP.items(): for element in root.findall(f'.//{{{ns_uri}}}*'): name = element.tag.split('}')[-1] text = element.text.strip() if element.text else None if text: self._add_field(name, text) for meta in root.findall('.//meta[@property]'): prop = meta.get('property', '') if prop.startswith('dcterms:'): name = prop.split(':')[1] text = meta.text.strip() if meta.text else None if text: self._add_field(name, text) self._validate() except etree.ParseError as e: raise ParseError( f'Invalid XML in metadata element: {str(e)}', element_name='metadata', suggestions=[ 'Check that the metadata contains valid XML', 'Verify all metadata elements are properly formatted', 'Ensure required Dublin Core elements are present', 'Check for invalid characters in metadata values', ], ) from e def _add_field(self, name: str, value: str) -> None: if name in self.fields: if isinstance(self.fields[name], list): self.fields[name].append(value) else: self.fields[name] = [self.fields[name], value] else: self.fields[name] = value def _validate(self, raise_exception=False) -> None: """ Validate all required fields and raise ValidationError if validation fails. """ errors = {} for field in self.REQUIRED_FIELDS: try: self._validate_field(field) except ValueError as e: errors[field] = str(e) if errors and raise_exception: error_messages = [f'{field}: {msg}' for field, msg in errors.items()] validation_errors = [f"Missing or invalid '{field}' element" for field in errors.keys()] raise ValidationError( 'EPUB metadata validation failed', validation_errors=validation_errors, suggestions=[ 'Ensure all required Dublin Core metadata elements are present', 'Check that metadata values are not empty', 'Verify the metadata follows EPUB specification requirements', 'Use proper Dublin Core namespace for metadata elements', ], ) def _validate_field(self, field_name: str) -> None: """ Validate an individual field. Args: field_name: Name of the field to validate Raises: ValueError: If the field validation fails """ value = self.fields.get(field_name) if value is None or (isinstance(value, str) and not value.strip()): raise ValueError('This field is required') def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def _get_text(self, root: etree.Element, xpath: str) -> str: element = root.find(xpath) return element.text.strip() if element is not None and element.text else None def __getattr__(self, name: str) -> str: return self.fields.get(name) def to_kv(self) -> str: if not self.fields: return '' max_key_length = max(len(k) for k in self.fields.keys()) lines = [f'{k.rjust(max_key_length)}: {str(v)}' for k, v in self.fields.items()] return '\n'.join(lines) ================================================ FILE: epub_utils/package/spine.py ================================================ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree from epub_utils.exceptions import ParseError from epub_utils.printers import XMLPrinter class Spine: """ Represents the spine section of an EPUB package document. The spine element defines the default reading order of the content. """ NAMESPACE = 'http://www.idpf.org/2007/opf' ITEMREF_XPATH = f'.//{{{NAMESPACE}}}itemref' def __init__(self, xml_content: str): self.xml_content = xml_content self.itemrefs = [] self.toc = None self.page_progression_direction = None self._parse(xml_content) self._printer = XMLPrinter(self) def __str__(self) -> str: return self.xml_content def to_str(self, *args, **kwargs) -> str: return self._printer.to_str(*args, **kwargs) def to_xml(self, *args, **kwargs) -> str: return self._printer.to_xml(*args, **kwargs) def _parse(self, xml_content: str) -> None: """ Parses the spine XML content. """ try: if isinstance(xml_content, str): xml_content = xml_content.encode('utf-8') root = etree.fromstring(xml_content) self.toc = root.get('toc') self.page_progression_direction = root.get('page-progression-direction', 'default') for itemref in root.findall(self.ITEMREF_XPATH): idref = itemref.get('idref') linear = itemref.get('linear', 'yes') properties = itemref.get('properties', '').split() if idref: self.itemrefs.append( {'idref': idref, 'linear': linear == 'yes', 'properties': properties} ) except etree.ParseError as e: raise ParseError( f'Invalid XML in spine element: {str(e)}', element_name='spine', suggestions=[ 'Check that the spine contains valid XML', 'Verify all spine items are properly formatted', 'Ensure required attributes (idref) are present', 'Check that spine defines the reading order correctly', ], ) from e def find_by_idref(self, itemref_idref: str) -> dict: """Find an itemref by its idref.""" for item in self.itemrefs: if item['idref'] == itemref_idref: return item return None ================================================ FILE: epub_utils/printers.py ================================================ try: from lxml import etree except ImportError: import xml.etree.ElementTree as etree from pygments import highlight from pygments.formatters import TerminalFormatter from pygments.lexers import XmlLexer def highlight_xml(xml_content: str) -> str: return highlight(xml_content, XmlLexer(), TerminalFormatter()) def pretty_print_xml(xml_content: str) -> str: try: original_content = xml_content if isinstance(xml_content, str): xml_content_bytes = xml_content.encode('utf-8') else: xml_content_bytes = xml_content original_content = ( xml_content.decode('utf-8') if isinstance(xml_content, bytes) else xml_content ) xml_declaration = '' doctype_declaration = '' if original_content.strip().startswith('') + 2 xml_declaration = original_content[:xml_decl_end] doctype_start = original_content.find('', doctype_start) + 1 doctype_declaration = original_content[doctype_start:doctype_end] parser = etree.XMLParser(remove_blank_text=True) root = etree.fromstring(xml_content_bytes, parser) pretty_xml = etree.tostring(root, pretty_print=True, encoding='unicode') result = '' if xml_declaration: result += xml_declaration + '\n' if doctype_declaration: result += doctype_declaration + '\n' result += pretty_xml return result except etree.ParseError: return original_content if isinstance(original_content, str) else xml_content def print_to_str(xml_content: bool, pretty_print: bool) -> str: if pretty_print: xml_content = pretty_print_xml(xml_content) return xml_content def print_to_xml(xml_content: str, pretty_print: bool, highlight_syntax: bool) -> str: if pretty_print: xml_content = pretty_print_xml(xml_content) if highlight_syntax: xml_content = highlight_xml(xml_content) return xml_content class XMLPrinter: """Handles XML printing operations for objects with xml_content.""" def __init__(self, xml_content_provider): """ Initialize the XMLPrinter with an object that provides xml_content. Args: xml_content_provider: Object that has an xml_content attribute """ self._xml_content_provider = xml_content_provider def to_str(self, pretty_print: bool = False) -> str: """ Get string representation of the XML content. Args: pretty_print: Whether to format the XML with proper indentation Returns: String representation of the XML content """ return print_to_str(self._xml_content_provider.xml_content, pretty_print) def to_xml(self, pretty_print: bool = False, highlight_syntax: bool = True) -> str: """ Get formatted XML representation with optional syntax highlighting. Args: pretty_print: Whether to format the XML with proper indentation highlight_syntax: Whether to apply syntax highlighting Returns: Formatted XML string with optional syntax highlighting """ return print_to_xml(self._xml_content_provider.xml_content, pretty_print, highlight_syntax) ================================================ FILE: pytest.ini ================================================ [pytest] pythonpath = . python_files = tests.py test_*.py *_tests.py addopts = -p no:warnings ================================================ FILE: requirements/requirements-docs.txt ================================================ sphinx==6.2.0 sphinx-copybutton==0.5.1 sphinx-issues==3.0.1 furo==2022.12.7 ================================================ FILE: requirements/requirements-linting.txt ================================================ ruff==0.11.9 ================================================ FILE: requirements/requirements-testing.txt ================================================ coverage==6.4.1 coverage-badge==1.1.0 pytest==7.2.0 pytest-cov==3.0.0 ================================================ FILE: requirements/requirements.txt ================================================ click==8.1.8 lxml==5.4.0 pygments==2.19.1 PyYAML==6.0.2 ================================================ FILE: requirements.txt ================================================ -r requirements/requirements-docs.txt -r requirements/requirements-linting.txt -r requirements/requirements-testing.txt -r requirements/requirements.txt ================================================ FILE: ruff.toml ================================================ line-length = 100 [format] quote-style = "single" indent-style = "tab" docstring-code-format = true ================================================ FILE: setup.py ================================================ import os from setuptools import find_packages, setup VERSION = '0.1.0a1' def get_long_description(): with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'README.md'), encoding='utf8', ) as fp: return fp.read() setup( name='epub-utils', description='A Python CLI and utility library for manipulating EPUB files', long_description=get_long_description(), long_description_content_type='text/markdown', author='Ernesto González', url='https://github.com/ernestofgonzalez/epub-utils', project_urls={ 'Source code': 'https://github.com/ernestofgonzalez/epub-utils', 'Issues': 'https://github.com/ernestofgonzalez/epub-utils/issues', 'CI': 'https://github.com/ernestofgonzalez/epub-utils/actions', 'Changelog': 'https://github.com/ernestofgonzalez/epub-utils/releases', }, license='Apache License, Version 2.0', version=VERSION, packages=find_packages(), entry_points={ 'console_scripts': [ 'epub-utils = epub_utils.cli:main', ] }, install_requires=[ 'click', 'lxml', 'packaging', 'pygments', 'PyYAML', ], extras_require={ 'test': ['pytest'], 'docs': [ 'sphinx', 'sphinx-copybutton', 'sphinx-issues', 'furo', ], }, python_requires='>=3.8', classifiers=[ 'Intended Audience :: Developers', 'Topic :: Software Development :: Libraries', 'Topic :: Utilities', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX', 'Operating System :: Unix', 'Operating System :: MacOS', ], ) ================================================ FILE: tests/conftest.py ================================================ import pytest @pytest.fixture def doc_path(): path = str('tests/assets/roads.epub') return path ================================================ FILE: tests/test_cli.py ================================================ import pytest from click.testing import CliRunner from epub_utils import cli @pytest.mark.parametrize( 'options', ( ['-h'], ['--help'], ), ) def test_help(options): result = CliRunner().invoke(cli.main, options) assert result.exit_code == 0 assert result.output.startswith('Usage: ') assert '-h, --help' in result.output @pytest.mark.parametrize( 'options', ( ['-v'], ['--version'], ), ) def test_version(options): result = CliRunner().invoke(cli.main, options) assert result.exit_code == 0 assert result.output.strip() == cli.VERSION def test_files_command_with_file_path_xhtml_xml(doc_path): """Test the files command with XHTML file path in XML format.""" result = CliRunner().invoke( cli.main, [str(doc_path), 'files', 'GoogleDoc/Roads.xhtml', '--format', 'xml'] ) assert result.exit_code == 0 assert len(result.output) > 0 def test_files_command_with_file_path_missing_file(doc_path): """Test the files command with missing file path.""" result = CliRunner().invoke(cli.main, [str(doc_path), 'files', 'nonexistent/file.xhtml']) assert result.exit_code == 1 assert 'Missing' in result.output def test_files_command_without_file_path_table(doc_path): """Test the files command without file path (list files) in table format.""" result = CliRunner().invoke(cli.main, [str(doc_path), 'files', '--format', 'table']) assert result.exit_code == 0 assert len(result.output) > 0 assert 'Path' in result.output assert 'Size' in result.output def test_files_command_without_file_path_raw(doc_path): """Test the files command without file path (list files) in raw format.""" result = CliRunner().invoke(cli.main, [str(doc_path), 'files', '--format', 'raw']) assert result.exit_code == 0 assert len(result.output) > 0 assert 'GoogleDoc/Roads.xhtml' in result.output def test_toc_command_default(doc_path): """Test the toc command with default behavior (auto-detect).""" result = CliRunner().invoke(cli.main, [str(doc_path), 'toc']) assert result.exit_code == 0 assert len(result.output) > 0 def test_toc_command_nav_flag(doc_path): """Test the toc command with --nav flag.""" result = CliRunner().invoke(cli.main, [str(doc_path), 'toc', '--nav']) assert result.exit_code == 0 assert len(result.output) > 0 def test_toc_command_mutually_exclusive_flags(doc_path): """Test that --ncx and --nav flags are mutually exclusive.""" result = CliRunner().invoke(cli.main, [str(doc_path), 'toc', '--ncx', '--nav']) assert result.exit_code == 1 assert '--ncx and --nav flags cannot be used together' in result.output ================================================ FILE: tests/test_container.py ================================================ import pytest from epub_utils.container import Container from epub_utils.exceptions import InvalidEPUBError CONTAINER_XML = """ """ def test_container_initialization(): """ Test that the Container class initializes correctly with valid XML content. """ container = Container(CONTAINER_XML) assert container is not None assert container.rootfile_path == 'OEBPS/content.opf' def test_invalid_container_xml(): """ Test that the Container class raises an error for invalid XML content. """ invalid_xml = '' with pytest.raises(InvalidEPUBError, match='Invalid container.xml: Missing rootfile element'): Container(invalid_xml) @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n\n \n\n \n \n', False, '\n\n \n\n \n \n', ), ( '\n\n \n\n \n \n', True, '\n\n \n \n \n\n', ), ], ) def test_container_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing for Container.""" container = Container(xml_content) assert container.to_str(pretty_print=pretty_print) == expected ================================================ FILE: tests/test_doc.py ================================================ import unittest from epub_utils.container import Container from epub_utils.doc import Document from epub_utils.navigation import EPUBNavDocNavigation, Navigation from epub_utils.package import Manifest, Package def test_document_container(doc_path): """ Test that the Document class correctly parses the container.xml file. """ doc = Document(doc_path) assert isinstance(doc.container, Container) def test_document_package(doc_path): """ Test that the Document class correctly parses the package file. """ case = unittest.TestCase() doc = Document(doc_path) assert isinstance(doc.package, Package) assert isinstance(doc.package.manifest, Manifest) case.assertCountEqual( doc.package.manifest.items, [ { 'id': 'toc', 'href': 'nav.xhtml', 'media_type': 'application/xhtml+xml', 'properties': ['nav'], }, { 'id': 'main', 'href': 'Roads.xhtml', 'media_type': 'application/xhtml+xml', 'properties': [], }, ], ) def test_document_toc(doc_path): """ Test that the Document class correctly parses the table of contents file. """ doc = Document(doc_path) assert isinstance(doc.toc, Navigation) def test_document_find_content_by_id(doc_path): doc = Document(doc_path) content = doc.find_content_by_id('main') assert content is not None def test_document_get_file_by_path_xhtml(doc_path): """ Test that the Document class can retrieve XHTML files by path. """ doc = Document(doc_path) content = doc.get_file_by_path('GoogleDoc/Roads.xhtml') # Should return XHTMLContent object for XHTML files assert hasattr(content, 'to_str') assert hasattr(content, 'to_xml') assert hasattr(content, 'to_plain') # Content should not be empty content_str = content.to_str() assert len(content_str) > 0 assert 'xhtml' in content_str.lower() def test_document_get_file_by_path_missing_file(doc_path): """ Test that the Document class raises an error for missing files. """ doc = Document(doc_path) try: doc.get_file_by_path('nonexistent/file.xhtml') assert False, 'Expected ValueError for missing file' except ValueError as e: assert 'Missing' in str(e) def test_document_nav_property(doc_path): """ Test that the Document class correctly accesses the Navigation Document via nav property. """ doc = Document(doc_path) nav = doc.nav assert nav is not None assert isinstance(nav, EPUBNavDocNavigation) ================================================ FILE: tests/test_manifest.py ================================================ import pytest from epub_utils.package.manifest import Manifest VALID_MANIFEST_XML = """ """ MINIMAL_MANIFEST_XML = """ """ def test_manifest_initialization(): manifest = Manifest(VALID_MANIFEST_XML) assert len(manifest.items) == 4 assert manifest.items[0]['id'] == 'nav' assert manifest.items[0]['href'] == 'nav.xhtml' assert manifest.items[0]['media_type'] == 'application/xhtml+xml' assert manifest.items[0]['properties'] == ['nav'] assert manifest.items[2]['id'] == 'style' assert manifest.items[2]['href'] == 'style.css' assert manifest.items[2]['media_type'] == 'text/css' assert manifest.items[2]['properties'] == [] def test_minimal_manifest(): manifest = Manifest(MINIMAL_MANIFEST_XML) assert len(manifest.items) == 1 assert manifest.items[0]['id'] == 'content' assert manifest.items[0]['href'] == 'content.xhtml' assert manifest.items[0]['media_type'] == 'application/xhtml+xml' assert manifest.items[0]['properties'] == [] def test_find_by_property(): manifest = Manifest(VALID_MANIFEST_XML) nav_item = manifest.find_by_property('nav') assert nav_item['id'] == 'nav' assert nav_item['href'] == 'nav.xhtml' def test_find_by_id(): manifest = Manifest(VALID_MANIFEST_XML) chapter = manifest.find_by_id('chapter1') assert chapter['href'] == 'chapter1.xhtml' assert chapter['media_type'] == 'application/xhtml+xml' def test_find_by_media_type(): manifest = Manifest(VALID_MANIFEST_XML) xhtml_items = manifest.find_by_media_type('application/xhtml+xml') assert len(xhtml_items) == 2 assert all(item['media_type'] == 'application/xhtml+xml' for item in xhtml_items) @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n \n\n \n', False, '\n \n\n \n', ), ( '\n \n\n \n', True, '\n \n \n\n', ), ], ) def test_manifest_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing for Manifest.""" manifest = Manifest(xml_content) assert manifest.to_str(pretty_print=pretty_print) == expected ================================================ FILE: tests/test_metadata.py ================================================ import pytest from epub_utils.exceptions import ValidationError from epub_utils.package.metadata import Metadata VALID_METADATA_XML = """ Test Book Test Author test-id-123 en Fiction Science Fiction 2024-01-01 Test Publisher 2023-11-28T14:50:13Z Original Source """ INVALID_METADATA_XML = """ Test Book Test Author """ def test_metadata_parse_valid_element(): """Test parsing valid metadata XML with both required and optional DC terms.""" metadata = Metadata(VALID_METADATA_XML) assert metadata.title == 'Test Book' assert metadata.creator == 'Test Author' assert metadata.identifier == 'test-id-123' assert metadata.language == 'en' assert metadata.subject == ['Fiction', 'Science Fiction'] assert metadata.date == '2024-01-01' assert metadata.publisher == 'Test Publisher' assert metadata.modified == '2023-11-28T14:50:13Z' assert metadata.source == 'Original Source' def test_metadata_validate_missing_identifier_with_raise_exception(): """Test that parsing metadata without identifier raises error.""" with pytest.raises(ValidationError): Metadata(INVALID_METADATA_XML)._validate(raise_exception=True) @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n Test Book\n\n Test Author\n\n test-id-123\n', False, '\n Test Book\n\n Test Author\n\n test-id-123\n', ), ( '\n Test Book\n\n Test Author\n\n test-id-123\n', True, '\n Test Book\n Test Author\n test-id-123\n\n', ), ], ) def test_metadata_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing for Metadata.""" metadata = Metadata(xml_content) assert metadata.to_str(pretty_print=pretty_print) == expected ================================================ FILE: tests/test_nav_navigation.py ================================================ import pytest from epub_utils.navigation.nav import EPUBNavDocNavigation NAV_XML = """ Navigation Document """ def test_nav_doc_navigation_initialization(): """Test that the EPUBNavDocNavigation class initializes correctly.""" nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml') assert nav is not None assert nav.xml_content == NAV_XML assert nav.media_type == 'application/xhtml+xml' assert nav.href == 'nav.xhtml' assert nav.xmlns == 'http://www.w3.org/1999/xhtml' assert nav.lang == 'en' def test_nav_doc_navigation_interface(): """Test the new navigation interface methods.""" nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml') # Test get_toc_items toc_items = nav.get_toc_items() assert len(toc_items) == 1 item = toc_items[0] assert item.id == 'ch1' assert item.label == 'Chapter 1' assert item.target == 'chapter1.xhtml' assert item.order == 1 assert item.level == 0 # Test get_page_list (should be empty for this sample) page_list = nav.get_page_list() assert len(page_list) == 0 # Test get_landmarks (should be empty for this sample) landmarks = nav.get_landmarks() assert len(landmarks) == 0 # Test find_item_by_id found_item = nav.find_item_by_id('ch1') assert found_item is not None assert found_item.label == 'Chapter 1' # Test find_items_by_target found_items = nav.find_items_by_target('chapter1.xhtml') assert len(found_items) == 1 assert found_items[0].id == 'ch1' def test_nav_doc_navigation_toc_items_as_dicts(): """Test hierarchical navigation structure.""" nav_xml_hierarchical = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_hierarchical, 'application/xhtml+xml', 'nav.xhtml') toc_items = nav.get_toc_items_as_dicts() assert toc_items == [ { 'id': 'ch1', 'label': 'Chapter 1', 'target': 'chapter1.xhtml', 'order': 1, 'level': 0, 'type': None, 'children': [ { 'id': 'ch1-1', 'label': 'Section 1.1', 'target': 'chapter1.xhtml#section1', 'order': 1, 'level': 1, 'type': None, 'children': [], } ], }, { 'id': 'ch2', 'label': 'Chapter 2', 'target': 'chapter2.xhtml', 'order': 2, 'level': 0, 'type': None, 'children': [], }, ] def test_nav_doc_navigation_page_list(): """Test page list functionality.""" nav_xml_with_pages = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_with_pages, 'application/xhtml+xml', 'nav.xhtml') # Test get_page_list page_list = nav.get_page_list() assert len(page_list) == 3 page1 = page_list[0] assert page1.id == 'page1' assert page1.label == '1' assert page1.target == 'chapter1.xhtml#page1' assert page1.order == 1 assert page1.level == 0 assert page1.item_type in [None, 'page'] # Could be None or 'page' page2 = page_list[1] assert page2.id == 'page2' assert page2.label == '2' assert page2.target == 'chapter1.xhtml#page2' page3 = page_list[2] assert page3.id == 'page3' assert page3.label == '3' assert page3.target == 'chapter2.xhtml#page3' def test_nav_doc_navigation_landmarks(): """Test landmarks functionality.""" nav_xml_with_landmarks = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_with_landmarks, 'application/xhtml+xml', 'nav.xhtml') # Test get_landmarks landmarks = nav.get_landmarks() assert len(landmarks) == 3 cover_landmark = landmarks[0] assert cover_landmark.id == 'cover' assert cover_landmark.label == 'Cover' assert cover_landmark.target == 'cover.xhtml' assert cover_landmark.item_type == 'cover' toc_landmark = landmarks[1] assert toc_landmark.id == 'toc-landmark' assert toc_landmark.label == 'Table of Contents' assert toc_landmark.target == 'toc.xhtml' assert toc_landmark.item_type == 'toc' start_landmark = landmarks[2] assert start_landmark.id == 'start' assert start_landmark.label == 'Start of Content' assert start_landmark.target == 'chapter1.xhtml' assert start_landmark.item_type == 'bodymatter' def test_nav_doc_navigation_editing(): """Test the editing capabilities of the navigation interface.""" from epub_utils.navigation.base import NavigationItem nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml') # Test adding a new item new_item = NavigationItem(id='ch2', label='Chapter 2', target='chapter2.xhtml', order=2) nav.add_toc_item(new_item) # Verify it was added toc_items = nav.get_toc_items() assert len(toc_items) == 2 new_toc_item = nav.find_item_by_id('ch2') assert new_toc_item is not None assert new_toc_item.label == 'Chapter 2' # Test updating an item success = nav.update_toc_item( 'ch2', label='Chapter Two Updated', target='chapter2_updated.xhtml' ) assert success updated_item = nav.find_item_by_id('ch2') assert updated_item.label == 'Chapter Two Updated' assert updated_item.target == 'chapter2_updated.xhtml' # Test removing an item success = nav.remove_toc_item('ch2') assert success # Verify it was removed toc_items = nav.get_toc_items() assert len(toc_items) == 1 assert nav.find_item_by_id('ch2') is None def test_nav_doc_navigation_span_elements(): """Test navigation with span elements (non-linked text).""" nav_xml_with_spans = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_with_spans, 'application/xhtml+xml', 'nav.xhtml') toc_items = nav.get_toc_items() assert len(toc_items) == 1 part1_item = toc_items[0] assert part1_item.id == 'part1' assert part1_item.label == 'Part 1' assert part1_item.target == '' # span elements don't have targets assert len(part1_item.children) == 2 ch1_item = part1_item.children[0] assert ch1_item.id == 'ch1' assert ch1_item.label == 'Chapter 1' assert ch1_item.target == 'chapter1.xhtml' ch2_item = part1_item.children[1] assert ch2_item.id == 'ch2' assert ch2_item.label == 'Chapter 2' assert ch2_item.target == 'chapter2.xhtml' def test_nav_doc_navigation_item_types(): """Test navigation with epub:type attributes.""" nav_xml_with_types = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_with_types, 'application/xhtml+xml', 'nav.xhtml') toc_items = nav.get_toc_items() assert len(toc_items) == 3 preface_item = toc_items[0] assert preface_item.item_type == 'preface' chapter_item = toc_items[1] assert chapter_item.item_type == 'chapter' appendix_item = toc_items[2] assert appendix_item.item_type == 'appendix' def test_nav_doc_navigation_invalid_media_type(): """Test that invalid media types raise ValueError.""" with pytest.raises(ValueError) as excinfo: EPUBNavDocNavigation(NAV_XML, 'application/x-dtbncx+xml', 'nav.xhtml') assert ( "Media type 'application/x-dtbncx+xml' is not supported for EPUB Navigation Document" in str(excinfo.value) ) def test_nav_doc_navigation_malformed_xml(): """Test handling of malformed XML.""" import pytest from epub_utils.exceptions import ParseError malformed_xml = """ Navigation Document """ # Missing closing and with pytest.raises(ParseError): EPUBNavDocNavigation(malformed_xml, 'application/xhtml+xml', 'nav.xhtml') def test_nav_doc_navigation_output_methods(): """Test the various output methods.""" nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml') # Test __str__ str_output = str(nav) assert str_output == NAV_XML # Test to_str (should use XMLPrinter) to_str_output = nav.to_str() assert isinstance(to_str_output, str) assert 'Chapter 1' in to_str_output # Test to_xml (may include ANSI color codes) to_xml_output = nav.to_xml() assert isinstance(to_xml_output, str) # Remove ANSI escape codes for testing import re clean_output = re.sub(r'\x1b\[[0-9;]*m', '', to_xml_output) assert 'Chapter 1' in clean_output # Test to_plain to_plain_output = nav.to_plain() assert isinstance(to_plain_output, str) assert 'Chapter 1' in to_plain_output def test_nav_doc_navigation_reorder_items(): """Test reordering TOC items.""" nav_xml_multiple = """ Navigation Document """ nav = EPUBNavDocNavigation(nav_xml_multiple, 'application/xhtml+xml', 'nav.xhtml') # Get original order original_items = nav.get_toc_items() assert [item.id for item in original_items] == ['ch1', 'ch2', 'ch3'] # Reorder items nav.reorder_toc_items(['ch3', 'ch1', 'ch2']) # Check that the method completed without error # Note: The actual reordering implementation may vary # and this test mainly ensures the method can be called def test_nav_doc_navigation_empty_document(): """Test handling of empty navigation document.""" empty_nav_xml = """ Navigation Document """ nav = EPUBNavDocNavigation(empty_nav_xml, 'application/xhtml+xml', 'nav.xhtml') # All lists should be empty assert len(nav.get_toc_items()) == 0 assert len(nav.get_page_list()) == 0 assert len(nav.get_landmarks()) == 0 # find methods should return None/empty assert nav.find_item_by_id('nonexistent') is None assert len(nav.find_items_by_target('nonexistent.xhtml')) == 0 ================================================ FILE: tests/test_ncx_navigation.py ================================================ from epub_utils.navigation.ncx import NCXNavigation NCX_XML = """ Sample Book Chapter 1 """ def test_ncx_navigation_initialization(): """Test that the NCXNavigation class initializes correctly.""" ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx') assert ncx is not None assert ncx.xml_content == NCX_XML assert ncx.media_type == 'application/x-dtbncx+xml' assert ncx.href == 'toc.ncx' assert ncx.xmlns == 'http://www.daisy.org/z3986/2005/ncx/' assert ncx.version == '2005-1' assert ncx.lang == 'en' def test_ncx_navigation_interface(): """Test the new navigation interface methods.""" ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx') # Test get_toc_items toc_items = ncx.get_toc_items() assert len(toc_items) == 1 item = toc_items[0] assert item.id == 'navpoint-1' assert item.label == 'Chapter 1' assert item.target == 'chapter1.xhtml' assert item.order == 1 assert item.level == 0 # Test get_page_list (should be empty for this sample) page_list = ncx.get_page_list() assert len(page_list) == 0 # Test get_landmarks (should be empty for this sample) landmarks = ncx.get_landmarks() assert len(landmarks) == 0 # Test find_item_by_id found_item = ncx.find_item_by_id('navpoint-1') assert found_item is not None assert found_item.label == 'Chapter 1' # Test find_items_by_target found_items = ncx.find_items_by_target('chapter1.xhtml') assert len(found_items) == 1 assert found_items[0].id == 'navpoint-1' def test_ncx_navigation_hierarchy(): """Test hierarchical navigation structure.""" ncx_xml_hierarchical = """ Sample Book Chapter 1 Section 1.1 Chapter 2 """ ncx = NCXNavigation(ncx_xml_hierarchical, 'application/x-dtbncx+xml', 'toc.ncx') toc_items = ncx.get_toc_items_as_dicts() assert toc_items == [ { 'id': 'ch1', 'label': 'Chapter 1', 'target': 'chapter1.xhtml', 'order': 1, 'level': 0, 'type': None, 'children': [ { 'id': 'ch1-1', 'label': 'Section 1.1', 'target': 'chapter1.xhtml#section1', 'order': 2, 'level': 1, 'type': None, 'children': [], } ], }, { 'id': 'ch2', 'label': 'Chapter 2', 'target': 'chapter2.xhtml', 'order': 3, 'level': 0, 'type': None, 'children': [], }, ] def test_ncx_navigation_editing(): """Test the editing capabilities of the navigation interface.""" from epub_utils.navigation.base import NavigationItem ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx') # Test adding a new item new_item = NavigationItem(id='ch2', label='Chapter 2', target='chapter2.xhtml', order=2) ncx.add_toc_item(new_item) # Verify it was added toc_items = ncx.get_toc_items() assert len(toc_items) == 2 new_toc_item = ncx.find_item_by_id('ch2') assert new_toc_item is not None assert new_toc_item.label == 'Chapter 2' # Test updating an item success = ncx.update_toc_item( 'ch2', label='Chapter Two Updated', target='chapter2_updated.xhtml' ) assert success updated_item = ncx.find_item_by_id('ch2') assert updated_item.label == 'Chapter Two Updated' assert updated_item.target == 'chapter2_updated.xhtml' # Test removing an item success = ncx.remove_toc_item('ch2') assert success # Verify it was removed toc_items = ncx.get_toc_items() assert len(toc_items) == 1 assert ncx.find_item_by_id('ch2') is None ================================================ FILE: tests/test_package.py ================================================ import pytest from epub_utils.exceptions import InvalidEPUBError, UnsupportedFormatError from epub_utils.package import Package VALID_OPF_XML = """ Sample EPUB John Doe 12345 """ INVALID_OPF_XML_MISSING_METADATA = """ """ VALID_EPUB3_XML_WITHOUT_TOC = """ Sample EPUB """ VALID_EPUB2_XML = """ Sample EPUB """ VALID_EPUB2_XML_WITHOUT_TOC = """ Sample EPUB """ VALID_OEPBS1_XML_WITH_TOC = """ Sample EPUB """ INVALID_VERSION = """ """ def test_package_initialization(): """ Test that the Package class initializes correctly with valid OPF XML content. """ package = Package(VALID_OPF_XML) assert package.metadata.title == 'Sample EPUB' assert package.metadata.creator == 'John Doe' assert package.metadata.identifier == '12345' def test_package_invalid_xml(): with pytest.raises(InvalidEPUBError) as excinfo: Package(INVALID_OPF_XML_MISSING_METADATA) assert 'OPF file missing required metadata element' in str(excinfo.value) def test_epub3(): package = Package(VALID_OPF_XML) assert package.version.public == '3.0' assert package.version.major == 3 assert package.nav_href == 'nav.xhtml' def test_epub3_without_toc(): package = Package(VALID_EPUB3_XML_WITHOUT_TOC) assert package.version.public == '3.0' assert package.version.major == 3 assert not package.nav_href def test_epub2(): package = Package(VALID_EPUB2_XML) assert package.version.public == '2.0' assert package.version.major == 2 assert package.toc_href == 'toc.ncx' def test_epub2_without_toc(): package = Package(VALID_EPUB2_XML_WITHOUT_TOC) assert package.version.public == '2.0' assert package.version.major == 2 assert not package.toc_href def test_epub1(): package = Package(VALID_OEPBS1_XML_WITH_TOC) assert package.version.public == '1.0' assert package.version.major == 1 assert package.toc_href == 'toc.ncx' def test_invalid_version(): with pytest.raises(UnsupportedFormatError) as excinfo: package = Package(INVALID_VERSION) assert 'EPUB version 4.x is not supported (EPUB 4.0 format)' in str(excinfo.value) @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n\n\n \n\n Sample EPUB\n \n\n \n \n \n\n \n \n ', False, '\n\n\n \n\n Sample EPUB\n \n\n \n \n \n\n \n \n ', ), ( '\n\n\n \n\n Sample EPUB\n \n\n \n \n \n\n \n \n ', True, '\n\n \n Sample EPUB\n \n \n \n \n \n \n \n\n', ), ], ) def test_package_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing for Package.""" package = Package(xml_content) assert package.to_str(pretty_print=pretty_print) == expected ================================================ FILE: tests/test_spine.py ================================================ import pytest from epub_utils.package.spine import Spine VALID_SPINE_XML = """ """ MINIMAL_SPINE_XML = """ """ def test_spine_initialization(): spine = Spine(VALID_SPINE_XML) assert spine.toc == 'ncx' assert spine.page_progression_direction == 'ltr' assert len(spine.itemrefs) == 4 # Test first itemref (cover) assert spine.itemrefs[0]['idref'] == 'cover' assert spine.itemrefs[0]['linear'] == False assert spine.itemrefs[0]['properties'] == [] # Test third itemref (chapter1) assert spine.itemrefs[2]['idref'] == 'chapter1' assert spine.itemrefs[2]['linear'] == True assert spine.itemrefs[2]['properties'] == ['page-spread-left'] def test_minimal_spine(): spine = Spine(MINIMAL_SPINE_XML) assert spine.toc is None assert spine.page_progression_direction == 'default' assert len(spine.itemrefs) == 1 assert spine.itemrefs[0]['idref'] == 'content' assert spine.itemrefs[0]['linear'] == True assert spine.itemrefs[0]['properties'] == [] @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n\n \n\n \n', False, '\n\n \n\n \n', ), ( '\n\n \n\n \n', True, '\n \n \n\n', ), ], ) def test_spine_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing for Spine.""" spine = Spine(xml_content) assert spine.to_str(pretty_print=pretty_print) == expected ================================================ FILE: tests/test_xhtml_content.py ================================================ import pytest from epub_utils.content.xhtml import XHTMLContent def test_simple_paragraph(): """Test extraction from a simple paragraph.""" xml_content = """

This is a simple paragraph.

""" content = XHTMLContent(xml_content, 'application/xhtml+xml', 'test.xhtml') assert content.inner_text == 'This is a simple paragraph.' @pytest.mark.parametrize( 'xml_content,pretty_print,expected', [ ( '\n\n\n \n

This is a simple paragraph.

\n\n \n', False, '\n\n\n \n

This is a simple paragraph.

\n\n \n', ), ( '\n\n\n \n

This is a simple paragraph.

\n\n \n', True, '\n\n\n \n

This is a simple paragraph.

\n \n\n', ), ( '\n\n \n

This is a simple paragraph.

\n\n \n', False, '\n\n \n

This is a simple paragraph.

\n\n \n', ), ( '\n\n \n

This is a simple paragraph.

\n\n \n', True, '\n\n \n

This is a simple paragraph.

\n \n\n', ), ( '\n \n

This is a simple paragraph.

\n\n \n', False, '\n \n

This is a simple paragraph.

\n\n \n', ), ( '\n \n

This is a simple paragraph.

\n\n \n', True, '\n \n

This is a simple paragraph.

\n \n\n', ), ], ) def test_to_str_pretty_print_parameter(xml_content, pretty_print, expected): """Test XML output with and without pretty printing.""" content = XHTMLContent(xml_content, 'application/xhtml+xml', 'test.xhtml') assert content.to_str(pretty_print=pretty_print) == expected