Repository: ernestofgonzalez/epub-utils
Branch: main
Commit: 8c5417c331f2
Files: 60
Total size: 315.4 KB

Directory structure:
gitextract_obeqz0f5/

├── .github/
│   └── workflows/
│       ├── docs.yml
│       └── test.yml
├── .gitignore
├── .vscode/
│   └── settings.json
├── LICENSE
├── Makefile
├── README.md
├── docs/
│   ├── Makefile
│   ├── api-reference.rst
│   ├── api-tutorial.rst
│   ├── changelog.rst
│   ├── cli-reference.rst
│   ├── cli-tutorial.rst
│   ├── conf.py
│   ├── contributing.rst
│   ├── epub-standards.rst
│   ├── examples.rst
│   ├── formats.rst
│   ├── index.rst
│   └── installation.rst
├── epub_utils/
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli.py
│   ├── container.py
│   ├── content/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   └── xhtml.py
│   ├── doc.py
│   ├── exceptions.py
│   ├── navigation/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── nav/
│   │   │   ├── __init__.py
│   │   │   └── dom.py
│   │   └── ncx/
│   │       ├── __init__.py
│   │       └── dom.py
│   ├── package/
│   │   ├── __init__.py
│   │   ├── manifest.py
│   │   ├── metadata.py
│   │   └── spine.py
│   └── printers.py
├── pytest.ini
├── requirements/
│   ├── requirements-docs.txt
│   ├── requirements-linting.txt
│   ├── requirements-testing.txt
│   └── requirements.txt
├── requirements.txt
├── ruff.toml
├── setup.py
└── tests/
    ├── assets/
    │   └── roads.epub
    ├── conftest.py
    ├── test_cli.py
    ├── test_container.py
    ├── test_doc.py
    ├── test_manifest.py
    ├── test_metadata.py
    ├── test_nav_navigation.py
    ├── test_ncx_navigation.py
    ├── test_package.py
    ├── test_spine.py
    └── test_xhtml_content.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/docs.yml
================================================
name: Publish documentation

on:
  push:
    branches:
    - main

jobs:
  docs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
      - name: Install dependencies
        run: |
          pip install -r requirements/requirements-docs.txt
      - name: Sphinx build
        run: |
          sphinx-build docs _build
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        if: ${{ github.ref == 'refs/heads/main' }}
        with:
          publish_branch: gh-pages
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: _build/
          force_orphan: true 

================================================
FILE: .github/workflows/test.yml
================================================
name: Test

on:
  push:
    branches: 
    - "main"
  pull_request:

concurrency:
  group: ${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  test:
    name: Python ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      max-parallel: 4
      matrix:
        os:
        - ubuntu-24.04
        - windows-2022
        - macos-14
        python-version: 
        - "3.8"
        - "3.9"
        - "3.10"
        - "3.11"
        - "3.12" 
        - "3.13"

    steps:
    - uses: actions/checkout@v4
    
    - uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
        allow-prereleases: true
    
    - name: Cache pip packages
      uses: actions/cache@v3
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r requirements.txt
    
    - name: Run tests
      run: |
        pytest

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# MacOS
.DS_Store

================================================
FILE: .vscode/settings.json
================================================
{
    "python.testing.pytestEnabled": true
}

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2025 Ernesto González

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
#!/usr/bin/env bash

LIGHT_CYAN=\033[1;36m
NO_COLOR=\033[0m

.PHONY: docs

help:
	@echo "test - run tests with pytest"
	@echo "coverage - get code coverage report"
	@echo "lint - lint the python code"
	@echo "format - format the python code"

# Run tests
test:
	@echo "${LIGHT_CYAN}Running tests...${NO_COLOR}"
	pytest

# Get code coverage report
coverage:
	@echo "${LIGHT_CYAN}Running tests and collecting coverage data...${NO_COLOR}"
	pytest
	coverage combine
	@echo "${LIGHT_CYAN}Reporting code coverage data...${NO_COLOR}"
	coverage report
	@echo "${LIGHT_CYAN}Creating HTML report...${NO_COLOR}"
	coverage html
	@echo "${LIGHT_CYAN}Creating coverage badge...${NO_COLOR}"
	@rm ./coverage.svg
	coverage-badge -o coverage.svg

# Lint code
lint:
	@echo "${LIGHT_CYAN}Linting code...${NO_COLOR}"
	ruff check

# Format code
format:
	@echo "${LIGHT_CYAN}Formatting code...${NO_COLOR}"
	ruff check --select I --fix
	ruff format

================================================
FILE: README.md
================================================
# epub-utils

[![PyPI](https://img.shields.io/pypi/v/epub-utils.svg)](https://pypi.org/project/epub-utils/)
[![Changelog](https://img.shields.io/github/v/release/ernestofgonzalez/epub-utils?include_prereleases&label=changelog)](https://ernestofgonzalez.github.io/epub-utils/changelog)
[![Python 3.x](https://img.shields.io/pypi/pyversions/epub-utils.svg?logo=python&logoColor=white)](https://pypi.org/project/epub-utils/)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/ernestofgonzalez/epub-utils/blob/main/LICENSE)

A Python library and CLI tool for inspecting ePub from the terminal.

## Features

- **Complete EPUB Support** - Parse both EPUB 2.0.1 and EPUB 3.0+ specifications with container, package, manifest, spine, and table of contents inspection
- **Rich Metadata Extraction** - Extract Dublin Core metadata (title, author, language, publisher) with key-value, XML, and raw output formats for easy scripting
- **Content Analysis** - Access document content by manifest ID or file path, with plain text extraction for content analysis and word counting
- **File System Navigation** - Browse and extract any file within EPUB archives (XHTML, CSS, images, fonts) with detailed file information including sizes and compression ratios
- **Multiple Output Formats** - XML with syntax highlighting, raw content, key-value pairs, plain text, and formatted tables to suit different workflows
- **CLI and Python API** - Comprehensive command-line tool for terminal workflows plus a clean Python library for programmatic access
- **Standards Compliance** - Built-in validation capabilities and adherence to W3C/IDPF specifications for reliable EPUB processing
- **Performance Optimized** - Lazy loading, efficient ZIP parsing, and optional lxml support for handling large EPUB collections

## Installation

`epub-utils` is available as a [PyPI](https://pypi.org/) package

```bash
pip install epub-utils
```

## Use as a CLI tool

The basic format is:

```bash
epub-utils EPUB_PATH COMMAND [OPTIONS]
```

### Commands

- `container` - Display the container.xml contents
    ```bash
    # Show container.xml with syntax highlighting
    epub-utils book.epub container

    # Show container.xml as raw content
    epub-utils book.epub container --format raw
    
    # Show container.xml with pretty formatting
    epub-utils book.epub container --pretty-print
    ```

- `package` - Display the package OPF file contents
    ```bash
    # Show package.opf with syntax highlighting
    epub-utils book.epub package

    # Show package.opf as raw content
    epub-utils book.epub package --format raw
    ```

- `toc` - Display the table of contents file contents
    ```bash
    # Show toc.ncx/nav.xhtml with syntax highlighting (auto-detect)
    epub-utils book.epub toc

    # Show toc.ncx/nav.xhtml as raw content
    epub-utils book.epub toc --format raw

    # Force NCX format (EPUB 2 navigation control file)
    epub-utils book.epub toc --ncx

    # Force Navigation Document (EPUB 3 navigation file)
    epub-utils book.epub toc --nav
    ```

- `metadata` - Display the metadata information from the package file
    ```bash
    # Show metadata with syntax highlighting
    epub-utils book.epub metadata

    # Show metadata as key-value pairs
    epub-utils book.epub metadata --format kv
    
    # Show metadata with pretty formatting
    epub-utils book.epub metadata --pretty-print
    ```

- `manifest` - Display the manifest information from the package file
    ```bash
    # Show manifest with syntax highlighting
    epub-utils book.epub manifest

    # Show manifest as raw content
    epub-utils book.epub manifest --format raw
    ```

- `spine` - Display the spine information from the package file
    ```bash
    # Show spine with syntax highlighting
    epub-utils book.epub spine

    # Show spine as raw content
    epub-utils book.epub spine --format raw
    ```

- `content` - Display the content of a document by its manifest item ID
    ```bash
    # Show content with syntax highlighting
    epub-utils book.epub content chapter1

    # Show raw HTML/XML content
    epub-utils book.epub content chapter1 --format raw
    
    # Show plain text content (HTML tags stripped)
    epub-utils book.epub content chapter1 --format plain
    ```

- `files` - List all files in the EPUB archive or display content of a specific file
    ```bash
    # List all files in table format (default)
    epub-utils book.epub files

    # List all files as simple paths
    epub-utils book.epub files --format raw

    # Display content of a specific file by path
    epub-utils book.epub files OEBPS/chapter1.xhtml

    # Display XHTML file content in different formats
    epub-utils book.epub files OEBPS/chapter1.xhtml --format raw
    epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print
    epub-utils book.epub files OEBPS/chapter1.xhtml --format plain

    # Display non-XHTML files (CSS, images, etc.)
    epub-utils book.epub files OEBPS/styles/main.css
    epub-utils book.epub files META-INF/container.xml
    ```

### Options

- `-h, --help` - Show help message and exit
- `-v, --version` - Show program version and exit
- `-fmt, --format` - Output format (default: xml)
    - `xml` - Display with XML syntax highlighting (default)
    - `raw` - Display raw content without formatting
    - `plain` - Display plain text content (HTML tags stripped, for content command only)
    - `kv` - Display key-value pairs (where supported)
- `-pp, --pretty-print` - Pretty-print XML output (applies to xml and raw formats only)
    
    ```bash
    # Display as raw content
    epub-utils book.epub package --format raw
    
    # Display with XML syntax highlighting (default)
    epub-utils book.epub package --format xml
    
    # Display as key-value pairs (for supported commands)
    epub-utils book.epub metadata --format kv
    
    # Display plain text content (content command only)
    epub-utils book.epub content chapter1 --format plain
    
    # Pretty-print XML with proper indentation
    epub-utils book.epub package --pretty-print
    
    # Combine format and pretty-print options
    epub-utils book.epub metadata --format raw --pretty-print
    ```

## Use as a Python library

```python
from epub_utils import Document

# Load an EPUB document
doc = Document("path/to/book.epub")
```

### Basic Document Access

Access the main components of an EPUB document:

```python
# Get container information
container = doc.container
print(container.to_xml())  # Formatted XML with syntax highlighting
print(container.to_str())  # Raw XML content

# Get package information  
package = doc.package
print(package.to_xml())    # Formatted XML with syntax highlighting
print(package.to_str())    # Raw XML content

# Get table of contents
toc = doc.toc
if toc:  # TOC might be None if not present
    print(toc.to_xml())    # Formatted XML with syntax highlighting
    print(toc.to_str())    # Raw XML content

# Access specific navigation formats
ncx = doc.ncx  # NCX format (EPUB 2 or EPUB 3 with NCX)
if ncx:
    print("NCX navigation available")
    print(ncx.to_xml())

nav = doc.nav  # Navigation Document (EPUB 3 only)
if nav:
    print("Navigation Document available")
    print(nav.to_xml())
    print(toc.to_str())    # Raw XML content
```

### Working with Metadata

Access and format metadata information:

```python
# Access package metadata
metadata = doc.package.metadata

# Basic Dublin Core elements
print(f"Title: {metadata.title}")
print(f"Creator: {metadata.creator}")
print(f"Identifier: {metadata.identifier}")
print(f"Language: {metadata.language}")
print(f"Publisher: {metadata.publisher}")
print(f"Date: {metadata.date}")

# Dynamic attribute access for any metadata field
isbn = getattr(metadata, 'isbn', 'Not available')
series = getattr(metadata, 'series', 'Not available')

# Get formatted metadata output
print(metadata.to_xml())     # Formatted XML with syntax highlighting
print(metadata.to_str())     # Raw XML content  
print(metadata.to_kv())      # Key-value format for easy parsing
```

### Working with Manifest

Access the manifest to see all files in the EPUB:

```python
# Get manifest information
manifest = doc.package.manifest

# Access all manifest items
for item in manifest.items:
    print(f"ID: {item['id']}")
    print(f"File: {item['href']}")
    print(f"Type: {item['media_type']}")
    print(f"Properties: {item['properties']}")

# Find specific items
nav_item = manifest.find_by_property('nav')
chapter = manifest.find_by_id('chapter1')
xhtml_items = manifest.find_by_media_type('application/xhtml+xml')

# Get formatted manifest output
print(manifest.to_xml())     # Formatted XML with syntax highlighting
print(manifest.to_str())     # Raw XML content
```

### Working with Spine

Access the spine to see the reading order:

```python
# Get spine information
spine = doc.package.spine

# Access spine properties
print(f"TOC reference: {spine.toc}")
print(f"Page progression: {spine.page_progression_direction}")

# Access spine items in reading order
for itemref in spine.itemrefs:
    print(f"ID: {itemref['idref']}")
    print(f"Linear: {itemref['linear']}")
    print(f"Properties: {itemref['properties']}")

# Find specific spine item
spine_item = spine.find_by_idref('chapter1')

# Get formatted spine output
print(spine.to_xml())        # Formatted XML with syntax highlighting
print(spine.to_str())        # Raw XML content
```

### Content Extraction

Extract content from specific documents within the EPUB:

```python
# Access content by manifest item ID
try:
    content = doc.find_content_by_id('chapter1')
    
    # Get content in different formats
    print(content.to_xml())      # Formatted XHTML with syntax highlighting
    print(content.to_str())      # Raw XHTML content
    print(content.to_plain())    # Plain text with HTML tags stripped
    
    # Access the parsed content tree for advanced processing
    tree = content.tree
    inner_text = content.inner_text
    
except ValueError as e:
    print(f"Content not found: {e}")

# Find publication resources by ID (for non-spine items)
try:
    resource = doc.find_pub_resource_by_id('cover-image')
except ValueError as e:
    print(f"Resource not found: {e}")
```

### File Operations

List and access files directly by their paths in the EPUB archive:

```python
# Get information about all files
files_info = doc.get_files_info()
for file_info in files_info:
    print(f"Path: {file_info['path']}")
    print(f"Size: {file_info['size']} bytes")
    print(f"Compressed: {file_info['compressed_size']} bytes")
    print(f"Modified: {file_info['modified']}")

# Access specific file by path
try:
    # For XHTML files, returns XHTMLContent object
    xhtml_content = doc.get_file_by_path('OEBPS/chapter1.xhtml')
    print(xhtml_content.to_xml())
    print(xhtml_content.to_plain())
    
    # For other files, returns raw string content
    css_content = doc.get_file_by_path('OEBPS/styles/main.css')
    print(css_content)
    
except ValueError as e:
    print(f"File not found: {e}")
```

### Output Formatting Options

All document components support flexible output formatting:

```python
# Pretty-printed XML output
print(metadata.to_str(pretty_print=True))
print(manifest.to_xml(pretty_print=True))

# Syntax highlighting can be controlled
print(package.to_xml(highlight_syntax=True))   # With highlighting (default)
print(package.to_xml(highlight_syntax=False))  # Without highlighting
```

## Industry Standards & Compliance

`epub-utils` provides comprehensive support for industry-standard ePub specifications and related technologies, ensuring broad compatibility across the digital publishing ecosystem.

### Supported EPUB Standards

- **EPUB 2.0.1** (IDPF, 2010)
  - Complete OPF 2.0 package document support
  - NCX navigation control file support
  - Dublin Core metadata extraction
  - Legacy EPUB compatibility

- **EPUB 3.0+** (IDPF/W3C, 2011-present)
  - EPUB 3.3 specification compliance
  - HTML5-based content documents
  - Navigation document (nav.xhtml) support
  - Enhanced accessibility features
  - Media overlays and scripting support

### Metadata Standards

- **Dublin Core Metadata Initiative (DCMI)**
  - Dublin Core Metadata Element Set v1.1
  - Dublin Core Metadata Terms (DCTERMS)

- **Open Packaging Format (OPF)**
  - OPF 2.0 specification (EPUB 2.0.1)
  - OPF 3.0 specification (EPUB 3.0+)

The library maintains strict adherence to published specifications while providing robust handling of real-world EPUB variations commonly found in commercial and open-source reading applications.

================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/api-reference.rst
================================================
API Reference
=============

This section provides complete API documentation for all classes and methods in epub-utils.

Document Class
--------------

.. py:class:: Document(path)

   Main class for working with EPUB files.

   :param str path: Path to the EPUB file

   **Example**:

   .. code-block:: python

      from epub_utils import Document
      
      doc = Document("book.epub")
      print(doc.package.metadata.title)

   .. py:attribute:: container

      Access to the container information.

      :type: Container
      :returns: Container object with container.xml information

      **Example**:

      .. code-block:: python

         container = doc.container
         print(f"Package path: {container.rootfile_path}")

   .. py:attribute:: package

      Access to the package (OPF) information.

      :type: Package  
      :returns: Package object with OPF file information

      **Example**:

      .. code-block:: python

         package = doc.package
         print(f"Title: {package.metadata.title}")

   .. py:attribute:: toc

      Access to the table of contents.

      :type: TableOfContents
      :returns: Table of contents object

      **Example**:

      .. code-block:: python

         toc = doc.toc
         toc_xml = toc.to_xml()

   .. py:attribute:: ncx

      Access to the NCX (Navigation Control for XML) table of contents.

      :type: TableOfContents or None
      :returns: NCX table of contents object for EPUB 2, or for EPUB 3 if NCX is present, None otherwise

      **Example**:

      .. code-block:: python

         ncx = doc.ncx
         if ncx:
             ncx_xml = ncx.to_xml()

      **Note**: For EPUB 2, this returns the same as ``toc``. For EPUB 3, this specifically 
      accesses the NCX file if present, which provides backward compatibility.

   .. py:attribute:: nav

      Access to the Navigation Document (EPUB 3 only).

      :type: TableOfContents or None
      :returns: Navigation Document table of contents object for EPUB 3, None for EPUB 2 or if not present

      **Example**:

      .. code-block:: python

         nav = doc.nav
         if nav:
             nav_xml = nav.to_xml()

      **Note**: This property specifically accesses EPUB 3 Navigation Documents. 
      Returns None for EPUB 2 documents.

   .. py:method:: get_files_info()

      Get detailed information about all files in the EPUB.

      :returns: List of dictionaries containing file information
      :rtype: List[Dict[str, Union[str, int]]]

      Each dictionary contains:
      - ``path`` (str): File path within the EPUB
      - ``size`` (int): Uncompressed size in bytes  
      - ``compressed_size`` (int): Compressed size in bytes
      - ``modified`` (str): Last modified date in ISO format

      **Example**:

      .. code-block:: python

         files = doc.get_files_info()
         for file_info in files:
             print(f"{file_info['path']}: {file_info['size']} bytes")

   .. py:method:: list_files()

      Get basic information about all files in the EPUB.

      :returns: List of dictionaries with basic file information
      :rtype: List[Dict[str, str]]

      **Example**:

      .. code-block:: python

         files = doc.list_files()
         print(f"EPUB contains {len(files)} files")

Container Class
---------------

.. py:class:: Container

   Represents the META-INF/container.xml file information.

   .. py:attribute:: rootfile_path

      Path to the main package file within the EPUB.

      :type: str

   .. py:attribute:: rootfile_media_type

      Media type of the main package file.

      :type: str

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_str()

      Get raw XML content.

      :returns: Raw XML string
      :rtype: str

Package Class
-------------

.. py:class:: Package

   Represents the main OPF package file.

   .. py:attribute:: metadata

      Package metadata information.

      :type: Metadata

   .. py:attribute:: manifest

      Package manifest information.

      :type: Manifest

   .. py:attribute:: spine

      Package spine information.

      :type: Spine

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation of the complete package.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_str()

      Get raw XML content of the complete package.

      :returns: Raw XML string
      :rtype: str

Metadata Class
--------------

.. py:class:: Metadata

   Represents Dublin Core and EPUB-specific metadata.

   .. py:attribute:: title

      Book title from dc:title element.

      :type: str

   .. py:attribute:: creator

      Book author/creator from dc:creator element.

      :type: str

   .. py:attribute:: language

      Language code from dc:language element.

      :type: str

   .. py:attribute:: identifier

      Unique identifier from dc:identifier element.

      :type: str

   .. py:attribute:: publisher

      Publisher from dc:publisher element.

      :type: str

   .. py:attribute:: date

      Publication date from dc:date element.

      :type: str

   .. py:attribute:: subject

      Subject/keywords from dc:subject element.

      :type: str

   .. py:attribute:: description

      Description from dc:description element.

      :type: str

   .. py:attribute:: contributor

      Contributor from dc:contributor element.

      :type: str

   .. py:attribute:: type

      Resource type from dc:type element.

      :type: str

   .. py:attribute:: format

      Format from dc:format element.

      :type: str

   .. py:attribute:: source

      Source from dc:source element.

      :type: str

   .. py:attribute:: relation

      Relation from dc:relation element.

      :type: str

   .. py:attribute:: coverage

      Coverage from dc:coverage element.

      :type: str

   .. py:attribute:: rights

      Rights information from dc:rights element.

      :type: str

   .. py:method:: __getattr__(name)

      Dynamic attribute access for any metadata field.

      :param str name: Metadata field name
      :returns: Metadata value or empty string
      :rtype: str

      **Example**:

      .. code-block:: python

         # Access any metadata field
         isbn = metadata.isbn if hasattr(metadata, 'isbn') else 'Not available'
         series = getattr(metadata, 'series', 'Not available')

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation of metadata.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_kv()

      Get metadata as key-value pairs.

      :returns: Key-value formatted string
      :rtype: str

      **Example**:

      .. code-block:: python

         kv_data = metadata.to_kv()
         print(kv_data)
         # Output:
         # title: The Great Gatsby
         # creator: F. Scott Fitzgerald
         # language: en

   .. py:method:: to_str()

      Get raw XML content of metadata.

      :returns: Raw XML string
      :rtype: str

Manifest Class
--------------

.. py:class:: Manifest

   Represents the package manifest section.

   .. py:attribute:: items

      Dictionary of manifest items.

      :type: Dict[str, Dict[str, str]]

      Each item contains:
      - ``href``: File path
      - ``media-type``: MIME type
      - Other attributes as needed

      **Example**:

      .. code-block:: python

         for item_id, item in manifest.items.items():
             print(f"ID: {item_id}")
             print(f"  File: {item['href']}")
             print(f"  Type: {item['media-type']}")

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_str()

      Get raw XML content.

      :returns: Raw XML string
      :rtype: str

Spine Class
-----------

.. py:class:: Spine

   Represents the package spine section.

   .. py:attribute:: items

      List of spine items in reading order.

      :type: List[Dict[str, str]]

      **Example**:

      .. code-block:: python

         for item in spine.items:
             print(f"Reading order item: {item}")

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_str()

      Get raw XML content.

      :returns: Raw XML string
      :rtype: str

TableOfContents Class
---------------------

.. py:class:: TableOfContents

   Represents the table of contents (NCX or Navigation Document).

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted XML representation.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted XML string
      :rtype: str

   .. py:method:: to_str()

      Get raw XML content.

      :returns: Raw XML string
      :rtype: str

Content Classes
---------------

.. py:class:: Content

   Base class for EPUB content documents.

   .. py:method:: to_xml(highlight_syntax=True)

      Get formatted content.

      :param bool highlight_syntax: Whether to apply syntax highlighting
      :returns: Formatted content string
      :rtype: str

   .. py:method:: to_str()

      Get raw content.

      :returns: Raw content string
      :rtype: str

.. py:class:: XHTMLContent

   Specialized class for XHTML content documents.

   Inherits from Content with additional XHTML-specific methods.

   .. py:method:: to_plain()

      Get plain text content with HTML tags stripped.

      :returns: Plain text string
      :rtype: str

      **Example**:

      .. code-block:: python

         from epub_utils.content import XHTMLContent
         
         # This would typically be accessed through Document
         # content = XHTMLContent(raw_html)
         # plain_text = content.to_plain()

Exception Classes
-----------------

.. py:exception:: ParseError

   Raised when there's an error parsing EPUB content.

   Base class: ``Exception``

   **Example**:

   .. code-block:: python

      from epub_utils import Document
      from epub_utils.exceptions import ParseError

      try:
          doc = Document("corrupted.epub")
          title = doc.package.metadata.title
      except ParseError as e:
          print(f"Failed to parse EPUB: {e}")
      except FileNotFoundError:
          print("EPUB file not found")

Usage Examples
--------------

Basic Usage
~~~~~~~~~~~

.. code-block:: python

   from epub_utils import Document

   # Load document
   doc = Document("book.epub")

   # Access metadata
   metadata = doc.package.metadata
   print(f"Title: {metadata.title}")
   print(f"Author: {metadata.creator}")

   # Check file structure
   files = doc.get_files_info()
   print(f"Contains {len(files)} files")

   # Get formatted output
   toc_xml = doc.toc.to_xml()
   metadata_kv = metadata.to_kv()

Error Handling
~~~~~~~~~~~~~~

.. code-block:: python

   from epub_utils import Document
   from epub_utils.exceptions import ParseError

   def safe_load_epub(path):
       try:
           doc = Document(path)
           return {
               'status': 'success',
               'document': doc,
               'title': getattr(doc.package.metadata, 'title', 'Unknown')
           }
       except ParseError as e:
           return {
               'status': 'parse_error',
               'error': str(e)
           }
       except FileNotFoundError:
           return {
               'status': 'file_not_found',
               'error': 'EPUB file not found'
           }
       except Exception as e:
           return {
               'status': 'unknown_error', 
               'error': str(e)
           }

Batch Processing
~~~~~~~~~~~~~~~~

.. code-block:: python

   import os
   from pathlib import Path
   from epub_utils import Document

   def process_epub_directory(directory):
       epub_files = Path(directory).glob("*.epub")
       results = []
       
       for epub_path in epub_files:
           try:
               doc = Document(str(epub_path))
               metadata = doc.package.metadata
               
               result = {
                   'file': epub_path.name,
                   'title': getattr(metadata, 'title', ''),
                   'author': getattr(metadata, 'creator', ''),
                   'language': getattr(metadata, 'language', ''),
                   'file_size': epub_path.stat().st_size,
                   'epub_files': len(doc.get_files_info())
               }
               results.append(result)
               
           except Exception as e:
               results.append({
                   'file': epub_path.name,
                   'error': str(e)
               })
       
       return results

Type Hints
----------

For better IDE support and type checking, here are the main type hints:

.. code-block:: python

   from typing import Dict, List, Union, Optional
   from epub_utils import Document

   # Function signatures for reference
   def get_files_info(self) -> List[Dict[str, Union[str, int]]]: ...
   def list_files(self) -> List[Dict[str, str]]: ...
   def to_xml(self, highlight_syntax: bool = True) -> str: ...
   def to_str(self) -> str: ...
   def to_kv(self) -> str: ...

   # Type-safe usage example
   doc: Document = Document("book.epub")
   files_info: List[Dict[str, Union[str, int]]] = doc.get_files_info()
   title: str = doc.package.metadata.title
   kv_data: str = doc.package.metadata.to_kv()

Module Structure
----------------

The ``epub-utils`` package is organized as follows:

.. code-block:: text

   epub_utils/
   ├── __init__.py          # Main exports (Document, Container)
   ├── doc.py               # Document class
   ├── container.py         # Container class
   ├── package/
   │   ├── __init__.py      # Package class
   │   ├── metadata.py      # Metadata class
   │   ├── manifest.py      # Manifest class
   │   └── spine.py         # Spine class
   ├── content/
   │   ├── __init__.py      # Content classes
   │   ├── base.py          # Base Content class
   │   └── xhtml.py         # XHTMLContent class
   ├── toc.py               # TableOfContents class
   ├── exceptions.py        # Exception classes
   ├── highlighters.py      # Syntax highlighting utilities
   └── cli.py               # Command-line interface

For detailed implementation examples, see :doc:`api-tutorial` and :doc:`examples`.


================================================
FILE: docs/api-tutorial.rst
================================================
Use as a Python library
=======================

This guide covers using ``epub-utils`` as a Python library. The API is designed to be intuitive 
and follows Python best practices for ease of use and integration into your projects.

Quick Start
-----------

The main entry point is the ``Document`` class:

.. code-block:: python

   from epub_utils import Document

   # Load an EPUB file
   doc = Document("path/to/book.epub")

   # Access various components
   print(f"Title: {doc.package.metadata.title}")
   print(f"Author: {doc.package.metadata.creator}")

Core Classes
------------

Document Class
~~~~~~~~~~~~~~

The ``Document`` class is your main interface to an EPUB file:

.. code-block:: python

   from epub_utils import Document

   doc = Document("example.epub")

   # Access major components
   container = doc.container      # Container information
   package = doc.package         # Package/OPF file
   toc = doc.toc                 # Table of contents
   
   # Get file information
   files_info = doc.get_files_info()

**Key Methods**:

- ``get_files_info()``: Returns detailed information about all files in the EPUB
- ``list_files()``: Returns a simple list of files with basic metadata

Container Access
~~~~~~~~~~~~~~~~

The container provides information from the META-INF/container.xml file:

.. code-block:: python

   # Access container properties
   print(f"Package path: {doc.container.rootfile_path}")
   print(f"Media type: {doc.container.rootfile_media_type}")

   # Get raw XML
   container_xml = doc.container.to_xml()
   raw_container = doc.container.to_str()

Package and Metadata
~~~~~~~~~~~~~~~~~~~~~

The package object gives you access to the main OPF file and its metadata:

.. code-block:: python

   package = doc.package

   # Access metadata
   metadata = package.metadata
   print(f"Title: {metadata.title}")
   print(f"Author: {metadata.creator}")
   print(f"Language: {metadata.language}")
   print(f"Identifier: {metadata.identifier}")
   print(f"Publisher: {metadata.publisher}")

   # Get all metadata as key-value pairs
   kv_metadata = metadata.to_kv()
   print(kv_metadata)

   # Access manifest and spine
   manifest = package.manifest
   spine = package.spine

Working with Metadata
----------------------

Extracting Common Fields
~~~~~~~~~~~~~~~~~~~~~~~~~

The metadata object provides easy access to Dublin Core and EPUB-specific metadata:

.. code-block:: python

   metadata = doc.package.metadata

   # Basic Dublin Core elements
   title = metadata.title
   creator = metadata.creator  # Usually the author
   subject = metadata.subject  # Keywords/topics
   description = metadata.description
   publisher = metadata.publisher
   contributor = metadata.contributor
   date = metadata.date
   type = metadata.type
   format = metadata.format
   identifier = metadata.identifier
   source = metadata.source
   language = metadata.language
   relation = metadata.relation
   coverage = metadata.coverage
   rights = metadata.rights

Dynamic Attribute Access
~~~~~~~~~~~~~~~~~~~~~~~~

The metadata object supports dynamic attribute access for any metadata field:

.. code-block:: python

   # Access any metadata field by name
   isbn = getattr(metadata, 'isbn', 'Not available')
   series = getattr(metadata, 'series', 'Not available')

   # Or use the more direct approach
   try:
       custom_field = metadata.custom_metadata_field
   except AttributeError:
       custom_field = "Field not found"

Formatted Output
~~~~~~~~~~~~~~~~

Get metadata in different formats:

.. code-block:: python

   # XML format with syntax highlighting
   xml_metadata = metadata.to_xml(highlight_syntax=True)

   # Raw XML without highlighting
   raw_xml = metadata.to_xml(highlight_syntax=False)

   # Key-value format for easy parsing
   kv_format = metadata.to_kv()

Manifest and Spine
-------------------

Working with the Manifest
~~~~~~~~~~~~~~~~~~~~~~~~~~

The manifest lists all files in the EPUB package:

.. code-block:: python

   manifest = doc.package.manifest

   # Get all items
   items = manifest.items  # Dictionary of manifest items

   # Find specific items
   for item_id, item in items.items():
       print(f"ID: {item_id}")
       print(f"  File: {item['href']}")
       print(f"  Type: {item['media-type']}")

   # Get formatted output
   manifest_xml = manifest.to_xml()

Understanding the Spine
~~~~~~~~~~~~~~~~~~~~~~~~

The spine defines the reading order:

.. code-block:: python

   spine = doc.package.spine

   # Get spine items in reading order
   spine_items = spine.items

   # Get formatted output
   spine_xml = spine.to_xml()

Table of Contents
-----------------

Working with TOC
~~~~~~~~~~~~~~~~

Access the table of contents (either NCX or Navigation Document):

.. code-block:: python

   toc = doc.toc

   # Get formatted TOC
   toc_xml = toc.to_xml()
   raw_toc = toc.to_str()

Specific TOC Access
~~~~~~~~~~~~~~~~~~~

For fine-grained control over which table of contents format to access:

.. code-block:: python

   # Access NCX specifically (EPUB 2 or EPUB 3 with NCX)
   ncx = doc.ncx
   if ncx:
       ncx_xml = ncx.to_xml()
       print("NCX navigation available")
   else:
       print("No NCX navigation found")

   # Access Navigation Document specifically (EPUB 3 only)
   nav = doc.nav
   if nav:
       nav_xml = nav.to_xml()
       print("Navigation Document available")
   else:
       print("No Navigation Document found (likely EPUB 2)")

   # Handle different EPUB versions
   package = doc.package
   if package.version.major >= 3:
       # EPUB 3 - prefer Navigation Document, fallback to NCX
       nav_doc = doc.nav or doc.ncx
   else:
       # EPUB 2 - use NCX
       nav_doc = doc.ncx

   if nav_doc:
       print("Table of contents found:", nav_doc.to_str()[:100])

Content Extraction
------------------

Accessing Document Content
~~~~~~~~~~~~~~~~~~~~~~~~~~

Extract content from specific documents within the EPUB:

.. code-block:: python

   # First, find content IDs from the manifest
   manifest = doc.package.manifest
   content_items = {
       item_id: item for item_id, item in manifest.items.items()
       if item['media-type'] == 'application/xhtml+xml'
   }

   # Access content by ID
   for content_id in content_items:
       try:
           content = doc.get_content(content_id)
           # Process content as needed
           print(f"Content ID {content_id}: {len(content)} characters")
       except Exception as e:
           print(f"Could not access content {content_id}: {e}")

File Information
----------------

Detailed File Analysis
~~~~~~~~~~~~~~~~~~~~~~

Get comprehensive information about all files in the EPUB:

.. code-block:: python

   files_info = doc.get_files_info()

   for file_info in files_info:
       print(f"Path: {file_info['path']}")
       print(f"Size: {file_info['size']} bytes")
       print(f"Compressed: {file_info['compressed_size']} bytes")
       print(f"Modified: {file_info['modified']}")
       print("---")

   # Calculate total size
   total_size = sum(f['size'] for f in files_info)
   total_compressed = sum(f['compressed_size'] for f in files_info)
   compression_ratio = (1 - total_compressed / total_size) * 100
   
   print(f"Total size: {total_size} bytes")
   print(f"Compressed size: {total_compressed} bytes")
   print(f"Compression ratio: {compression_ratio:.1f}%")

Error Handling
--------------

Robust Error Handling
~~~~~~~~~~~~~~~~~~~~~~

epub-utils provides specific exception types for better error handling:

.. code-block:: python

   from epub_utils import Document
   from epub_utils.exceptions import ParseError

   try:
       doc = Document("potentially_corrupt.epub")
       
       # Try to access metadata
       title = doc.package.metadata.title
       print(f"Successfully loaded: {title}")
       
   except ParseError as e:
       print(f"EPUB parsing error: {e}")
   except FileNotFoundError:
       print("EPUB file not found")
   except Exception as e:
       print(f"Unexpected error: {e}")

Graceful Degradation
~~~~~~~~~~~~~~~~~~~~

Handle missing or malformed metadata gracefully:

.. code-block:: python

   def safe_get_metadata(doc, field_name, default="Unknown"):
       """Safely extract metadata field with fallback."""
       try:
           return getattr(doc.package.metadata, field_name, default)
       except (AttributeError, ParseError):
           return default

   # Usage
   title = safe_get_metadata(doc, 'title', 'Untitled')
   author = safe_get_metadata(doc, 'creator', 'Unknown Author')

Next Steps
----------

- Explore the complete :doc:`api-reference` for detailed class documentation
- See more :doc:`examples` for advanced use cases
- Learn about :doc:`epub-standards` to understand the underlying specifications
- Check out the :doc:`cli-reference` for command-line equivalents


================================================
FILE: docs/changelog.rst
================================================
.. _changelog:

=========
Changelog
=========

.. _v_0_1_0a1:

0.1.0a1 (2025-06-14)
--------------------

* Added `toc` retrieval as dictionary (:issue:`4`)
* Added Comprehensive navigation reading support (`#38 <https://github.com/ernestofgonzalez/epub-utils/pull/38>`__, `#39 <https://github.com/ernestofgonzalez/epub-utils/pull/39>`__, `#42 <https://github.com/ernestofgonzalez/epub-utils/pull/42>`__)
* Added MacOS test runner (`#41 <https://github.com/ernestofgonzalez/epub-utils/pull/41>`__)
* Added support for Python 3.8 and Python 3.9 (`#40 <https://github.com/ernestofgonzalez/epub-utils/pull/40>`__)

.. _v_0_0_0a5:

0.0.0a5 (2025-06-01)
--------------------

* Added file retrieval by file path. (:issue:`22`)
* Added pretty printing to XML inspection (:issue:`23`)

.. _v_0_0_0a4:

0.0.0a4 (2025-05-26)
--------------------

* Added file inspection and ``files`` CLI command. (`#20 <https://github.com/ernestofgonzalez/epub-utils/pull/20>`__)
* Added content inspection and ``content`` CLI command (:issue:`5`)
* Added manifest parsing and ``manifest`` CLI command (`#13 <https://github.com/ernestofgonzalez/epub-utils/pull/13>`__)
* Added spine parsing and ``spine`` CLI command (`#9 <https://github.com/ernestofgonzalez/epub-utils/pull/9>`__)
* Added Key-value support for ``metadata`` CLI command 
* Fixed table of contents parsing for OEBPS 1 (`#11 <https://github.com/ernestofgonzalez/epub-utils/pull/11>`__). Thanks, `Christian Klein <https://github.com/cklein>`__.

.. _v_0_0_0a3:

0.0.0a3 (2025-05-04)
--------------------

* Fixed `toc` command. (:issue:`1`)

.. _v_0_0_0a2:

0.0.0a2 (2025-05-03)
--------------------

* Added classifiers

.. _v_0_0_0a1:

0.0.0a1 (2025-05-03)
--------------------

* Initial relese to PyPI

================================================
FILE: docs/cli-reference.rst
================================================
CLI Reference
=============

This reference documents all available command-line options and commands for ``epub-utils``.

Synopsis
--------

.. code-block:: text

   epub-utils [GLOBAL_OPTIONS] EPUB_FILE COMMAND [COMMAND_OPTIONS]

Global Options
--------------

``-h, --help``
   Show help message and exit

``-v, --version``
   Show program version and exit

``-pp, --pretty-print``
   Pretty-print XML output with proper indentation (applies to xml and raw formats only)

Commands
--------

All commands operate on an EPUB file and support the ``--format`` and ``--pretty-print`` options unless otherwise noted.

container
~~~~~~~~~

Display the container.xml file contents.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE container [--format FORMAT] [--pretty-print]

**Description**:
The container command shows the contents of META-INF/container.xml, which defines the 
location of the main package file within the EPUB.

**Supported formats**: ``xml`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # Show container with syntax highlighting
   epub-utils book.epub container

   # Show raw container XML
   epub-utils book.epub container --format raw
   
   # Show container with pretty formatting
   epub-utils book.epub container --pretty-print
   
   # Combine both options
   epub-utils book.epub container --format raw --pretty-print
   epub-utils book.epub container --format raw

**Sample output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?>
   <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
     <rootfiles>
       <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
     </rootfiles>
   </container>

package
~~~~~~~

Display the main package (OPF) file contents.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE package [--format FORMAT] [--pretty-print]

**Description**:
The package command shows the complete OPF (Open Packaging Format) file, which contains 
metadata, manifest, and spine information.

**Supported formats**: ``xml`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # Show package with syntax highlighting
   epub-utils book.epub package

   # Show raw package XML for processing
   epub-utils book.epub package --format raw | xmllint --format -
   
   # Show package with pretty formatting
   epub-utils book.epub package --pretty-print

toc
~~~

Display the table of contents file.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE toc [--format FORMAT] [--pretty-print] [--ncx | --nav]

**Description**:
Shows the table of contents, which can be either an NCX file (EPUB 2.x) or a 
Navigation Document (EPUB 3.x). By default, automatically detects and uses the 
appropriate format for the EPUB version.

**Options**:

``--ncx``
   Force retrieval of NCX file (EPUB 2 navigation control file). For EPUB 2, 
   this is the same as the default behavior. For EPUB 3, this specifically 
   accesses the NCX file if present for backward compatibility.

``--nav``
   Force retrieval of Navigation Document (EPUB 3 navigation file). Only works 
   with EPUB 3 documents that have a Navigation Document.

**Note**: The ``--ncx`` and ``--nav`` flags are mutually exclusive.

**Supported formats**: ``xml`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # Show TOC with highlighting (auto-detect format)
   epub-utils book.epub toc

   # Extract navigation structure
   epub-utils book.epub toc --format raw
   
   # Show TOC with pretty formatting
   epub-utils book.epub toc --pretty-print

   # Force NCX format (EPUB 2 style)
   epub-utils book.epub toc --ncx

   # Force Navigation Document (EPUB 3 style)
   epub-utils book.epub toc --nav

metadata
~~~~~~~~

Display metadata information from the package file.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE metadata [--format FORMAT] [--pretty-print]

**Description**:
Extracts and displays Dublin Core and EPUB-specific metadata from the package file.

**Supported formats**: ``xml`` (default), ``raw``, ``kv``

**Examples**:

.. code-block:: bash

   # Show formatted metadata
   epub-utils book.epub metadata

   # Get key-value pairs for scripting
   epub-utils book.epub metadata --format kv

   # Raw metadata XML
   epub-utils book.epub metadata --format raw
   
   # Show metadata with pretty formatting
   epub-utils book.epub metadata --pretty-print

**Key-value output format**:

.. code-block:: text

   title: The Great Gatsby
   creator: F. Scott Fitzgerald
   language: en
   identifier: urn:uuid:12345678-1234-1234-1234-123456789abc
   publisher: Scribner
   date: 2021-01-01
   subject: Fiction, Classic Literature

manifest
~~~~~~~~

Display the manifest section from the package file.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE manifest [--format FORMAT] [--pretty-print]

**Description**:
Shows the manifest, which lists all files included in the EPUB package with their 
IDs, file paths, and media types.

**Supported formats**: ``xml`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # Show manifest with highlighting
   epub-utils book.epub manifest

   # Find all CSS files
   epub-utils book.epub manifest --format raw | grep 'media-type="text/css"'
   
   # Show manifest with pretty formatting
   epub-utils book.epub manifest --pretty-print
   epub-utils book.epub manifest --format raw | grep 'media-type="text/css"'

   # Count content files
   epub-utils book.epub manifest --format raw | grep -c 'application/xhtml+xml'

spine
~~~~~

Display the spine section from the package file.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE spine [--format FORMAT] [--pretty-print]

**Description**:
Shows the spine, which defines the default reading order of the book's content.

**Supported formats**: ``xml`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # Show spine with highlighting
   epub-utils book.epub spine

   # Extract reading order
   epub-utils book.epub spine --format raw
   
   # Show spine with pretty formatting
   epub-utils book.epub spine --pretty-print

content
~~~~~~~

Display the content of a document by its manifest item ID.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE content ITEM_ID [--format FORMAT] [--pretty-print]

**Description**:
Extracts and displays the content of a specific document within the EPUB, identified 
by its manifest item ID.

**Supported formats**: ``xml`` (default), ``raw``, ``plain``

**Arguments**:
- ``ITEM_ID``: The ID of the item as defined in the manifest

**Examples**:

.. code-block:: bash

   # Show content with syntax highlighting
   epub-utils book.epub content chapter1

   # Get raw HTML/XHTML
   epub-utils book.epub content intro --format raw

   # Extract plain text (no HTML tags)
   epub-utils book.epub content chapter2 --format plain
   
   # Show content with pretty formatting
   epub-utils book.epub content chapter1 --pretty-print

**Finding item IDs**:

.. code-block:: bash

   # First check the manifest for available IDs
   epub-utils book.epub manifest | grep 'id='

   # Then extract specific content
   epub-utils book.epub content found_id --format plain

files
~~~~~

List all files in the EPUB archive with metadata, or display content of a specific file.

**Syntax**:

.. code-block:: bash

   epub-utils EPUB_FILE files [FILE_PATH] [--format FORMAT] [--pretty-print]

**Description**:
When used without a file path, provides detailed information about all files contained 
within the EPUB archive, including sizes, compression ratios, and modification dates.

When used with a file path, displays the content of the specified file within the EPUB archive.

**Supported formats**: 

- For file listing: ``table`` (default), ``raw``
- For file content: ``raw``, ``xml`` (default), ``plain``, ``kv``

**Arguments**:
- ``FILE_PATH`` (optional): Path to a specific file within the EPUB archive

**Supported formats**: ``table`` (default), ``raw``

**Examples**:

.. code-block:: bash

   # List all files in table format (default)
   epub-utils book.epub files

   # Get simple file list
   epub-utils book.epub files --format raw

   # Count total files
   epub-utils book.epub files --format raw | wc -l

   # Display content of a specific XHTML file
   epub-utils book.epub files OEBPS/chapter1.xhtml

   # Display XHTML file in different formats
   epub-utils book.epub files OEBPS/chapter1.xhtml --format raw
   epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print
   epub-utils book.epub files OEBPS/chapter1.xhtml --format plain

   # Display non-XHTML files (CSS, etc.)
   epub-utils book.epub files OEBPS/styles/main.css

**Key differences from content command**:

- ``files`` uses file paths within the EPUB archive
- ``content`` uses manifest item IDs
- ``files`` can access any file, including CSS, XML, and image files
- ``content`` only accesses files listed in the manifest

**Sample table output**:

.. code-block:: text

   File Information for book.epub
   ┌────────────────────────────────────────┬──────────┬──────────────┬─────────────────────┐
   │ Path                                   │ Size     │ Compressed   │ Modified            │
   ├────────────────────────────────────────┼──────────┼──────────────┼─────────────────────┤
   │ META-INF/container.xml                 │ 230 B    │ 140 B        │ 2021-01-01 10:00:00│
   │ OEBPS/content.opf                      │ 2.1 KB   │ 856 B        │ 2021-01-01 10:00:00│
   │ OEBPS/Text/chapter01.xhtml             │ 12.4 KB  │ 3.2 KB       │ 2021-01-01 10:00:00│
   └────────────────────────────────────────┴──────────┴──────────────┴─────────────────────┘

Format Options
--------------

Most commands support the ``--format`` and ``--pretty-print`` options to control output formatting:

``xml`` (default for most commands)
   Syntax-highlighted, formatted XML output

``raw``
   Unformatted content exactly as stored in the EPUB

``kv`` (metadata command only)
   Key-value pairs suitable for shell scripting

``plain`` (content command only)
   Plain text with HTML tags stripped

``table`` (files command only)
   Formatted table with aligned columns

Pretty Print Option
~~~~~~~~~~~~~~~~~~~

The ``--pretty-print`` (or ``-pp``) option formats XML output with proper indentation and structure:

.. code-block:: bash

   # Default output (with syntax highlighting but compact)
   epub-utils book.epub metadata
   
   # Pretty-printed output (with proper indentation)
   epub-utils book.epub metadata --pretty-print
   
   # Combine with raw format for clean, formatted XML
   epub-utils book.epub package --format raw --pretty-print

**Note**: The pretty-print option applies to both ``xml`` and ``raw`` formats, but has no effect on ``kv``, ``plain``, or ``table`` formats.

Exit Codes
----------

epub-utils uses standard exit codes:

- ``0``: Success
- ``1``: General error (file not found, invalid EPUB, etc.)
- ``2``: Command line usage error

Examples can check exit codes for error handling:

.. code-block:: bash

   if epub-utils book.epub metadata >/dev/null 2>&1; then
       echo "EPUB is valid"
   else
       echo "EPUB has issues"
   fi

Environment Variables
---------------------

epub-utils respects these environment variables:

``NO_COLOR``
   Disable color output when set to any value

``FORCE_COLOR``
   Force color output even when not outputting to a terminal

**Examples**:

.. code-block:: bash

   # Disable colors
   NO_COLOR=1 epub-utils book.epub metadata

   # Force colors in pipes
   FORCE_COLOR=1 epub-utils book.epub metadata | less -R

Common Usage Patterns
---------------------

Validation Workflow
~~~~~~~~~~~~~~~~~~~

.. code-block:: bash

   #!/bin/zsh
   # validate-epub.sh - Basic EPUB validation

   epub_file="$1"

   echo "Validating: $epub_file"

   # Check container
   if ! epub-utils "$epub_file" container >/dev/null 2>&1; then
       echo "❌ Invalid container"
       exit 1
   fi

   # Check package
   if ! epub-utils "$epub_file" package >/dev/null 2>&1; then
       echo "❌ Invalid package"
       exit 1
   fi

   # Check required metadata
   metadata=$(epub-utils "$epub_file" metadata --format kv 2>/dev/null)
   if ! echo "$metadata" | grep -q "^title:"; then
       echo "⚠️  Missing title"
   fi

   if ! echo "$metadata" | grep -q "^creator:"; then
       echo "⚠️  Missing author"
   fi

   echo "✅ EPUB structure is valid"

Metadata Extraction
~~~~~~~~~~~~~~~~~~~

.. code-block:: bash

   #!/bin/zsh
   # extract-metadata.sh - Extract metadata to CSV

   echo "filename,title,author,language,publisher" > metadata.csv

   for epub in *.epub; do
       if [[ -f "$epub" ]]; then
           metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null)
           
           title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | tr ',' ';')
           author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | tr ',' ';')
           language=$(echo "$metadata" | grep "^language:" | cut -d' ' -f2-)
           publisher=$(echo "$metadata" | grep "^publisher:" | cut -d' ' -f2- | tr ',' ';')
           
           echo "$epub,$title,$author,$language,$publisher" >> metadata.csv
       fi
   done

Content Analysis
~~~~~~~~~~~~~~~~

.. code-block:: bash

   #!/bin/zsh
   # analyze-content.sh - Analyze EPUB content structure

   epub_file="$1"

   echo "Content Analysis for: $epub_file"
   echo "=================================="

   # Get content files from manifest
   content_ids=$(epub-utils "$epub_file" manifest --format raw | \
                grep 'media-type="application/xhtml+xml"' | \
                sed 's/.*id="\([^"]*\)".*/\1/')

   total_words=0

   for content_id in $content_ids; do
       if word_count=$(epub-utils "$epub_file" content "$content_id" --format plain 2>/dev/null | wc -w); then
           echo "Content ID '$content_id': $word_count words"
           total_words=$((total_words + word_count))
       fi
   done

   echo "=================================="
   echo "Total words: $total_words"

Error Handling
--------------

Always handle errors when using epub-utils in scripts:

.. code-block:: bash

   # Check if file exists first
   if [[ ! -f "$epub_file" ]]; then
       echo "Error: File '$epub_file' not found" >&2
       exit 1
   fi

   # Capture and handle command errors
   if ! output=$(epub-utils "$epub_file" metadata --format kv 2>&1); then
       echo "Error processing EPUB: $output" >&2
       exit 1
   fi

   # Check for specific issues
   if [[ -z "$output" ]]; then
       echo "Warning: No metadata found" >&2
   fi

Performance Tips
----------------

1. **Use raw format for large-scale processing** to avoid syntax highlighting overhead
2. **Pipe efficiently** to avoid unnecessary intermediate files
3. **Process files in parallel** when handling many EPUBs
4. **Cache results** when running the same command multiple times

.. code-block:: bash

   # Efficient parallel processing
   find . -name "*.epub" | xargs -n 1 -P 4 -I {} \
       zsh -c 'echo "{}: $(epub-utils "{}" metadata --format kv | grep "^title:" | cut -d" " -f2-)"'

Troubleshooting
---------------

Common Issues and Solutions
~~~~~~~~~~~~~~~~~~~~~~~~~~~

**"Invalid value for 'PATH': File does not exist"**
   Check the file path and ensure the EPUB file exists.

**"ParseError: Unable to parse container.xml"**
   The EPUB file may be corrupted. Verify it's a valid ZIP file.

**"Content with id 'X' not found"**
   Check available content IDs using the manifest command first.

**No color output**
   Ensure your terminal supports colors and check the ``NO_COLOR`` environment variable.

**Large file performance**
   Use ``--format raw`` for better performance with large files.


================================================
FILE: docs/cli-tutorial.rst
================================================
Use as a command-line tool
==========================

This tutorial will guide you through using ``epub-utils`` from the command line. We'll cover all 
available commands with practical examples and tips for everyday usage.

Getting Started
---------------

The basic syntax for epub-utils is:

.. code-block:: bash

   epub-utils [OPTIONS] EPUB_FILE COMMAND [COMMAND_OPTIONS]

Let's start with a simple example:

.. code-block:: bash

   # Display help
   epub-utils --help

   # Check version
   epub-utils --version

Basic File Inspection
---------------------

Container Information
~~~~~~~~~~~~~~~~~~~~~

The container command shows the EPUB's container.xml file, which points to the main package file:

.. code-block:: bash

   # Show container with syntax highlighting (default)
   epub-utils book.epub container

   # Show raw XML without highlighting
   epub-utils book.epub container --format raw
   
   # Show container with pretty formatting
   epub-utils book.epub container --pretty-print

**Example output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?>
   <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
     <rootfiles>
       <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
     </rootfiles>
   </container>

Package Information
~~~~~~~~~~~~~~~~~~~

The package command displays the main OPF (Open Packaging Format) file:

.. code-block:: bash

   # Show package file with highlighting
   epub-utils book.epub package

   # Show raw package content
   epub-utils book.epub package --format raw
   
   # Show package with pretty formatting
   epub-utils book.epub package --pretty-print

This reveals the complete EPUB structure including metadata, manifest, and spine.

Working with Metadata
----------------------

Extracting Metadata
~~~~~~~~~~~~~~~~~~~~

The metadata command is perfect for getting book information:

.. code-block:: bash

   # Pretty-printed metadata with highlighting
   epub-utils book.epub metadata

   # Key-value format for scripting
   epub-utils book.epub metadata --format kv
   
   # Metadata with pretty formatting
   epub-utils book.epub metadata --pretty-print

**Example key-value output**:

.. code-block:: text

   title: The Great Gatsby
   creator: F. Scott Fitzgerald
   language: en
   identifier: urn:uuid:12345678-1234-1234-1234-123456789abc
   publisher: Scribner
   date: 2021-01-01
   subject: Fiction, Classic Literature

Scripting with Metadata
~~~~~~~~~~~~~~~~~~~~~~~~

The key-value format is perfect for shell scripting:

.. code-block:: bash

   # Extract just the title
   epub-utils book.epub metadata --format kv | grep "^title:" | cut -d' ' -f2-

   # Get author name
   author=$(epub-utils book.epub metadata --format kv | grep "^creator:" | cut -d' ' -f2-)
   echo "Author: $author"

   # Batch process multiple files
   for epub in *.epub; do
       title=$(epub-utils "$epub" metadata --format kv | grep "^title:" | cut -d' ' -f2-)
       echo "$epub: $title"
   done

Understanding EPUB Structure
-----------------------------

Table of Contents
~~~~~~~~~~~~~~~~~

View the navigation structure of your EPUB:

.. code-block:: bash

   # Show table of contents with highlighting (auto-detect format)
   epub-utils book.epub toc

   # Raw TOC for processing
   epub-utils book.epub toc --format raw
   
   # TOC with pretty formatting
   epub-utils book.epub toc --pretty-print

**EPUB Version-Specific Access**:

For precise control over which navigation format to access:

.. code-block:: bash

   # Force NCX format (EPUB 2 navigation control file)
   epub-utils book.epub toc --ncx

   # Force Navigation Document (EPUB 3 navigation file)
   epub-utils book.epub toc --nav

**Use Cases**:

- Use ``--ncx`` when you specifically need the EPUB 2 style navigation or want to access backward-compatible NCX in EPUB 3
- Use ``--nav`` when you specifically need the EPUB 3 Navigation Document features
- Use the default (no flags) for general TOC access that works with any EPUB version

Manifest Inspection
~~~~~~~~~~~~~~~~~~~

The manifest lists all files contained in the EPUB:

.. code-block:: bash

   # View manifest with syntax highlighting
   epub-utils book.epub manifest

   # Raw manifest content
   epub-utils book.epub manifest --format raw
   
   # Manifest with pretty formatting
   epub-utils book.epub manifest --pretty-print

**What you'll see**: Each item in the manifest includes:
- ``id``: Unique identifier for the item
- ``href``: File path within the EPUB
- ``media-type``: MIME type of the file

Spine Information
~~~~~~~~~~~~~~~~~

The spine defines the reading order of the book:

.. code-block:: bash

   # View spine with highlighting
   epub-utils book.epub spine

   # Raw spine for processing
   epub-utils book.epub spine --format raw

Content Extraction
------------------

Viewing Document Content
~~~~~~~~~~~~~~~~~~~~~~~~

Extract content from specific documents using their manifest ID:

.. code-block:: bash

   # Show content with syntax highlighting
   epub-utils book.epub content chapter1

   # Raw HTML/XHTML content
   epub-utils book.epub content chapter1 --format raw

   # Plain text (HTML tags stripped)
   epub-utils book.epub content chapter1 --format plain

**Finding Content IDs**: Use the manifest command to see available content IDs:

.. code-block:: bash

   # First, check the manifest for available IDs
   epub-utils book.epub manifest

   # Then extract specific content
   epub-utils book.epub content intro --format plain

File Listing and Content Access
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Get detailed information about all files in the EPUB, or access specific file content:

.. code-block:: bash

   # Formatted table of files
   epub-utils book.epub files

   # Raw file list
   epub-utils book.epub files --format raw

   # Display content of a specific file by path
   epub-utils book.epub files OEBPS/chapter1.xhtml

   # Access different file types
   epub-utils book.epub files META-INF/container.xml
   epub-utils book.epub files OEBPS/styles/main.css
   epub-utils book.epub files OEBPS/images/cover.jpg

   # Different output formats for XHTML content
   epub-utils book.epub files OEBPS/chapter1.xhtml --format raw
   epub-utils book.epub files OEBPS/chapter1.xhtml --format xml --pretty-print
   epub-utils book.epub files OEBPS/chapter1.xhtml --format plain

**Key advantages of the files command**:

- Access any file in the EPUB archive by its path
- No need to know manifest item IDs
- Works with all file types (XHTML, CSS, XML, images, etc.)
- Complements the ``content`` command which uses manifest IDs

Content Analysis
~~~~~~~~~~~~~~~~

Analyze EPUB content structure:

.. code-block:: bash

   #!/bin/bash
   # analyze-content.sh - Analyze EPUB content structure

   epub_file="$1"

   echo "=== Content Analysis for $epub_file ==="

   # Get all content files from manifest
   epub-utils "$epub_file" manifest --format raw | \
   grep 'media-type="application/xhtml+xml"' | \
   sed 's/.*id="\([^"]*\)".*/\1/' | \
   while read -r content_id; do
       echo "--- Content ID: $content_id ---"
       word_count=$(epub-utils "$epub_file" content "$content_id" --format plain | wc -w)
       echo "Word count: $word_count"
       echo ""
   done

Output Format Options
---------------------

epub-utils supports multiple output formats for different use cases:

XML Format (Default)
~~~~~~~~~~~~~~~~~~~~

.. code-block:: bash

   epub-utils book.epub metadata
   # Produces syntax-highlighted, formatted XML

Raw Format
~~~~~~~~~~

.. code-block:: bash

   epub-utils book.epub metadata --format raw
   # Produces unformatted XML, perfect for piping to other tools

Key-Value Format
~~~~~~~~~~~~~~~~

.. code-block:: bash

   epub-utils book.epub metadata --format kv
   # Produces key: value pairs, ideal for scripting

Plain Text Format
~~~~~~~~~~~~~~~~~

.. code-block:: bash

   epub-utils book.epub content chapter1 --format plain
   # Strips HTML tags, produces readable text

Pretty-Print Option
~~~~~~~~~~~~~~~~~~~

Use the ``--pretty-print`` (or ``-pp``) option to format XML output with proper indentation:

.. code-block:: bash

   # Default output (compact XML)
   epub-utils book.epub metadata --format raw
   
   # Pretty-formatted output (with indentation)
   epub-utils book.epub metadata --format raw --pretty-print
   
   # Works with syntax highlighting too
   epub-utils book.epub package --pretty-print

Next Steps
----------

Now that you're familiar with the CLI basics, you might want to:

- Explore the :doc:`api-tutorial` for programmatic access
- Check out more :doc:`examples` for real-world use cases
- Learn about :doc:`epub-standards` for deeper understanding
- Contribute to the project via :doc:`contributing`


================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'epub-utils'
copyright = '2025, Ernesto González'
author = 'Ernesto González'
release = '0.1.0a1'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

extensions = [
	'sphinx.ext.autodoc',
	'sphinx.ext.autosummary',
	'sphinx.ext.napoleon',
	'sphinx.ext.viewcode',
	'sphinx_copybutton',
	'sphinx_issues',
]

templates_path = ['_templates']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# -- Napoleon settings -------------------------------------------------------
napoleon_google_docstring = True
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = False
napoleon_include_private_with_doc = False

# -- Autodoc settings --------------------------------------------------------
autodoc_member_order = 'bysource'
autodoc_default_flags = ['members']
autosummary_generate = True

# -- Intersphinx mapping -----------------------------------------------------
intersphinx_mapping = {
	'python': ('https://docs.python.org/3', None),
	'lxml': ('https://lxml.de/', None),
}


# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'furo'
html_static_path = ['_static']

# Add source link in footer
html_show_sourcelink = True
html_copy_source = True
html_show_sphinx = True

# -- Linking Github issues --------------------------------------------------
# https://github.com/sloria/sphinx-issues

issues_github_path = 'ernestofgonzalez/epub-utils'


================================================
FILE: docs/contributing.rst
================================================
============
Contributing
============

We welcome contributions to ``epub-utils``! This guide will help you get started with contributing to the project.

Getting Started
===============

Setting Up Development Environment
----------------------------------

1. **Fork the Repository**

   Fork the ``epub-utils`` repository on GitHub to your own account.

2. **Clone Your Fork**

   .. code-block:: bash

       git clone https://github.com/yourusername/epub-utils.git
       cd epub-utils

3. **Set Up Development Environment**

   .. code-block:: bash

       # Create virtual environment
       python -m venv dev-env
       source dev-env/bin/activate  # On Windows: dev-env\Scripts\activate
       
       # Install in development mode
       pip install -e ".[dev]"
       
       # Or install dependencies manually
       pip install -e .
       pip install pytest black flake8 mypy sphinx


Project Structure
-----------------

.. code-block:: text

    epub-utils/
    ├── src/
    │   └── epub_utils/
    │       ├── __init__.py
    │       ├── cli.py              # Command-line interface
    │       ├── document.py         # Main Document class
    │       ├── extractors.py       # Content extraction logic
    │       └── formatters.py       # Output formatting
    ├── tests/
    │   ├── __init__.py
    │   ├── test_document.py
    │   ├── test_cli.py
    │   └── fixtures/               # Test EPUB files
    ├── docs/
    │   ├── conf.py
    │   ├── index.rst
    │   └── ...                     # Documentation files
    ├── pyproject.toml
    ├── README.md
    └── CHANGELOG.md

Development Workflow
====================

Branch Strategy
---------------

- ``main`` branch: Stable, release-ready code
- ``develop`` branch: Integration branch for features
- Feature branches: ``feature/your-feature-name``
- Bug fix branches: ``fix/issue-description``

Making Changes
--------------

1. **Create a Feature Branch**

   .. code-block:: bash

       git checkout -b feature/your-feature-name

2. **Make Your Changes**

   Follow the coding standards outlined below.

3. **Write Tests**

   All new features should include comprehensive tests.

4. **Run Tests Locally**

   .. code-block:: bash

       # Run all tests
       pytest
       
       # Run with coverage
       pytest --cov=epub_utils
       
       # Run specific test file
       pytest tests/test_document.py

5. **Check Code Quality**

   .. code-block:: bash

       # Format code
       black src/ tests/
       
       # Check linting
       flake8 src/ tests/
       
       # Type checking
       mypy src/

6. **Update Documentation**

   If your changes affect the API or add new features, update the documentation.

7. **Commit Your Changes**

   .. code-block:: bash

       git add .
       git commit -m "Add: Brief description of your changes"

8. **Push and Create Pull Request**

   .. code-block:: bash

       git push origin feature/your-feature-name

   Then create a pull request on GitHub.

Coding Standards
================

Python Style Guide
------------------

We follow PEP 8 with some modifications:

- **Line length**: 88 characters (Black's default)
- **String quotes**: Use double quotes for strings
- **Import sorting**: Use isort or similar tool
- **Docstrings**: Use Google-style docstrings

Code Formatting
---------------

We use **Black** for code formatting:

.. code-block:: bash

    # Format all Python files
    black src/ tests/
    
    # Check formatting without making changes
    black --check src/ tests/

Example of properly formatted code:

.. code-block:: python

    def extract_metadata(epub_path: str, format_type: str = "dict") -> dict:
        """Extract metadata from an EPUB file.
        
        Args:
            epub_path: Path to the EPUB file.
            format_type: Output format ('dict', 'xml', 'json').
            
        Returns:
            Dictionary containing extracted metadata.
            
        Raises:
            FileNotFoundError: If the EPUB file doesn't exist.
            ValueError: If format_type is not supported.
        """
        if not os.path.exists(epub_path):
            raise FileNotFoundError(f"EPUB file not found: {epub_path}")
        
        if format_type not in ["dict", "xml", "json"]:
            raise ValueError(f"Unsupported format: {format_type}")
        
        # Implementation here...
        return {}

Linting
-------

We use **ruff** for linting:

.. code-block:: bash

    # Check for linting errors
    make lint

Type Hints
----------

Use type hints for all function signatures:

.. code-block:: python

    from typing import List, Dict, Optional, Union
    from pathlib import Path

    def process_files(
        file_paths: List[Union[str, Path]], 
        output_format: str = "table"
    ) -> Optional[Dict[str, any]]:
        """Process multiple EPUB files."""
        pass

Documentation Standards
=======================

Docstring Format
----------------

Use Google-style docstrings:

.. code-block:: python

    def complex_function(param1: str, param2: int, param3: bool = False) -> dict:
        """Brief description of the function.
        
        Longer description if needed. Explain the purpose, behavior,
        and any important details about the function.
        
        Args:
            param1: Description of the first parameter.
            param2: Description of the second parameter.
            param3: Description of optional parameter. Defaults to False.
            
        Returns:
            Description of return value and its structure.
            
        Raises:
            ValueError: When param2 is negative.
            FileNotFoundError: When the specified file doesn't exist.
            
        Example:
            Basic usage example:
            
            >>> result = complex_function("test", 42)
            >>> print(result["status"])
            "success"
        """
        pass

API Documentation
-----------------

When adding new classes or functions to the public API:

1. **Add to __init__.py** exports if appropriate
2. **Update API reference** documentation
3. **Include usage examples** in docstrings
4. **Add to tutorials** if it's a major feature

RST Documentation
-----------------

When writing RST documentation:

.. code-block:: rst

    Section Title
    =============
    
    Subsection
    ----------
    
    Code examples:
    
    .. code-block:: python
    
        # Python code here
        import epub_utils
        
    Shell commands:
    
    .. code-block:: bash
    
        epub-utils info book.epub

Testing Guidelines
==================

Test Structure
--------------

- **Unit tests**: Test individual functions and methods
- **Integration tests**: Test component interactions
- **End-to-end tests**: Test complete workflows
- **Performance tests**: Test with large files (optional)

Writing Tests
-------------

Use pytest for all tests:

.. code-block:: python

    import pytest
    from epub_utils import Document
    from pathlib import Path


    def test_document_with_invalid_file():
        """Test error handling with invalid file."""
        with pytest.raises(FileNotFoundError):
            Document("nonexistent.epub")
            

    @pytest.mark.parametrize("format_type", ["dict", "xml", "json"])
    def test_metadata_formats(doc_path, format_type):
        """Test different metadata formats."""
        doc = Document(str(doc_path)
        metadata = doc.get_metadata(format_type=format_type)
        assert metadata is not None

Test Fixtures
-------------

Create test EPUB files in ``tests/fixtures/``:

.. code-block:: python

    # tests/conftest.py
    import pytest
    from pathlib import Path


    @pytest.fixture
    def sample_epub():
        """Provide path to sample EPUB for testing."""
        return Path(__file__).parent / "fixtures" / "sample.epub"


    @pytest.fixture
    def invalid_epub():
        """Provide path to invalid EPUB for error testing."""
        return Path(__file__).parent / "fixtures" / "invalid.epub"

Running Tests
-------------

.. code-block:: bash

    # Run all tests
    make test
    
    # Run specific test file
    pytest tests/test_document.py

Types of Contributions
======================

Bug Reports
-----------

When reporting bugs:

1. Check existing issues first
2. Use the issue template if available
3. Provide minimal reproduction case
4. Include system information

.. code-block:: text

    **Bug Description**
    Clear description of the bug.
    
    **Steps to Reproduce**
    1. Step one
    2. Step two
    3. Step three
    
    **Expected Behavior**
    What should happen.
    
    **Actual Behavior**
    What actually happens.
    
    **Environment**
    - epub-utils version: 
    - Python version:
    - Operating system:
    
    **Sample File**
    Attach or link to EPUB file if relevant.

Feature Requests
----------------

For new features:

1. Describe the use case clearly
2. Explain why it's valuable to users
3. Suggest implementation approach if you have ideas
4. Consider backward compatibility

Documentation Improvements
--------------------------

Documentation contributions are highly valued:

- Fix typos and grammar errors
- Improve clarity of explanations
- Add more examples to existing docs
- Create new tutorials for common use cases
- Update outdated information

Code Contributions
------------------

Areas where contributions are welcome:

1. Performance improvements
2. New output formats
3. Additional EPUB validation
4. Better error handling
5. CLI usability enhancements
6. Support for EPUB 3 features

Release Process
===============

Versioning
----------

We follow `Semantic Versioning <https://semver.org/>`_:

- MAJOR: Incompatible API changes
- MINOR: New functionality (backward compatible)
- PATCH: Bug fixes (backward compatible)

Version format: ``MAJOR.MINOR.PATCH`` (e.g., ``1.2.3``)

Development versions may include additional identifiers:
- ``1.2.3-dev`` (development)
- ``1.2.3rc1`` (release candidate)

================================================
FILE: docs/epub-standards.rst
================================================
==============
EPUB Standards
==============

Understanding EPUB Specifications
=================================

EPUB (Electronic Publication) is an open standard for digital books and publications. 
This guide covers the EPUB specifications and how epub-utils ensures compliance.

EPUB 3.3 Specification
======================

Current Standard
----------------

EPUB 3.3 is the current specification, published by the W3C. It defines:

- **Package Document**: Contains metadata, manifest, and spine
- **Container Format**: ZIP-based archive structure
- **Content Documents**: XHTML5, SVG, and other media types
- **Navigation Document**: Replaces NCX for table of contents

Key Components
--------------

Container Structure
~~~~~~~~~~~~~~~~~~~

.. code-block:: text

    book.epub
    ├── META-INF/
    │   ├── container.xml          # Points to package document
    │   └── signatures.xml         # Digital signatures (optional)
    ├── OEBPS/                     # Content folder (common name)
    │   ├── package.opf            # Package document
    │   ├── nav.xhtml              # Navigation document
    │   ├── content/               # Text content
    │   ├── images/                # Images
    │   ├── styles/                # CSS files
    │   └── fonts/                 # Font files (optional)
    └── mimetype                   # Must be first file, uncompressed

Package Document (OPF)
~~~~~~~~~~~~~~~~~~~~~~

The package document defines three main sections:

**Metadata Section**:

.. code-block:: xml

    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
        <dc:title>Book Title</dc:title>
        <dc:creator>Author Name</dc:creator>
        <dc:identifier id="bookid">urn:uuid:12345</dc:identifier>
        <dc:language>en</dc:language>
        <meta property="dcterms:modified">2024-01-01T00:00:00Z</meta>
    </metadata>

**Manifest Section**:

.. code-block:: xml

    <manifest>
        <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" 
              properties="nav"/>
        <item id="chapter1" href="content/chapter1.xhtml" 
              media-type="application/xhtml+xml"/>
        <item id="cover-image" href="images/cover.jpg" 
              media-type="image/jpeg" properties="cover-image"/>
    </manifest>

**Spine Section**:

.. code-block:: xml

    <spine>
        <itemref idref="chapter1"/>
        <itemref idref="chapter2"/>
    </spine>

Navigation Document
~~~~~~~~~~~~~~~~~~~

EPUB 3 uses XHTML navigation documents instead of NCX:

.. code-block:: html

    <!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhtml" 
          xmlns:epub="http://www.idpf.org/2007/ops">
    <head>
        <title>Navigation</title>
    </head>
    <body>
        <nav epub:type="toc">
            <h1>Table of Contents</h1>
            <ol>
                <li><a href="content/chapter1.xhtml">Chapter 1</a></li>
                <li><a href="content/chapter2.xhtml">Chapter 2</a></li>
            </ol>
        </nav>
    </body>
    </html>

EPUB Compliance with epub-utils
===============================

Validation Capabilities
-----------------------

epub-utils helps ensure EPUB compliance by:

1. **Structure Validation**: Checks container format
2. **Metadata Validation**: Verifies required elements
3. **Manifest Validation**: Ensures all files are declared
4. **Spine Validation**: Checks reading order
5. **Content Validation**: Basic XHTML structure checks

Checking Compliance
-------------------

Use epub-utils to validate EPUB structure:

.. code-block:: bash

    # Check basic structure
    epub-utils info book.epub

    # Detailed manifest information
    epub-utils manifest book.epub --format table

    # Extract and examine package document
    epub-utils extract book.epub --output-dir temp/
    cat temp/OEBPS/package.opf

Python API for Validation
~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

    from epub_utils import Document

    def validate_epub_structure(epub_path):
        """Validate basic EPUB structure."""
        try:
            doc = Document(epub_path)
            
            # Check required components
            checks = {
                'has_container': hasattr(doc, 'container'),
                'has_package': hasattr(doc, 'package'),
                'has_metadata': len(doc.metadata) > 0,
                'has_manifest': len(doc.manifest) > 0,
                'has_spine': len(doc.spine) > 0,
            }
            
            # Check required metadata
            required_metadata = ['title', 'language', 'identifier']
            metadata_present = {}
            
            for item in doc.metadata:
                for req in required_metadata:
                    if req in item.get('name', '').lower():
                        metadata_present[req] = True
            
            print("Structure Validation:")
            for check, passed in checks.items():
                status = "✓" if passed else "✗"
                print(f"  {status} {check}")
            
            print("\nRequired Metadata:")
            for req in required_metadata:
                status = "✓" if metadata_present.get(req) else "✗"
                print(f"  {status} {req}")
                
            return all(checks.values()) and len(metadata_present) >= 2
            
        except Exception as e:
            print(f"Validation failed: {e}")
            return False

Common Compliance Issues
========================

Missing Required Elements
-------------------------

**Problem**: EPUB missing required metadata

.. code-block:: bash

    # Check metadata completeness
    epub-utils metadata book.epub --format table

**Solution**: Ensure these elements are present:

- ``dc:title``
- ``dc:language`` 
- ``dc:identifier`` (with unique ID)
- ``meta property="dcterms:modified"`` (EPUB 3)

Invalid File References
-----------------------

**Problem**: Manifest references files that don't exist

.. code-block:: python

    def check_file_references(epub_path):
        """Check if all manifest files exist in the archive."""
        doc = Document(epub_path)
        
        missing_files = []
        for item in doc.manifest:
            file_path = item.get('href')
            if file_path:
                # Check if file exists in the EPUB
                try:
                    # This would need zip file checking
                    pass  
                except:
                    missing_files.append(file_path)
        
        if missing_files:
            print("Missing files referenced in manifest:")
            for file in missing_files:
                print(f"  - {file}")

Incorrect MIME Types
--------------------

**Problem**: Wrong media-type attributes in manifest

Common correct MIME types:

- XHTML: ``application/xhtml+xml``
- CSS: ``text/css``
- JPEG: ``image/jpeg``
- PNG: ``image/png``
- NCX: ``application/x-dtbncx+xml``

EPUB 2 vs EPUB 3 Differences
============================

Format Evolution
-----------------

+------------------+-------------------------+-------------------------+
| Feature          | EPUB 2                  | EPUB 3                  |
+==================+=========================+=========================+
| Navigation       | NCX file required       | XHTML nav document      |
+------------------+-------------------------+-------------------------+
| Content Types    | XHTML 1.1, limited      | XHTML5, SVG, MathML     |
+------------------+-------------------------+-------------------------+
| Metadata         | Dublin Core only        | Enhanced metadata       |
+------------------+-------------------------+-------------------------+
| Accessibility    | Limited                 | Rich accessibility      |
+------------------+-------------------------+-------------------------+
| Scripting        | Not allowed             | Limited JavaScript      |
+------------------+-------------------------+-------------------------+

Migration Considerations
------------------------

When working with older EPUB 2 files:

.. code-block:: python

    def detect_epub_version(epub_path):
        """Detect EPUB version from package document."""
        doc = Document(epub_path)
        
        # Check package document for version attribute
        # This is a simplified example
        for item in doc.manifest:
            if 'nav' in item.get('properties', ''):
                return "EPUB 3"
        
        # Check for NCX file (EPUB 2 indicator)
        for item in doc.manifest:
            if item.get('media-type') == 'application/x-dtbncx+xml':
                return "EPUB 2"
        
        return "Unknown"

Best Practices for Compliance
=============================

Metadata Best Practices
-----------------------

1. **Always include required elements**:

   .. code-block:: xml

       <dc:title>Complete Book Title</dc:title>
       <dc:creator>Author Full Name</dc:creator>
       <dc:identifier id="bookid">urn:uuid:unique-identifier</dc:identifier>
       <dc:language>en-US</dc:language>

2. **Use proper Dublin Core refinements**:

   .. code-block:: xml

       <dc:creator id="author">Jane Doe</dc:creator>
       <meta refines="#author" property="role" scheme="marc:relators">aut</meta>

3. **Include modification date for EPUB 3**:

   .. code-block:: xml

       <meta property="dcterms:modified">2024-05-25T10:30:00Z</meta>

File Organization
-----------------

1. **Use consistent folder structure**
2. **Declare all files in manifest**
3. **Use proper MIME types**
4. **Include fallbacks for specialized content**

Content Guidelines
------------------

1. **Valid XHTML**: Ensure all content files are well-formed
2. **Proper encoding**: Use UTF-8 encoding
3. **Relative links**: Use relative paths for internal references
4. **Alt text**: Include alt attributes for images

Testing and Validation Tools
============================

External Validators
-------------------

- **EPUBCheck**: Official EPUB validator
- **Ace by DAISY**: Accessibility checker
- **pagina EPUB-Checker**: Online validator

Integration with epub-utils
---------------------------

.. code-block:: bash

    # Basic structure check
    epub-utils info book.epub

    # Export for external validation
    epub-utils extract book.epub --output-dir validation/
    # Run EPUBCheck on extracted content

    # Check specific components
    epub-utils manifest book.epub --format xml > manifest.xml
    epub-utils metadata book.epub --format xml > metadata.xml

Future Standards
================

EPUB 3.3 and Beyond
-------------------

Current developments in EPUB standards:

- **Enhanced accessibility features**
- **Better multimedia support**
- **Improved metadata vocabularies**
- **Web standards alignment**

Staying Current
---------------

- Monitor W3C EPUB Working Group
- Test with latest validators
- Follow accessibility guidelines (WCAG)
- Use semantic markup

Resources
=========

Official Specifications
-----------------------

- `EPUB 3.3 Specification <https://www.w3.org/TR/epub-33/>`_
- `EPUB Accessibility 1.1 <https://www.w3.org/TR/epub-a11y-11/>`_
- `EPUB Open Container Format 3.0.1 <https://www.w3.org/TR/epub-ocf-301/>`_

Validation Tools
----------------

- `EPUBCheck <https://github.com/w3c/epubcheck>`_
- `Ace Accessibility Checker <https://github.com/daisy/ace>`_
- `EPUB Validator <https://validator.idpf.org/>`_

Developer Resources
-------------------

- `EPUB 3 Best Practices <https://www.w3.org/TR/epub-bp/>`_
- `IDPF EPUB Resources <http://idpf.org/epub/31/spec/>`_
- `Accessibility Guidelines <https://www.w3.org/WAI/WCAG21/quickref/>`_


================================================
FILE: docs/examples.rst
================================================
Examples and Use Cases
======================

This page showcases real-world examples of using epub-utils for various tasks. Each example 
includes both CLI and Python API approaches where applicable.

Digital Library Management
--------------------------

Cataloging Your EPUB Collection
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: You have a large collection of EPUB files and want to create a comprehensive catalog.

**CLI Approach**:

.. code-block:: bash

   #!/bin/bash
   # catalog-epubs.sh - Create a catalog of all EPUB files

   echo "Creating EPUB catalog..."
   echo "File,Title,Author,Publisher,Language,Year,Files,Size" > epub_catalog.csv

   find . -name "*.epub" -type f | while read -r epub; do
       echo "Processing: $epub"
       
       # Extract metadata using epub-utils
       metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null)
       
       if [ $? -eq 0 ]; then
           title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | sed 's/,/;/g')
           author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | sed 's/,/;/g')
           publisher=$(echo "$metadata" | grep "^publisher:" | cut -d' ' -f2- | sed 's/,/;/g')
           language=$(echo "$metadata" | grep "^language:" | cut -d' ' -f2-)
           year=$(echo "$metadata" | grep "^date:" | cut -d' ' -f2- | cut -d'-' -f1)
           
           # Count files and get size
           file_count=$(epub-utils "$epub" files --format raw 2>/dev/null | wc -l)
           size=$(stat -f%z "$epub" 2>/dev/null || stat -c%s "$epub" 2>/dev/null)
           
           echo "$epub,$title,$author,$publisher,$language,$year,$file_count,$size" >> epub_catalog.csv
       else
           echo "$epub,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR" >> epub_catalog.csv
       fi
   done

   echo "Catalog complete! See epub_catalog.csv"

**Python Approach**:

.. code-block:: python

   import csv
   import os
   from pathlib import Path
   from epub_utils import Document

   def create_epub_catalog(directory, output_file="epub_catalog.csv"):
       """Create a comprehensive catalog of EPUB files."""
       
       fieldnames = [
           'filepath', 'filename', 'title', 'author', 'publisher', 
           'language', 'year', 'isbn', 'file_count', 'size_bytes', 'size_mb'
       ]
       
       epub_files = list(Path(directory).rglob("*.epub"))
       print(f"Found {len(epub_files)} EPUB files")
       
       with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
           writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
           writer.writeheader()
           
           for i, epub_path in enumerate(epub_files, 1):
               print(f"Processing {i}/{len(epub_files)}: {epub_path.name}")
               
               try:
                   doc = Document(str(epub_path))
                   metadata = doc.package.metadata
                   
                   # Extract date year
                   date_str = getattr(metadata, 'date', '')
                   year = date_str.split('-')[0] if date_str else ''
                   
                   # Get file size
                   size_bytes = epub_path.stat().st_size
                   size_mb = round(size_bytes / (1024 * 1024), 2)
                   
                   row = {
                       'filepath': str(epub_path),
                       'filename': epub_path.name,
                       'title': getattr(metadata, 'title', ''),
                       'author': getattr(metadata, 'creator', ''),
                       'publisher': getattr(metadata, 'publisher', ''),
                       'language': getattr(metadata, 'language', ''),
                       'year': year,
                       'isbn': getattr(metadata, 'identifier', ''),
                       'file_count': len(doc.get_files_info()),
                       'size_bytes': size_bytes,
                       'size_mb': size_mb
                   }
                   
                   writer.writerow(row)
                   
               except Exception as e:
                   print(f"  Error: {e}")
                   # Write error row
                   writer.writerow({
                       'filepath': str(epub_path),
                       'filename': epub_path.name,
                       'title': f'ERROR: {str(e)}',
                       'author': '',
                       'publisher': '',
                       'language': '',
                       'year': '',
                       'isbn': '',
                       'file_count': 0,
                       'size_bytes': epub_path.stat().st_size,
                       'size_mb': 0
                   })

   # Usage
   create_epub_catalog("/path/to/your/epub/collection")

Quality Assurance and Validation
---------------------------------

EPUB Health Check
~~~~~~~~~~~~~~~~~

**Scenario**: Validate EPUB files and identify potential issues.

.. code-block:: python

   from epub_utils import Document, ParseError
   import zipfile
   from pathlib import Path

   class EPUBHealthChecker:
       def __init__(self):
           self.issues = []
           
       def check_epub(self, epub_path):
           """Comprehensive EPUB health check."""
           self.issues = []
           epub_path = Path(epub_path)
           
           print(f"Checking EPUB: {epub_path.name}")
           
           # Basic file checks
           if not epub_path.exists():
               self.issues.append("File does not exist")
               return self.get_report()
           
           if epub_path.stat().st_size == 0:
               self.issues.append("File is empty")
               return self.get_report()
           
           # ZIP integrity check
           try:
               with zipfile.ZipFile(epub_path, 'r') as zf:
                   corrupt_files = zf.testzip()
                   if corrupt_files:
                       self.issues.append(f"Corrupt ZIP file: {corrupt_files}")
           except zipfile.BadZipFile:
               self.issues.append("Invalid ZIP file")
               return self.get_report()
           
           # EPUB structure checks
           try:
               doc = Document(str(epub_path))
               self._check_container(doc)
               self._check_package(doc)
               self._check_metadata(doc)
               self._check_manifest(doc)
               self._check_files(doc)
               
           except ParseError as e:
               self.issues.append(f"Parse error: {e}")
           except Exception as e:
               self.issues.append(f"Unexpected error: {e}")
           
           return self.get_report()
       
       def _check_container(self, doc):
           """Check container structure."""
           try:
               container = doc.container
               if not container.rootfile_path:
                   self.issues.append("No rootfile specified in container")
           except Exception as e:
               self.issues.append(f"Container error: {e}")
       
       def _check_package(self, doc):
           """Check package/OPF file."""
           try:
               package = doc.package
               if not hasattr(package, 'metadata'):
                   self.issues.append("Package missing metadata")
               if not hasattr(package, 'manifest'):
                   self.issues.append("Package missing manifest")
               if not hasattr(package, 'spine'):
                   self.issues.append("Package missing spine")
           except Exception as e:
               self.issues.append(f"Package error: {e}")
       
       def _check_metadata(self, doc):
           """Check metadata quality."""
           try:
               metadata = doc.package.metadata
               
               # Check required fields
               if not getattr(metadata, 'title', '').strip():
                   self.issues.append("Missing or empty title")
               if not getattr(metadata, 'language', '').strip():
                   self.issues.append("Missing or empty language")
               if not getattr(metadata, 'identifier', '').strip():
                   self.issues.append("Missing or empty identifier")
                   
           except Exception as e:
               self.issues.append(f"Metadata error: {e}")
       
       def _check_manifest(self, doc):
           """Check manifest integrity."""
           try:
               manifest = doc.package.manifest
               if not manifest.items:
                   self.issues.append("Empty manifest")
               
               # Check for common content types
               has_html = any(
                   item.get('media-type') == 'application/xhtml+xml'
                   for item in manifest.items.values()
               )
               if not has_html:
                   self.issues.append("No XHTML content files found")
                   
           except Exception as e:
               self.issues.append(f"Manifest error: {e}")
       
       def _check_files(self, doc):
           """Check file structure."""
           try:
               files_info = doc.get_files_info()
               if len(files_info) < 3:  # At least container, package, and one content file
                   self.issues.append("Very few files in EPUB (possibly incomplete)")
               
               # Check for suspiciously large files
               for file_info in files_info:
                   if file_info['size'] > 10 * 1024 * 1024:  # 10MB
                       self.issues.append(f"Large file found: {file_info['path']} ({file_info['size']} bytes)")
                       
           except Exception as e:
               self.issues.append(f"File check error: {e}")
       
       def get_report(self):
           """Generate health check report."""
           if not self.issues:
               return {"status": "healthy", "issues": []}
           else:
               return {"status": "issues_found", "issues": self.issues}

   # Usage
   checker = EPUBHealthChecker()
   report = checker.check_epub("book.epub")

   if report["status"] == "healthy":
       print("✓ EPUB is healthy!")
   else:
       print("⚠ Issues found:")
       for issue in report["issues"]:
           print(f"  - {issue}")

Metadata Management
-------------------

Standardizing Metadata
~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Clean and standardize metadata across your EPUB collection.

.. code-block:: python

   import re
   from epub_utils import Document

   class MetadataStandardizer:
       def __init__(self):
           self.language_codes = {
               'english': 'en',
               'spanish': 'es', 
               'french': 'fr',
               'german': 'de',
               'italian': 'it'
               # Add more as needed
           }
       
       def analyze_metadata(self, epub_path):
           """Analyze and suggest metadata improvements."""
           doc = Document(epub_path)
           metadata = doc.package.metadata
           suggestions = []
           
           # Check title
           title = getattr(metadata, 'title', '')
           if not title:
               suggestions.append("Missing title")
           elif len(title) > 200:
               suggestions.append("Title is very long (>200 chars)")
           elif title.isupper():
               suggestions.append("Title is all uppercase - consider title case")
           
           # Check author
           creator = getattr(metadata, 'creator', '')
           if not creator:
               suggestions.append("Missing author/creator")
           elif ',' not in creator and len(creator.split()) > 2:
               suggestions.append("Author name might need reformatting (Last, First)")
           
           # Check language
           language = getattr(metadata, 'language', '')
           if not language:
               suggestions.append("Missing language code")
           elif len(language) > 3:
               # Might be full language name instead of code
               lang_lower = language.lower()
               if lang_lower in self.language_codes:
                   suggestions.append(f"Use language code '{self.language_codes[lang_lower]}' instead of '{language}'")
           
           # Check identifier
           identifier = getattr(metadata, 'identifier', '')
           if not identifier:
               suggestions.append("Missing identifier")
           elif not self._is_valid_identifier(identifier):
               suggestions.append("Identifier format might be invalid")
           
           # Check date format
           date = getattr(metadata, 'date', '')
           if date and not re.match(r'\d{4}(-\d{2}-\d{2})?', date):
               suggestions.append("Date should be in YYYY or YYYY-MM-DD format")
           
           return {
               'file': epub_path,
               'current_metadata': {
                   'title': title,
                   'creator': creator,
                   'language': language,
                   'identifier': identifier,
                   'date': date
               },
               'suggestions': suggestions
           }
       
       def _is_valid_identifier(self, identifier):
           """Check if identifier looks valid."""
           # Check for ISBN, DOI, UUID patterns
           patterns = [
               r'urn:isbn:\d{10,13}',  # ISBN URN
               r'isbn:\d{10,13}',      # Simple ISBN
               r'urn:uuid:[a-f0-9-]{36}',  # UUID URN
               r'doi:10\.\d+/.+',      # DOI
               r'urn:doi:10\.\d+/.+'   # DOI URN
           ]
           
           return any(re.match(pattern, identifier, re.I) for pattern in patterns)

   # Usage
   standardizer = MetadataStandardizer()
   analysis = standardizer.analyze_metadata("book.epub")

   print(f"Analyzing: {analysis['file']}")
   if analysis['suggestions']:
       print("Suggestions for improvement:")
       for suggestion in analysis['suggestions']:
           print(f"  - {suggestion}")
   else:
       print("Metadata looks good!")

Content Analysis and Statistics
-------------------------------

Reading Level Analysis
~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Analyze EPUB content to determine reading complexity.

.. code-block:: python

   import re
   import math
   from epub_utils import Document

   class ReadingLevelAnalyzer:
       def analyze_epub(self, epub_path):
           """Analyze reading level of an EPUB."""
           doc = Document(epub_path)
           
           # Get all text content
           all_text = self._extract_all_text(doc)
           
           if not all_text.strip():
               return {"error": "No readable text found"}
           
           # Calculate statistics
           stats = self._calculate_text_stats(all_text)
           
           # Calculate reading level scores
           flesch_score = self._flesch_reading_ease(stats)
           flesch_grade = self._flesch_kincaid_grade(stats)
           
           return {
               'title': getattr(doc.package.metadata, 'title', 'Unknown'),
               'word_count': stats['words'],
               'sentence_count': stats['sentences'],
               'syllable_count': stats['syllables'],
               'avg_words_per_sentence': round(stats['words'] / stats['sentences'], 2),
               'avg_syllables_per_word': round(stats['syllables'] / stats['words'], 2),
               'flesch_reading_ease': round(flesch_score, 2),
               'flesch_kincaid_grade': round(flesch_grade, 2),
               'reading_level': self._interpret_flesch_score(flesch_score)
           }
       
       def _extract_all_text(self, doc):
           """Extract all readable text from EPUB."""
           # This is a simplified version - real implementation would
           # need to parse XHTML content files
           try:
               manifest = doc.package.manifest
               # In a real implementation, you'd extract and parse each content file
               # For now, return placeholder
               return "Sample text for analysis. This would contain the actual book content."
           except Exception:
               return ""
       
       def _calculate_text_stats(self, text):
           """Calculate basic text statistics."""
           # Clean text
           text = re.sub(r'[^\w\s\.\!\?]', '', text)
           
           # Count words
           words = len(text.split())
           
           # Count sentences
           sentences = len(re.findall(r'[.!?]+', text))
           if sentences == 0:
               sentences = 1  # Avoid division by zero
           
           # Count syllables (simplified)
           syllables = self._count_syllables(text)
           
           return {
               'words': words,
               'sentences': sentences,
               'syllables': syllables
           }
       
       def _count_syllables(self, text):
           """Simplified syllable counting."""
           words = text.lower().split()
           syllable_count = 0
           
           for word in words:
               word = re.sub(r'[^a-z]', '', word)
               if word:
                   # Simple syllable counting heuristic
                   vowels = 'aeiouy'
                   syllables = sum(1 for i, char in enumerate(word) 
                                 if char in vowels and (i == 0 or word[i-1] not in vowels))
                   if word.endswith('e') and syllables > 1:
                       syllables -= 1
                   syllable_count += max(1, syllables)
           
           return syllable_count
       
       def _flesch_reading_ease(self, stats):
           """Calculate Flesch Reading Ease score."""
           return (206.835 - 
                   (1.015 * (stats['words'] / stats['sentences'])) - 
                   (84.6 * (stats['syllables'] / stats['words'])))
       
       def _flesch_kincaid_grade(self, stats):
           """Calculate Flesch-Kincaid Grade Level."""
           return ((0.39 * (stats['words'] / stats['sentences'])) + 
                   (11.8 * (stats['syllables'] / stats['words'])) - 15.59)
       
       def _interpret_flesch_score(self, score):
           """Interpret Flesch Reading Ease score."""
           if score >= 90:
               return "Very Easy (5th grade)"
           elif score >= 80:
               return "Easy (6th grade)"
           elif score >= 70:
               return "Fairly Easy (7th grade)"
           elif score >= 60:
               return "Standard (8th-9th grade)"
           elif score >= 50:
               return "Fairly Difficult (10th-12th grade)"
           elif score >= 30:
               return "Difficult (College level)"
           else:
               return "Very Difficult (Graduate level)"

   # Usage
   analyzer = ReadingLevelAnalyzer()
   analysis = analyzer.analyze_epub("book.epub")

   print(f"Reading Level Analysis for: {analysis['title']}")
   print(f"Word Count: {analysis['word_count']:,}")
   print(f"Reading Level: {analysis['reading_level']}")
   print(f"Flesch-Kincaid Grade: {analysis['flesch_kincaid_grade']}")

Direct File Access and Extraction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Extract specific files from EPUB archives for processing or analysis.

**CLI Approach**:

.. code-block:: bash

   #!/bin/bash
   # extract-epub-assets.sh - Extract and process EPUB content files

   epub_file="$1"
   output_dir="extracted_content"
   
   mkdir -p "$output_dir"
   
   echo "Extracting content from: $epub_file"
   
   # Get list of all XHTML content files
   epub-utils "$epub_file" files --format raw | grep '\.xhtml$' | while read -r file_path; do
       echo "Processing: $file_path"
       
       # Extract plain text content
       safe_name=$(echo "$file_path" | tr '/' '_')
       epub-utils "$epub_file" files "$file_path" --format plain > "$output_dir/${safe_name}.txt"
       
       # Extract styled HTML content
       epub-utils "$epub_file" files "$file_path" --format raw > "$output_dir/${safe_name}.html"
   done
   
   # Extract CSS files for styling reference
   epub-utils "$epub_file" files --format raw | grep '\.css$' | while read -r css_path; do
       echo "Extracting CSS: $css_path"
       safe_name=$(echo "$css_path" | tr '/' '_')
       epub-utils "$epub_file" files "$css_path" > "$output_dir/${safe_name}"
   done
   
   echo "Extraction complete! Files saved to $output_dir/"

**Comparing files vs content commands**:

.. code-block:: bash

   # Using files command (direct path access)
   epub-utils book.epub files OEBPS/chapter1.xhtml --format plain
   epub-utils book.epub files OEBPS/styles/main.css
   epub-utils book.epub files META-INF/container.xml
   
   # Using content command (requires manifest item ID)
   epub-utils book.epub manifest | grep chapter1  # Find the ID first
   epub-utils book.epub content chapter1-id --format plain

**Key advantages of the files command**:

- **Direct access**: Use actual file paths without needing manifest IDs
- **Universal file access**: Access any file type (XHTML, CSS, XML, images, etc.)
- **Simpler automation**: No need to parse manifest to find item IDs
- **Better for file-system-based workflows**: Mirrors actual EPUB structure

**Python equivalent using API**:

.. code-block:: python

   from epub_utils import Document

   def extract_file_content(epub_path, file_path):
       """Extract content from a specific file in EPUB."""
       doc = Document(epub_path)
       
       try:
           content = doc.get_file_by_path(file_path)
           
           # Handle different content types
           if hasattr(content, 'to_plain'):
               # XHTML content - can extract plain text
               return {
                   'raw_html': content.to_str(),
                   'plain_text': content.to_plain(),
                   'formatted_xml': content.to_xml(pretty_print=True)
               }
           else:
               # Other file types (CSS, XML, etc.)
               return {'raw_content': content}
               
       except ValueError as e:
           return {'error': str(e)}

   # Usage
   doc = Document("book.epub")
   
   # Extract chapter content
   chapter_content = extract_file_content("book.epub", "OEBPS/chapter1.xhtml")
   if 'plain_text' in chapter_content:
       print(f"Chapter text: {chapter_content['plain_text'][:200]}...")
   
   # Extract CSS for styling analysis
   css_content = extract_file_content("book.epub", "OEBPS/styles/main.css")
   if 'raw_content' in css_content:
       print(f"CSS rules: {len(css_content['raw_content'].split('{'))} rules found")

Automation and Workflows
-------------------------

Automated EPUB Processing Pipeline
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Set up an automated pipeline for processing new EPUB files.

.. code-block:: python

   import os
   import shutil
   import json
   from pathlib import Path
   from datetime import datetime
   from epub_utils import Document

   class EPUBProcessor:
       def __init__(self, input_dir, output_dir, processed_dir):
           self.input_dir = Path(input_dir)
           self.output_dir = Path(output_dir)
           self.processed_dir = Path(processed_dir)
           
           # Create directories if they don't exist
           self.output_dir.mkdir(exist_ok=True)
           self.processed_dir.mkdir(exist_ok=True)
       
       def process_new_files(self):
           """Process all new EPUB files in input directory."""
           epub_files = list(self.input_dir.glob("*.epub"))
           
           if not epub_files:
               print("No EPUB files found to process")
               return
           
           print(f"Found {len(epub_files)} EPUB files to process")
           
           results = []
           for epub_path in epub_files:
               result = self.process_single_file(epub_path)
               results.append(result)
           
           # Generate processing report
           self.generate_report(results)
           
           return results
       
       def process_single_file(self, epub_path):
           """Process a single EPUB file."""
           print(f"Processing: {epub_path.name}")
           
           try:
               doc = Document(str(epub_path))
               
               # Extract metadata
               metadata = self.extract_metadata(doc)
               
               # Validate file
               validation_result = self.validate_epub(doc)
               
               # Generate file info
               file_info = self.generate_file_info(epub_path, doc)
               
               # Create organized filename
               new_filename = self.create_organized_filename(metadata)
               
               # Move file to organized location
               organized_path = self.organize_file(epub_path, new_filename, metadata)
               
               result = {
                   'original_path': str(epub_path),
                   'new_path': str(organized_path),
                   'status': 'success',
                   'metadata': metadata,
                   'validation': validation_result,
                   'file_info': file_info,
                   'processed_at': datetime.now().isoformat()
               }
               
               # Move original to processed directory
               processed_path = self.processed_dir / epub_path.name
               shutil.move(str(epub_path), str(processed_path))
               
               return result
               
           except Exception as e:
               result = {
                   'original_path': str(epub_path),
                   'status': 'error',
                   'error': str(e),
                   'processed_at': datetime.now().isoformat()
               }
               
               # Move problematic file to processed directory
               processed_path = self.processed_dir / f"ERROR_{epub_path.name}"
               shutil.move(str(epub_path), str(processed_path))
               
               return result
       
       def extract_metadata(self, doc):
           """Extract standardized metadata."""
           metadata = doc.package.metadata
           
           return {
               'title': getattr(metadata, 'title', '').strip(),
               'author': getattr(metadata, 'creator', '').strip(),
               'publisher': getattr(metadata, 'publisher', '').strip(),
               'language': getattr(metadata, 'language', '').strip(),
               'year': self.extract_year(getattr(metadata, 'date', '')),
               'identifier': getattr(metadata, 'identifier', '').strip(),
               'subject': getattr(metadata, 'subject', '').strip()
           }
       
       def extract_year(self, date_str):
           """Extract year from date string."""
           if not date_str:
               return ''
           return date_str.split('-')[0] if '-' in date_str else date_str[:4]
       
       def validate_epub(self, doc):
           """Basic EPUB validation."""
           issues = []
           
           try:
               metadata = doc.package.metadata
               
               if not getattr(metadata, 'title', '').strip():
                   issues.append('Missing title')
               if not getattr(metadata, 'creator', '').strip():
                   issues.append('Missing author')
               if not getattr(metadata, 'language', '').strip():
                   issues.append('Missing language')
               
               # Check for content
               manifest = doc.package.manifest
               has_content = any(
                   item.get('media-type') == 'application/xhtml+xml'
                   for item in manifest.items.values()
               )
               
               if not has_content:
                   issues.append('No content files found')
               
           except Exception as e:
               issues.append(f'Validation error: {e}')
           
           return {
               'is_valid': len(issues) == 0,
               'issues': issues
           }
       
       def generate_file_info(self, epub_path, doc):
           """Generate file information."""
           stat = epub_path.stat()
           
           return {
               'filename': epub_path.name,
               'size_bytes': stat.st_size,
               'size_mb': round(stat.st_size / (1024 * 1024), 2),
               'file_count': len(doc.get_files_info()),
               'modified': datetime.fromtimestamp(stat.st_mtime).isoformat()
           }
       
       def create_organized_filename(self, metadata):
           """Create an organized filename from metadata."""
           # Clean strings for filename
           def clean_for_filename(s):
               return re.sub(r'[^\w\s-]', '', s).strip()[:50]
           
           author = clean_for_filename(metadata['author'] or 'Unknown_Author')
           title = clean_for_filename(metadata['title'] or 'Unknown_Title')
           year = metadata['year'] or 'Unknown_Year'
           
           return f"{author} - {title} ({year}).epub"
       
       def organize_file(self, epub_path, new_filename, metadata):
           """Organize file into structured directory."""
           # Create author directory
           author = metadata['author'] or 'Unknown_Author'
           author_dir = self.output_dir / author[:50]  # Limit length
           author_dir.mkdir(exist_ok=True)
           
           # Create final path
           final_path = author_dir / new_filename
           
           # Copy file to organized location
           shutil.copy2(str(epub_path), str(final_path))
           
           return final_path
       
       def generate_report(self, results):
           """Generate processing report."""
           report_path = self.output_dir / f"processing_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
           
           summary = {
               'total_files': len(results),
               'successful': len([r for r in results if r['status'] == 'success']),
               'errors': len([r for r in results if r['status'] == 'error']),
               'generated_at': datetime.now().isoformat(),
               'results': results
           }
           
           with open(report_path, 'w', encoding='utf-8') as f:
               json.dump(summary, f, indent=2, ensure_ascii=False)
           
           print(f"Processing complete!")
           print(f"Successfully processed: {summary['successful']}")
           print(f"Errors: {summary['errors']}")
           print(f"Report saved to: {report_path}")

   # Usage
   processor = EPUBProcessor(
       input_dir="/path/to/new/epubs",
       output_dir="/path/to/organized/library", 
       processed_dir="/path/to/processed/files"
   )

   results = processor.process_new_files()

Command-Line Power User Examples
--------------------------------

Advanced Shell Scripts
~~~~~~~~~~~~~~~~~~~~~~

**Complex metadata extraction with error handling**:

.. code-block:: bash

   #!/bin/bash
   # advanced-epub-analysis.sh

   set -euo pipefail

   EPUB_DIR="${1:-./}"
   OUTPUT_FILE="detailed_analysis.json"

   echo "Starting advanced EPUB analysis..."
   echo "Directory: $EPUB_DIR"
   echo "Output: $OUTPUT_FILE"

   # Initialize JSON output
   echo '{"analysis_date": "'$(date -Iseconds)'", "epubs": [' > "$OUTPUT_FILE"

   first=true
   find "$EPUB_DIR" -name "*.epub" -type f | while read -r epub; do
       echo "Analyzing: $(basename "$epub")"
       
       if [ "$first" = true ]; then
           first=false
       else
           echo "," >> "$OUTPUT_FILE"
       fi
       
       # Start JSON object for this EPUB
       echo '  {' >> "$OUTPUT_FILE"
       echo "    \"file\": \"$epub\"," >> "$OUTPUT_FILE"
       
       # Extract metadata with error handling
       if metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null); then
           echo "    \"metadata\": {" >> "$OUTPUT_FILE"
           
           # Parse metadata into JSON
           echo "$metadata" | while IFS=': ' read -r key value; do
               if [ -n "$key" ] && [ -n "$value" ]; then
                   echo "      \"$key\": \"$value\"," >> "$OUTPUT_FILE"
               fi
           done | sed '$s/,$//' # Remove last comma
           
           echo "    }," >> "$OUTPUT_FILE"
       else
           echo "    \"metadata\": null," >> "$OUTPUT_FILE"
           echo "    \"metadata_error\": true," >> "$OUTPUT_FILE"
       fi
       
       # File analysis
       if file_info=$(epub-utils "$epub" files --format raw 2>/dev/null); then
           file_count=$(echo "$file_info" | wc -l)
           echo "    \"file_count\": $file_count," >> "$OUTPUT_FILE"
       else
           echo "    \"file_count\": null," >> "$OUTPUT_FILE"
       fi
       
       # File size
       size=$(stat -f%z "$epub" 2>/dev/null || stat -c%s "$epub" 2>/dev/null || echo "0")
       echo "    \"size_bytes\": $size," >> "$OUTPUT_FILE"
       
       # Validation check
       if epub-utils "$epub" container >/dev/null 2>&1 && \
          epub-utils "$epub" package >/dev/null 2>&1; then
           echo "    \"is_valid\": true" >> "$OUTPUT_FILE"
       else
           echo "    \"is_valid\": false" >> "$OUTPUT_FILE"
       fi
       
       echo "  }" >> "$OUTPUT_FILE"
   done

   # Close JSON
   echo "]}" >> "$OUTPUT_FILE"

   echo "Analysis complete! Results in $OUTPUT_FILE"

**Batch processing with parallel execution**:

.. code-block:: bash

   #!/bin/bash
   # parallel-epub-check.sh

   EPUB_DIR="${1:-./}"
   MAX_JOBS=4

   export -f check_single_epub
   check_single_epub() {
       epub="$1"
       base=$(basename "$epub")
       
       echo "[$base] Starting check..."
       
       # Quick validation
       if ! epub-utils "$epub" container >/dev/null 2>&1; then
           echo "[$base] ❌ Invalid container"
           return 1
       fi
       
       if ! epub-utils "$epub" package >/dev/null 2>&1; then
           echo "[$base] ❌ Invalid package"
           return 1
       fi
       
       # Check for required metadata
       metadata=$(epub-utils "$epub" metadata --format kv 2>/dev/null)
       
       if ! echo "$metadata" | grep -q "^title:"; then
           echo "[$base] ⚠️  Missing title"
       fi
       
       if ! echo "$metadata" | grep -q "^creator:"; then
           echo "[$base] ⚠️  Missing author"
       fi
       
       echo "[$base] ✅ Check complete"
   }

   # Run parallel checks
   find "$EPUB_DIR" -name "*.epub" -type f | \
   xargs -n 1 -P $MAX_JOBS -I {} bash -c 'check_single_epub "$@"' _ {}

Navigation and Table of Contents
--------------------------------

Working with EPUB Navigation Documents
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Extract and analyze navigation structures from both EPUB 2 and EPUB 3 files.

**CLI Approach - Version-Specific TOC Access**:

.. code-block:: bash

   #!/bin/bash
   # extract-navigation.sh - Extract navigation from EPUB files
   
   EPUB_FILE="$1"
   
   if [ -z "$EPUB_FILE" ]; then
       echo "Usage: $0 <epub-file>"
       exit 1
   fi
   
   echo "Analyzing navigation in: $(basename "$EPUB_FILE")"
   echo "========================================"
   
   # Try EPUB 3 nav document first
   echo "Attempting EPUB 3 nav document extraction..."
   if epub-utils "$EPUB_FILE" toc --nav > /tmp/nav.xml 2>/dev/null; then
       echo "✅ EPUB 3 nav document found"
       echo "Navigation structure:"
       # Extract navigation items with their hierarchy
       grep -o '<a[^>]*href="[^"]*"[^>]*>[^<]*</a>' /tmp/nav.xml | \
       sed 's/<a[^>]*href="\([^"]*\)"[^>]*>\([^<]*\)<\/a>/  → \2 (\1)/' | \
       head -10
       
       # Count navigation items
       nav_count=$(grep -c '<a[^>]*href=' /tmp/nav.xml)
       echo "Total navigation items: $nav_count"
   else
       echo "❌ No EPUB 3 nav document found"
   fi
   
   echo ""
   echo "Attempting EPUB 2 NCX extraction..."
   if epub-utils "$EPUB_FILE" toc --ncx > /tmp/ncx.xml 2>/dev/null; then
       echo "✅ EPUB 2 NCX document found"
       echo "Table of contents structure:"
       # Extract NCX navigation points
       grep -o '<navLabel><text>[^<]*</text></navLabel>' /tmp/ncx.xml | \
       sed 's/<navLabel><text>\([^<]*\)<\/text><\/navLabel>/  → \1/' | \
       head -10
       
       # Count NCX nav points
       ncx_count=$(grep -c '<navPoint' /tmp/ncx.xml)
       echo "Total NCX navigation points: $ncx_count"
   else
       echo "❌ No EPUB 2 NCX document found"
   fi
   
   # Compare standard TOC with version-specific extracts
   echo ""
   echo "Standard TOC extraction:"
   standard_toc=$(epub-utils "$EPUB_FILE" toc --format raw 2>/dev/null | wc -l)
   echo "Standard TOC items: $standard_toc"

**Python Approach - Advanced Navigation Analysis**:

.. code-block:: python

   from epub_utils import Document
   import xml.etree.ElementTree as ET
   from pathlib import Path
   
   class NavigationAnalyzer:
       def __init__(self, epub_path):
           self.doc = Document(epub_path)
           self.epub_path = Path(epub_path)
           
       def analyze_navigation(self):
           """Comprehensive navigation analysis."""
           print(f"Analyzing: {self.epub_path.name}")
           print("=" * 50)
           
           # Check EPUB version
           version = getattr(self.doc.package.metadata, 'version', 'unknown')
           print(f"EPUB Version: {version}")
           print()
           
           # Analyze EPUB 3 nav document
           self._analyze_nav_document()
           
           # Analyze EPUB 2 NCX document  
           self._analyze_ncx_document()
           
           # Compare with standard TOC
           self._analyze_standard_toc()
           
       def _analyze_nav_document(self):
           """Analyze EPUB 3 navigation document."""
           print("EPUB 3 Navigation Document Analysis:")
           print("-" * 40)
           
           try:
               nav_content = self.doc.nav
               if nav_content:
                   print("✅ Nav document found")
                   
                   # Parse navigation structure
                   nav_items = self._parse_nav_structure(nav_content)
                   print(f"Navigation items found: {len(nav_items)}")
                   
                   # Show hierarchy
                   print("\nNavigation hierarchy:")
                   for item in nav_items[:10]:  # Show first 10
                       indent = "  " * item['level']
                       print(f"{indent}→ {item['title']} ({item['href']})")
                   
                   if len(nav_items) > 10:
                       print(f"  ... and {len(nav_items) - 10} more items")
                       
               else:
                   print("❌ No nav document found")
                   
           except Exception as e:
               print(f"❌ Error accessing nav document: {e}")
           print()
           
       def _analyze_ncx_document(self):
           """Analyze EPUB 2 NCX document."""
           print("EPUB 2 NCX Document Analysis:")
           print("-" * 30)
           
           try:
               ncx_content = self.doc.ncx
               if ncx_content:
                   print("✅ NCX document found")
                   
                   # Parse NCX structure
                   ncx_items = self._parse_ncx_structure(ncx_content)
                   print(f"NCX navigation points: {len(ncx_items)}")
                   
                   # Show structure
                   print("\nNCX structure:")
                   for item in ncx_items[:10]:  # Show first 10
                       indent = "  " * item['level']
                       print(f"{indent}→ {item['title']} ({item['src']})")
                   
                   if len(ncx_items) > 10:
                       print(f"  ... and {len(ncx_items) - 10} more items")
                       
               else:
                   print("❌ No NCX document found")
                   
           except Exception as e:
               print(f"❌ Error accessing NCX document: {e}")
           print()
           
       def _analyze_standard_toc(self):
           """Analyze standard TOC extraction."""
           print("Standard TOC Analysis:")
           print("-" * 22)
           
           try:
               toc = self.doc.get_toc()
               toc_items = len(toc.get_nav_items())
               print(f"✅ Standard TOC items: {toc_items}")
               
               # Show some items
               print("\nStandard TOC items:")
               for i, item in enumerate(toc.get_nav_items()[:5]):
                   print(f"  → {item.title} ({item.href})")
               
           except Exception as e:
               print(f"❌ Error with standard TOC: {e}")
           print()
           
       def _parse_nav_structure(self, nav_content):
           """Parse EPUB 3 nav document structure."""
           items = []
           try:
               root = ET.fromstring(nav_content)
               # Handle namespaces
               namespaces = {'xhtml': 'http://www.w3.org/1999/xhtml'}
               
               def parse_nav_list(ol_element, level=0):
                   for li in ol_element.findall('.//xhtml:li', namespaces):
                       a_elem = li.find('.//xhtml:a', namespaces)
                       if a_elem is not None:
                           title = a_elem.text or ""
                           href = a_elem.get('href', '')
                           items.append({
                               'title': title.strip(),
                               'href': href,
                               'level': level
                           })
                           
                           # Check for nested lists
                           nested_ol = li.find('.//xhtml:ol', namespaces)
                           if nested_ol is not None:
                               parse_nav_list(nested_ol, level + 1)
               
               # Find main navigation
               nav_elem = root.find('.//xhtml:nav[@*="toc"]', namespaces)
               if nav_elem is None:
                   nav_elem = root.find('.//xhtml:nav', namespaces)
               
               if nav_elem is not None:
                   ol_elem = nav_elem.find('.//xhtml:ol', namespaces)
                   if ol_elem is not None:
                       parse_nav_list(ol_elem)
                       
           except ET.ParseError as e:
               print(f"Warning: Could not parse nav XML: {e}")
           
           return items
           
       def _parse_ncx_structure(self, ncx_content):
           """Parse EPUB 2 NCX document structure."""
           items = []
           try:
               root = ET.fromstring(ncx_content)
               # NCX namespace
               namespaces = {'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
               
               def parse_nav_point(nav_point, level=0):
                   # Get label
                   nav_label = nav_point.find('ncx:navLabel/ncx:text', namespaces)
                   title = nav_label.text if nav_label is not None else ""
                   
                   # Get content source
                   content = nav_point.find('ncx:content', namespaces)
                   src = content.get('src', '') if content is not None else ""
                   
                   items.append({
                       'title': title.strip(),
                       'src': src,
                       'level': level
                   })
                   
                   # Process child nav points
                   for child_nav_point in nav_point.findall('ncx:navPoint', namespaces):
                       parse_nav_point(child_nav_point, level + 1)
               
               # Find all top-level navigation points
               nav_map = root.find('ncx:navMap', namespaces)
               if nav_map is not None:
                   for nav_point in nav_map.findall('ncx:navPoint', namespaces):
                       parse_nav_point(nav_point)
                       
           except ET.ParseError as e:
               print(f"Warning: Could not parse NCX XML: {e}")
           
           return items
   
   # Usage examples
   def analyze_single_epub(epub_path):
       """Analyze a single EPUB file."""
       analyzer = NavigationAnalyzer(epub_path)
       analyzer.analyze_navigation()
   
   def compare_navigation_across_epubs(epub_directory):
       """Compare navigation structures across multiple EPUB files."""
       epub_files = list(Path(epub_directory).glob("*.epub"))
       
       print(f"Comparing navigation across {len(epub_files)} EPUB files")
       print("=" * 60)
       
       results = []
       
       for epub_path in epub_files:
           try:
               doc = Document(str(epub_path))
               
               # Check what navigation documents are available
               has_nav = bool(doc.nav)
               has_ncx = bool(doc.ncx)
               standard_toc_count = len(doc.get_toc().get_nav_items())
               
               results.append({
                   'file': epub_path.name,
                   'has_nav': has_nav,
                   'has_ncx': has_ncx,
                   'toc_items': standard_toc_count,
                   'version': getattr(doc.package.metadata, 'version', 'unknown')
               })
               
           except Exception as e:
               print(f"Error processing {epub_path.name}: {e}")
       
       # Print comparison table
       print(f"{'File':<30} {'Version':<8} {'Nav':<5} {'NCX':<5} {'TOC Items':<10}")
       print("-" * 65)
       
       for result in results:
           nav_mark = "✅" if result['has_nav'] else "❌"
           ncx_mark = "✅" if result['has_ncx'] else "❌"
           
           print(f"{result['file']:<30} {result['version']:<8} "
                 f"{nav_mark:<5} {ncx_mark:<5} {result['toc_items']:<10}")
   
   # Example usage
   if __name__ == "__main__":
       # Analyze single file
       analyze_single_epub("/path/to/your/book.epub")
       
       # Compare multiple files
       compare_navigation_across_epubs("/path/to/epub/collection")

Building Smart Reading Lists
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Scenario**: Create curated reading lists based on navigation complexity and structure.

.. code-block:: python

   from epub_utils import Document
   import json
   from pathlib import Path
   from collections import defaultdict
   
   class ReadingListBuilder:
       def __init__(self):
           self.books = []
           
       def analyze_book_complexity(self, epub_path):
           """Analyze book's structural complexity."""
           try:
               doc = Document(str(epub_path))
               
               # Get navigation info
               toc_items = len(doc.get_toc().get_nav_items())
               has_advanced_nav = bool(doc.nav) or bool(doc.ncx)
               
               # Get file structure info
               files_info = doc.get_files_info()
               html_files = [f for f in files_info if f['media_type'] == 'application/xhtml+xml']
               
               complexity_score = self._calculate_complexity_score(
                   toc_items, len(html_files), has_advanced_nav
               )
               
               return {
                   'path': epub_path,
                   'title': getattr(doc.package.metadata, 'title', ''),
                   'author': getattr(doc.package.metadata, 'creator', ''),
                   'toc_items': toc_items,
                   'html_files': len(html_files),
                   'has_advanced_nav': has_advanced_nav,
                   'complexity_score': complexity_score,
                   'complexity_level': self._get_complexity_level(complexity_score)
               }
               
           except Exception as e:
               print(f"Error analyzing {epub_path}: {e}")
               return None
               
       def _calculate_complexity_score(self, toc_items, html_files, has_advanced_nav):
           """Calculate structural complexity score."""
           score = 0
           
           # TOC complexity
           if toc_items > 50:
               score += 30
           elif toc_items > 20:
               score += 20
           elif toc_items > 10:
               score += 10
           
           # File structure complexity
           if html_files > 100:
               score += 25
           elif html_files > 50:
               score += 15
           elif html_files > 20:
               score += 10
           
           # Advanced navigation features
           if has_advanced_nav:
               score += 15
           
           return min(score, 100)  # Cap at 100
           
       def _get_complexity_level(self, score):
           """Convert score to complexity level."""
           if score >= 70:
               return "Advanced"
           elif score >= 40:
               return "Intermediate"
           else:
               return "Beginner"
               
       def build_reading_lists(self, epub_directory, output_file="reading_lists.json"):
           """Build categorized reading lists."""
           epub_files = list(Path(epub_directory).glob("*.epub"))
           
           print(f"Analyzing {len(epub_files)} EPUB files for reading lists...")
           
           # Analyze all books
           for epub_path in epub_files:
               book_info = self.analyze_book_complexity(epub_path)
               if book_info:
                   self.books.append(book_info)
           
           # Categorize books
           categories = defaultdict(list)
           
           for book in self.books:
               # By complexity
               categories[f"complexity_{book['complexity_level'].lower()}"].append(book)
               
               # By navigation richness
               if book['toc_items'] >= 20:
                   categories['detailed_structure'].append(book)
               
               if book['has_advanced_nav']:
                   categories['advanced_navigation'].append(book)
           
           # Create final reading lists
           reading_lists = {
               'beginner_friendly': {
                   'description': 'Books with simple structure, perfect for casual reading',
                   'books': sorted(categories['complexity_beginner'], 
                                 key=lambda x: x['toc_items'])[:10]
               },
               'intermediate_reads': {
                   'description': 'Well-structured books with moderate complexity',
                   'books': sorted(categories['complexity_intermediate'], 
                                 key=lambda x: x['complexity_score'])[:15]
               },
               'advanced_studies': {
                   'description': 'Complex books with rich navigation, ideal for research',
                   'books': sorted(categories['complexity_advanced'], 
                                 key=lambda x: x['complexity_score'], reverse=True)[:10]
               },
               'detailed_references': {
                   'description': 'Books with detailed table of contents',
                   'books': sorted(categories['detailed_structure'], 
                                 key=lambda x: x['toc_items'], reverse=True)[:12]
               },
               'enhanced_navigation': {
                   'description': 'Books with advanced navigation features',
                   'books': categories['advanced_navigation'][:10]
               }
           }
           
           # Save to file
           with open(output_file, 'w', encoding='utf-8') as f:
               json.dump(reading_lists, f, indent=2, ensure_ascii=False, default=str)
           
           # Print summary
           print(f"\nReading Lists Generated:")
           print("=" * 25)
           for list_name, list_data in reading_lists.items():
               print(f"{list_name}: {len(list_data['books'])} books")
               print(f"  → {list_data['description']}")
           
           print(f"\nSaved to: {output_file}")
           
   # Usage
   builder = ReadingListBuilder()
   builder.build_reading_lists("/path/to/epub/collection")

These examples demonstrate the power and flexibility of ``epub-utils`` for various real-world scenarios. Whether you're managing a digital library, performing quality assurance, building automated workflows, or analyzing navigation structures, epub-utils provides the tools you need to work effectively with EPUB files.


================================================
FILE: docs/formats.rst
================================================
Output Formats Reference
========================

``epub-utils`` supports multiple output formats to suit different use cases. This guide explains each 
format with examples and best practices for when to use each one.

Overview
--------

All commands in ``epub-utils`` support the ``--format`` option with these values:

- ``xml`` - Syntax-highlighted XML (default for most commands)
- ``raw`` - Unformatted, raw content
- ``kv`` - Key-value pairs (where supported)
- ``plain`` - Plain text with HTML tags stripped (content command only)
- ``table`` - Formatted table (files command only)

Additionally, most commands support the ``--pretty-print`` option to format XML output with proper indentation and structure.

XML Format (Default)
--------------------

The XML format provides syntax-highlighted, pretty-printed XML output that's easy to read.

**When to use**: Interactive inspection, debugging, learning EPUB structure

**Example**:

.. code-block:: bash

   $ epub-utils book.epub metadata --format xml

**Output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?>
   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" 
             xmlns:opf="http://www.idpf.org/2007/opf">
     <dc:title>The Great Gatsby</dc:title>
     <dc:creator>F. Scott Fitzgerald</dc:creator>
     <dc:language>en</dc:language>
     <dc:identifier id="bookid">urn:uuid:12345678-1234-1234-1234-123456789abc</dc:identifier>
     <dc:publisher>Scribner</dc:publisher>
     <dc:date>2021-01-01</dc:date>
     <dc:subject>Fiction</dc:subject>
     <dc:subject>Classic Literature</dc:subject>
   </metadata>

**Features**:
- Color syntax highlighting
- Proper indentation
- Easy to read structure
- Preserves all XML attributes and namespaces

Raw Format
----------

The raw format outputs unprocessed content exactly as stored in the EPUB file.

**When to use**: Piping to other tools, automated processing, debugging XML issues

**Example**:

.. code-block:: bash

   $ epub-utils book.epub metadata --format raw

**Output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?><metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"><dc:title>The Great Gatsby</dc:title><dc:creator>F. Scott Fitzgerald</dc:creator><dc:language>en</dc:language><dc:identifier id="bookid">urn:uuid:12345678-1234-1234-1234-123456789abc</dc:identifier><dc:publisher>Scribner</dc:publisher><dc:date>2021-01-01</dc:date><dc:subject>Fiction</dc:subject><dc:subject>Classic Literature</dc:subject></metadata>

**Use cases**:

.. code-block:: bash

   # Pipe to xmllint for custom formatting
   $ epub-utils book.epub package --format raw | xmllint --format -

   # Extract specific elements with grep
   $ epub-utils book.epub manifest --format raw | grep 'media-type="text/css"'

   # Validate XML structure
   $ epub-utils book.epub toc --format raw | xmllint --valid -

Key-Value Format
----------------

The key-value format presents metadata as simple ``key: value`` pairs, perfect for scripting.

**When to use**: Shell scripting, automated data extraction, configuration files

**Supported commands**: ``metadata``

**Example**:

.. code-block:: bash

   $ epub-utils book.epub metadata --format kv

**Output**:

.. code-block:: text

   title: The Great Gatsby
   creator: F. Scott Fitzgerald
   language: en
   identifier: urn:uuid:12345678-1234-1234-1234-123456789abc
   publisher: Scribner
   date: 2021-01-01
   subject: Fiction, Classic Literature

**Scripting examples**:

.. code-block:: bash

   # Extract just the title
   title=$(epub-utils book.epub metadata --format kv | grep "^title:" | cut -d' ' -f2-)

   # Get all metadata into shell variables
   eval "$(epub-utils book.epub metadata --format kv | sed 's/^/meta_/')"
   echo "Book title: $meta_title"
   echo "Author: $meta_creator"

   # Create a simple database
   echo "filename,title,author" > books.csv
   for epub in *.epub; do
       metadata=$(epub-utils "$epub" metadata --format kv)
       title=$(echo "$metadata" | grep "^title:" | cut -d' ' -f2- | tr ',' ';')
       author=$(echo "$metadata" | grep "^creator:" | cut -d' ' -f2- | tr ',' ';')
       echo "$epub,$title,$author" >> books.csv
   done

Plain Text Format
-----------------

The plain text format strips HTML tags and returns readable text content.

**When to use**: Content analysis, word counting, text extraction

**Supported commands**: ``content``, ``files`` (with file path)

**Example**:

.. code-block:: bash

   $ epub-utils book.epub content chapter1 --format plain

**Output**:

.. code-block:: text

   Chapter 1: The Beginning

   In my younger and more vulnerable years my father gave me some advice 
   that I've carried with me ever since. "Whenever you feel like criticizing 
   anyone," he told me, "just remember that all the people in this world 
   haven't had the advantages that you've had."

**Use cases**:

.. code-block:: bash

   # Count words in a chapter (using content command)
   word_count=$(epub-utils book.epub content chapter1 --format plain | wc -w)
   echo "Chapter 1 has $word_count words"

   # Extract all text for analysis (using files command)
   epub-utils book.epub files OEBPS/chapter1.xhtml --format plain > chapter1.txt

   # Search for specific content in any file
   if epub-utils book.epub files OEBPS/chapter2.xhtml --format plain | grep -q "important phrase"; then
       echo "Found the phrase in chapter 2"
   fi

   # Access files by path without knowing manifest IDs
   epub-utils book.epub files OEBPS/styles/main.css
   epub-utils book.epub files META-INF/container.xml

Table Format
------------

The table format presents file information in a readable tabular layout.

**When to use**: File analysis, human-readable file listings

**Supported commands**: ``files``

**Example**:

.. code-block:: bash

   $ epub-utils book.epub files --format table

**Output**:

.. code-block:: text

   File Information for book.epub
   ┌────────────────────────────────────────┬──────────┬──────────────┬─────────────────────┐
   │ Path                                   │ Size     │ Compressed   │ Modified            │
   ├────────────────────────────────────────┼──────────┼──────────────┼─────────────────────┤
   │ META-INF/container.xml                 │ 230 B    │ 140 B        │ 2021-01-01 10:00:00│
   │ OEBPS/content.opf                      │ 2.1 KB   │ 856 B        │ 2021-01-01 10:00:00│
   │ OEBPS/toc.ncx                          │ 1.8 KB   │ 542 B        │ 2021-01-01 10:00:00│
   │ OEBPS/Text/chapter01.xhtml             │ 12.4 KB  │ 3.2 KB       │ 2021-01-01 10:00:00│
   │ OEBPS/Text/chapter02.xhtml             │ 15.6 KB  │ 4.1 KB       │ 2021-01-01 10:00:00│
   │ OEBPS/Styles/stylesheet.css            │ 3.2 KB   │ 1.1 KB       │ 2021-01-01 10:00:00│
   │ OEBPS/Images/cover.jpg                 │ 145.2 KB │ 144.8 KB     │ 2021-01-01 10:00:00│
   └────────────────────────────────────────┴──────────┴──────────────┴─────────────────────┘

Command-Specific Format Support
-------------------------------

Here's a quick reference for which formats each command supports:

.. list-table:: Format Support by Command
   :header-rows: 1
   :widths: 20 15 15 15 15 15

   * - Command
     - XML
     - Raw
     - KV
     - Plain
     - Table
   * - ``container``
     - ✓
     - ✓
     - ✗
     - ✗
     - ✗
   * - ``package``
     - ✓
     - ✓
     - ✗
     - ✗
     - ✗
   * - ``toc``
     - ✓
     - ✓
     - ✗
     - ✗
     - ✗
   * - ``metadata``
     - ✓
     - ✓
     - ✓
     - ✗
     - ✗
   * - ``manifest``
     - ✓
     - ✓
     - ✗
     - ✗
     - ✗
   * - ``spine``
     - ✓
     - ✓
     - ✗
     - ✗
     - ✗
   * - ``content``
     - ✓
     - ✓
     - ✗
     - ✓
     - ✗
   * - ``files``
     - ✓*
     - ✓
     - ✗
     - ✓*
     - ✓*

.. note::
   \* For the ``files`` command: ``xml``, ``plain``, and ``table`` formats are only available when specifying a file path. When listing files (no path specified), only ``table`` and ``raw`` formats are supported.

Advanced Format Usage
---------------------

Combining Formats with Shell Tools
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

**Pretty-print with custom tools**:

.. code-block:: bash

   # Use xmllint for custom XML formatting
   epub-utils book.epub package --format raw | xmllint --format --noblanks -

   # Convert to JSON using xq (if available)
   epub-utils book.epub metadata --format raw | xq '.'

**Processing key-value output**:

.. code-block:: bash

   # Convert to environment variables
   export $(epub-utils book.epub metadata --format kv | tr ' ' '_' | tr ':' '=')
   echo "Title: $title"

   # Create YAML-like output
   epub-utils book.epub metadata --format kv | sed 's/^/  /' | sed '1i metadata:'

**Text analysis workflows**:

.. code-block:: bash

   # Analyze reading time (assuming 200 words per minute)
   words=$(epub-utils book.epub content chapter1 --format plain | wc -w)
   minutes=$((words / 200))
   echo "Chapter 1 reading time: $minutes minutes"

   # Extract quotes (lines starting with quotation marks)
   epub-utils book.epub content chapter1 --format plain | grep '^".*"$'

Format Selection Guidelines
---------------------------

Choose the right format based on your use case:

**For Human Reading**:
- Use ``xml`` for inspecting EPUB structure
- Use ``table`` for file listings
- Use ``plain`` for content reading

**For Automation**:
- Use ``raw`` for piping to other XML tools
- Use ``kv`` for simple scripting and data extraction
- Use ``raw`` with ``files`` for getting simple file lists

**For Integration**:
- Use ``raw`` when feeding into other programs
- Use ``kv`` for configuration file generation
- Use ``plain`` for text processing workflows

**Performance Considerations**:
- ``raw`` format is fastest (no syntax highlighting)
- ``xml`` format has slight overhead for highlighting
- ``table`` format requires additional formatting computation

Error Handling with Formats
----------------------------

Different formats handle errors differently:

.. code-block:: bash

   # XML format shows formatted error messages
   $ epub-utils corrupted.epub metadata --format xml
   Error: Unable to parse metadata

   # Raw format may show parsing errors directly
   $ epub-utils corrupted.epub metadata --format raw
   ParseError: Invalid XML structure

   # KV format gracefully handles missing fields
   $ epub-utils incomplete.epub metadata --format kv
   title: 
   creator: Unknown Author
   language: en

Custom Format Processing
------------------------

You can create custom output formats by post-processing the raw output:

.. code-block:: bash

   #!/bin/zsh
   # custom-json-format.sh - Convert metadata to JSON

   epub_file="$1"

   echo "{"
   epub-utils "$epub_file" metadata --format kv | while IFS=': ' read -r key value; do
       if [[ -n "$key" && -n "$value" ]]; then
           echo "  \"$key\": \"$value\","
       fi
   done | sed '$s/,$//'
   echo "}"

.. code-block:: bash

   #!/bin/zsh
   # custom-markdown-format.sh - Convert metadata to Markdown

   epub_file="$1"
   
   echo "# Book Information"
   echo ""
   
   epub-utils "$epub_file" metadata --format kv | while IFS=': ' read -r key value; do
       if [[ -n "$key" && -n "$value" ]]; then
           formatted_key=$(echo "$key" | sed 's/\b\w/\U&/g')  # Title case
           echo "**$formatted_key**: $value"
       fi
   done

Pretty-Print Option
-------------------

The ``--pretty-print`` (or ``-pp``) option enhances XML output by adding proper indentation and structure, making it more readable for human inspection.

**When to use**: Human review, debugging XML structure, cleaner output for documentation

**Supported formats**: ``xml`` and ``raw``

**Example without pretty-print**:

.. code-block:: bash

   $ epub-utils book.epub metadata --format raw

**Output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?><metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"><dc:title>The Great Gatsby</dc:title><dc:creator>F. Scott Fitzgerald</dc:creator><dc:language>en</dc:language></metadata>

**Example with pretty-print**:

.. code-block:: bash

   $ epub-utils book.epub metadata --format raw --pretty-print

**Output**:

.. code-block:: xml

   <?xml version="1.0" encoding="UTF-8"?>
   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" 
             xmlns:opf="http://www.idpf.org/2007/opf">
     <dc:title>The Great Gatsby</dc:title>
     <dc:creator>F. Scott Fitzgerald</dc:creator>
     <dc:language>en</dc:language>
   </metadata>

**Use cases**:

.. code-block:: bash

   # Better readability for manual inspection
   epub-utils book.epub package --pretty-print
   
   # Clean output for documentation or examples
   epub-utils book.epub container --format raw --pretty-print
   
   # Pipe to file with proper formatting
   epub-utils book.epub toc --pretty-print > toc-formatted.xml

**Note**: Pretty-print has no effect on ``kv``, ``plain``, or ``table`` formats as these are already optimized for readability.

Best Practices
--------------

1. **Default to XML for interactive use** - it's the most readable
2. **Use raw for scripting** - it's the most reliable for automation
3. **Use kv for metadata extraction** - it's purpose-built for simple parsing
4. **Use plain for content analysis** - it removes HTML complexity
5. **Use pretty-print for human review** - it makes XML structure clearer
6. **Always handle errors** - EPUB files can be malformed
7. **Test with various EPUB files** - format output can vary with different EPUB structures

These format options make epub-utils flexible enough to handle everything from quick 
interactive inspection to complex automated workflows.


================================================
FILE: docs/index.rst
================================================
epub-utils: EPUB Inspection and Manipulation
=============================================

.. image:: https://img.shields.io/pypi/v/epub-utils.svg
   :target: https://pypi.org/project/epub-utils/
   :alt: PyPI version

.. image:: https://img.shields.io/pypi/pyversions/epub-utils.svg?logo=python&logoColor=white
   :target: https://pypi.org/project/epub-utils/
   :alt: Python versions

.. image:: https://img.shields.io/badge/license-Apache%202.0-blue.svg
   :target: https://github.com/ernestofgonzalez/epub-utils/blob/main/LICENSE
   :alt: License

**epub-utils** is a comprehensive Python library and command-line tool for working with EPUB files. 
It provides both a programmatic API and an intuitive CLI interface for inspecting and parsing EPUB archives.

.. note::
   epub-utils supports **EPUB 2.0.1** and **EPUB 3.0+** specifications, ensuring compatibility 
   with the vast majority of EPUB files in circulation.

Key Features
------------

**Rich CLI Interface**
   - Syntax-highlighted XML output
   - Multiple output formats (XML, raw, key-value, plain text)
   - Comprehensive file inspection capabilities

**Complete EPUB Support**
   - Parse container.xml and package files
   - Extract and display table of contents
   - Access manifest and spine information
   - Retrieve document content by ID

**Metadata Extraction**
   - Dublin Core metadata support
   - EPUB-specific metadata fields
   - Key-value output for easy parsing

**Python API**
   - Clean, object-oriented interface
   - Lazy loading for performance
   - Comprehensive error handling

Quick Start
-----------

Installation
~~~~~~~~~~~~

.. code-block:: bash

   $ pip install epub-utils

Basic CLI Usage
~~~~~~~~~~~~~~~

Inspect an EPUB file with a simple command:

.. code-block:: bash

   # Display metadata with beautiful syntax highlighting
   $ epub-utils my-book.epub metadata

   # Show table of contents structure
   $ epub-utils my-book.epub toc

   # Get key-value metadata for scripting
   $ epub-utils my-book.epub metadata --format kv

Basic Python Usage
~~~~~~~~~~~~~~~~~~

.. code-block:: python

   from epub_utils import Document

   # Load an EPUB document
   doc = Document("path/to/book.epub")

   # Access metadata easily
   print(f"Title: {doc.package.metadata.title}")
   print(f"Author: {doc.package.metadata.creator}")
   print(f"Language: {doc.package.metadata.language}")

   # Get table of contents
   toc_xml = doc.toc.to_xml()
   print(toc_xml)

Why epub-utils?
---------------

epub-utils fills a crucial gap in the Python ecosystem for EPUB file manipulation. While there are 
libraries for creating EPUBs, few focus on inspection and analysis. This tool is perfect for:

**Publishers and Authors**
   Validate EPUB structure and metadata before distribution

**Digital Librarians**
   Batch process and analyze EPUB collections

**Automation Scripts**
   Extract metadata for catalogs and databases

**Debugging**
   Inspect malformed or problematic EPUB files

**Learning**
   Understand EPUB structure and standards compliance

Documentation Contents
----------------------

.. toctree::
   :maxdepth: 2
   :caption: User Guide

   installation
   cli-tutorial
   api-tutorial
   examples
   formats

.. toctree::
   :maxdepth: 2
   :caption: Reference

   cli-reference
   api-reference
   epub-standards

.. toctree::
   :maxdepth: 1
   :caption: Development

   contributing
   changelog

Community & Support
-------------------

- **Source Code**: `GitHub Repository <https://github.com/ernestofgonzalez/epub-utils>`_
- **Issues**: `Bug Reports & Feature Requests <https://github.com/ernestofgonzalez/epub-utils/issues>`_
- **PyPI**: `Package Index <https://pypi.org/project/epub-utils/>`_

License
-------

``epub-utils`` is distributed under the `Apache License 2.0 <https://github.com/ernestofgonzalez/epub-utils/blob/main/LICENSE>`_.


================================================
FILE: docs/installation.rst
================================================
Installation Guide
==================

System Requirements
-------------------

``epub-utils`` requires Python 3.10 or higher and works on:

- **Linux** (Ubuntu 18.04+, Debian 10+, CentOS 7+, Fedora 30+)
- **macOS** (10.14+)
- **Windows** (Windows 10+)

Installing from PyPI
---------------------

The easiest way to install ``epub-utils`` is using pip:

.. code-block:: bash

   $ pip install epub-utils

This will install the latest stable version with all required dependencies.

Development Installation
------------------------

If you want to contribute to ``epub-utils`` or use the latest development version:

.. code-block:: bash

   # Clone the repository
   $ git clone https://github.com/ernestofgonzalez/epub-utils.git
   $ cd epub-utils

   # Create a virtual environment
   $ python -m venv env
   $ source env/bin/activate  # On Windows: env\Scripts\activate

   # Install in development mode
   $ pip install -e .

   # Install development dependencies
   $ pip install -r requirements/requirements-testing.txt
   $ pip install -r requirements/requirements-linting.txt

Virtual Environment Installation
--------------------------------

For isolated installations, we recommend using virtual environments:

Using venv (Python 3.3+)
~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: bash

   # Create virtual environment
   $ python -m venv epub-utils-env

   # Activate virtual environment
   $ source epub-utils-env/bin/activate  # Linux/macOS
   $ epub-utils-env\Scripts\activate     # Windows

   # Install epub-utils
   $ pip install epub-utils

Using conda
~~~~~~~~~~~

.. code-block:: bash

   # Create conda environment
   $ conda create -n epub-utils python=3.10

   # Activate environment
   $ conda activate epub-utils

   # Install epub-utils
   $ pip install epub-utils

Verifying Installation
----------------------

After installation, verify that ``epub-utils`` is working correctly:

.. code-block:: bash

   # Check version
   $ epub-utils --version

   # Test with a sample EPUB (if you have one)
   $ epub-utils sample.epub metadata

If you see the version number and can run commands without errors, the installation was successful!

Installing from Source
----------------------

To install from source code:

.. code-block:: bash

   # Download and extract the source
   $ wget https://github.com/ernestofgonzalez/epub-utils/archive/main.zip
   $ unzip main.zip
   $ cd epub-utils-main

   # Install
   $ pip install .

Upgrading
---------

To upgrade to the latest version:

.. code-block:: bash

   $ pip install --upgrade epub-utils

Uninstalling
------------

To remove epub-utils:

.. code-block:: bash

   $ pip uninstall epub-utils

Performance Considerations
--------------------------

Installing lxml
~~~~~~~~~~~~~~~

While not required, installing ``lxml`` can significantly improve XML parsing performance:

.. code-block:: bash

   $ pip install lxml

``epub-utils`` will automatically use lxml if available, falling back to the standard library's 
``xml.etree.ElementTree`` if not.


================================================
FILE: epub_utils/__init__.py
================================================
from epub_utils.container import Container
from epub_utils.doc import Document

__all__ = ['Document', 'Container']


================================================
FILE: epub_utils/__main__.py
================================================
from epub_utils.cli import main

if __name__ == '__main__':
	main(prog_name='epub-utils')


================================================
FILE: epub_utils/cli.py
================================================
import click

from epub_utils.doc import Document
from epub_utils.exceptions import (
	EPUBError,
	FileNotFoundError,
)

VERSION = '0.1.0a1'


def format_error_message(e: Exception) -> str:
	"""Format exception messages for CLI output."""
	if isinstance(e, EPUBError):
		# Use the custom formatting from our EPUBError class
		return str(e)
	else:
		# For other exceptions, just return the message
		return str(e)


def print_version(ctx, param, value):
	if not value or ctx.resilient_parsing:
		return
	click.echo(VERSION)
	ctx.exit()


@click.group(
	context_settings=dict(help_option_names=['-h', '--help']),
)
@click.option(
	'-v',
	'--version',
	is_flag=True,
	callback=print_version,
	expose_value=False,
	is_eager=True,
	help='Print epub-utils version.',
)
@click.argument(
	'path',
	type=click.Path(exists=True, file_okay=True),
	required=True,
)
@click.pass_context
def main(ctx, path):
	ctx.ensure_object(dict)
	ctx.obj['path'] = path


def format_option(default='xml'):
	"""Reusable decorator for the format option."""
	return click.option(
		'-fmt',
		'--format',
		type=click.Choice(['raw', 'xml', 'plain', 'kv'], case_sensitive=False),
		default=default,
		help=f'Output format, defaults to {default}.',
	)


def pretty_print_option():
	"""Reusable decorator for the pretty-print option."""
	return click.option(
		'-pp',
		'--pretty-print',
		is_flag=True,
		default=False,
		help='Pretty-print XML output (only applies to str and xml format).',
	)


def output_document_part(doc, part_name, format, pretty_print=False):
	"""Helper function to output document parts in the specified format."""
	part = getattr(doc, part_name)
	if format == 'raw':
		click.echo(part.to_str(pretty_print=pretty_print))
	elif format == 'xml':
		click.echo(part.to_xml(pretty_print=pretty_print))
	elif format == 'kv':
		if hasattr(part, 'to_kv') and callable(getattr(part, 'to_kv')):
			click.echo(part.to_kv())
		else:
			click.secho(
				'Key-value format not supported for this document part. Falling back to raw:\n',
				fg='yellow',
			)
			click.echo(part.to_str())


def format_file_size(size_bytes: int) -> str:
	"""Format file size in human-readable format."""
	if size_bytes == 0:
		return '0 B'

	size_names = ['B', 'KB', 'MB', 'GB']
	i = 0
	size = float(size_bytes)

	while size >= 1024.0 and i < len(size_names) - 1:
		size /= 1024.0
		i += 1

	if i == 0:
		return f'{int(size)} {size_names[i]}'
	else:
		return f'{size:.1f} {size_names[i]}'


def format_files_table(files_info: list) -> str:
	"""Format file information as a table."""
	if not files_info:
		return 'No files found in EPUB archive.'

	# Calculate column widths
	max_path_width = max(len(file_info['path']) for file_info in files_info)
	max_size_width = max(len(format_file_size(file_info['size'])) for file_info in files_info)
	max_compressed_width = max(
		len(format_file_size(file_info['compressed_size'])) for file_info in files_info
	)

	# Ensure minimum widths for headers
	path_width = max(max_path_width, len('Path'))
	size_width = max(max_size_width, len('Size'))
	compressed_width = max(max_compressed_width, len('Compressed'))
	modified_width = len('Modified')  # Fixed width for date/time

	# Create header
	header = f'{"Path":<{path_width}} | {"Size":>{size_width}} | {"Compressed":>{compressed_width}} | {"Modified":<{modified_width}}'
	separator = '-' * len(header)

	# Create rows
	rows = []
	for file_info in files_info:
		path = file_info['path'][:path_width]  # Truncate if too long
		size = format_file_size(file_info['size'])
		compressed = format_file_size(file_info['compressed_size'])
		modified = file_info['modified']

		row = f'{path:<{path_width}} | {size:>{size_width}} | {compressed:>{compressed_width}} | {modified:<{modified_width}}'
		rows.append(row)

	# Combine all parts
	result = [header, separator] + rows
	return '\n'.join(result)


@main.command()
@format_option()
@pretty_print_option()
@click.pass_context
def container(ctx, format, pretty_print):
	"""Outputs the container information of the EPUB file."""
	try:
		doc = Document(ctx.obj['path'])
		output_document_part(doc, 'container', format, pretty_print)
	except EPUBError as e:
		click.secho('EPUB Error:', fg='red', bold=True, err=True)
		click.secho(format_error_message(e), fg='red', err=True)
		ctx.exit(1)
	except Exception as e:
		click.secho('Unexpected Error:', fg='red', bold=True, err=True)
		click.secho(str(e), fg='red', err=True)
		ctx.exit(1)


@main.command()
@format_option()
@pretty_print_option()
@click.pass_context
def package(ctx, format, pretty_print):
	"""Outputs the package information of the EPUB file."""
	doc = Document(ctx.obj['path'])
	output_document_part(doc, 'package', format, pretty_print)


@main.command()
@format_option()
@pretty_print_option()
@click.option(
	'--ncx',
	is_flag=True,
	default=False,
	help='Force retrieval of NCX file (EPUB 2 navigation control file).',
)
@click.option(
	'--nav',
	is_flag=True,
	default=False,
	help='Force retrieval of Navigation Document (EPUB 3 navigation file).',
)
@click.pass_context
def toc(ctx, format, pretty_print, ncx, nav):
	"""Outputs the Table of Contents (TOC) of the EPUB file."""
	doc = Document(ctx.obj['path'])

	if ncx and nav:
		click.secho('Error: --ncx and --nav flags cannot be used together.', fg='red', err=True)
		ctx.exit(1)

	if ncx:
		part = 'ncx'
		if doc.ncx is None:
			click.secho(
				'Error: This document does not include a Navigation Control eXtended (NCX).',
				fg='red',
				err=True,
			)
			ctx.exit(1)
	elif nav:
		part = 'nav'
		if doc.nav is None:
			click.secho(
				'Error: This document does not include an EPUB Navigation Document.',
				fg='red',
				err=True,
			)
			ctx.exit(1)
	else:
		part = 'toc'

	output_document_part(doc, part, format, pretty_print)


@main.command()
@format_option()
@pretty_print_option()
@click.pass_context
def metadata(ctx, format, pretty_print):
	"""Outputs the metadata information from the package file."""
	doc = Document(ctx.obj['path'])
	package = doc.package
	output_document_part(package, 'metadata', format, pretty_print)


@main.command()
@format_option()
@pretty_print_option()
@click.pass_context
def manifest(ctx, format, pretty_print):
	"""Outputs the manifest information from the package file."""
	doc = Document(ctx.obj['path'])
	package = doc.package
	output_document_part(package, 'manifest', format, pretty_print)


@main.command()
@format_option()
@pretty_print_option()
@click.pass_context
def spine(ctx, format, pretty_print):
	"""Outputs the spine information from the package file."""
	doc = Document(ctx.obj['path'])
	package = doc.package
	output_document_part(package, 'spine', format, pretty_print)


@main.command()
@click.argument('item_id', required=True)
@format_option()
@pretty_print_option()
@click.pass_context
def content(ctx, item_id, format, pretty_print):
	"""Outputs the content of a document by its manifest item ID."""
	doc = Document(ctx.obj['path'])

	content = doc.find_content_by_id(item_id)
	if format == 'raw':
		click.echo(content.to_str())
	elif format == 'xml':
		if hasattr(content, 'to_xml'):
			click.echo(content.to_xml(pretty_print=pretty_print))
		else:
			click.echo(content.to_str())
	elif format == 'plain':
		click.echo(content.to_plain())
	elif format == 'kv':
		click.secho(
			'Key-value format not supported for content documents. Falling back to raw:\n',
			fg='yellow',
		)
		click.echo(content.to_str())


@main.command()
@click.argument('file_path', required=False)
@click.option(
	'-fmt',
	'--format',
	type=click.Choice(['table', 'raw', 'xml', 'plain', 'kv'], case_sensitive=False),
	default=None,
	help='Output format. For file listing: table, raw. For file content: raw, xml, plain, kv. Defaults to table for listing, xml for file content.',
)
@pretty_print_option()
@click.pass_context
def files(ctx, file_path, format, pretty_print):
	"""List all files in the EPUB archive with their metadata, or output content of a specific file."""
	doc = Document(ctx.obj['path'])

	# Set dynamic default based on whether file_path is provided
	if format is None:
		format = 'xml' if file_path else 'table'

	if file_path:
		# Display content of specific file
		try:
			content = doc.get_file_by_path(file_path)
		except FileNotFoundError as e:
			click.secho('FileNotFoundError:', fg='red', bold=True, err=True)
			click.secho(format_error_message(e), fg='red', err=True)
			ctx.exit(1)
			return

		# Handle XHTMLContent objects
		if hasattr(content, 'to_str'):
			if format == 'raw':
				click.echo(content.to_str())
			elif format == 'xml':
				if hasattr(content, 'to_xml'):
					click.echo(content.to_xml(pretty_print=pretty_print))
				else:
					click.echo(content.to_str())
			elif format == 'plain':
				if hasattr(content, 'to_plain'):
					click.echo(content.to_plain())
				else:
					click.echo(content.to_str())
			elif format == 'kv':
				click.secho(
					'Key-value format not supported for file content. Falling back to raw:\n',
					fg='yellow',
				)
				click.echo(content.to_str())
			elif format == 'table':
				# For file content, table format doesn't make sense, fall back to raw
				click.secho(
					'Table format not supported for file content. Falling back to raw:\n',
					fg='yellow',
				)
				click.echo(content.to_str())
		else:
			# Handle raw string content (non-XHTML files)
			click.echo(content)
	else:
		# List all files (existing behavior)
		files_info = doc.get_files_info()

		if format == 'table':
			click.echo(format_files_table(files_info))
		elif format == 'raw':
			for file_info in files_info:
				click.echo(f'{file_info["path"]}')
		else:
			# For file listing, only table and raw make sense
			if format in ['xml', 'plain', 'kv']:
				click.secho(
					f'{format.title()} format not supported for file listing. Using table format:\n',
					fg='yellow',
				)
			click.echo(format_files_table(files_info))


================================================
FILE: epub_utils/container.py
================================================
"""
Open Container Format: https://www.w3.org/TR/epub/#sec-ocf

This file includes the `Container` class, which is responsible for parsing the `container.xml` file
of an EPUB archive. The `container.xml` file is a required component of the EPUB Open Container
Format (OCF) and is located in the `META-INF` directory of the EPUB archive.

The `container.xml` file serves as the entry point for identifying the package document(s)
within the EPUB container. It must conform to the following structure as defined in the EPUB
specification:

- The root element is `<container>` and must include the `version` attribute with the value "1.0".
- The `<container>` element must contain exactly one `<rootfiles>` child element.
- The `<rootfiles>` element must contain one or more `<rootfile>` child elements.
- Each `<rootfile>` element must include a `full-path` attribute that specifies the location of
  the package document relative to the root of the EPUB container.

Namespace:
- All elements in the `container.xml` file are in the namespace
  `urn:oasis:names:tc:opendocument:xmlns:container`.

For more details on the structure and requirements of the `container.xml` file, refer to the
EPUB specification: https://www.w3.org/TR/epub/#sec-ocf
"""

try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

from epub_utils.exceptions import InvalidEPUBError, ParseError
from epub_utils.printers import XMLPrinter


class Container:
	"""
	Represents the parsed container.xml file of an EPUB.

	Attributes:
	    xml_content (str): The raw XML content of the container.xml file.
	    rootfile_path (str): The path to the rootfile specified in the container.
	"""

	NAMESPACE = 'urn:oasis:names:tc:opendocument:xmlns:container'
	ROOTFILE_XPATH = f'.//{{{NAMESPACE}}}rootfile'

	def __init__(self, xml_content: str) -> None:
		"""
		Initialize the Container by parsing the container.xml data.

		Args:
		    xml_content (str): The raw XML content of the container.xml file.
		"""
		self.xml_content = xml_content
		self.rootfile_path: str = None

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def _find_rootfile_element(self, root: etree.Element) -> etree.Element:
		"""
		Finds the rootfile element in the container.xml data.

		Args:
		    root (etree.Element): The root element of the parsed XML.

		Returns:
		    etree.Element: The rootfile element.

		Raises:
		    InvalidEPUBError: If the rootfile element or its 'full-path' attribute is missing.
		"""
		rootfile_element = root.find(self.ROOTFILE_XPATH)
		if rootfile_element is None:
			raise InvalidEPUBError(
				'Invalid container.xml: Missing rootfile element',
				suggestions=[
					'Ensure the container.xml contains a rootfile element',
					'Check that the container structure follows EPUB specifications',
					'Verify the EPUB was created with compliant tools',
				],
			)

		if 'full-path' not in rootfile_element.attrib:
			raise InvalidEPUBError(
				"Invalid container.xml: Missing 'full-path' attribute in rootfile element",
				suggestions=[
					"Ensure the rootfile element has a 'full-path' attribute",
					'Check that the container.xml follows EPUB specifications',
					'Verify the EPUB package structure is complete',
				],
			)

		return rootfile_element

	def _parse(self, xml_content: str) -> None:
		"""
		Parses the container.xml data to extract the rootfile path.

		Args:
		    xml_content (str): The raw XML content of the container.xml file.

		Raises:
		    ParseError: If the XML is invalid or cannot be parsed.
		    InvalidEPUBError: If the container.xml structure is invalid.
		"""
		try:
			if isinstance(xml_content, str):
				xml_content = xml_content.encode('utf-8')
			root = etree.fromstring(xml_content)
			rootfile_element = self._find_rootfile_element(root)
			self.rootfile_path = rootfile_element.attrib['full-path']

			if not self.rootfile_path.strip():
				raise InvalidEPUBError(
					"Invalid container.xml: 'full-path' attribute is empty",
					suggestions=[
						"Ensure the rootfile element has a non-empty 'full-path' attribute",
						'Check that the path points to a valid OPF file',
						'Verify the EPUB package structure is complete',
					],
				)
		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in container.xml: {str(e)}',
				suggestions=[
					'Check that the container.xml file contains valid XML',
					'Verify the file is not corrupted',
					'Ensure all XML tags are properly closed',
					'Check for invalid characters in the XML',
				],
			) from e


================================================
FILE: epub_utils/content/__init__.py
================================================
from epub_utils.content.base import Content
from epub_utils.content.xhtml import XHTMLContent

__all__ = ['Content', 'XHTMLContent']


================================================
FILE: epub_utils/content/base.py
================================================
class Content:
	"""
	Base class for EPUB content documents.

	Attributes:
	    media_type (str): The MIME type of the content.
	    href (str): The path to the content file within the EPUB.
	"""

	def __init__(self, media_type: str, href: str) -> None:
		self.media_type = media_type
		self.href = href


================================================
FILE: epub_utils/content/xhtml.py
================================================
import re

from lxml import etree

from epub_utils.content.base import Content
from epub_utils.exceptions import ParseError, UnsupportedFormatError
from epub_utils.printers import XMLPrinter


class XHTMLContent(Content):
	"""
	Represents an XHTML content document within an EPUB file.
	"""

	MEDIA_TYPES = ['application/xhtml+xml', 'text/html']

	def __init__(self, xml_content: str, media_type: str, href: str) -> None:
		self.xml_content = xml_content

		self._tree = None

		if media_type not in self.MEDIA_TYPES:
			raise UnsupportedFormatError(
				f"Media type '{media_type}' is not supported for XHTML content",
				suggestions=[
					f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}',
					'Check that this is an XHTML content file',
					'Verify the manifest declares the correct media type',
				],
			)
		super().__init__(media_type, href)

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def to_plain(self) -> str:
		return self.inner_text

	def _parse(self, xml_content: str) -> None:
		try:
			self._tree = etree.fromstring(xml_content.encode('utf-8'))
		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in XHTML content file: {str(e)}',
				suggestions=[
					'Check that the content file contains valid XHTML',
					'Verify the file is not corrupted',
					'Ensure all XML tags are properly closed',
					'Check for invalid characters in the XML',
				],
			) from e

	@property
	def tree(self):
		"""Lazily parse and cache the XHTML tree."""
		if self._tree is None:
			self._parse(self.xml_content)
		return self._tree

	@property
	def inner_text(self) -> str:
		tree = self.tree

		body_elements = tree.xpath('//*[local-name()="body"]')

		if body_elements:
			inner_text = ''.join(body_elements[0].itertext())
		else:
			inner_text = ''.join(tree.itertext())

		# Normalize whitespace
		inner_text = re.sub(r'\s+', ' ', inner_text).strip()

		return inner_text


================================================
FILE: epub_utils/doc.py
================================================
import os
import zipfile
from datetime import datetime
from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional, Union

from epub_utils.container import Container
from epub_utils.content import XHTMLContent
from epub_utils.exceptions import FileNotFoundError as EPUBFileNotFoundError
from epub_utils.exceptions import InvalidEPUBError
from epub_utils.navigation import EPUBNavDocNavigation, Navigation, NCXNavigation
from epub_utils.package import Package


class Document:
	"""
	Represents an EPUB document.

	Attributes:
	    path (Path): The path to the EPUB file.
	    _container (Container): The parsed container document.
	    _package (Package): The parsed package document.
	    _toc (TableOfContents): The parsed table of contents document.
	"""

	CONTAINER_FILE_PATH = 'META-INF/container.xml'

	def __init__(self, path: Union[str, Path]) -> None:
		"""
		Initialize the Document from a given path.

		Args:
		    path (str | Path): The path to the EPUB file.

		Raises:
		    InvalidEPUBError: If the file is not a valid EPUB archive.
		"""
		self.path: Path = Path(path)

		if not self.path.exists():
			raise InvalidEPUBError(
				f'EPUB file does not exist: {self.path}',
				suggestions=[
					'Check that the file path is correct',
					'Verify the file has not been moved or deleted',
				],
				file_path=str(self.path),
			)

		if not zipfile.is_zipfile(self.path):
			raise InvalidEPUBError(
				f'File is not a valid ZIP archive: {self.path}',
				suggestions=[
					'Ensure the file is a valid EPUB (which is a ZIP archive)',
					'Check that the file is not corrupted',
					'Verify the file extension is .epub',
				],
				file_path=str(self.path),
			)

		self._container: Container = None
		self._package: Package = None

		self._toc: Navigation = None
		self._ncx: NCXNavigation = None
		self._nav: EPUBNavDocNavigation = None

	def _read_file_from_epub(self, file_path: str) -> str:
		"""
		Read and decode a file from the EPUB archive.

		Args:
		    file_path (str): Path to the file within the EPUB archive.

		Returns:
		    str: Decoded contents of the file.

		Raises:
		    EPUBFileNotFoundError: If the file is missing from the EPUB archive.
		"""
		with zipfile.ZipFile(self.path, 'r') as epub_zip:
			norm_namelist = {os.path.normpath(name): name for name in epub_zip.namelist()}
			norm_path = os.path.normpath(file_path)

			if norm_path not in norm_namelist:
				available_files = sorted(norm_namelist.keys())[:10]  # Show first 10 files
				suggestions = [
					'Check that the file path is correct',
					'Verify the EPUB file structure is complete',
				]
				if available_files:
					file_list = ', '.join(available_files)
					if len(norm_namelist) > 10:
						file_list += f' (and {len(norm_namelist) - 10} more)'
					suggestions.append(f'Available files include: {file_list}')

				raise EPUBFileNotFoundError(
					file_path, epub_path=str(self.path), suggestions=suggestions
				)

			try:
				return epub_zip.read(norm_namelist[norm_path]).decode('utf-8')
			except UnicodeDecodeError as e:
				raise InvalidEPUBError(
					f"Cannot decode file '{file_path}' as UTF-8",
					suggestions=[
						'Check that the file contains valid UTF-8 text',
						'Verify the EPUB file is not corrupted',
						'Ensure the file is a text-based format (XML, HTML, etc.)',
					],
					file_path=str(self.path),
				) from e

	@property
	def container(self) -> Container:
		if self._container is None:
			container_xml_content = self._read_file_from_epub(self.CONTAINER_FILE_PATH)
			self._container = Container(container_xml_content)
		return self._container

	@property
	def package(self) -> Package:
		if self._package is None:
			package_xml_content = self._read_file_from_epub(self.container.rootfile_path)
			self._package = Package(package_xml_content)
		return self._package

	@cached_property
	def package_href(self):
		return os.path.dirname(self.container.rootfile_path)

	@property
	def toc(self) -> Optional[Navigation]:
		if self._toc is None:
			if self.nav is not None:
				# Default to newer EPUB3 Navigation Document when available
				self._toc = self.nav
			elif self.ncx is not None:
				self._toc = self.ncx

		return self._toc

	@property
	def ncx(self) -> Optional[NCXNavigation]:
		"""Access the Navigation Control eXtended (EPUB 2)"""
		if self._ncx is None:
			package = self.package

			if not package.toc_href:
				return None

			toc_href = package.toc_href
			toc_path = os.path.join(self.package_href, toc_href)
			toc_xml_content = self._read_file_from_epub(toc_path)

			self._ncx = NCXNavigation(toc_xml_content)

		return self._ncx

	@property
	def nav(self) -> Optional[EPUBNavDocNavigation]:
		"""Access the Navigation Document (EPUB 3)."""
		if self._nav is None:
			package = self.package

			if not package.nav_href:
				return None

			nav_href = package.nav_href
			nav_path = os.path.join(self.package_href, nav_href)
			nav_xml_content = self._read_file_from_epub(nav_path)

			self._nav = EPUBNavDocNavigation(nav_xml_content)

		return self._nav

	def find_content_by_id(self, item_id: str) -> str:
		"""
		Find and return content by its manifest item ID.

		Args:
		    item_id: The ID of the item in the manifest.

		Returns:
		    XHTMLContent: The content object for the specified item.

		Raises:
		    EPUBFileNotFoundError: If the item ID is not found in spine or manifest.
		"""
		spine_item = self.package.spine.find_by_idref(item_id)
		if not spine_item:
			spine_ids = [
				item.get('idref') for item in self.package.spine.itemrefs if item.get('idref')
			]
			suggestions = [
				'Check that the item ID is correct',
				'Verify the item is included in the spine',
			]
			if spine_ids:
				available_ids = ', '.join(spine_ids[:5])
				if len(spine_ids) > 5:
					available_ids += f' (and {len(spine_ids) - 5} more)'
				suggestions.append(f'Available spine IDs: {available_ids}')

			raise EPUBFileNotFoundError(
				f"spine item '{item_id}'", epub_path=str(self.path), suggestions=suggestions
			)

		manifest_item = self.package.manifest.find_by_id(item_id)
		if not manifest_item:
			manifest_ids = [
				item.get('id') for item in self.package.manifest.items if item.get('id')
			]
			suggestions = [
				'Check that the item ID is correct',
				'Verify the item is declared in the manifest',
			]
			if manifest_ids:
				available_ids = ', '.join(manifest_ids[:5])
				if len(manifest_ids) > 5:
					available_ids += f' (and {len(manifest_ids) - 5} more)'
				suggestions.append(f'Available manifest IDs: {available_ids}')

			raise EPUBFileNotFoundError(
				f"manifest item '{item_id}'", epub_path=str(self.path), suggestions=suggestions
			)

		content_path = os.path.join(self.package_href, manifest_item['href'])
		xml_content = self._read_file_from_epub(content_path)

		content = XHTMLContent(xml_content, manifest_item['media_type'], manifest_item['href'])

		return content

	def find_pub_resource_by_id(self, item_id: str) -> str:
		"""
		Find and return a publication resource by its manifest item ID.

		Args:
		    item_id: The ID of the item in the manifest.

		Returns:
		    str: The raw content of the resource.

		Raises:
		    EPUBFileNotFoundError: If the item ID is not found in manifest.
		"""
		manifest_item = self.package.manifest.find_by_id(item_id)
		if not manifest_item:
			manifest_ids = [
				item.get('id') for item in self.package.manifest.items if item.get('id')
			]
			suggestions = [
				'Check that the item ID is correct',
				'Verify the item is declared in the manifest',
			]
			if manifest_ids:
				available_ids = ', '.join(manifest_ids[:5])
				if len(manifest_ids) > 5:
					available_ids += f' (and {len(manifest_ids) - 5} more)'
				suggestions.append(f'Available manifest IDs: {available_ids}')

			raise EPUBFileNotFoundError(
				f"manifest item '{item_id}'", epub_path=str(self.path), suggestions=suggestions
			)

		content_path = os.path.join(self.package_href, manifest_item['href'])
		xml_content = self._read_file_from_epub(content_path)

		content = XHTMLContent(xml_content, manifest_item['media_type'], manifest_item['href'])

		return content

	def list_files(self) -> List[Dict[str, str]]:
		"""
		List all files in the EPUB archive.

		Returns:
		    List[Dict[str, str]]: A list of dictionaries containing file information.
		"""
		with zipfile.ZipFile(self.path, 'r') as epub_zip:
			file_list = []
			for zip_info in epub_zip.infolist():
				file_info = {
					'filename': zip_info.filename,
					'file_size': zip_info.file_size,
					'compress_size': zip_info.compress_size,
					'file_mode': zip_info.external_attr >> 16,
					'last_modified': datetime(*zip_info.date_time),
				}
				file_list.append(file_info)
			return file_list

	def get_files_info(self) -> List[Dict[str, Union[str, int]]]:
		"""
		Get information about all files in the EPUB archive.

		Returns:
		    List[Dict]: A list of dictionaries containing file information.
		        Each dictionary contains: 'path', 'size', 'compressed_size', 'modified'.
		"""
		files_info = []

		with zipfile.ZipFile(self.path, 'r') as epub_zip:
			for zip_info in epub_zip.infolist():
				if zip_info.filename.endswith('/'):
					continue

				modified_time = datetime(*zip_info.date_time).strftime('%Y-%m-%d %H:%M:%S')

				file_info = {
					'path': zip_info.filename,
					'size': zip_info.file_size,
					'compressed_size': zip_info.compress_size,
					'modified': modified_time,
				}
				files_info.append(file_info)

		files_info.sort(key=lambda x: x['path'])
		return files_info

	def get_file_by_path(self, file_path: str):
		"""
		Retrieve a file from the EPUB archive by its path.

		Args:
		    file_path (str): Path to the file within the EPUB archive.

		Returns:
		    XHTMLContent or str: For XHTML files, returns XHTMLContent object.
		                        For other files, returns raw content as string.

		Raises:
		    ValueError: If the file is missing from the EPUB archive.
		"""
		file_content = self._read_file_from_epub(file_path)

		if file_path.lower().endswith(('.xhtml', '.html', '.htm')):
			media_type = 'application/xhtml+xml'

			try:
				for item in self.package.manifest.items:
					manifest_path = os.path.join(self._Documentpackage_href, item['href'])
					if os.path.normpath(manifest_path) == os.path.normpath(file_path):
						media_type = item.get('media_type', 'application/xhtml+xml')
						break
			except:
				pass

			return XHTMLContent(file_content, media_type, file_path)
		else:
			return file_content


================================================
FILE: epub_utils/exceptions.py
================================================
"""
Global epub-utils exception classes.

This module defines custom exceptions for the epub-utils library that provide
more descriptive error messages to help users understand what went wrong and
how to fix it.
"""


class EPUBError(Exception):
	"""Base exception for all epub-utils errors."""

	def __init__(self, message: str, suggestions: list = None, file_path: str = None):
		"""
		Initialize the EPUBError.

		Args:
			message: The error message describing what went wrong
			suggestions: Optional list of suggestions for fixing the error
			file_path: Optional path to the file where the error occurred
		"""
		super().__init__(message)
		self.suggestions = suggestions or []
		self.file_path = file_path

	def __str__(self):
		error_parts = [super().__str__()]

		if self.file_path:
			error_parts.append(f'File: {self.file_path}')

		if self.suggestions:
			error_parts.append('Suggestions:')
			for suggestion in self.suggestions:
				error_parts.append(f'  • {suggestion}')

		return '\n'.join(error_parts)


class ParseError(EPUBError, ValueError):
	"""An error when parsing EPUB content due to invalid formatting."""

	def __init__(
		self,
		message: str,
		element_name: str = None,
		line_number: int = None,
		suggestions: list = None,
		file_path: str = None,
	):
		"""
		Initialize the ParseError.

		Args:
			message: The error message
			element_name: The XML element that caused the parsing error
			line_number: The line number where the error occurred
			suggestions: List of suggestions for fixing the error
			file_path: Path to the file with the parsing error
		"""
		if element_name:
			message = f'Error parsing {element_name}: {message}'
		if line_number:
			message = f'{message} (line {line_number})'

		if not suggestions:
			suggestions = [
				'Verify the EPUB file is not corrupted',
				'Check that the XML is well-formed',
				'Ensure all required elements are present',
			]

		super().__init__(message, suggestions, file_path)


class InvalidEPUBError(EPUBError, ValueError):
	"""An error when the EPUB file structure or content is invalid."""

	def __init__(
		self,
		message: str,
		missing_files: list = None,
		suggestions: list = None,
		file_path: str = None,
	):
		"""
		Initialize the InvalidEPUBError.

		Args:
			message: The error message
			missing_files: List of missing required files
			suggestions: List of suggestions for fixing the error
			file_path: Path to the invalid EPUB file
		"""
		if missing_files:
			file_list = ', '.join(missing_files)
			message = f'{message}. Missing required files: {file_list}'

		if not suggestions:
			suggestions = [
				'Verify the file is a valid EPUB archive',
				'Check that all required EPUB files are present',
				'Ensure the EPUB was created with a compliant tool',
			]

		super().__init__(message, suggestions, file_path)


class UnsupportedFormatError(EPUBError, ValueError):
	"""An error when attempting operations not supported for the EPUB version/format."""

	def __init__(
		self,
		message: str,
		epub_version: str = None,
		required_version: str = None,
		suggestions: list = None,
		file_path: str = None,
	):
		"""
		Initialize the UnsupportedFormatError.

		Args:
			message: The error message
			epub_version: The version of the EPUB file
			required_version: The minimum required version for the operation
			suggestions: List of suggestions for fixing the error
			file_path: Path to the EPUB file
		"""
		if epub_version and required_version:
			message = f'{message} (EPUB {epub_version} detected, requires EPUB {required_version})'
		elif epub_version:
			message = f'{message} (EPUB {epub_version} format)'

		if not suggestions:
			suggestions = [
				'Try using an EPUB file with a compatible version',
				'Check the EPUB specification for version requirements',
			]
			if required_version:
				suggestions.insert(0, f'Convert the EPUB to version {required_version} or higher')

		super().__init__(message, suggestions, file_path)


class NotImplementedError(EPUBError):
	"""An error when attempting to use functionality not yet implemented."""

	def __init__(
		self,
		message: str,
		feature_name: str = None,
		suggestions: list = None,
		file_path: str = None,
	):
		"""
		Initialize the NotImplementedError.

		Args:
			message: The error message
			feature_name: Name of the unimplemented feature
			suggestions: List of suggestions for fixing the error
			file_path: Path to the file (if applicable)
		"""
		if feature_name:
			message = f"Feature '{feature_name}' is not yet implemented: {message}"

		if not suggestions:
			suggestions = [
				'Check the documentation for supported features',
				'Consider contributing this feature to the project',
				'Use an alternative approach if available',
			]

		super().__init__(message, suggestions, file_path)


class FileNotFoundError(EPUBError, ValueError):
	"""An error when a required file is not found in the EPUB archive."""

	def __init__(self, file_path: str, epub_path: str = None, suggestions: list = None):
		"""
		Initialize the FileNotFoundError.

		Args:
			file_path: Path to the missing file within the EPUB
			epub_path: Path to the EPUB file
			suggestions: List of suggestions for fixing the error
		"""
		message = f"Missing '{file_path}' in EPUB archive"

		if not suggestions:
			suggestions = [
				'Verify the file path is correct',
				'Check that the EPUB file is complete and not corrupted',
				'Ensure the file was included when the EPUB was created',
			]

		super().__init__(message, suggestions, epub_path)


class ValidationError(EPUBError, ValueError):
	"""An error when EPUB content fails validation."""

	def __init__(
		self,
		message: str,
		validation_errors: list = None,
		suggestions: list = None,
		file_path: str = None,
	):
		"""
		Initialize the ValidationError.

		Args:
			message: The error message
			validation_errors: List of specific validation errors
			suggestions: List of suggestions for fixing the error
			file_path: Path to the file with validation errors
		"""
		if validation_errors:
			error_list = '\n'.join(f'  • {error}' for error in validation_errors)
			message = f'{message}\nValidation errors:\n{error_list}'

		if not suggestions:
			suggestions = [
				'Fix the validation errors listed above',
				'Use an EPUB validator to check for additional issues',
				'Consult the EPUB specification for requirements',
			]

		super().__init__(message, suggestions, file_path)


================================================
FILE: epub_utils/navigation/__init__.py
================================================
"""EPUB Navigation module."""

from .base import Navigation, NavigationItem
from .nav import EPUBNavDocNavigation
from .ncx import NCXNavigation

__all__ = [
	'Navigation',
	'NavigationItem',
	'NCXNavigation',
	'EPUBNavDocNavigation',
]


================================================
FILE: epub_utils/navigation/base.py
================================================
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional


@dataclass
class NavigationItem:
	"""Universal navigation item representation."""

	id: str
	label: str
	target: str  # href/src
	order: Optional[int] = None
	level: int = 0
	item_type: Optional[str] = None  # semantic type
	children: List['NavigationItem'] = field(default_factory=list)

	def to_dict(self) -> Dict[str, Any]:
		"""Convert NavigationItem to dictionary format with all children recursively converted.

		Returns:
			Dictionary representation with children as nested dictionaries.
		"""
		result = {
			'id': self.id,
			'label': self.label,
			'target': self.target,
			'order': self.order,
			'level': self.level,
			'type': self.item_type,
			'children': [child.to_dict() for child in self.children],
		}

		return result


class Navigation(ABC):
	"""
	Base class for Navigation Documents.

	Attributes:
	    media_type (str): The MIME type of the content.
	    href (str): The path to the content file within the EPUB.
	"""

	def __init__(self, media_type: str, href: str) -> None:
		self.media_type = media_type
		self.href = href

	# === Core Abstract Methods ===
	@abstractmethod
	def get_toc_items(self) -> List[NavigationItem]:
		"""Get table of contents as normalized items."""
		pass

	@abstractmethod
	def get_page_list(self) -> List[NavigationItem]:
		"""Get page list/breaks as normalized items."""
		pass

	@abstractmethod
	def get_landmarks(self) -> List[NavigationItem]:
		"""Get landmarks/guide references as normalized items."""
		pass

	# === Editing Interface ===
	@abstractmethod
	def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None:
		"""Add item to table of contents."""
		pass

	@abstractmethod
	def remove_toc_item(self, item_id: str) -> bool:
		"""Remove item from table of contents by ID."""
		pass

	@abstractmethod
	def update_toc_item(self, item_id: str, **kwargs) -> bool:
		"""Update existing TOC item properties."""
		pass

	@abstractmethod
	def reorder_toc_items(self, new_order: List[str]) -> None:
		"""Reorder TOC items by list of IDs."""
		pass

	# === Query Interface ===
	def find_item_by_id(self, item_id: str) -> Optional[NavigationItem]:
		"""Find navigation item by ID across all collections."""
		for item in self.get_all_items():
			if item.id == item_id:
				return item
		return None

	def find_items_by_target(self, target: str) -> List[NavigationItem]:
		"""Find navigation items by target/href."""
		return [item for item in self.get_all_items() if item.target == target]

	def get_all_items(self) -> List[NavigationItem]:
		"""Get all navigation items from all collections."""
		items = []
		items.extend(self.get_toc_items())
		items.extend(self.get_page_list())
		items.extend(self.get_landmarks())
		return items

	def get_toc_items_as_dicts(self) -> List[Dict[str, Any]]:
		"""Get TOC items as list of dictionaries with recursive children conversion.

		Returns:
			List of dictionaries representing the TOC structure, where each item
			contains all its children recursively converted to dictionaries.
		"""
		return [item.to_dict() for item in self.get_toc_items()]

	def get_page_list_as_dicts(self) -> List[Dict[str, Any]]:
		"""Get page list items as list of dictionaries.

		Returns:
			List of dictionaries representing the page list structure.
		"""
		return [item.to_dict() for item in self.get_page_list()]

	def get_landmarks_as_dicts(self) -> List[Dict[str, Any]]:
		"""Get landmarks as list of dictionaries.

		Returns:
			List of dictionaries representing the landmarks structure.
		"""
		return [item.to_dict() for item in self.get_landmarks()]

	# === Format-specific Access ===
	@property
	@abstractmethod
	def tree(self):
		"""Get underlying XML/DOM tree for format-specific operations."""
		pass

	# === Output Methods ===
	@abstractmethod
	def to_str(self, *args, **kwargs) -> str:
		pass

	@abstractmethod
	def to_xml(self, *args, **kwargs) -> str:
		pass

	@abstractmethod
	def to_plain(self) -> str:
		pass


================================================
FILE: epub_utils/navigation/nav/__init__.py
================================================
import re
from typing import List, Optional

from lxml import etree

from epub_utils.exceptions import ParseError, UnsupportedFormatError
from epub_utils.navigation.base import Navigation, NavigationItem
from epub_utils.printers import XMLPrinter

from .dom import NavDocument, NavListItem


class EPUBNavDocNavigation(Navigation):
	"""EPUB 3 Navigation Document implementation."""

	MEDIA_TYPES = ['application/xhtml+xml']

	def __init__(
		self, xml_content: str, media_type: str = 'application/xhtml+xml', href: str = None
	) -> None:
		self.xml_content = xml_content

		self._tree = None

		self.xmlns = None
		self.lang = None

		if media_type not in self.MEDIA_TYPES:
			raise UnsupportedFormatError(
				f"Media type '{media_type}' is not supported for EPUB Navigation Document",
				suggestions=[
					f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}',
					'Check that this is an EPUB 3 Navigation Document',
					'Verify the manifest declares the correct media type',
				],
			)
		super().__init__(media_type, href)

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def to_plain(self) -> str:
		return self.inner_text

	def _parse(self, xml_content: str) -> None:
		try:
			self._tree = etree.fromstring(xml_content.encode('utf-8'))

			root = self._tree

			self.xmlns = root.nsmap.get(None, '') if root.nsmap else ''
			self.lang = root.get('{http://www.w3.org/XML/1998/namespace}lang', '')

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in EPUB Navigation Document: {str(e)}',
				suggestions=[
					'Check that the navigation document contains valid XHTML',
					'Verify the file is not corrupted',
					'Ensure all XML tags are properly closed',
					'Check for invalid characters in the XML',
				],
			) from e

	@property
	def tree(self):
		"""Lazily parse and cache the XHTML tree."""
		if self._tree is None:
			self._parse(self.xml_content)
		return self._tree

	@property
	def inner_text(self) -> str:
		tree = self.tree

		body_elements = tree.xpath(
			'//*[local-name()="body"]', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)

		if body_elements:
			inner_text = ''.join(body_elements[0].itertext())
		else:
			inner_text = ''.join(tree.itertext())

		# Normalize whitespace
		inner_text = re.sub(r'\s+', ' ', inner_text).strip()

		return inner_text

	# === Navigation Interface Implementation ===

	def get_toc_items(self) -> List[NavigationItem]:
		"""Get table of contents as normalized items."""
		nav_doc = NavDocument(self.tree)
		toc_nav = nav_doc.toc_nav
		if not toc_nav:
			return []

		ordered_list = toc_nav.ordered_list
		if not ordered_list:
			return []

		return self._convert_list_items_recursive(ordered_list.list_items, level=0)

	def get_page_list(self) -> List[NavigationItem]:
		"""Get page list/breaks as normalized items."""
		nav_doc = NavDocument(self.tree)
		page_list_nav = nav_doc.page_list_nav
		if not page_list_nav:
			return []

		ordered_list = page_list_nav.ordered_list
		if not ordered_list:
			return []

		return self._convert_list_items_to_pages(ordered_list.list_items)

	def get_landmarks(self) -> List[NavigationItem]:
		"""Get landmarks/guide references as normalized items."""
		nav_doc = NavDocument(self.tree)
		landmarks_nav = nav_doc.landmarks_nav
		if not landmarks_nav:
			return []

		ordered_list = landmarks_nav.ordered_list
		if not ordered_list:
			return []

		return self._convert_list_items_to_landmarks(ordered_list.list_items)

	# === Editing Interface ===

	def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None:
		"""Add item to table of contents."""
		nav_doc = NavDocument(self.tree)
		toc_nav = nav_doc.toc_nav

		if not toc_nav:
			# Create TOC nav if it doesn't exist
			toc_nav = nav_doc.add_nav_section('toc')
			toc_nav.add_heading(1, 'Table of Contents')
			ordered_list = toc_nav.add_ordered_list()
		else:
			ordered_list = toc_nav.ordered_list
			if not ordered_list:
				ordered_list = toc_nav.add_ordered_list()

		# Create new list item
		new_li = ordered_list.add_list_item()
		if item.id:
			new_li.id = item.id

		# Add anchor or span based on whether target is provided
		if item.target:
			anchor = new_li.add_anchor(item.target, item.label)
			if item.item_type:
				anchor.epub_type = item.item_type
		else:
			span = new_li.add_span(item.label)
			if item.id:
				span.id = item.id

		# TODO: Handle after_id positioning and children

	def remove_toc_item(self, item_id: str) -> bool:
		"""Remove item from table of contents by ID."""
		nav_doc = NavDocument(self.tree)
		toc_nav = nav_doc.toc_nav
		if not toc_nav:
			return False

		# Find and remove the list item with the given ID
		items_to_remove = self.tree.xpath(
			f'.//xhtml:li[@id="{item_id}"]',
			namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'},
		)

		# Also check for anchors with the ID
		if not items_to_remove:
			items_to_remove = self.tree.xpath(
				f'.//xhtml:a[@id="{item_id}"]',
				namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'},
			)
			# Remove the parent li element if found
			items_to_remove = [
				item.getparent() for item in items_to_remove if item.getparent() is not None
			]

		if items_to_remove:
			for item in items_to_remove:
				if item.getparent() is not None:
					item.getparent().remove(item)
			return True

		return False

	def update_toc_item(self, item_id: str, **kwargs) -> bool:
		"""Update existing TOC item properties."""
		nav_doc = NavDocument(self.tree)
		toc_nav = nav_doc.toc_nav
		if not toc_nav:
			return False

		# Find the item by ID (could be on li or a element)
		target_items = self.tree.xpath(
			f'.//xhtml:li[@id="{item_id}"] | .//xhtml:a[@id="{item_id}"]',
			namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'},
		)

		if not target_items:
			return False

		target_element = target_items[0]

		# If we found an anchor, work with it; if we found a li, find its anchor
		if target_element.tag.endswith('}a'):
			anchor_element = target_element
			li_element = target_element.getparent()
		else:
			li_element = target_element
			anchors = li_element.xpath(
				'./xhtml:a', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
			)
			anchor_element = anchors[0] if anchors else None

		# Update properties
		if 'label' in kwargs and anchor_element is not None:
			anchor_element.text = kwargs['label']
		elif 'label' in kwargs:
			# Handle span elements or create anchor
			spans = li_element.xpath(
				'./xhtml:span', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
			)
			if spans:
				spans[0].text = kwargs['label']

		if 'target' in kwargs and anchor_element is not None:
			anchor_element.set('href', kwargs['target'])

		if 'item_type' in kwargs and anchor_element is not None:
			anchor_element.set('{http://www.idpf.org/2007/ops}type', kwargs['item_type'])

		return True

	def reorder_toc_items(self, new_order: List[str]) -> None:
		"""Reorder TOC items by list of IDs."""
		# This is a complex operation that would require rebuilding the list structure
		# For now, we'll implement a basic version that moves items around
		nav_doc = NavDocument(self.tree)
		toc_nav = nav_doc.toc_nav
		if not toc_nav:
			return

		ordered_list = toc_nav.ordered_list
		if not ordered_list:
			return

		# Collect all items with their IDs
		items_map = {}
		for li_item in ordered_list.list_items:
			if li_item.id:
				items_map[li_item.id] = li_item.element
			elif li_item.anchor and li_item.anchor.id:
				items_map[li_item.anchor.id] = li_item.element

		# Reorder by removing and re-adding in new order
		for item_id in new_order:
			if item_id in items_map:
				element = items_map[item_id]
				parent = element.getparent()
				if parent is not None:
					parent.remove(element)
					parent.append(element)

	# === Helper Methods ===

	def _convert_list_items_recursive(
		self, list_items: List[NavListItem], level: int = 0
	) -> List[NavigationItem]:
		"""Convert navigation list items to NavigationItems recursively."""
		items = []

		for i, list_item in enumerate(list_items):
			anchor = list_item.anchor
			span = list_item.span

			if anchor:
				item = NavigationItem(
					id=anchor.id or list_item.id or '',
					label=anchor.text,
					target=anchor.href or '',
					order=i + 1,
					level=level,
					item_type=anchor.epub_type,
				)
			elif span:
				item = NavigationItem(
					id=span.id or list_item.id or '',
					label=span.element.text or '',
					target='',
					order=i + 1,
					level=level,
					item_type=None,
				)
			else:
				# Fallback for items without anchor or span
				continue

			# Convert nested items
			nested_list = list_item.nested_list
			if nested_list:
				item.children = self._convert_list_items_recursive(
					nested_list.list_items, level + 1
				)

			items.append(item)

		return items

	def _convert_list_items_to_pages(self, list_items: List[NavListItem]) -> List[NavigationItem]:
		"""Convert navigation list items to page NavigationItems."""
		items = []

		for i, list_item in enumerate(list_items):
			anchor = list_item.anchor
			if not anchor:
				continue

			item = NavigationItem(
				id=anchor.id or list_item.id or '',
				label=anchor.text,
				target=anchor.href or '',
				order=i + 1,
				level=0,
				item_type=anchor.epub_type or 'page',
			)
			items.append(item)

		return items

	def _convert_list_items_to_landmarks(
		self, list_items: List[NavListItem]
	) -> List[NavigationItem]:
		"""Convert navigation list items to landmark NavigationItems."""
		items = []

		for i, list_item in enumerate(list_items):
			anchor = list_item.anchor
			if not anchor:
				continue

			item = NavigationItem(
				id=anchor.id or list_item.id or '',
				label=anchor.text,
				target=anchor.href or '',
				order=i + 1,
				level=0,
				item_type=anchor.epub_type or 'landmark',
			)
			items.append(item)

		return items


================================================
FILE: epub_utils/navigation/nav/dom.py
================================================
"""DOM classes for structured access to EPUB 3 Navigation Documents."""

from typing import List, Optional

from lxml import etree


class NavElement:
	"""Base class for navigation document elements."""

	def __init__(self, element: etree.Element) -> None:
		self.element = element

	@property
	def id(self) -> Optional[str]:
		"""Get the id attribute."""
		return self.element.get('id')

	@id.setter
	def id(self, value: str) -> None:
		"""Set the id attribute."""
		self.element.set('id', value)


class NavAnchor(NavElement):
	"""Represents an anchor element (a) in navigation."""

	@property
	def href(self) -> Optional[str]:
		"""Get the href attribute."""
		return self.element.get('href')

	@href.setter
	def href(self, value: str) -> None:
		"""Set the href attribute."""
		self.element.set('href', value)

	@property
	def text(self) -> str:
		"""Get the text content of the anchor."""
		return self.element.text or ''

	@text.setter
	def text(self, value: str) -> None:
		"""Set the text content of the anchor."""
		self.element.text = value

	@property
	def epub_type(self) -> Optional[str]:
		"""Get the epub:type attribute."""
		return self.element.get('{http://www.idpf.org/2007/ops}type')

	@epub_type.setter
	def epub_type(self, value: str) -> None:
		"""Set the epub:type attribute."""
		self.element.set('{http://www.idpf.org/2007/ops}type', value)


class NavListItem(NavElement):
	"""Represents a list item (li) in navigation."""

	@property
	def anchor(self) -> Optional[NavAnchor]:
		"""Get the first anchor child element."""
		anchors = self.element.xpath(
			'./xhtml:a', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		if anchors:
			return NavAnchor(anchors[0])
		return None

	@property
	def nested_list(self) -> Optional['NavList']:
		"""Get nested ordered list if present."""
		lists = self.element.xpath(
			'./xhtml:ol', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		if lists:
			return NavList(lists[0])
		return None

	@property
	def span(self) -> Optional[NavElement]:
		"""Get span element if present (for non-linked text)."""
		spans = self.element.xpath(
			'./xhtml:span', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		if spans:
			return NavElement(spans[0])
		return None

	def add_anchor(self, href: str, text: str, epub_type: Optional[str] = None) -> NavAnchor:
		"""Add an anchor element to this list item."""
		anchor_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}a')
		anchor = NavAnchor(anchor_element)
		anchor.href = href
		anchor.text = text
		if epub_type:
			anchor.epub_type = epub_type
		return anchor

	def add_span(self, text: str) -> NavElement:
		"""Add a span element to this list item."""
		span_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}span')
		span = NavElement(span_element)
		span.element.text = text
		return span

	def add_nested_list(self) -> 'NavList':
		"""Add a nested ordered list to this list item."""
		ol_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}ol')
		return NavList(ol_element)


class NavList(NavElement):
	"""Represents an ordered list (ol) in navigation."""

	@property
	def list_items(self) -> List[NavListItem]:
		"""Get all list item children."""
		items = self.element.xpath(
			'./xhtml:li', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		return [NavListItem(item) for item in items]

	def add_list_item(self) -> NavListItem:
		"""Add a new list item to this list."""
		li_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}li')
		return NavListItem(li_element)

	def get_all_items_recursive(self) -> List[NavListItem]:
		"""Get all list items recursively."""
		items = []

		def collect_items(nav_list: NavList):
			for item in nav_list.list_items:
				items.append(item)
				nested_list = item.nested_list
				if nested_list:
					collect_items(nested_list)

		collect_items(self)
		return items


class NavSection(NavElement):
	"""Represents a nav element with specific epub:type."""

	@property
	def epub_type(self) -> Optional[str]:
		"""Get the epub:type attribute."""
		return self.element.get('{http://www.idpf.org/2007/ops}type')

	@epub_type.setter
	def epub_type(self, value: str) -> None:
		"""Set the epub:type attribute."""
		self.element.set('{http://www.idpf.org/2007/ops}type', value)

	@property
	def heading(self) -> Optional[str]:
		"""Get the text of the heading element (h1-h6)."""
		for level in range(1, 7):
			headings = self.element.xpath(
				f'./xhtml:h{level}', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
			)
			if headings:
				return headings[0].text or ''
		return None

	@property
	def ordered_list(self) -> Optional[NavList]:
		"""Get the ordered list child element."""
		lists = self.element.xpath(
			'./xhtml:ol', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		if lists:
			return NavList(lists[0])
		return None

	def add_heading(self, level: int, text: str) -> NavElement:
		"""Add a heading element."""
		if not 1 <= level <= 6:
			raise ValueError('Heading level must be between 1 and 6')

		heading_element = etree.SubElement(
			self.element, f'{{http://www.w3.org/1999/xhtml}}h{level}'
		)
		heading = NavElement(heading_element)
		heading.element.text = text
		return heading

	def add_ordered_list(self) -> NavList:
		"""Add an ordered list to this nav section."""
		ol_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}ol')
		return NavList(ol_element)


class NavDocument(NavElement):
	"""Represents the root html element of a navigation document."""

	@property
	def toc_nav(self) -> Optional[NavSection]:
		"""Get the table of contents nav section."""
		navs = self.element.xpath(
			'.//xhtml:nav[@epub:type="toc"]',
			namespaces={
				'xhtml': 'http://www.w3.org/1999/xhtml',
				'epub': 'http://www.idpf.org/2007/ops',
			},
		)
		if navs:
			return NavSection(navs[0])
		return None

	@property
	def page_list_nav(self) -> Optional[NavSection]:
		"""Get the page list nav section."""
		navs = self.element.xpath(
			'.//xhtml:nav[@epub:type="page-list"]',
			namespaces={
				'xhtml': 'http://www.w3.org/1999/xhtml',
				'epub': 'http://www.idpf.org/2007/ops',
			},
		)
		if navs:
			return NavSection(navs[0])
		return None

	@property
	def landmarks_nav(self) -> Optional[NavSection]:
		"""Get the landmarks nav section."""
		navs = self.element.xpath(
			'.//xhtml:nav[@epub:type="landmarks"]',
			namespaces={
				'xhtml': 'http://www.w3.org/1999/xhtml',
				'epub': 'http://www.idpf.org/2007/ops',
			},
		)
		if navs:
			return NavSection(navs[0])
		return None

	@property
	def all_nav_sections(self) -> List[NavSection]:
		"""Get all nav sections."""
		navs = self.element.xpath(
			'.//xhtml:nav', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		return [NavSection(nav) for nav in navs]

	@property
	def title(self) -> str:
		"""Get the document title."""
		title_elements = self.element.xpath(
			'.//xhtml:title', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		return title_elements[0].text if title_elements else ''

	@property
	def body(self) -> Optional[NavElement]:
		"""Get the body element."""
		bodies = self.element.xpath(
			'.//xhtml:body', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}
		)
		if bodies:
			return NavElement(bodies[0])
		return None

	def add_nav_section(self, epub_type: str) -> NavSection:
		"""Add a new nav section to the body."""
		body = self.body
		if not body:
			# Create body if it doesn't exist
			body_element = etree.SubElement(self.element, '{http://www.w3.org/1999/xhtml}body')
			body = NavElement(body_element)

		nav_element = etree.SubElement(body.element, '{http://www.w3.org/1999/xhtml}nav')
		nav_section = NavSection(nav_element)
		nav_section.epub_type = epub_type
		return nav_section


================================================
FILE: epub_utils/navigation/ncx/__init__.py
================================================
import re
from typing import List, Optional

from lxml import etree

from epub_utils.exceptions import FileNotFoundError as EPUBFileNotFoundError
from epub_utils.exceptions import ParseError, UnsupportedFormatError
from epub_utils.navigation.base import Navigation, NavigationItem
from epub_utils.printers import XMLPrinter

from .dom import NCXDocument, NCXNavPoint, NCXNavTarget, NCXPageTarget


class NCXNavigation(Navigation):
	MEDIA_TYPES = ['application/x-dtbncx+xml']

	def __init__(
		self, xml_content: str, media_type: str = 'application/x-dtbncx+xml', href: str = None
	) -> None:
		self.xml_content = xml_content

		self._tree = None

		self.xmlns = None
		self.version = None
		self.lang = None

		if media_type not in self.MEDIA_TYPES:
			raise UnsupportedFormatError(
				f"Media type '{media_type}' is not supported for NCX navigation",
				suggestions=[
					f'Use one of the supported media types: {", ".join(self.MEDIA_TYPES)}',
					'Check that this is an NCX navigation file',
					'Verify the manifest declares the correct media type',
				],
			)
		super().__init__(media_type, href)

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def to_plain(self) -> str:
		return self.inner_text

	def _parse(self, xml_content: str) -> None:
		try:
			self._tree = etree.fromstring(xml_content.encode('utf-8'))

			root = self._tree

			self.xmlns = root.nsmap.get(None, '') if root.nsmap else ''
			self.version = root.get('version', '')
			self.lang = root.get('{http://www.w3.org/XML/1998/namespace}lang', '')

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in NCX navigation file: {str(e)}',
				suggestions=[
					'Check that the NCX file contains valid XML',
					'Verify the file is not corrupted',
					'Ensure all XML tags are properly closed',
					'Check for invalid characters in the XML',
				],
			) from e

	@property
	def tree(self):
		"""Lazily parse and cache the XHTML tree."""
		if self._tree is None:
			self._parse(self.xml_content)
		return self._tree

	@property
	def inner_text(self) -> str:
		tree = self.tree

		body_elements = tree.xpath('//*[local-name()="body"]')

		if body_elements:
			inner_text = ''.join(body_elements[0].itertext())
		else:
			inner_text = ''.join(tree.itertext())

		# Normalize whitespace
		inner_text = re.sub(r'\s+', ' ', inner_text).strip()

		return inner_text

	# === Navigation Interface Implementation ===

	def get_toc_items(self) -> List[NavigationItem]:
		"""Get table of contents as normalized items."""
		ncx_doc = NCXDocument(self.tree)
		nav_map = ncx_doc.nav_map
		if not nav_map:
			return []

		return self._convert_nav_points_recursive(nav_map.nav_points, level=0)

	def get_page_list(self) -> List[NavigationItem]:
		"""Get page list/breaks as normalized items."""
		ncx_doc = NCXDocument(self.tree)
		page_list = ncx_doc.page_list
		if not page_list:
			return []

		return self._convert_page_targets(page_list.page_targets)

	def get_landmarks(self) -> List[NavigationItem]:
		"""Get landmarks/guide references as normalized items."""
		ncx_doc = NCXDocument(self.tree)
		nav_lists = ncx_doc.nav_lists

		items = []
		for nav_list in nav_lists:
			for nav_target in nav_list.nav_targets:
				items.append(self._convert_nav_target(nav_target))

		return items

	def add_toc_item(self, item: NavigationItem, after_id: Optional[str] = None) -> None:
		"""Add item to table of contents."""
		ncx_doc = NCXDocument(self.tree)
		nav_map = ncx_doc.nav_map
		if not nav_map:
			raise ParseError(
				'NCX document is missing required navMap element',
				element_name='navMap',
				suggestions=[
					'Ensure the NCX file contains a navMap element',
					'Check that the NCX structure follows EPUB specifications',
					'Verify the NCX file was created correctly',
				],
			)

		# Find insertion point
		if after_id:
			all_nav_points = nav_map.get_all_nav_points()
			insert_index = None
			for i, nav_point in enumerate(all_nav_points):
				if nav_point.id == after_id:
					insert_index = i + 1
					break

			if insert_index is None:
				available_ids = [nav_point.id for nav_point in all_nav_points if nav_point.id]
				suggestions = [
					'Check that the navigation item ID is correct',
					'Verify the item exists in the navigation structure',
				]
				if available_ids:
					id_list = ', '.join(available_ids[:5])
					if len(available_ids) > 5:
						id_list += f' (and {len(available_ids) - 5} more)'
					suggestions.append(f'Available navigation IDs: {id_list}')

				raise EPUBFileNotFoundError(
					f"Navigation item with ID '{after_id}' not found", suggestions=suggestions
				)

			# For now, append to the end if we can't find the exact position
			# More complex insertion logic would require tree manipulation
			nav_map.add_nav_point(
				item.id, item.label, item.target, class_attr=item.item_type, play_order=item.order
			)
		else:
			# Add to the end
			nav_map.add_nav_point(
				item.id, item.label, item.target, class_attr=item.item_type, play_order=item.order
			)

	def remove_toc_item(self, item_id: str) -> bool:
		"""Remove item from table of contents by ID."""
		ncx_doc = NCXDocument(self.tree)
		nav_map = ncx_doc.nav_map
		if not nav_map:
			return False

		# Find and remove the navPoint
		nav_points = nav_map.element.xpath(
			f'.//ncx:navPoint[@id="{item_id}"]',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)

		if nav_points:
			nav_points[0].getparent().remove(nav_points[0])
			return True

		return False

	def update_toc_item(self, item_id: str, **kwargs) -> bool:
		"""Update existing TOC item properties."""
		ncx_doc = NCXDocument(self.tree)
		nav_map = ncx_doc.nav_map
		if not nav_map:
			return False

		# Find the navPoint
		nav_points = nav_map.element.xpath(
			f'.//ncx:navPoint[@id="{item_id}"]',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)

		if not nav_points:
			return False

		nav_point = NCXNavPoint(nav_points[0])

		# Update properties
		if 'label' in kwargs:
			nav_label = nav_point.nav_label
			if nav_label:
				nav_label.text = kwargs['label']

		if 'target' in kwargs:
			content = nav_point.content
			if content:
				content.src = kwargs['target']

		if 'order' in kwargs:
			nav_point.play_order = kwargs['order']

		if 'item_type' in kwargs:
			nav_point.class_attr = kwargs['item_type']

		return True

	def reorder_toc_items(self, new_order: List[str]) -> None:
		"""Reorder TOC items by list of IDs."""
		# This is a complex operation that would require rebuilding the navMap
		# For now, we'll update the playOrder attributes
		ncx_doc = NCXDocument(self.tree)
		nav_map = ncx_doc.nav_map
		if not nav_map:
			return

		for i, item_id in enumerate(new_order):
			nav_points = nav_map.element.xpath(
				f'.//ncx:navPoint[@id="{item_id}"]',
				namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
			)
			if nav_points:
				nav_point = NCXNavPoint(nav_points[0])
				nav_point.play_order = i + 1

	# === Helper Methods ===

	def _convert_nav_points_recursive(
		self, nav_points: List[NCXNavPoint], level: int = 0
	) -> List[NavigationItem]:
		"""Convert NCX navPoints to NavigationItems recursively."""
		items = []

		for nav_point in nav_points:
			item = NavigationItem(
				id=nav_point.id or '',
				label=nav_point.label_text,
				target=nav_point.content_src,
				order=nav_point.play_order,
				level=level,
				item_type=nav_point.class_attr,
			)

			# Convert child nav points
			child_nav_points = nav_point.nav_points
			if child_nav_points:
				item.children = self._convert_nav_points_recursive(child_nav_points, level + 1)

			items.append(item)

		return items

	def _convert_page_targets(self, page_targets: List[NCXPageTarget]) -> List[NavigationItem]:
		"""Convert NCX pageTargets to NavigationItems."""
		items = []

		for page_target in page_targets:
			item = NavigationItem(
				id=page_target.id or '',
				label=page_target.label_text,
				target=page_target.content_src,
				order=page_target.play_order,
				level=0,
				item_type=page_target.type_attr,
			)
			items.append(item)

		return items

	def _convert_nav_target(self, nav_target: NCXNavTarget) -> NavigationItem:
		"""Convert NCX navTarget to NavigationItem."""
		return NavigationItem(
			id=nav_target.id or '',
			label=nav_target.nav_label.text if nav_target.nav_label else '',
			target=nav_target.content.src if nav_target.content else '',
			order=nav_target.play_order,
			level=0,
			item_type=nav_target.class_attr,
		)


================================================
FILE: epub_utils/navigation/ncx/dom.py
================================================
"""NCX DOM classes for structured access to NCX navigation documents."""

from typing import List, Optional

from lxml import etree


class NCXElement:
	"""Base class for NCX DOM elements."""

	def __init__(self, element: etree.Element):
		self.element = element

	@property
	def id(self) -> Optional[str]:
		"""Get the id attribute."""
		return self.element.get('id')

	@id.setter
	def id(self, value: str) -> None:
		"""Set the id attribute."""
		self.element.set('id', value)


class NCXText(NCXElement):
	"""Represents a text element."""

	@property
	def text(self) -> str:
		"""Get the text content."""
		return self.element.text or ''

	@text.setter
	def text(self, value: str) -> None:
		"""Set the text content."""
		self.element.text = value


class NCXContent(NCXElement):
	"""Represents a content element."""

	@property
	def src(self) -> Optional[str]:
		"""Get the src attribute."""
		return self.element.get('src')

	@src.setter
	def src(self, value: str) -> None:
		"""Set the src attribute."""
		self.element.set('src', value)


class NCXNavLabel(NCXElement):
	"""Represents a navLabel element."""

	@property
	def text_element(self) -> Optional[NCXText]:
		"""Get the text child element."""
		text_elements = self.element.xpath(
			'./ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if text_elements:
			return NCXText(text_elements[0])
		return None

	@property
	def text(self) -> str:
		"""Get the text content."""
		text_elem = self.text_element
		return text_elem.text if text_elem else ''

	@text.setter
	def text(self, value: str) -> None:
		"""Set the text content."""
		text_elem = self.text_element
		if text_elem:
			text_elem.text = value
		else:
			# Create text element if it doesn't exist
			text_element = etree.SubElement(
				self.element, '{http://www.daisy.org/z3986/2005/ncx/}text'
			)
			text_element.text = value


class NCXNavPoint(NCXElement):
	"""Represents a navPoint element in the navigation hierarchy."""

	@property
	def class_attr(self) -> Optional[str]:
		"""Get the class attribute."""
		return self.element.get('class')

	@class_attr.setter
	def class_attr(self, value: str) -> None:
		"""Set the class attribute."""
		self.element.set('class', value)

	@property
	def play_order(self) -> Optional[int]:
		"""Get the playOrder attribute."""
		play_order = self.element.get('playOrder')
		return int(play_order) if play_order else None

	@play_order.setter
	def play_order(self, value: int) -> None:
		"""Set the playOrder attribute."""
		self.element.set('playOrder', str(value))

	@property
	def nav_label(self) -> Optional[NCXNavLabel]:
		"""Get the navLabel child element."""
		nav_labels = self.element.xpath(
			'./ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if nav_labels:
			return NCXNavLabel(nav_labels[0])
		return None

	@property
	def content(self) -> Optional[NCXContent]:
		"""Get the content child element."""
		content_elements = self.element.xpath(
			'./ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if content_elements:
			return NCXContent(content_elements[0])
		return None

	@property
	def nav_points(self) -> List['NCXNavPoint']:
		"""Get child navPoint elements."""
		nav_point_elements = self.element.xpath(
			'./ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXNavPoint(point) for point in nav_point_elements]

	def add_nav_point(
		self,
		id: str,
		label_text: str,
		src: str,
		class_attr: Optional[str] = None,
		play_order: Optional[int] = None,
	) -> 'NCXNavPoint':
		"""Add a child navPoint element."""
		nav_point_element = etree.SubElement(
			self.element, '{http://www.daisy.org/z3986/2005/ncx/}navPoint'
		)
		nav_point = NCXNavPoint(nav_point_element)
		nav_point.id = id

		if class_attr:
			nav_point.class_attr = class_attr
		if play_order is not None:
			nav_point.play_order = play_order

		# Add navLabel
		nav_label_element = etree.SubElement(
			nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel'
		)
		nav_label = NCXNavLabel(nav_label_element)
		nav_label.text = label_text

		# Add content
		content_element = etree.SubElement(
			nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}content'
		)
		content = NCXContent(content_element)
		content.src = src

		return nav_point

	@property
	def label_text(self) -> str:
		"""Get the text of the navLabel."""
		nav_label = self.nav_label
		return nav_label.text if nav_label else ''

	@property
	def content_src(self) -> str:
		"""Get the src of the content element."""
		content = self.content
		return content.src if content else ''


class NCXNavMap(NCXElement):
	"""Represents the navMap element."""

	@property
	def nav_points(self) -> List[NCXNavPoint]:
		"""Get all direct child navPoint elements."""
		nav_point_elements = self.element.xpath(
			'./ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXNavPoint(point) for point in nav_point_elements]

	def add_nav_point(
		self,
		id: str,
		label_text: str,
		src: str,
		class_attr: Optional[str] = None,
		play_order: Optional[int] = None,
	) -> NCXNavPoint:
		"""Add a navPoint element."""
		nav_point_element = etree.SubElement(
			self.element, '{http://www.daisy.org/z3986/2005/ncx/}navPoint'
		)
		nav_point = NCXNavPoint(nav_point_element)
		nav_point.id = id

		if class_attr:
			nav_point.class_attr = class_attr
		if play_order is not None:
			nav_point.play_order = play_order

		# Add navLabel
		nav_label_element = etree.SubElement(
			nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel'
		)
		nav_label = NCXNavLabel(nav_label_element)
		nav_label.text = label_text

		# Add content
		content_element = etree.SubElement(
			nav_point_element, '{http://www.daisy.org/z3986/2005/ncx/}content'
		)
		content = NCXContent(content_element)
		content.src = src

		return nav_point

	def get_all_nav_points(self) -> List[NCXNavPoint]:
		"""Get all navPoint elements recursively."""
		nav_point_elements = self.element.xpath(
			'.//ncx:navPoint', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXNavPoint(point) for point in nav_point_elements]


class NCXPageTarget(NCXElement):
	"""Represents a pageTarget element."""

	@property
	def type_attr(self) -> Optional[str]:
		"""Get the type attribute."""
		return self.element.get('type')

	@type_attr.setter
	def type_attr(self, value: str) -> None:
		"""Set the type attribute."""
		self.element.set('type', value)

	@property
	def value(self) -> Optional[str]:
		"""Get the value attribute."""
		return self.element.get('value')

	@value.setter
	def value(self, value: str) -> None:
		"""Set the value attribute."""
		self.element.set('value', value)

	@property
	def play_order(self) -> Optional[int]:
		"""Get the playOrder attribute."""
		play_order = self.element.get('playOrder')
		return int(play_order) if play_order else None

	@play_order.setter
	def play_order(self, value: int) -> None:
		"""Set the playOrder attribute."""
		self.element.set('playOrder', str(value))

	@property
	def nav_label(self) -> Optional[NCXNavLabel]:
		"""Get the navLabel child element."""
		nav_labels = self.element.xpath(
			'./ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if nav_labels:
			return NCXNavLabel(nav_labels[0])
		return None

	@property
	def content(self) -> Optional[NCXContent]:
		"""Get the content child element."""
		content_elements = self.element.xpath(
			'./ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if content_elements:
			return NCXContent(content_elements[0])
		return None

	@property
	def label_text(self) -> str:
		"""Get the text of the navLabel."""
		nav_label = self.nav_label
		return nav_label.text if nav_label else ''

	@property
	def content_src(self) -> str:
		"""Get the src of the content element."""
		content = self.content
		return content.src if content else ''


class NCXPageList(NCXElement):
	"""Represents the pageList element."""

	@property
	def page_targets(self) -> List[NCXPageTarget]:
		"""Get all pageTarget elements."""
		page_target_elements = self.element.xpath(
			'./ncx:pageTarget', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXPageTarget(target) for target in page_target_elements]

	def add_page_target(
		self,
		id: str,
		type_attr: str,
		value: str,
		label_text: str,
		src: str,
		play_order: Optional[int] = None,
	) -> NCXPageTarget:
		"""Add a pageTarget element."""
		page_target_element = etree.SubElement(
			self.element, '{http://www.daisy.org/z3986/2005/ncx/}pageTarget'
		)
		page_target = NCXPageTarget(page_target_element)
		page_target.id = id
		page_target.type_attr = type_attr
		page_target.value = value

		if play_order is not None:
			page_target.play_order = play_order

		# Add navLabel
		nav_label_element = etree.SubElement(
			page_target_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel'
		)
		nav_label = NCXNavLabel(nav_label_element)
		nav_label.text = label_text

		# Add content
		content_element = etree.SubElement(
			page_target_element, '{http://www.daisy.org/z3986/2005/ncx/}content'
		)
		content = NCXContent(content_element)
		content.src = src

		return page_target


class NCXNavTarget(NCXElement):
	"""Represents a navTarget element."""

	@property
	def value(self) -> Optional[str]:
		"""Get the value attribute."""
		return self.element.get('value')

	@value.setter
	def value(self, value: str) -> None:
		"""Set the value attribute."""
		self.element.set('value', value)

	@property
	def class_attr(self) -> Optional[str]:
		"""Get the class attribute."""
		return self.element.get('class')

	@class_attr.setter
	def class_attr(self, value: str) -> None:
		"""Set the class attribute."""
		self.element.set('class', value)

	@property
	def play_order(self) -> Optional[int]:
		"""Get the playOrder attribute."""
		play_order = self.element.get('playOrder')
		return int(play_order) if play_order else None

	@play_order.setter
	def play_order(self, value: int) -> None:
		"""Set the playOrder attribute."""
		self.element.set('playOrder', str(value))

	@property
	def nav_label(self) -> Optional[NCXNavLabel]:
		"""Get the navLabel child element."""
		nav_labels = self.element.xpath(
			'./ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if nav_labels:
			return NCXNavLabel(nav_labels[0])
		return None

	@property
	def content(self) -> Optional[NCXContent]:
		"""Get the content child element."""
		content_elements = self.element.xpath(
			'./ncx:content', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if content_elements:
			return NCXContent(content_elements[0])
		return None


class NCXNavList(NCXElement):
	"""Represents the navList element."""

	@property
	def nav_label(self) -> Optional[NCXNavLabel]:
		"""Get the navLabel child element."""
		nav_labels = self.element.xpath(
			'./ncx:navLabel', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if nav_labels:
			return NCXNavLabel(nav_labels[0])
		return None

	@property
	def nav_targets(self) -> List[NCXNavTarget]:
		"""Get all navTarget elements."""
		nav_target_elements = self.element.xpath(
			'./ncx:navTarget', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXNavTarget(target) for target in nav_target_elements]

	def add_nav_target(
		self, id: str, label_text: str, src: str, play_order: Optional[int] = None
	) -> NCXNavTarget:
		"""Add a navTarget element."""
		nav_target_element = etree.SubElement(
			self.element, '{http://www.daisy.org/z3986/2005/ncx/}navTarget'
		)
		nav_target = NCXNavTarget(nav_target_element)
		nav_target.id = id

		if play_order is not None:
			nav_target.play_order = play_order

		# Add navLabel
		nav_label_element = etree.SubElement(
			nav_target_element, '{http://www.daisy.org/z3986/2005/ncx/}navLabel'
		)
		nav_label = NCXNavLabel(nav_label_element)
		nav_label.text = label_text

		# Add content
		content_element = etree.SubElement(
			nav_target_element, '{http://www.daisy.org/z3986/2005/ncx/}content'
		)
		content = NCXContent(content_element)
		content.src = src

		return nav_target

	@property
	def label_text(self) -> str:
		"""Get the text of the navLabel."""
		nav_label = self.nav_label
		return nav_label.text if nav_label else ''


class NCXDocument(NCXElement):
	"""Represents the root ncx element."""

	@property
	def nav_map(self) -> Optional[NCXNavMap]:
		"""Get the navMap element."""
		nav_map_elements = self.element.xpath(
			'./ncx:navMap', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if nav_map_elements:
			return NCXNavMap(nav_map_elements[0])
		return None

	@property
	def page_list(self) -> Optional[NCXPageList]:
		"""Get the pageList element."""
		page_list_elements = self.element.xpath(
			'./ncx:pageList', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		if page_list_elements:
			return NCXPageList(page_list_elements[0])
		return None

	@property
	def nav_lists(self) -> List[NCXNavList]:
		"""Get all navList elements."""
		nav_list_elements = self.element.xpath(
			'./ncx:navList', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return [NCXNavList(nav_list) for nav_list in nav_list_elements]

	@property
	def title(self) -> str:
		"""Get the document title text."""
		title_elements = self.element.xpath(
			'.//ncx:docTitle/ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return title_elements[0].text if title_elements else ''

	@property
	def author(self) -> str:
		"""Get the document author text."""
		author_elements = self.element.xpath(
			'.//ncx:docAuthor/ncx:text', namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'}
		)
		return author_elements[0].text if author_elements else ''

	def get_uid(self) -> Optional[str]:
		"""Get the dtb:uid meta content."""
		uid_elements = self.element.xpath(
			'.//ncx:meta[@name="dtb:uid"]/@content',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)
		return uid_elements[0] if uid_elements else None

	def get_depth(self) -> Optional[int]:
		"""Get the dtb:depth meta content."""
		depth_elements = self.element.xpath(
			'.//ncx:meta[@name="dtb:depth"]/@content',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)
		return int(depth_elements[0]) if depth_elements else None

	def get_total_page_count(self) -> Optional[int]:
		"""Get the dtb:totalPageCount meta content."""
		count_elements = self.element.xpath(
			'.//ncx:meta[@name="dtb:totalPageCount"]/@content',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)
		return int(count_elements[0]) if count_elements else None

	def get_max_page_number(self) -> Optional[int]:
		"""Get the dtb:maxPageNumber meta content."""
		max_elements = self.element.xpath(
			'.//ncx:meta[@name="dtb:maxPageNumber"]/@content',
			namespaces={'ncx': 'http://www.daisy.org/z3986/2005/ncx/'},
		)
		return int(max_elements[0]) if max_elements else None


================================================
FILE: epub_utils/package/__init__.py
================================================
"""
Open Packaging Format (OPF): https://www.w3.org/TR/epub/#sec-package-doc

This file includes the `Package` class, which is responsible for parsing the OPF package file
of an EPUB archive. The OPF file contains metadata, manifest, spine, and guide information
about the EPUB content.

Namespace:
- The OPF file uses the namespace `http://www.idpf.org/2007/opf`.

For more details on the structure and requirements of the OPF file, refer to the
EPUB specification: https://www.w3.org/TR/epub/#sec-package-doc
"""

try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

import packaging.version

from epub_utils.exceptions import InvalidEPUBError, ParseError, UnsupportedFormatError
from epub_utils.package.manifest import Manifest
from epub_utils.package.metadata import Metadata
from epub_utils.package.spine import Spine
from epub_utils.printers import XMLPrinter


class Package:
	"""
	Represents the parsed OPF package file of an EPUB.

	Attributes:
	    xml_content (str): The raw XML content of the OPF package file.
	    metadata (dict): The metadata section of the OPF file.
	    manifest (dict): The manifest section listing all resources.
	    spine (list): The spine section defining the reading order.
	    guide (dict): The guide section with navigation references.
	    cover (str): The cover image resource ID.
	    toc (str): The table of contents resource ID.
	    nav (str): The navigation document resource ID.
	"""

	NAMESPACE = 'http://www.idpf.org/2007/opf'
	DC_NAMESPACE = 'http://purl.org/dc/elements/1.1/'
	METADATA_XPATH = f'.//{{{NAMESPACE}}}metadata'
	SPINE_XPATH = f'.//{{{NAMESPACE}}}spine'
	MANIFEST_XPATH = f'.//{{{NAMESPACE}}}manifest'
	ITEM_XPATH = f'.//{{{NAMESPACE}}}item'
	NCX_MEDIA_TYPE = 'application/x-dtbncx+xml'
	TITLE_XPATH = f'.//{{{DC_NAMESPACE}}}title'
	CREATOR_XPATH = f'.//{{{DC_NAMESPACE}}}creator'
	IDENTIFIER_XPATH = f'.//{{{DC_NAMESPACE}}}identifier'

	def __init__(self, xml_content: str) -> None:
		"""
		Initialize the Package by parsing the OPF package file.

		Args:
		    xml_content (str): The raw XML content of the OPF package file.
		"""
		self.xml_content = xml_content

		self.metadata = None
		self.manifest = None
		self.spine = None
		self.guide = None
		self.cover = None
		self.toc_href = None
		self.nav_href = None
		self.version = None

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def _parse(self, xml_content: str) -> None:
		"""
		Parses the OPF package file to extract metadata.

		Args:
		    xml_content (str): The raw XML content of the OPF package file.

		Raises:
		    ParseError: If the XML is invalid or cannot be parsed.
		    InvalidEPUBError: If required OPF elements are missing.
		"""
		try:
			if isinstance(xml_content, str):
				xml_content = xml_content.encode('utf-8')
			root = etree.fromstring(xml_content)

			# Check for version attribute
			if 'version' not in root.attrib:
				raise InvalidEPUBError(
					"OPF file missing required 'version' attribute",
					suggestions=[
						'Ensure the package element has a version attribute',
						'Check that this is a valid EPUB OPF file',
						'Verify the EPUB was created with compliant tools',
					],
				)

			self.version = self._parse_version(root.attrib['version'])

			# Parse metadata
			metadata_el = root.find(self.METADATA_XPATH)
			if metadata_el is None:
				raise InvalidEPUBError(
					'OPF file missing required metadata element',
					suggestions=[
						'Ensure the OPF file contains a metadata section',
						'Check the EPUB package structure',
						'Verify all required OPF elements are present',
					],
				)
			metadata_xml = etree.tostring(metadata_el, encoding='unicode')
			self.metadata = Metadata(metadata_xml)

			# Parse manifest
			manifest_el = root.find(self.MANIFEST_XPATH)
			if manifest_el is not None:
				manifest_xml = etree.tostring(manifest_el, encoding='unicode')
				self.manifest = Manifest(manifest_xml)
			else:
				raise InvalidEPUBError(
					'OPF file missing required manifest element',
					suggestions=[
						'Ensure the OPF file contains a manifest section',
						'Check that all resources are declared in the manifest',
						'Verify the EPUB package structure is complete',
					],
				)

			# Parse spine
			spine_el = root.find(self.SPINE_XPATH)
			if spine_el is not None:
				spine_xml = etree.tostring(spine_el, encoding='unicode')
				self.spine = Spine(spine_xml)
			else:
				raise InvalidEPUBError(
					'OPF file missing required spine element',
					suggestions=[
						'Ensure the OPF file contains a spine section',
						'Check that reading order is defined in the spine',
						'Verify the EPUB package structure is complete',
					],
				)

			# Parse TOC references
			if self.version.major == 3:
				self.nav_href = self._find_nav_href(root)
			else:
				self.toc_href = self._find_toc_href(root)

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in OPF file: {str(e)}',
				suggestions=[
					'Check that the OPF file contains valid XML',
					'Verify the file is not corrupted',
					'Ensure all XML tags are properly closed',
					'Check for invalid characters in the XML',
				],
			) from e

	def _get_text(self, root: etree.Element, xpath: str) -> str:
		"""
		Helper method to extract text content from an XML element.

		Args:
		    root (etree.Element): The root element to search within.
		    xpath (str): The XPath expression to locate the element.

		Returns:
		    str: The text content of the element, or None if not found.
		"""
		element = root.find(xpath)
		return element.text.strip() if element is not None and element.text else None

	def _find_toc_href(self, root: etree.Element) -> str:
		"""
		Find the publication navigation control file.

		Args:
		    root (etree.Element): The root element of the OPF document.

		Returns:
		    str: The href to the NCX document, or None if not found.
		"""
		# First check for NCX media-type in manifest
		for item in root.findall(self.ITEM_XPATH):
			if item.get('media-type') == self.NCX_MEDIA_TYPE:
				return item.get('href')

		# Then check spine toc attribute
		spine = root.find(self.SPINE_XPATH)
		if spine is not None:
			toc_id = spine.get('toc')
			if toc_id:
				for item in root.findall(self.ITEM_XPATH):
					if item.get('id') == toc_id:
						href = item.get('href')
						if href:
							# Remove fragment identifier if present
							return href.split('#')[0]

		return None

	def _find_nav_href(self, root: etree.Element) -> str:
		"""
		Find the publication navigation file.

		Args:
		    root (etree.Element): The root element of the OPF document.

		Returns:
		    str: The href to navigation file, or None if not found.
		"""
		# Check for item with nav properties
		for item in root.findall(self.ITEM_XPATH):
			if item.get('properties') == 'nav':
				href = item.get('href')
				if href:
					return href.split('#')[0]

		# Fall back to guide TOC reference
		guide = root.find(f'.//{{{self.NAMESPACE}}}guide')
		if guide is not None:
			for reference in guide.findall(f'.//{{{self.NAMESPACE}}}reference'):
				if reference.get('type') == 'toc':
					href = reference.get('href')
					if href:
						return href.split('#')[0]

		return None

	def _parse_version(self, version):
		"""
		Parse and validate the EPUB version.

		Args:
		    version (str): Version string from the OPF file.

		Returns:
		    packaging.version.Version: Parsed version object.

		Raises:
		    UnsupportedFormatError: If the EPUB version is not supported.
		"""
		try:
			version_obj = packaging.version.Version(version)
		except packaging.version.InvalidVersion as e:
			raise InvalidEPUBError(
				f"Invalid version format in OPF file: '{version}'",
				suggestions=[
					"Ensure the version follows semantic versioning (e.g., '3.0', '2.0')",
					'Check that the version attribute is correctly formatted',
					'Verify the EPUB was created with compliant tools',
				],
			) from e

		if version_obj.major not in (1, 2, 3):
			supported_versions = '1.x, 2.x, 3.x'
			raise UnsupportedFormatError(
				f'EPUB version {version_obj.major}.x is not supported',
				epub_version=str(version_obj),
				suggestions=[
					f'Use an EPUB with a supported version ({supported_versions})',
					'Convert the EPUB to a supported version',
					'Check the EPUB specification for version requirements',
				],
			)

		return version_obj


================================================
FILE: epub_utils/package/manifest.py
================================================
try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

from epub_utils.exceptions import ParseError
from epub_utils.printers import XMLPrinter


class Manifest:
	"""
	Represents the manifest section of an EPUB package document.
	The manifest element provides an exhaustive list of the publication resources.
	"""

	NAMESPACE = 'http://www.idpf.org/2007/opf'
	ITEM_XPATH = f'.//{{{NAMESPACE}}}item'

	def __init__(self, xml_content: str):
		self.xml_content = xml_content
		self.items = []

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def _parse(self, xml_content: str) -> None:
		"""
		Parses the manifest XML content.
		"""
		try:
			if isinstance(xml_content, str):
				xml_content = xml_content.encode('utf-8')
			root = etree.fromstring(xml_content)

			for item in root.findall(self.ITEM_XPATH):
				item_data = {
					'id': item.get('id'),
					'href': item.get('href'),
					'media_type': item.get('media-type'),
					'properties': item.get('properties', '').split(),
				}
				if all(
					v is not None
					for v in [item_data['id'], item_data['href'], item_data['media_type']]
				):
					self.items.append(item_data)

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in manifest element: {str(e)}',
				element_name='manifest',
				suggestions=[
					'Check that the manifest contains valid XML',
					'Verify all manifest items are properly formatted',
					'Ensure required attributes (id, href, media-type) are present',
					'Check for invalid characters in the XML',
				],
			) from e

	def find_by_property(self, property_name: str) -> dict:
		"""Find the first item with the given property."""
		for item in self.items:
			if property_name in item['properties']:
				return item
		return None

	def find_by_id(self, item_id: str) -> dict:
		"""Find an item by its ID."""
		for item in self.items:
			if item['id'] == item_id:
				return item
		return None

	def find_by_media_type(self, media_type: str) -> list:
		"""Find all items with the given media type."""
		return [item for item in self.items if item['media_type'] == media_type]


================================================
FILE: epub_utils/package/metadata.py
================================================
try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

from epub_utils.exceptions import ParseError, ValidationError
from epub_utils.printers import XMLPrinter


class Metadata:
	"""
	Represents the metadata section of an EPUB package document.
	Handles Dublin Core (DC) and Dublin Core Terms (DCTERMS) metadata elements.
	"""

	DC_NAMESPACE = 'http://purl.org/dc/elements/1.1/'
	DCTERMS_NAMESPACE = 'http://purl.org/dc/terms/'
	REQUIRED_FIELDS = ['identifier', 'title', 'creator']

	NSMAP = {'dc': DC_NAMESPACE, 'dcterms': DCTERMS_NAMESPACE}

	def __init__(self, xml_content: str):
		self.xml_content = xml_content
		self.fields = {}

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def _parse(self, xml_content: str) -> None:
		try:
			if isinstance(xml_content, str):
				xml_content = xml_content.encode('utf-8')
			root = etree.fromstring(xml_content)

			for ns_prefix, ns_uri in self.NSMAP.items():
				for element in root.findall(f'.//{{{ns_uri}}}*'):
					name = element.tag.split('}')[-1]
					text = element.text.strip() if element.text else None
					if text:
						self._add_field(name, text)

			for meta in root.findall('.//meta[@property]'):
				prop = meta.get('property', '')
				if prop.startswith('dcterms:'):
					name = prop.split(':')[1]
					text = meta.text.strip() if meta.text else None
					if text:
						self._add_field(name, text)

			self._validate()

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in metadata element: {str(e)}',
				element_name='metadata',
				suggestions=[
					'Check that the metadata contains valid XML',
					'Verify all metadata elements are properly formatted',
					'Ensure required Dublin Core elements are present',
					'Check for invalid characters in metadata values',
				],
			) from e

	def _add_field(self, name: str, value: str) -> None:
		if name in self.fields:
			if isinstance(self.fields[name], list):
				self.fields[name].append(value)
			else:
				self.fields[name] = [self.fields[name], value]
		else:
			self.fields[name] = value

	def _validate(self, raise_exception=False) -> None:
		"""
		Validate all required fields and raise ValidationError if validation fails.
		"""
		errors = {}

		for field in self.REQUIRED_FIELDS:
			try:
				self._validate_field(field)
			except ValueError as e:
				errors[field] = str(e)

		if errors and raise_exception:
			error_messages = [f'{field}: {msg}' for field, msg in errors.items()]
			validation_errors = [f"Missing or invalid '{field}' element" for field in errors.keys()]

			raise ValidationError(
				'EPUB metadata validation failed',
				validation_errors=validation_errors,
				suggestions=[
					'Ensure all required Dublin Core metadata elements are present',
					'Check that metadata values are not empty',
					'Verify the metadata follows EPUB specification requirements',
					'Use proper Dublin Core namespace for metadata elements',
				],
			)

	def _validate_field(self, field_name: str) -> None:
		"""
		Validate an individual field.

		Args:
		    field_name: Name of the field to validate

		Raises:
		    ValueError: If the field validation fails
		"""
		value = self.fields.get(field_name)
		if value is None or (isinstance(value, str) and not value.strip()):
			raise ValueError('This field is required')

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def _get_text(self, root: etree.Element, xpath: str) -> str:
		element = root.find(xpath)
		return element.text.strip() if element is not None and element.text else None

	def __getattr__(self, name: str) -> str:
		return self.fields.get(name)

	def to_kv(self) -> str:
		if not self.fields:
			return ''

		max_key_length = max(len(k) for k in self.fields.keys())

		lines = [f'{k.rjust(max_key_length)}: {str(v)}' for k, v in self.fields.items()]

		return '\n'.join(lines)


================================================
FILE: epub_utils/package/spine.py
================================================
try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

from epub_utils.exceptions import ParseError
from epub_utils.printers import XMLPrinter


class Spine:
	"""
	Represents the spine section of an EPUB package document.
	The spine element defines the default reading order of the content.
	"""

	NAMESPACE = 'http://www.idpf.org/2007/opf'
	ITEMREF_XPATH = f'.//{{{NAMESPACE}}}itemref'

	def __init__(self, xml_content: str):
		self.xml_content = xml_content

		self.itemrefs = []
		self.toc = None
		self.page_progression_direction = None

		self._parse(xml_content)

		self._printer = XMLPrinter(self)

	def __str__(self) -> str:
		return self.xml_content

	def to_str(self, *args, **kwargs) -> str:
		return self._printer.to_str(*args, **kwargs)

	def to_xml(self, *args, **kwargs) -> str:
		return self._printer.to_xml(*args, **kwargs)

	def _parse(self, xml_content: str) -> None:
		"""
		Parses the spine XML content.
		"""
		try:
			if isinstance(xml_content, str):
				xml_content = xml_content.encode('utf-8')
			root = etree.fromstring(xml_content)

			self.toc = root.get('toc')
			self.page_progression_direction = root.get('page-progression-direction', 'default')

			for itemref in root.findall(self.ITEMREF_XPATH):
				idref = itemref.get('idref')
				linear = itemref.get('linear', 'yes')
				properties = itemref.get('properties', '').split()

				if idref:
					self.itemrefs.append(
						{'idref': idref, 'linear': linear == 'yes', 'properties': properties}
					)

		except etree.ParseError as e:
			raise ParseError(
				f'Invalid XML in spine element: {str(e)}',
				element_name='spine',
				suggestions=[
					'Check that the spine contains valid XML',
					'Verify all spine items are properly formatted',
					'Ensure required attributes (idref) are present',
					'Check that spine defines the reading order correctly',
				],
			) from e

	def find_by_idref(self, itemref_idref: str) -> dict:
		"""Find an itemref by its idref."""
		for item in self.itemrefs:
			if item['idref'] == itemref_idref:
				return item
		return None


================================================
FILE: epub_utils/printers.py
================================================
try:
	from lxml import etree
except ImportError:
	import xml.etree.ElementTree as etree

from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import XmlLexer


def highlight_xml(xml_content: str) -> str:
	return highlight(xml_content, XmlLexer(), TerminalFormatter())


def pretty_print_xml(xml_content: str) -> str:
	try:
		original_content = xml_content
		if isinstance(xml_content, str):
			xml_content_bytes = xml_content.encode('utf-8')
		else:
			xml_content_bytes = xml_content
			original_content = (
				xml_content.decode('utf-8') if isinstance(xml_content, bytes) else xml_content
			)

		xml_declaration = ''
		doctype_declaration = ''

		if original_content.strip().startswith('<?xml'):
			xml_decl_end = original_content.find('?>') + 2
			xml_declaration = original_content[:xml_decl_end]

		doctype_start = original_content.find('<!DOCTYPE')
		if doctype_start != -1:
			doctype_end = original_content.find('>', doctype_start) + 1
			doctype_declaration = original_content[doctype_start:doctype_end]

		parser = etree.XMLParser(remove_blank_text=True)
		root = etree.fromstring(xml_content_bytes, parser)
		pretty_xml = etree.tostring(root, pretty_print=True, encoding='unicode')

		result = ''
		if xml_declaration:
			result += xml_declaration + '\n'
		if doctype_declaration:
			result += doctype_declaration + '\n'
		result += pretty_xml

		return result
	except etree.ParseError:
		return original_content if isinstance(original_content, str) else xml_content


def print_to_str(xml_content: bool, pretty_print: bool) -> str:
	if pretty_print:
		xml_content = pretty_print_xml(xml_content)

	return xml_content


def print_to_xml(xml_content: str, pretty_print: bool, highlight_syntax: bool) -> str:
	if pretty_print:
		xml_content = pretty_print_xml(xml_content)

	if highlight_syntax:
		xml_content = highlight_xml(xml_content)

	return xml_content


class XMLPrinter:
	"""Handles XML printing operations for objects with xml_content."""

	def __init__(self, xml_content_provider):
		"""
		Initialize the XMLPrinter with an object that provides xml_content.

		Args:
			xml_content_provider: Object that has an xml_content attribute
		"""
		self._xml_content_provider = xml_content_provider

	def to_str(self, pretty_print: bool = False) -> str:
		"""
		Get string representation of the XML content.

		Args:
			pretty_print: Whether to format the XML with proper indentation

		Returns:
			String representation of the XML content
		"""
		return print_to_str(self._xml_content_provider.xml_content, pretty_print)

	def to_xml(self, pretty_print: bool = False, highlight_syntax: bool = True) -> str:
		"""
		Get formatted XML representation with optional syntax highlighting.

		Args:
			pretty_print: Whether to format the XML with proper indentation
			highlight_syntax: Whether to apply syntax highlighting

		Returns:
			Formatted XML string with optional syntax highlighting
		"""
		return print_to_xml(self._xml_content_provider.xml_content, pretty_print, highlight_syntax)


================================================
FILE: pytest.ini
================================================
[pytest]
pythonpath = .
python_files = tests.py test_*.py *_tests.py
addopts = -p no:warnings

================================================
FILE: requirements/requirements-docs.txt
================================================
sphinx==6.2.0
sphinx-copybutton==0.5.1
sphinx-issues==3.0.1
furo==2022.12.7

================================================
FILE: requirements/requirements-linting.txt
================================================
ruff==0.11.9

================================================
FILE: requirements/requirements-testing.txt
================================================
coverage==6.4.1
coverage-badge==1.1.0
pytest==7.2.0
pytest-cov==3.0.0

================================================
FILE: requirements/requirements.txt
================================================
click==8.1.8
lxml==5.4.0
pygments==2.19.1
PyYAML==6.0.2

================================================
FILE: requirements.txt
================================================
-r requirements/requirements-docs.txt
-r requirements/requirements-linting.txt
-r requirements/requirements-testing.txt
-r requirements/requirements.txt

================================================
FILE: ruff.toml
================================================
line-length = 100

[format]
quote-style = "single"
indent-style = "tab"
docstring-code-format = true

================================================
FILE: setup.py
================================================
import os

from setuptools import find_packages, setup

VERSION = '0.1.0a1'


def get_long_description():
	with open(
		os.path.join(os.path.dirname(os.path.abspath(__file__)), 'README.md'),
		encoding='utf8',
	) as fp:
		return fp.read()


setup(
	name='epub-utils',
	description='A Python CLI and utility library for manipulating EPUB files',
	long_description=get_long_description(),
	long_description_content_type='text/markdown',
	author='Ernesto González',
	url='https://github.com/ernestofgonzalez/epub-utils',
	project_urls={
		'Source code': 'https://github.com/ernestofgonzalez/epub-utils',
		'Issues': 'https://github.com/ernestofgonzalez/epub-utils/issues',
		'CI': 'https://github.com/ernestofgonzalez/epub-utils/actions',
		'Changelog': 'https://github.com/ernestofgonzalez/epub-utils/releases',
	},
	license='Apache License, Version 2.0',
	version=VERSION,
	packages=find_packages(),
	entry_points={
		'console_scripts': [
			'epub-utils = epub_utils.cli:main',
		]
	},
	install_requires=[
		'click',
		'lxml',
		'packaging',
		'pygments',
		'PyYAML',
	],
	extras_require={
		'test': ['pytest'],
		'docs': [
			'sphinx',
			'sphinx-copybutton',
			'sphinx-issues',
			'furo',
		],
	},
	python_requires='>=3.8',
	classifiers=[
		'Intended Audience :: Developers',
		'Topic :: Software Development :: Libraries',
		'Topic :: Utilities',
		'Programming Language :: Python :: 3.8',
		'Programming Language :: Python :: 3.9',
		'Programming Language :: Python :: 3.10',
		'Programming Language :: Python :: 3.11',
		'Programming Language :: Python :: 3.12',
		'Programming Language :: Python :: 3.13',
		'Operating System :: Microsoft :: Windows',
		'Operating System :: POSIX',
		'Operating System :: Unix',
		'Operating System :: MacOS',
	],
)


================================================
FILE: tests/conftest.py
================================================
import pytest


@pytest.fixture
def doc_path():
	path = str('tests/assets/roads.epub')
	return path


================================================
FILE: tests/test_cli.py
================================================
import pytest
from click.testing import CliRunner

from epub_utils import cli


@pytest.mark.parametrize(
	'options',
	(
		['-h'],
		['--help'],
	),
)
def test_help(options):
	result = CliRunner().invoke(cli.main, options)
	assert result.exit_code == 0
	assert result.output.startswith('Usage: ')
	assert '-h, --help' in result.output


@pytest.mark.parametrize(
	'options',
	(
		['-v'],
		['--version'],
	),
)
def test_version(options):
	result = CliRunner().invoke(cli.main, options)
	assert result.exit_code == 0
	assert result.output.strip() == cli.VERSION


def test_files_command_with_file_path_xhtml_xml(doc_path):
	"""Test the files command with XHTML file path in XML format."""
	result = CliRunner().invoke(
		cli.main, [str(doc_path), 'files', 'GoogleDoc/Roads.xhtml', '--format', 'xml']
	)
	assert result.exit_code == 0
	assert len(result.output) > 0


def test_files_command_with_file_path_missing_file(doc_path):
	"""Test the files command with missing file path."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'files', 'nonexistent/file.xhtml'])
	assert result.exit_code == 1
	assert 'Missing' in result.output


def test_files_command_without_file_path_table(doc_path):
	"""Test the files command without file path (list files) in table format."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'files', '--format', 'table'])
	assert result.exit_code == 0
	assert len(result.output) > 0
	assert 'Path' in result.output
	assert 'Size' in result.output


def test_files_command_without_file_path_raw(doc_path):
	"""Test the files command without file path (list files) in raw format."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'files', '--format', 'raw'])
	assert result.exit_code == 0
	assert len(result.output) > 0
	assert 'GoogleDoc/Roads.xhtml' in result.output


def test_toc_command_default(doc_path):
	"""Test the toc command with default behavior (auto-detect)."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'toc'])
	assert result.exit_code == 0
	assert len(result.output) > 0


def test_toc_command_nav_flag(doc_path):
	"""Test the toc command with --nav flag."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'toc', '--nav'])
	assert result.exit_code == 0
	assert len(result.output) > 0


def test_toc_command_mutually_exclusive_flags(doc_path):
	"""Test that --ncx and --nav flags are mutually exclusive."""
	result = CliRunner().invoke(cli.main, [str(doc_path), 'toc', '--ncx', '--nav'])
	assert result.exit_code == 1
	assert '--ncx and --nav flags cannot be used together' in result.output


================================================
FILE: tests/test_container.py
================================================
import pytest

from epub_utils.container import Container
from epub_utils.exceptions import InvalidEPUBError

CONTAINER_XML = """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
    <rootfiles>
        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
    </rootfiles>
</container>
"""


def test_container_initialization():
	"""
	Test that the Container class initializes correctly with valid XML content.
	"""
	container = Container(CONTAINER_XML)
	assert container is not None
	assert container.rootfile_path == 'OEBPS/content.opf'


def test_invalid_container_xml():
	"""
	Test that the Container class raises an error for invalid XML content.
	"""
	invalid_xml = '<invalid></invalid>'
	with pytest.raises(InvalidEPUBError, match='Invalid container.xml: Missing rootfile element'):
		Container(invalid_xml)


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<?xml version="1.0"?>\n<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n    <rootfiles>\n\n        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>\n    </rootfiles>\n</container>',
			False,
			'<?xml version="1.0"?>\n<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n    <rootfiles>\n\n        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>\n    </rootfiles>\n</container>',
		),
		(
			'<?xml version="1.0"?>\n<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n    <rootfiles>\n\n        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>\n    </rootfiles>\n</container>',
			True,
			'<?xml version="1.0"?>\n<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">\n  <rootfiles>\n    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>\n  </rootfiles>\n</container>\n',
		),
	],
)
def test_container_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing for Container."""
	container = Container(xml_content)

	assert container.to_str(pretty_print=pretty_print) == expected


================================================
FILE: tests/test_doc.py
================================================
import unittest

from epub_utils.container import Container
from epub_utils.doc import Document
from epub_utils.navigation import EPUBNavDocNavigation, Navigation
from epub_utils.package import Manifest, Package


def test_document_container(doc_path):
	"""
	Test that the Document class correctly parses the container.xml file.
	"""
	doc = Document(doc_path)
	assert isinstance(doc.container, Container)


def test_document_package(doc_path):
	"""
	Test that the Document class correctly parses the package file.
	"""
	case = unittest.TestCase()

	doc = Document(doc_path)
	assert isinstance(doc.package, Package)
	assert isinstance(doc.package.manifest, Manifest)
	case.assertCountEqual(
		doc.package.manifest.items,
		[
			{
				'id': 'toc',
				'href': 'nav.xhtml',
				'media_type': 'application/xhtml+xml',
				'properties': ['nav'],
			},
			{
				'id': 'main',
				'href': 'Roads.xhtml',
				'media_type': 'application/xhtml+xml',
				'properties': [],
			},
		],
	)


def test_document_toc(doc_path):
	"""
	Test that the Document class correctly parses the table of contents file.
	"""
	doc = Document(doc_path)
	assert isinstance(doc.toc, Navigation)


def test_document_find_content_by_id(doc_path):
	doc = Document(doc_path)
	content = doc.find_content_by_id('main')
	assert content is not None


def test_document_get_file_by_path_xhtml(doc_path):
	"""
	Test that the Document class can retrieve XHTML files by path.
	"""
	doc = Document(doc_path)
	content = doc.get_file_by_path('GoogleDoc/Roads.xhtml')

	# Should return XHTMLContent object for XHTML files
	assert hasattr(content, 'to_str')
	assert hasattr(content, 'to_xml')
	assert hasattr(content, 'to_plain')

	# Content should not be empty
	content_str = content.to_str()
	assert len(content_str) > 0
	assert 'xhtml' in content_str.lower()


def test_document_get_file_by_path_missing_file(doc_path):
	"""
	Test that the Document class raises an error for missing files.
	"""
	doc = Document(doc_path)

	try:
		doc.get_file_by_path('nonexistent/file.xhtml')
		assert False, 'Expected ValueError for missing file'
	except ValueError as e:
		assert 'Missing' in str(e)


def test_document_nav_property(doc_path):
	"""
	Test that the Document class correctly accesses the Navigation Document via nav property.
	"""
	doc = Document(doc_path)
	nav = doc.nav

	assert nav is not None
	assert isinstance(nav, EPUBNavDocNavigation)


================================================
FILE: tests/test_manifest.py
================================================
import pytest

from epub_utils.package.manifest import Manifest

VALID_MANIFEST_XML = """
<manifest xmlns="http://www.idpf.org/2007/opf">
    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
    <item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
    <item id="style" href="style.css" media-type="text/css"/>
    <item id="image1" href="image1.jpg" media-type="image/jpeg"/>
</manifest>
"""

MINIMAL_MANIFEST_XML = """
<manifest xmlns="http://www.idpf.org/2007/opf">
    <item id="content" href="content.xhtml" media-type="application/xhtml+xml"/>
</manifest>
"""


def test_manifest_initialization():
	manifest = Manifest(VALID_MANIFEST_XML)

	assert len(manifest.items) == 4

	assert manifest.items[0]['id'] == 'nav'
	assert manifest.items[0]['href'] == 'nav.xhtml'
	assert manifest.items[0]['media_type'] == 'application/xhtml+xml'
	assert manifest.items[0]['properties'] == ['nav']

	assert manifest.items[2]['id'] == 'style'
	assert manifest.items[2]['href'] == 'style.css'
	assert manifest.items[2]['media_type'] == 'text/css'
	assert manifest.items[2]['properties'] == []


def test_minimal_manifest():
	manifest = Manifest(MINIMAL_MANIFEST_XML)

	assert len(manifest.items) == 1
	assert manifest.items[0]['id'] == 'content'
	assert manifest.items[0]['href'] == 'content.xhtml'
	assert manifest.items[0]['media_type'] == 'application/xhtml+xml'
	assert manifest.items[0]['properties'] == []


def test_find_by_property():
	manifest = Manifest(VALID_MANIFEST_XML)
	nav_item = manifest.find_by_property('nav')
	assert nav_item['id'] == 'nav'
	assert nav_item['href'] == 'nav.xhtml'


def test_find_by_id():
	manifest = Manifest(VALID_MANIFEST_XML)
	chapter = manifest.find_by_id('chapter1')
	assert chapter['href'] == 'chapter1.xhtml'
	assert chapter['media_type'] == 'application/xhtml+xml'


def test_find_by_media_type():
	manifest = Manifest(VALID_MANIFEST_XML)
	xhtml_items = manifest.find_by_media_type('application/xhtml+xml')
	assert len(xhtml_items) == 2
	assert all(item['media_type'] == 'application/xhtml+xml' for item in xhtml_items)


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<manifest xmlns="http://www.idpf.org/2007/opf">\n    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>\n\n    <item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>\n</manifest>',
			False,
			'<manifest xmlns="http://www.idpf.org/2007/opf">\n    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>\n\n    <item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>\n</manifest>',
		),
		(
			'<manifest xmlns="http://www.idpf.org/2007/opf">\n    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>\n\n    <item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>\n</manifest>',
			True,
			'<manifest xmlns="http://www.idpf.org/2007/opf">\n  <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>\n  <item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>\n</manifest>\n',
		),
	],
)
def test_manifest_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing for Manifest."""
	manifest = Manifest(xml_content)

	assert manifest.to_str(pretty_print=pretty_print) == expected


================================================
FILE: tests/test_metadata.py
================================================
import pytest

from epub_utils.exceptions import ValidationError
from epub_utils.package.metadata import Metadata

VALID_METADATA_XML = """
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/">
    <dc:title>Test Book</dc:title>
    <dc:creator>Test Author</dc:creator>
    <dc:identifier>test-id-123</dc:identifier>
    <dc:language>en</dc:language>
    <dc:subject>Fiction</dc:subject>
    <dc:subject>Science Fiction</dc:subject>
    <dc:date>2024-01-01</dc:date>
    <dc:publisher>Test Publisher</dc:publisher>
    <meta property="dcterms:modified">2023-11-28T14:50:13Z</meta>
    <meta property="dcterms:source">Original Source</meta>
</metadata>
"""

INVALID_METADATA_XML = """
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:title>Test Book</dc:title>
    <dc:creator>Test Author</dc:creator>
</metadata>
"""


def test_metadata_parse_valid_element():
	"""Test parsing valid metadata XML with both required and optional DC terms."""
	metadata = Metadata(VALID_METADATA_XML)

	assert metadata.title == 'Test Book'
	assert metadata.creator == 'Test Author'
	assert metadata.identifier == 'test-id-123'

	assert metadata.language == 'en'
	assert metadata.subject == ['Fiction', 'Science Fiction']
	assert metadata.date == '2024-01-01'
	assert metadata.publisher == 'Test Publisher'

	assert metadata.modified == '2023-11-28T14:50:13Z'
	assert metadata.source == 'Original Source'


def test_metadata_validate_missing_identifier_with_raise_exception():
	"""Test that parsing metadata without identifier raises error."""
	with pytest.raises(ValidationError):
		Metadata(INVALID_METADATA_XML)._validate(raise_exception=True)


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n    <dc:title>Test Book</dc:title>\n\n    <dc:creator>Test Author</dc:creator>\n\n    <dc:identifier>test-id-123</dc:identifier>\n</metadata>',
			False,
			'<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n    <dc:title>Test Book</dc:title>\n\n    <dc:creator>Test Author</dc:creator>\n\n    <dc:identifier>test-id-123</dc:identifier>\n</metadata>',
		),
		(
			'<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n    <dc:title>Test Book</dc:title>\n\n    <dc:creator>Test Author</dc:creator>\n\n    <dc:identifier>test-id-123</dc:identifier>\n</metadata>',
			True,
			'<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n  <dc:title>Test Book</dc:title>\n  <dc:creator>Test Author</dc:creator>\n  <dc:identifier>test-id-123</dc:identifier>\n</metadata>\n',
		),
	],
)
def test_metadata_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing for Metadata."""
	metadata = Metadata(xml_content)

	assert metadata.to_str(pretty_print=pretty_print) == expected


================================================
FILE: tests/test_nav_navigation.py
================================================
import pytest

from epub_utils.navigation.nav import EPUBNavDocNavigation

NAV_XML = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li id="ch1-li">
                <a href="chapter1.xhtml" id="ch1">Chapter 1</a>
            </li>
        </ol>
    </nav>
</body>
</html>"""


def test_nav_doc_navigation_initialization():
	"""Test that the EPUBNavDocNavigation class initializes correctly."""
	nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml')
	assert nav is not None
	assert nav.xml_content == NAV_XML
	assert nav.media_type == 'application/xhtml+xml'
	assert nav.href == 'nav.xhtml'

	assert nav.xmlns == 'http://www.w3.org/1999/xhtml'
	assert nav.lang == 'en'


def test_nav_doc_navigation_interface():
	"""Test the new navigation interface methods."""
	nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml')

	# Test get_toc_items
	toc_items = nav.get_toc_items()
	assert len(toc_items) == 1

	item = toc_items[0]
	assert item.id == 'ch1'
	assert item.label == 'Chapter 1'
	assert item.target == 'chapter1.xhtml'
	assert item.order == 1
	assert item.level == 0

	# Test get_page_list (should be empty for this sample)
	page_list = nav.get_page_list()
	assert len(page_list) == 0

	# Test get_landmarks (should be empty for this sample)
	landmarks = nav.get_landmarks()
	assert len(landmarks) == 0

	# Test find_item_by_id
	found_item = nav.find_item_by_id('ch1')
	assert found_item is not None
	assert found_item.label == 'Chapter 1'

	# Test find_items_by_target
	found_items = nav.find_items_by_target('chapter1.xhtml')
	assert len(found_items) == 1
	assert found_items[0].id == 'ch1'


def test_nav_doc_navigation_toc_items_as_dicts():
	"""Test hierarchical navigation structure."""
	nav_xml_hierarchical = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li id="ch1-li">
                <a href="chapter1.xhtml" id="ch1">Chapter 1</a>
                <ol>
                    <li id="ch1-1-li">
                        <a href="chapter1.xhtml#section1" id="ch1-1">Section 1.1</a>
                    </li>
                </ol>
            </li>
            <li id="ch2-li">
                <a href="chapter2.xhtml" id="ch2">Chapter 2</a>
            </li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_hierarchical, 'application/xhtml+xml', 'nav.xhtml')

	toc_items = nav.get_toc_items_as_dicts()

	assert toc_items == [
		{
			'id': 'ch1',
			'label': 'Chapter 1',
			'target': 'chapter1.xhtml',
			'order': 1,
			'level': 0,
			'type': None,
			'children': [
				{
					'id': 'ch1-1',
					'label': 'Section 1.1',
					'target': 'chapter1.xhtml#section1',
					'order': 1,
					'level': 1,
					'type': None,
					'children': [],
				}
			],
		},
		{
			'id': 'ch2',
			'label': 'Chapter 2',
			'target': 'chapter2.xhtml',
			'order': 2,
			'level': 0,
			'type': None,
			'children': [],
		},
	]


def test_nav_doc_navigation_page_list():
	"""Test page list functionality."""
	nav_xml_with_pages = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li><a href="chapter1.xhtml" id="ch1">Chapter 1</a></li>
        </ol>
    </nav>
    <nav epub:type="page-list" id="page-list">
        <h1>List of Pages</h1>
        <ol>
            <li><a href="chapter1.xhtml#page1" id="page1">1</a></li>
            <li><a href="chapter1.xhtml#page2" id="page2">2</a></li>
            <li><a href="chapter2.xhtml#page3" id="page3">3</a></li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_with_pages, 'application/xhtml+xml', 'nav.xhtml')

	# Test get_page_list
	page_list = nav.get_page_list()
	assert len(page_list) == 3

	page1 = page_list[0]
	assert page1.id == 'page1'
	assert page1.label == '1'
	assert page1.target == 'chapter1.xhtml#page1'
	assert page1.order == 1
	assert page1.level == 0
	assert page1.item_type in [None, 'page']  # Could be None or 'page'

	page2 = page_list[1]
	assert page2.id == 'page2'
	assert page2.label == '2'
	assert page2.target == 'chapter1.xhtml#page2'

	page3 = page_list[2]
	assert page3.id == 'page3'
	assert page3.label == '3'
	assert page3.target == 'chapter2.xhtml#page3'


def test_nav_doc_navigation_landmarks():
	"""Test landmarks functionality."""
	nav_xml_with_landmarks = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li><a href="chapter1.xhtml" id="ch1">Chapter 1</a></li>
        </ol>
    </nav>
    <nav epub:type="landmarks" id="landmarks">
        <h1>Landmarks</h1>
        <ol>
            <li><a href="cover.xhtml" epub:type="cover" id="cover">Cover</a></li>
            <li><a href="toc.xhtml" epub:type="toc" id="toc-landmark">Table of Contents</a></li>
            <li><a href="chapter1.xhtml" epub:type="bodymatter" id="start">Start of Content</a></li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_with_landmarks, 'application/xhtml+xml', 'nav.xhtml')

	# Test get_landmarks
	landmarks = nav.get_landmarks()
	assert len(landmarks) == 3

	cover_landmark = landmarks[0]
	assert cover_landmark.id == 'cover'
	assert cover_landmark.label == 'Cover'
	assert cover_landmark.target == 'cover.xhtml'
	assert cover_landmark.item_type == 'cover'

	toc_landmark = landmarks[1]
	assert toc_landmark.id == 'toc-landmark'
	assert toc_landmark.label == 'Table of Contents'
	assert toc_landmark.target == 'toc.xhtml'
	assert toc_landmark.item_type == 'toc'

	start_landmark = landmarks[2]
	assert start_landmark.id == 'start'
	assert start_landmark.label == 'Start of Content'
	assert start_landmark.target == 'chapter1.xhtml'
	assert start_landmark.item_type == 'bodymatter'


def test_nav_doc_navigation_editing():
	"""Test the editing capabilities of the navigation interface."""
	from epub_utils.navigation.base import NavigationItem

	nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml')

	# Test adding a new item
	new_item = NavigationItem(id='ch2', label='Chapter 2', target='chapter2.xhtml', order=2)

	nav.add_toc_item(new_item)

	# Verify it was added
	toc_items = nav.get_toc_items()
	assert len(toc_items) == 2

	new_toc_item = nav.find_item_by_id('ch2')
	assert new_toc_item is not None
	assert new_toc_item.label == 'Chapter 2'

	# Test updating an item
	success = nav.update_toc_item(
		'ch2', label='Chapter Two Updated', target='chapter2_updated.xhtml'
	)
	assert success

	updated_item = nav.find_item_by_id('ch2')
	assert updated_item.label == 'Chapter Two Updated'
	assert updated_item.target == 'chapter2_updated.xhtml'

	# Test removing an item
	success = nav.remove_toc_item('ch2')
	assert success

	# Verify it was removed
	toc_items = nav.get_toc_items()
	assert len(toc_items) == 1
	assert nav.find_item_by_id('ch2') is None


def test_nav_doc_navigation_span_elements():
	"""Test navigation with span elements (non-linked text)."""
	nav_xml_with_spans = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li id="part1-li">
                <span id="part1">Part 1</span>
                <ol>
                    <li><a href="chapter1.xhtml" id="ch1">Chapter 1</a></li>
                    <li><a href="chapter2.xhtml" id="ch2">Chapter 2</a></li>
                </ol>
            </li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_with_spans, 'application/xhtml+xml', 'nav.xhtml')

	toc_items = nav.get_toc_items()
	assert len(toc_items) == 1

	part1_item = toc_items[0]
	assert part1_item.id == 'part1'
	assert part1_item.label == 'Part 1'
	assert part1_item.target == ''  # span elements don't have targets
	assert len(part1_item.children) == 2

	ch1_item = part1_item.children[0]
	assert ch1_item.id == 'ch1'
	assert ch1_item.label == 'Chapter 1'
	assert ch1_item.target == 'chapter1.xhtml'

	ch2_item = part1_item.children[1]
	assert ch2_item.id == 'ch2'
	assert ch2_item.label == 'Chapter 2'
	assert ch2_item.target == 'chapter2.xhtml'


def test_nav_doc_navigation_item_types():
	"""Test navigation with epub:type attributes."""
	nav_xml_with_types = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li><a href="preface.xhtml" epub:type="preface" id="preface">Preface</a></li>
            <li><a href="chapter1.xhtml" epub:type="chapter" id="ch1">Chapter 1</a></li>
            <li><a href="appendix.xhtml" epub:type="appendix" id="appendix">Appendix</a></li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_with_types, 'application/xhtml+xml', 'nav.xhtml')

	toc_items = nav.get_toc_items()
	assert len(toc_items) == 3

	preface_item = toc_items[0]
	assert preface_item.item_type == 'preface'

	chapter_item = toc_items[1]
	assert chapter_item.item_type == 'chapter'

	appendix_item = toc_items[2]
	assert appendix_item.item_type == 'appendix'


def test_nav_doc_navigation_invalid_media_type():
	"""Test that invalid media types raise ValueError."""
	with pytest.raises(ValueError) as excinfo:
		EPUBNavDocNavigation(NAV_XML, 'application/x-dtbncx+xml', 'nav.xhtml')
	assert (
		"Media type 'application/x-dtbncx+xml' is not supported for EPUB Navigation Document"
		in str(excinfo.value)
	)


def test_nav_doc_navigation_malformed_xml():
	"""Test handling of malformed XML."""
	import pytest

	from epub_utils.exceptions import ParseError

	malformed_xml = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc">
        <ol>
            <li><a href="chapter1.xhtml">Chapter 1</a>
        </ol>
    </nav>
</body>
"""  # Missing closing </li> and </html>

	with pytest.raises(ParseError):
		EPUBNavDocNavigation(malformed_xml, 'application/xhtml+xml', 'nav.xhtml')


def test_nav_doc_navigation_output_methods():
	"""Test the various output methods."""
	nav = EPUBNavDocNavigation(NAV_XML, 'application/xhtml+xml', 'nav.xhtml')

	# Test __str__
	str_output = str(nav)
	assert str_output == NAV_XML

	# Test to_str (should use XMLPrinter)
	to_str_output = nav.to_str()
	assert isinstance(to_str_output, str)
	assert 'Chapter 1' in to_str_output

	# Test to_xml (may include ANSI color codes)
	to_xml_output = nav.to_xml()
	assert isinstance(to_xml_output, str)
	# Remove ANSI escape codes for testing
	import re

	clean_output = re.sub(r'\x1b\[[0-9;]*m', '', to_xml_output)
	assert 'Chapter 1' in clean_output

	# Test to_plain
	to_plain_output = nav.to_plain()
	assert isinstance(to_plain_output, str)
	assert 'Chapter 1' in to_plain_output


def test_nav_doc_navigation_reorder_items():
	"""Test reordering TOC items."""
	nav_xml_multiple = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
    <nav epub:type="toc" id="toc">
        <h1>Table of Contents</h1>
        <ol>
            <li><a href="chapter1.xhtml" id="ch1">Chapter 1</a></li>
            <li><a href="chapter2.xhtml" id="ch2">Chapter 2</a></li>
            <li><a href="chapter3.xhtml" id="ch3">Chapter 3</a></li>
        </ol>
    </nav>
</body>
</html>"""

	nav = EPUBNavDocNavigation(nav_xml_multiple, 'application/xhtml+xml', 'nav.xhtml')

	# Get original order
	original_items = nav.get_toc_items()
	assert [item.id for item in original_items] == ['ch1', 'ch2', 'ch3']

	# Reorder items
	nav.reorder_toc_items(['ch3', 'ch1', 'ch2'])

	# Check that the method completed without error
	# Note: The actual reordering implementation may vary
	# and this test mainly ensures the method can be called


def test_nav_doc_navigation_empty_document():
	"""Test handling of empty navigation document."""
	empty_nav_xml = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
    <title>Navigation Document</title>
</head>
<body>
</body>
</html>"""

	nav = EPUBNavDocNavigation(empty_nav_xml, 'application/xhtml+xml', 'nav.xhtml')

	# All lists should be empty
	assert len(nav.get_toc_items()) == 0
	assert len(nav.get_page_list()) == 0
	assert len(nav.get_landmarks()) == 0

	# find methods should return None/empty
	assert nav.find_item_by_id('nonexistent') is None
	assert len(nav.find_items_by_target('nonexistent.xhtml')) == 0


================================================
FILE: tests/test_ncx_navigation.py
================================================
from epub_utils.navigation.ncx import NCXNavigation

NCX_XML = """<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="en">
    <head>
        <meta name="dtb:uid" content="urn:uuid:12345"/>
        <meta name="dtb:depth" content="1"/>
        <meta name="dtb:totalPageCount" content="0"/>
        <meta name="dtb:maxPageNumber" content="0"/>
    </head>
    <docTitle>
        <text>Sample Book</text>
    </docTitle>
    <navMap>
        <navPoint id="navpoint-1" playOrder="1">
            <navLabel>
                <text>Chapter 1</text>
            </navLabel>
            <content src="chapter1.xhtml"/>
        </navPoint>
    </navMap>
</ncx>"""


def test_ncx_navigation_initialization():
	"""Test that the NCXNavigation class initializes correctly."""
	ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx')
	assert ncx is not None
	assert ncx.xml_content == NCX_XML
	assert ncx.media_type == 'application/x-dtbncx+xml'
	assert ncx.href == 'toc.ncx'

	assert ncx.xmlns == 'http://www.daisy.org/z3986/2005/ncx/'
	assert ncx.version == '2005-1'
	assert ncx.lang == 'en'


def test_ncx_navigation_interface():
	"""Test the new navigation interface methods."""
	ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx')

	# Test get_toc_items
	toc_items = ncx.get_toc_items()
	assert len(toc_items) == 1

	item = toc_items[0]
	assert item.id == 'navpoint-1'
	assert item.label == 'Chapter 1'
	assert item.target == 'chapter1.xhtml'
	assert item.order == 1
	assert item.level == 0

	# Test get_page_list (should be empty for this sample)
	page_list = ncx.get_page_list()
	assert len(page_list) == 0

	# Test get_landmarks (should be empty for this sample)
	landmarks = ncx.get_landmarks()
	assert len(landmarks) == 0

	# Test find_item_by_id
	found_item = ncx.find_item_by_id('navpoint-1')
	assert found_item is not None
	assert found_item.label == 'Chapter 1'

	# Test find_items_by_target
	found_items = ncx.find_items_by_target('chapter1.xhtml')
	assert len(found_items) == 1
	assert found_items[0].id == 'navpoint-1'


def test_ncx_navigation_hierarchy():
	"""Test hierarchical navigation structure."""
	ncx_xml_hierarchical = """<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="en">
    <head>
        <meta name="dtb:uid" content="urn:uuid:12345"/>
        <meta name="dtb:depth" content="2"/>
        <meta name="dtb:totalPageCount" content="0"/>
        <meta name="dtb:maxPageNumber" content="0"/>
    </head>
    <docTitle>
        <text>Sample Book</text>
    </docTitle>
    <navMap>
        <navPoint id="ch1" playOrder="1">
            <navLabel>
                <text>Chapter 1</text>
            </navLabel>
            <content src="chapter1.xhtml"/>
            <navPoint id="ch1-1" playOrder="2">
                <navLabel>
                    <text>Section 1.1</text>
                </navLabel>
                <content src="chapter1.xhtml#section1"/>
            </navPoint>
        </navPoint>
        <navPoint id="ch2" playOrder="3">
            <navLabel>
                <text>Chapter 2</text>
            </navLabel>
            <content src="chapter2.xhtml"/>
        </navPoint>
    </navMap>
</ncx>"""

	ncx = NCXNavigation(ncx_xml_hierarchical, 'application/x-dtbncx+xml', 'toc.ncx')

	toc_items = ncx.get_toc_items_as_dicts()

	assert toc_items == [
		{
			'id': 'ch1',
			'label': 'Chapter 1',
			'target': 'chapter1.xhtml',
			'order': 1,
			'level': 0,
			'type': None,
			'children': [
				{
					'id': 'ch1-1',
					'label': 'Section 1.1',
					'target': 'chapter1.xhtml#section1',
					'order': 2,
					'level': 1,
					'type': None,
					'children': [],
				}
			],
		},
		{
			'id': 'ch2',
			'label': 'Chapter 2',
			'target': 'chapter2.xhtml',
			'order': 3,
			'level': 0,
			'type': None,
			'children': [],
		},
	]


def test_ncx_navigation_editing():
	"""Test the editing capabilities of the navigation interface."""
	from epub_utils.navigation.base import NavigationItem

	ncx = NCXNavigation(NCX_XML, 'application/x-dtbncx+xml', 'toc.ncx')

	# Test adding a new item
	new_item = NavigationItem(id='ch2', label='Chapter 2', target='chapter2.xhtml', order=2)

	ncx.add_toc_item(new_item)

	# Verify it was added
	toc_items = ncx.get_toc_items()
	assert len(toc_items) == 2

	new_toc_item = ncx.find_item_by_id('ch2')
	assert new_toc_item is not None
	assert new_toc_item.label == 'Chapter 2'

	# Test updating an item
	success = ncx.update_toc_item(
		'ch2', label='Chapter Two Updated', target='chapter2_updated.xhtml'
	)
	assert success

	updated_item = ncx.find_item_by_id('ch2')
	assert updated_item.label == 'Chapter Two Updated'
	assert updated_item.target == 'chapter2_updated.xhtml'

	# Test removing an item
	success = ncx.remove_toc_item('ch2')
	assert success

	# Verify it was removed
	toc_items = ncx.get_toc_items()
	assert len(toc_items) == 1
	assert ncx.find_item_by_id('ch2') is None


================================================
FILE: tests/test_package.py
================================================
import pytest

from epub_utils.exceptions import InvalidEPUBError, UnsupportedFormatError
from epub_utils.package import Package

VALID_OPF_XML = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
        <dc:title>Sample EPUB</dc:title>
        <dc:creator>John Doe</dc:creator>
        <dc:identifier>12345</dc:identifier>
    </metadata>
    <manifest>
        <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
    </manifest>
	<spine>
		<itemref idref="nav" />
	</spine>
</package>
"""

INVALID_OPF_XML_MISSING_METADATA = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
</package>
"""

VALID_EPUB3_XML_WITHOUT_TOC = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
        <dc:title>Sample EPUB</dc:title>
    </metadata>
	<manifest>
        <item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>
    </manifest>
	<spine>
		<itemref idref="roads" />
	</spine>
</package>
"""

VALID_EPUB2_XML = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
		<dc:title>Sample EPUB</dc:title>
	</metadata>
	<manifest>
		<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
		<item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>
	</manifest>
	<spine toc="ncx">
		<itemref idref="roads" />
	</spine>
</package>
"""

VALID_EPUB2_XML_WITHOUT_TOC = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
		<dc:title>Sample EPUB</dc:title>
	</metadata>
	<manifest>
		<item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>
	</manifest>
	<spine>
		<itemref idref="roads" />
	</spine>
</package>
"""

VALID_OEPBS1_XML_WITH_TOC = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="1.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
		<dc:title>Sample EPUB</dc:title>
	</metadata>
	<manifest>
		<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
		<item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>
	</manifest>
	<spine toc="ncx">
		<itemref idref="roads" />
	</spine>
</package>
"""

INVALID_VERSION = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" version="4.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" />
</package>
"""


def test_package_initialization():
	"""
	Test that the Package class initializes correctly with valid OPF XML content.
	"""
	package = Package(VALID_OPF_XML)
	assert package.metadata.title == 'Sample EPUB'
	assert package.metadata.creator == 'John Doe'
	assert package.metadata.identifier == '12345'


def test_package_invalid_xml():
	with pytest.raises(InvalidEPUBError) as excinfo:
		Package(INVALID_OPF_XML_MISSING_METADATA)
	assert 'OPF file missing required metadata element' in str(excinfo.value)


def test_epub3():
	package = Package(VALID_OPF_XML)
	assert package.version.public == '3.0'
	assert package.version.major == 3
	assert package.nav_href == 'nav.xhtml'


def test_epub3_without_toc():
	package = Package(VALID_EPUB3_XML_WITHOUT_TOC)
	assert package.version.public == '3.0'
	assert package.version.major == 3
	assert not package.nav_href


def test_epub2():
	package = Package(VALID_EPUB2_XML)
	assert package.version.public == '2.0'
	assert package.version.major == 2
	assert package.toc_href == 'toc.ncx'


def test_epub2_without_toc():
	package = Package(VALID_EPUB2_XML_WITHOUT_TOC)
	assert package.version.public == '2.0'
	assert package.version.major == 2
	assert not package.toc_href


def test_epub1():
	package = Package(VALID_OEPBS1_XML_WITH_TOC)
	assert package.version.public == '1.0'
	assert package.version.major == 1
	assert package.toc_href == 'toc.ncx'


def test_invalid_version():
	with pytest.raises(UnsupportedFormatError) as excinfo:
		package = Package(INVALID_VERSION)
	assert 'EPUB version 4.x is not supported (EPUB 4.0 format)' in str(excinfo.value)


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<?xml version="1.0"?>\n<package xmlns="http://www.idpf.org/2007/opf" version="3.0">\n\n    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n\n        <dc:title>Sample EPUB</dc:title>\n    </metadata>\n\n    <manifest>\n        <item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>\n    </manifest>\n\n    <spine>\n        <itemref idref="roads"/>\n    </spine></package>',
			False,
			'<?xml version="1.0"?>\n<package xmlns="http://www.idpf.org/2007/opf" version="3.0">\n\n    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n\n        <dc:title>Sample EPUB</dc:title>\n    </metadata>\n\n    <manifest>\n        <item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>\n    </manifest>\n\n    <spine>\n        <itemref idref="roads"/>\n    </spine></package>',
		),
		(
			'<?xml version="1.0"?>\n<package xmlns="http://www.idpf.org/2007/opf" version="3.0">\n\n    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n\n        <dc:title>Sample EPUB</dc:title>\n    </metadata>\n\n    <manifest>\n        <item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>\n    </manifest>\n\n    <spine>\n        <itemref idref="roads"/>\n    </spine></package>',
			True,
			'<?xml version="1.0"?>\n<package xmlns="http://www.idpf.org/2007/opf" version="3.0">\n  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">\n    <dc:title>Sample EPUB</dc:title>\n  </metadata>\n  <manifest>\n    <item id="roads" href="roads.xhtml" media-type="application/xhtml+xml"/>\n  </manifest>\n  <spine>\n    <itemref idref="roads"/>\n  </spine>\n</package>\n',
		),
	],
)
def test_package_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing for Package."""
	package = Package(xml_content)

	assert package.to_str(pretty_print=pretty_print) == expected


================================================
FILE: tests/test_spine.py
================================================
import pytest

from epub_utils.package.spine import Spine

VALID_SPINE_XML = """
<spine xmlns="http://www.idpf.org/2007/opf" toc="ncx" page-progression-direction="ltr">
    <itemref idref="cover" linear="no"/>
    <itemref idref="nav" linear="yes"/>
    <itemref idref="chapter1" properties="page-spread-left"/>
    <itemref idref="chapter2"/>
</spine>
"""

MINIMAL_SPINE_XML = """
<spine xmlns="http://www.idpf.org/2007/opf">
    <itemref idref="content"/>
</spine>
"""


def test_spine_initialization():
	spine = Spine(VALID_SPINE_XML)

	assert spine.toc == 'ncx'
	assert spine.page_progression_direction == 'ltr'
	assert len(spine.itemrefs) == 4

	# Test first itemref (cover)
	assert spine.itemrefs[0]['idref'] == 'cover'
	assert spine.itemrefs[0]['linear'] == False
	assert spine.itemrefs[0]['properties'] == []

	# Test third itemref (chapter1)
	assert spine.itemrefs[2]['idref'] == 'chapter1'
	assert spine.itemrefs[2]['linear'] == True
	assert spine.itemrefs[2]['properties'] == ['page-spread-left']


def test_minimal_spine():
	spine = Spine(MINIMAL_SPINE_XML)

	assert spine.toc is None
	assert spine.page_progression_direction == 'default'
	assert len(spine.itemrefs) == 1
	assert spine.itemrefs[0]['idref'] == 'content'
	assert spine.itemrefs[0]['linear'] == True
	assert spine.itemrefs[0]['properties'] == []


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<spine xmlns="http://www.idpf.org/2007/opf" toc="ncx">\n\n    <itemref idref="cover" linear="no"/>\n\n    <itemref idref="chapter1"/>\n</spine>',
			False,
			'<spine xmlns="http://www.idpf.org/2007/opf" toc="ncx">\n\n    <itemref idref="cover" linear="no"/>\n\n    <itemref idref="chapter1"/>\n</spine>',
		),
		(
			'<spine xmlns="http://www.idpf.org/2007/opf" toc="ncx">\n\n    <itemref idref="cover" linear="no"/>\n\n    <itemref idref="chapter1"/>\n</spine>',
			True,
			'<spine xmlns="http://www.idpf.org/2007/opf" toc="ncx">\n  <itemref idref="cover" linear="no"/>\n  <itemref idref="chapter1"/>\n</spine>\n',
		),
	],
)
def test_spine_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing for Spine."""
	spine = Spine(xml_content)

	assert spine.to_str(pretty_print=pretty_print) == expected


================================================
FILE: tests/test_xhtml_content.py
================================================
import pytest

from epub_utils.content.xhtml import XHTMLContent


def test_simple_paragraph():
	"""Test extraction from a simple paragraph."""
	xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
    <body>
        <p>This is a simple paragraph.</p>

    </body>
</html>"""

	content = XHTMLContent(xml_content, 'application/xhtml+xml', 'test.xhtml')

	assert content.inner_text == 'This is a simple paragraph.'


@pytest.mark.parametrize(
	'xml_content,pretty_print,expected',
	[
		(
			'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			False,
			'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
		),
		(
			'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			True,
			'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n  <body>\n    <p>This is a simple paragraph.</p>\n  </body>\n</html>\n',
		),
		(
			'<?xml version="1.0" encoding="UTF-8"?>\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			False,
			'<?xml version="1.0" encoding="UTF-8"?>\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
		),
		(
			'<?xml version="1.0" encoding="UTF-8"?>\n<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			True,
			'<?xml version="1.0" encoding="UTF-8"?>\n<html xmlns="http://www.w3.org/1999/xhtml">\n  <body>\n    <p>This is a simple paragraph.</p>\n  </body>\n</html>\n',
		),
		(
			'<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			False,
			'<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
		),
		(
			'<html xmlns="http://www.w3.org/1999/xhtml">\n    <body>\n        <p>This is a simple paragraph.</p>\n\n    </body>\n</html>',
			True,
			'<html xmlns="http://www.w3.org/1999/xhtml">\n  <body>\n    <p>This is a simple paragraph.</p>\n  </body>\n</html>\n',
		),
	],
)
def test_to_str_pretty_print_parameter(xml_content, pretty_print, expected):
	"""Test XML output with and without pretty printing."""
	content = XHTMLContent(xml_content, 'application/xhtml+xml', 'test.xhtml')

	assert content.to_str(pretty_print=pretty_print) == expected