Full Code of run-llama/liteparse for AI

main 0f56290816b9 cached
322 files
4.0 MB
1.0M tokens
5040 symbols
1 requests
Download .txt
Showing preview only (4,188K chars total). Download the full file or copy to clipboard to get everything.
Repository: run-llama/liteparse
Branch: main
Commit: 0f56290816b9
Files: 322
Total size: 4.0 MB

Directory structure:
gitextract_a5not0fy/

├── .changeset/
│   ├── README.md
│   └── config.json
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── feature_request.yml
│   │   └── parsing_issue.yml
│   └── workflows/
│       ├── ci.yml
│       ├── e2e-output.yml
│       ├── homebrew_release.yml
│       ├── ocr_servers.yml
│       ├── release.yml
│       └── sync-docs.yml
├── .gitignore
├── .prettierignore
├── .prettierrc
├── AGENTS.md
├── CHANGELOG.md
├── CLAUDE.md
├── CONTRIBUTING.md
├── LICENSE
├── OCR_API_SPEC.md
├── README.md
├── SECURITY.md
├── cli/
│   ├── README.md
│   └── parse.ts
├── dataset_eval_utils/
│   ├── README.md
│   ├── pyproject.toml
│   └── src/
│       └── liteparse_eval/
│           ├── __init__.py
│           ├── benchmark.py
│           ├── evaluation.py
│           ├── processing.py
│           ├── providers/
│           │   ├── __init__.py
│           │   ├── llm/
│           │   │   ├── __init__.py
│           │   │   ├── anthropic.py
│           │   │   └── base.py
│           │   └── parsers/
│           │       ├── __init__.py
│           │       ├── base.py
│           │       ├── liteparse.py
│           │       ├── markitdown.py
│           │       ├── pymupdf.py
│           │       └── pypdf.py
│           └── report.py
├── docs/
│   └── src/
│       └── content/
│           └── docs/
│               └── liteparse/
│                   ├── _meta.yml
│                   ├── cli-reference.md
│                   ├── getting_started.md
│                   ├── guides/
│                   │   ├── _meta.yml
│                   │   ├── agent-skill.md
│                   │   ├── library-usage.md
│                   │   ├── multi-format.md
│                   │   ├── ocr.md
│                   │   ├── parsing-urls.md
│                   │   └── visual-citations.md
│                   └── index.md
├── docs.config.mjs
├── eslint.config.js
├── ocr/
│   ├── README.md
│   ├── easyocr/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── pyproject.toml
│   │   ├── server.py
│   │   └── test_server.py
│   └── paddleocr/
│       ├── Dockerfile
│       ├── README.md
│       ├── pyproject.toml
│       ├── server.py
│       └── test_server.py
├── package.json
├── packages/
│   └── python/
│       ├── README.md
│       ├── liteparse/
│       │   ├── __init__.py
│       │   ├── parser.py
│       │   ├── py.typed
│       │   └── types.py
│       ├── pyproject.toml
│       └── tests/
│           ├── __init__.py
│           ├── conftest.py
│           ├── test_batch_e2e.py
│           ├── test_parse_e2e.py
│           └── test_screenshot_e2e.py
├── scripts/
│   ├── compare-dataset.ts
│   ├── compare-outputs.sh
│   ├── create-dataset.ts
│   ├── generate-api-docs.sh
│   ├── publish-to-homebrew-repo.sh
│   ├── sync-docs-to-developer-hub.sh
│   └── upload-dataset.ts
├── src/
│   ├── conversion/
│   │   ├── README.md
│   │   ├── convertToPdf.test.ts
│   │   └── convertToPdf.ts
│   ├── core/
│   │   ├── README.md
│   │   ├── config.test.ts
│   │   ├── config.ts
│   │   ├── parser.test.ts
│   │   ├── parser.ts
│   │   └── types.ts
│   ├── engines/
│   │   ├── README.md
│   │   ├── ocr/
│   │   │   ├── README.md
│   │   │   ├── http-simple.test.ts
│   │   │   ├── http-simple.ts
│   │   │   ├── interface.ts
│   │   │   ├── tesseract.test.ts
│   │   │   └── tesseract.ts
│   │   └── pdf/
│   │       ├── README.md
│   │       ├── interface.ts
│   │       ├── pdfium-renderer.test.ts
│   │       ├── pdfium-renderer.ts
│   │       ├── pdfjs.test.ts
│   │       ├── pdfjs.ts
│   │       └── pdfjsImporter.ts
│   ├── index.ts
│   ├── lib.ts
│   ├── output/
│   │   ├── README.md
│   │   ├── json.test.ts
│   │   ├── json.ts
│   │   ├── text.test.ts
│   │   └── text.ts
│   ├── processing/
│   │   ├── README.md
│   │   ├── bbox.test.ts
│   │   ├── bbox.ts
│   │   ├── cleanText.test.ts
│   │   ├── cleanText.ts
│   │   ├── grid.ts
│   │   ├── gridDebugLogger.ts
│   │   ├── gridProjection.test.ts
│   │   ├── gridProjection.ts
│   │   ├── gridVisualizer.ts
│   │   ├── markupUtils.test.ts
│   │   ├── markupUtils.ts
│   │   ├── ocrUtils.ts
│   │   ├── octUtils.test.ts
│   │   ├── searchItems.test.ts
│   │   ├── searchItems.ts
│   │   ├── textUtils.test.ts
│   │   └── textUtils.ts
│   └── vendor/
│       └── pdfjs/
│           ├── LICENSE
│           ├── README.md
│           ├── cmaps/
│           │   ├── 78-EUC-H.bcmap
│           │   ├── 78-EUC-V.bcmap
│           │   ├── 78-H.bcmap
│           │   ├── 78-RKSJ-H.bcmap
│           │   ├── 78-RKSJ-V.bcmap
│           │   ├── 78-V.bcmap
│           │   ├── 78ms-RKSJ-H.bcmap
│           │   ├── 78ms-RKSJ-V.bcmap
│           │   ├── 83pv-RKSJ-H.bcmap
│           │   ├── 90ms-RKSJ-H.bcmap
│           │   ├── 90ms-RKSJ-V.bcmap
│           │   ├── 90msp-RKSJ-H.bcmap
│           │   ├── 90msp-RKSJ-V.bcmap
│           │   ├── 90pv-RKSJ-H.bcmap
│           │   ├── 90pv-RKSJ-V.bcmap
│           │   ├── Add-H.bcmap
│           │   ├── Add-RKSJ-H.bcmap
│           │   ├── Add-RKSJ-V.bcmap
│           │   ├── Add-V.bcmap
│           │   ├── Adobe-CNS1-0.bcmap
│           │   ├── Adobe-CNS1-1.bcmap
│           │   ├── Adobe-CNS1-2.bcmap
│           │   ├── Adobe-CNS1-3.bcmap
│           │   ├── Adobe-CNS1-4.bcmap
│           │   ├── Adobe-CNS1-5.bcmap
│           │   ├── Adobe-CNS1-6.bcmap
│           │   ├── Adobe-CNS1-UCS2.bcmap
│           │   ├── Adobe-GB1-0.bcmap
│           │   ├── Adobe-GB1-1.bcmap
│           │   ├── Adobe-GB1-2.bcmap
│           │   ├── Adobe-GB1-3.bcmap
│           │   ├── Adobe-GB1-4.bcmap
│           │   ├── Adobe-GB1-5.bcmap
│           │   ├── Adobe-GB1-UCS2.bcmap
│           │   ├── Adobe-Japan1-0.bcmap
│           │   ├── Adobe-Japan1-1.bcmap
│           │   ├── Adobe-Japan1-2.bcmap
│           │   ├── Adobe-Japan1-3.bcmap
│           │   ├── Adobe-Japan1-4.bcmap
│           │   ├── Adobe-Japan1-5.bcmap
│           │   ├── Adobe-Japan1-6.bcmap
│           │   ├── Adobe-Japan1-UCS2.bcmap
│           │   ├── Adobe-Korea1-0.bcmap
│           │   ├── Adobe-Korea1-1.bcmap
│           │   ├── Adobe-Korea1-2.bcmap
│           │   ├── Adobe-Korea1-UCS2.bcmap
│           │   ├── B5-H.bcmap
│           │   ├── B5-V.bcmap
│           │   ├── B5pc-H.bcmap
│           │   ├── B5pc-V.bcmap
│           │   ├── CNS-EUC-H.bcmap
│           │   ├── CNS-EUC-V.bcmap
│           │   ├── CNS1-H.bcmap
│           │   ├── CNS1-V.bcmap
│           │   ├── CNS2-H.bcmap
│           │   ├── CNS2-V.bcmap
│           │   ├── ETHK-B5-H.bcmap
│           │   ├── ETHK-B5-V.bcmap
│           │   ├── ETen-B5-H.bcmap
│           │   ├── ETen-B5-V.bcmap
│           │   ├── ETenms-B5-H.bcmap
│           │   ├── ETenms-B5-V.bcmap
│           │   ├── EUC-H.bcmap
│           │   ├── EUC-V.bcmap
│           │   ├── Ext-H.bcmap
│           │   ├── Ext-RKSJ-H.bcmap
│           │   ├── Ext-RKSJ-V.bcmap
│           │   ├── Ext-V.bcmap
│           │   ├── GB-EUC-H.bcmap
│           │   ├── GB-EUC-V.bcmap
│           │   ├── GB-H.bcmap
│           │   ├── GB-V.bcmap
│           │   ├── GBK-EUC-H.bcmap
│           │   ├── GBK-EUC-V.bcmap
│           │   ├── GBK2K-H.bcmap
│           │   ├── GBK2K-V.bcmap
│           │   ├── GBKp-EUC-H.bcmap
│           │   ├── GBKp-EUC-V.bcmap
│           │   ├── GBT-EUC-H.bcmap
│           │   ├── GBT-EUC-V.bcmap
│           │   ├── GBT-H.bcmap
│           │   ├── GBT-V.bcmap
│           │   ├── GBTpc-EUC-H.bcmap
│           │   ├── GBTpc-EUC-V.bcmap
│           │   ├── GBpc-EUC-H.bcmap
│           │   ├── GBpc-EUC-V.bcmap
│           │   ├── H.bcmap
│           │   ├── HKdla-B5-H.bcmap
│           │   ├── HKdla-B5-V.bcmap
│           │   ├── HKdlb-B5-H.bcmap
│           │   ├── HKdlb-B5-V.bcmap
│           │   ├── HKgccs-B5-H.bcmap
│           │   ├── HKgccs-B5-V.bcmap
│           │   ├── HKm314-B5-H.bcmap
│           │   ├── HKm314-B5-V.bcmap
│           │   ├── HKm471-B5-H.bcmap
│           │   ├── HKm471-B5-V.bcmap
│           │   ├── HKscs-B5-H.bcmap
│           │   ├── HKscs-B5-V.bcmap
│           │   ├── Hankaku.bcmap
│           │   ├── Hiragana.bcmap
│           │   ├── KSC-EUC-H.bcmap
│           │   ├── KSC-EUC-V.bcmap
│           │   ├── KSC-H.bcmap
│           │   ├── KSC-Johab-H.bcmap
│           │   ├── KSC-Johab-V.bcmap
│           │   ├── KSC-V.bcmap
│           │   ├── KSCms-UHC-H.bcmap
│           │   ├── KSCms-UHC-HW-H.bcmap
│           │   ├── KSCms-UHC-HW-V.bcmap
│           │   ├── KSCms-UHC-V.bcmap
│           │   ├── KSCpc-EUC-H.bcmap
│           │   ├── KSCpc-EUC-V.bcmap
│           │   ├── Katakana.bcmap
│           │   ├── LICENSE
│           │   ├── NWP-H.bcmap
│           │   ├── NWP-V.bcmap
│           │   ├── RKSJ-H.bcmap
│           │   ├── RKSJ-V.bcmap
│           │   ├── Roman.bcmap
│           │   ├── UniCNS-UCS2-H.bcmap
│           │   ├── UniCNS-UCS2-V.bcmap
│           │   ├── UniCNS-UTF16-H.bcmap
│           │   ├── UniCNS-UTF16-V.bcmap
│           │   ├── UniCNS-UTF32-H.bcmap
│           │   ├── UniCNS-UTF32-V.bcmap
│           │   ├── UniCNS-UTF8-H.bcmap
│           │   ├── UniCNS-UTF8-V.bcmap
│           │   ├── UniGB-UCS2-H.bcmap
│           │   ├── UniGB-UCS2-V.bcmap
│           │   ├── UniGB-UTF16-H.bcmap
│           │   ├── UniGB-UTF16-V.bcmap
│           │   ├── UniGB-UTF32-H.bcmap
│           │   ├── UniGB-UTF32-V.bcmap
│           │   ├── UniGB-UTF8-H.bcmap
│           │   ├── UniGB-UTF8-V.bcmap
│           │   ├── UniJIS-UCS2-H.bcmap
│           │   ├── UniJIS-UCS2-HW-H.bcmap
│           │   ├── UniJIS-UCS2-HW-V.bcmap
│           │   ├── UniJIS-UCS2-V.bcmap
│           │   ├── UniJIS-UTF16-H.bcmap
│           │   ├── UniJIS-UTF16-V.bcmap
│           │   ├── UniJIS-UTF32-H.bcmap
│           │   ├── UniJIS-UTF32-V.bcmap
│           │   ├── UniJIS-UTF8-H.bcmap
│           │   ├── UniJIS-UTF8-V.bcmap
│           │   ├── UniJIS2004-UTF16-H.bcmap
│           │   ├── UniJIS2004-UTF16-V.bcmap
│           │   ├── UniJIS2004-UTF32-H.bcmap
│           │   ├── UniJIS2004-UTF32-V.bcmap
│           │   ├── UniJIS2004-UTF8-H.bcmap
│           │   ├── UniJIS2004-UTF8-V.bcmap
│           │   ├── UniJISPro-UCS2-HW-V.bcmap
│           │   ├── UniJISPro-UCS2-V.bcmap
│           │   ├── UniJISPro-UTF8-V.bcmap
│           │   ├── UniJISX0213-UTF32-H.bcmap
│           │   ├── UniJISX0213-UTF32-V.bcmap
│           │   ├── UniJISX02132004-UTF32-H.bcmap
│           │   ├── UniJISX02132004-UTF32-V.bcmap
│           │   ├── UniKS-UCS2-H.bcmap
│           │   ├── UniKS-UCS2-V.bcmap
│           │   ├── UniKS-UTF16-H.bcmap
│           │   ├── UniKS-UTF16-V.bcmap
│           │   ├── UniKS-UTF32-H.bcmap
│           │   ├── UniKS-UTF32-V.bcmap
│           │   ├── UniKS-UTF8-H.bcmap
│           │   ├── UniKS-UTF8-V.bcmap
│           │   ├── V.bcmap
│           │   └── WP-Symbol.bcmap
│           ├── pdf.mjs
│           ├── pdf.sandbox.mjs
│           ├── pdf.worker.mjs
│           └── standard_fonts/
│               ├── FoxitDingbats.pfb
│               ├── FoxitFixed.pfb
│               ├── FoxitFixedBold.pfb
│               ├── FoxitFixedBoldItalic.pfb
│               ├── FoxitFixedItalic.pfb
│               ├── FoxitSerif.pfb
│               ├── FoxitSerifBold.pfb
│               ├── FoxitSerifBoldItalic.pfb
│               ├── FoxitSerifItalic.pfb
│               ├── FoxitSymbol.pfb
│               ├── LICENSE_FOXIT
│               └── LICENSE_LIBERATION
├── tsconfig.json
├── typedoc.json
└── vitest.config.ts

================================================
FILE CONTENTS
================================================

================================================
FILE: .changeset/README.md
================================================
# Changesets

Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
with multi-package repos, or single-package repos to help you version and publish your code. You can
find the full documentation for it [in our repository](https://github.com/changesets/changesets).

We have a quick list of common questions to get you started engaging with this project in
[our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md).


================================================
FILE: .changeset/config.json
================================================
{
  "$schema": "https://unpkg.com/@changesets/config@3.1.3/schema.json",
  "changelog": ["@changesets/changelog-github", { "repo": "run-llama/liteparse" }],
  "commit": false,
  "fixed": [],
  "linked": [],
  "access": "public",
  "baseBranch": "main",
  "updateInternalDependencies": "patch",
  "ignore": []
}


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yml
================================================
name: Bug Report
description: Report a bug (crashes, errors, unexpected behavior)
title: "[Bug] "
labels: ["bug"]
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to report a bug!

        **Note:** If this is a parsing quality issue (incorrect text, bad formatting), please use the "Parsing Issue" template instead.

  - type: textarea
    id: description
    attributes:
      label: Description
      description: A clear description of the bug
    validations:
      required: true

  - type: textarea
    id: reproduce
    attributes:
      label: Steps to Reproduce
      description: How can we reproduce this issue?
      placeholder: |
        1. Run `lit parse ...`
        2. See error
    validations:
      required: true

  - type: textarea
    id: error
    attributes:
      label: Error Message
      description: The full error message or stack trace
      render: text
    validations:
      required: false

  - type: input
    id: version
    attributes:
      label: LiteParse Version
      description: Run `lit --version` to get this
      placeholder: "0.1.0"
    validations:
      required: true

  - type: dropdown
    id: os
    attributes:
      label: Operating System
      options:
        - macOS (Apple Silicon)
        - macOS (Intel)
        - Linux
        - Windows
        - Other
    validations:
      required: true

  - type: input
    id: node
    attributes:
      label: Node.js Version
      description: Run `node --version` to get this
      placeholder: "v20.0.0"
    validations:
      required: true

  - type: textarea
    id: additional
    attributes:
      label: Additional Context
      description: Any other relevant information


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: Documentation
    url: https://github.com/run-llama/liteparse#readme
    about: Check the README for usage documentation
  - name: Discussions
    url: https://github.com/run-llama/liteparse/discussions
    about: Ask questions and share ideas


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.yml
================================================
name: Feature Request
description: Suggest a new feature or improvement
title: "[Feature] "
labels: ["enhancement"]
body:
  - type: textarea
    id: problem
    attributes:
      label: Problem Statement
      description: What problem would this feature solve?
      placeholder: "I'm always frustrated when..."
    validations:
      required: true

  - type: textarea
    id: solution
    attributes:
      label: Proposed Solution
      description: How do you think this should work?
    validations:
      required: true

  - type: textarea
    id: alternatives
    attributes:
      label: Alternatives Considered
      description: Have you considered any alternative solutions or workarounds?
    validations:
      required: false

  - type: textarea
    id: additional
    attributes:
      label: Additional Context
      description: Any other context, mockups, or examples


================================================
FILE: .github/ISSUE_TEMPLATE/parsing_issue.yml
================================================
name: Parsing Issue
description: Report an issue with document parsing (incorrect output, missing text, etc.)
title: "[Parsing] "
labels: ["parsing", "bug"]
body:
  - type: markdown
    attributes:
      value: |
        ## Important: Document Required

        **Parsing issues without a reproducible example will be closed.**

        To investigate parsing problems, we need to see the actual document. Please either:
        - Attach the document to this issue (drag and drop below)
        - Provide a public link to the document
        - If the document is confidential, provide a minimal reproduction with a similar document

  - type: textarea
    id: description
    attributes:
      label: Description
      description: Describe the parsing issue you're experiencing
      placeholder: "The text on page 3 is missing / The table formatting is incorrect / etc."
    validations:
      required: true

  - type: textarea
    id: document
    attributes:
      label: Document
      description: |
        **Required:** Attach the document or provide a link.
        Drag and drop your file here, or paste a public URL.
      placeholder: "Drag and drop your document here, or paste a URL"
    validations:
      required: true

  - type: textarea
    id: expected
    attributes:
      label: Expected Output
      description: What did you expect the output to look like?
      placeholder: "I expected the table to have 3 columns with aligned text..."
    validations:
      required: true

  - type: textarea
    id: actual
    attributes:
      label: Actual Output
      description: What did you actually get? (paste relevant portions)
      render: text
    validations:
      required: true

  - type: textarea
    id: command
    attributes:
      label: Command Used
      description: The exact command or code you ran
      render: bash
      placeholder: "lit parse document.pdf --format json"
    validations:
      required: true

  - type: input
    id: version
    attributes:
      label: LiteParse Version
      description: Run `lit --version` to get this
      placeholder: "0.1.0"
    validations:
      required: true

  - type: dropdown
    id: os
    attributes:
      label: Operating System
      options:
        - macOS (Apple Silicon)
        - macOS (Intel)
        - Linux
        - Windows
        - Other
    validations:
      required: true

  - type: textarea
    id: additional
    attributes:
      label: Additional Context
      description: Any other relevant information (OCR server used, config file, etc.)


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  lint-and-build:
    runs-on: ubuntu-latest

    strategy:
      matrix:
        node-version: [18, 20, 22]

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Setup Node.js ${{ matrix.node-version }}
        uses: actions/setup-node@v4
        with:
          node-version: ${{ matrix.node-version }}
          cache: "npm"

      - name: Install dependencies
        run: npm ci

      - name: Check formatting
        run: npm run format:check

      - name: Run linter
        run: npm run lint

      - name: Build
        run: npm run build

  test:
    runs-on: ubuntu-latest

    strategy:
      matrix:
        node-version: [18, 20, 22]

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Setup Node.js ${{ matrix.node-version }}
        uses: actions/setup-node@v4
        with:
          node-version: ${{ matrix.node-version }}
          cache: "npm"

      - name: Install dependencies
        run: npm ci

      - name: Run tests
        run: npm run test


================================================
FILE: .github/workflows/e2e-output.yml
================================================
name: E2E Output Validation

on:
  pull_request:
    branches: [main]
  push:
    branches: [main]

# For PRs, we need write permissions to add labels
permissions:
  contents: read
  pull-requests: write

jobs:
  compare-outputs:
    runs-on: ubuntu-latest
    outputs:
      has_changes: ${{ steps.compare.outputs.has_changes }}

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install system dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y libreoffice imagemagick ghostscript tesseract-ocr

      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: "npm"

      - name: Install dependencies
        run: npm ci

      - name: Build
        run: npm run build

      - name: Download from HuggingFace
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          pip install huggingface_hub
          hf download 'llamaindex/liteparse_cicd_data' --local-dir expected-dataset --repo-type dataset

      - name: Compare outputs
        id: compare
        run: ./scripts/compare-outputs.sh expected-dataset

      - name: Upload comparison results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: comparison-results
          path: comparison-output.txt

      - name: Add label for changed outputs (PR only)
        if: github.event_name == 'pull_request' && steps.compare.outputs.has_changes == 'true'
        uses: actions/github-script@v7
        with:
          script: |
            github.rest.issues.addLabels({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              labels: ['output-changed']
            })

      - name: Check for approval label (PR only)
        id: check-approved
        if: github.event_name == 'pull_request' && steps.compare.outputs.has_changes == 'true'
        uses: actions/github-script@v7
        with:
          script: |
            const { data: labels } = await github.rest.issues.listLabelsOnIssue({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number
            });
            const hasApproval = labels.some(label => label.name === 'output-approved');
            core.setOutput('has_approval', hasApproval.toString());
            console.log(`Has output-approved label: ${hasApproval}`);

      - name: Require approval for output changes
        if: steps.compare.outputs.has_changes == 'true' && github.event_name == 'pull_request' && steps.check-approved.outputs.has_approval != 'true'
        run: |
          echo "::warning::This PR changes liteparse output. Please review the comparison results."
          echo ""
          echo "To approve these changes:"
          echo "1. Review the comparison output in the artifacts"
          echo "2. Add the 'output-approved' label to this PR"
          echo "3. Re-run this workflow"
          exit 1

  # This job only runs on main branch after merge when outputs changed
  upload-dataset:
    runs-on: ubuntu-latest
    needs: compare-outputs
    if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.compare-outputs.outputs.has_changes == 'true'

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install system dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y libreoffice imagemagick ghostscript tesseract-ocr

      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: "npm"

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install dependencies
        run: |
          npm ci
          pip install huggingface_hub

      - name: Build
        run: npm run build

      - name: Download existing dataset from HuggingFace
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          hf download 'llamaindex/liteparse_cicd_data' --local-dir expected-dataset --repo-type dataset

      - name: Generate and upload dataset
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          if [ -z "$HF_TOKEN" ]; then
            echo "Warning: HF_TOKEN not set, skipping upload"
            exit 0
          fi
          npx tsx scripts/upload-dataset.ts expected-dataset ${{ vars.HF_DATASET_REPO || 'llamaindex/liteparse_cicd_data' }}


================================================
FILE: .github/workflows/homebrew_release.yml
================================================
name: Push Release to HomeBrew repository

on:
  workflow_dispatch:

jobs:
  push-release-homebrew:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Install uv
        uses: astral-sh/setup-uv@v6

      - name: Set up Python
        run: uv python install 3.13
        shell: bash

      - name: Push Release to HomeBrew repository
        run: ./scripts/publish-to-homebrew-repo.sh
        shell: bash
        env:
          GITHUB_TOKEN: ${{ secrets.HOMEBREW_GITHUB_TOKEN }} # PAT with permissions to push to run-llama/homebrew-liteparse


================================================
FILE: .github/workflows/ocr_servers.yml
================================================
name: Validate OCR Servers

on:
  pull_request:

jobs:
  testing_paddleocr:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.12"]
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 1

      - name: Install uv
        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true

      - name: Run Tests on Main Package
        run: uv run pytest test_server.py
        working-directory: ocr/paddleocr/

  testing_easyocr:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.12"]
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 1

      - name: Install uv
        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true

      - name: Run Tests on Main Package
        run: uv run pytest test_server.py
        working-directory: ocr/easyocr/


================================================
FILE: .github/workflows/release.yml
================================================
name: Release

on:
  push:
    branches:
      - main

concurrency: ${{ github.workflow }}-${{ github.ref }}

jobs:
  release:
    name: Release
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
      id-token: write
      actions: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: "npm"
          registry-url: "https://registry.npmjs.org"

      - name: Install dependencies
        run: npm ci

      - name: Build
        run: npm run build

      - name: Create Release Pull Request or Publish
        id: changesets
        uses: changesets/action@v1
        with:
          publish: npm run release
          title: "chore: version packages"
          commit: "chore: version packages"
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

      - name: Dispatch post-publish workflow
        if: steps.changesets.outputs.published == 'true'
        uses: actions/github-script@v7
        with:
          script: |
            await github.rest.actions.createWorkflowDispatch({
              owner: context.repo.owner,
              repo: context.repo.repo,
              workflow_id: 'homebrew_release.yml',
              ref: 'main',
            })


================================================
FILE: .github/workflows/sync-docs.yml
================================================
name: Sync Docs to Developer Hub

on:
  push:
    branches: [main]
    paths:
      - "docs/**"
  workflow_dispatch:

jobs:
  sync-docs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout source repo
        uses: actions/checkout@v4

      - name: Install uv
        uses: astral-sh/setup-uv@v4

      - name: Checkout docs repo
        uses: actions/checkout@v4
        with:
          repository: run-llama/developers
          token: ${{ secrets.DEVELOPER_HUB_TOKEN }}
          path: developer-hub

      - name: Sync docs
        run: |
          npm install
          npm run docs:api
          ./scripts/sync-docs-to-developer-hub.sh developer-hub

      - name: Commit and push
        working-directory: developer-hub
        run: |
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"

          git add src/content/docs/liteparse

          if git diff --staged --quiet; then
            echo "No docs changes to sync."
            exit 0
          fi

          SOURCE_SHA="${GITHUB_SHA::8}"
          git commit -m "sync: liteparse docs from run-llama/liteparse@${SOURCE_SHA}"
          git push


================================================
FILE: .gitignore
================================================
# Dependencies
node_modules/
pnpm-lock.yaml

# Build output
dist/
bin/
sea-prep.blob
sea-config.json

# Environment
*claude*
.env
.env.local

# TypeScript
*.tsbuildinfo

# IDE
.vscode/
.idea/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

# Logs
*.log
npm-debug.log*
pnpm-debug.log*

# Test output
screenshots/
*.pdf
!tests/fixtures/*.pdf

# Temporary files
tmp/
temp/
*.txt
*.png
deno.json

# Python files
*venv*
__pycache__
*.pyc
*cache*

# Test/dev directories (local testing)
e2e-output/
parsed_dataset/
hard_docs/
hard_docs_parsed/
test-docs/
test-docs-output/
arxiv_screenshots/
more_hard_docs/
nytimes_imgs/
ac/
ltt/

# OCR data files
*.traineddata

# Docs
.docs-preview/
docs/src/content/docs/liteparse/api.md
docs/src/content/docs/liteparse/.api-tmp/


================================================
FILE: .prettierignore
================================================
dist
node_modules
src/vendor
*.md
pnpm-lock.yaml
package-lock.json


================================================
FILE: .prettierrc
================================================
{
  "semi": true,
  "singleQuote": false,
  "tabWidth": 2,
  "trailingComma": "es5",
  "printWidth": 100,
  "bracketSpacing": true
}


================================================
FILE: AGENTS.md
================================================
# LiteParse - Agent Documentation

> This file provides comprehensive context for AI coding agents working on this codebase. Each subdirectory contains its own README with file-specific documentation.

## Project Overview

**LiteParse** is an open-source PDF parsing library focused on fast, lightweight document processing with spatial text extraction. It runs entirely locally with zero cloud dependencies by default.

### Key Capabilities
- **Spatial text extraction** with precise bounding boxes
- **Flexible OCR** (built-in Tesseract.js or pluggable HTTP servers)
- **Multi-format support** (PDFs, DOCX, XLSX, PPTX, images via conversion)
- **TypeScript/Node.js** with both library and CLI interfaces

## Directory Structure

```
liteparse/
├── src/
│   ├── core/           # Configuration, types, main orchestrator
│   ├── engines/        # Pluggable PDF and OCR engines
│   │   ├── pdf/        # PDF parsing engines (PDF.js, PDFium)
│   │   └── ocr/        # OCR engines (Tesseract, HTTP)
│   ├── processing/     # Text extraction and spatial analysis
│   ├── output/         # Output formatters (JSON, text)
│   ├── conversion/     # Multi-format conversion to PDF
│   ├── vendor/         # Bundled dependencies (PDF.js)
│   ├── index.ts        # CLI entry point
│   └── lib.ts          # Library public API
├── cli/                # CLI implementation
├── ocr/                # Example OCR server implementations
│   ├── easyocr/        # EasyOCR wrapper server
│   └── paddleocr/      # PaddleOCR wrapper server
└── dist/               # Compiled JavaScript output
```

## Data Flow

1. **Input**: File path received (any supported format)
2. **Conversion** (if dependencies installed): Non-PDF formats converted to PDF via LibreOffice/ImageMagick
3. **PDF Loading**: PDF.js extracts text items, images, metadata
4. **OCR** (if enabled): Images rendered and OCR'd for text-sparse areas
5. **Grid Projection**: Spatial reconstruction of text layout using anchor system
6. **Post-processing**: Bounding boxes, text cleanup
7. **Output**: Formatted as JSON or plain text

## Key Design Decisions

### 1. Engine Abstraction Pattern
Both PDF and OCR functionality use interface-based abstraction (`PdfEngine`, `OcrEngine`). This allows:
- Swapping implementations without changing core logic
- Auto-detection: HTTP OCR if URL provided, otherwise Tesseract.js
- Future extensibility for new engines
- Future possibility of custom conversion engines for non-PDF formats

### 2. Spatial Grid Projection
The most complex (and important!) part of the codebase (`src/processing/gridProjection.ts`, ~1650 lines). Uses:
- **Anchor-based layout**: Tracks text alignment (left, right, center, floating)
- **Forward anchors**: Carry alignment information between lines
- **Column detection**: Identifies multi-column layouts
- **Rotation handling**: Transforms 90°, 180°, 270° rotated text to correct reading order
- **OCR merging**: Combines native PDF text with OCR results, preserving confidence scores and source flags in output

### 3. Selective OCR
OCR only runs on embedded images where text extraction failed, not the entire document. This balances accuracy with performance.

### 4. Configuration Merging
Uses a default-first approach where users only override what they need. Configuration flows: defaults → file config → CLI options.

### 5. Format Conversion via External Tools
Rather than implementing format parsers, LiteParse converts non-PDF formats using system tools (LibreOffice, ImageMagick) into a single format (PDF). This provides broad format support with minimal code.

## Common Tasks

### Adding a New Output Format
1. Create new file in `src/output/` implementing the formatter
2. Add format option to `cli/parse.ts`
3. Update `src/core/parser.ts` to use new formatter

### Adding a New OCR Engine
1. Implement `OcrEngine` interface in `src/engines/ocr/`
2. Add initialization logic in `src/core/parser.ts`
3. Add configuration options in `src/core/types.ts`

### Modifying Text Extraction Logic
The processing pipeline is in `src/processing/`. Key files:
- `gridProjection.ts` - Layout reconstruction (most complex)
- `bbox.ts` - Bounding box calculation
- `cleanText.ts` - Text cleanup

### Adding CLI Options
1. Update `cli/parse.ts` with new Commander.js option
2. Add corresponding config field in `src/core/types.ts`
3. Update `src/core/config.ts` with default value
4. Use the option in `src/core/parser.ts`

## Testing Approach

Currently tested via manual verification with sample documents. The project would benefit from:
- Unit tests for processing utilities
- Integration tests with known PDFs
- Snapshot tests for output formats

## Key Dependencies

| Dependency | Purpose |
|------------|---------|
| `pdfjs-dist` | PDF parsing and text extraction |
| `@hyzyla/pdfium` | High-quality PDF rendering for screenshots |
| `tesseract.js` | In-process OCR (zero setup) |
| `sharp` | Image processing |
| `commander` | CLI framework |
| `zod` | Schema validation |

## Entry Points

- **CLI**: `src/index.ts` → `cli/parse.ts`
- **Library**: `src/lib.ts` exports `LiteParse` class and types
- **Main Class**: `src/core/parser.ts` contains `LiteParse` orchestrator

## Related Documentation

These files are key to understanding the codebase and should be referenced for specific implementation details.

If changes to the codebase are being made, please update the relevant documentation files to reflect those changes and keep them up to date.

- [User-facing documentation](README.md)
- [src/conversion/README.md](src/conversion/README.md) - Format conversion details
- [src/core/README.md](src/core/README.md) - Core architecture and configuration
- [src/engines/README.md](src/engines/README.md) - Engine abstraction and implementations
  - [src/engines/pdf/README.md](src/engines/pdf/README.md) - PDF engines (PDF.js, PDFium)
  - [src/engines/ocr/README.md](src/engines/ocr/README.md) - OCR engines (Tesseract, HTTP)
- [src/output/README.md](src/output/README.md) - Output formatters
- [src/processing/README.md](src/processing/README.md) - Text extraction and spatial processing
- [ocr/README.md](ocr/README.md) - OCR server implementations (EasyOCR, PaddleOCR)
- [cli/README.md](cli/README.md) - CLI usage and options


================================================
FILE: CHANGELOG.md
================================================
# @llamaindex/liteparse

## 1.4.6

### Patch Changes

- [#120](https://github.com/run-llama/liteparse/pull/120) [`9cde441`](https://github.com/run-llama/liteparse/commit/9cde441b106b9dbee055b228bf011c197126bef5) Thanks [@apprakash](https://github.com/apprakash)! - fix(gridProjection): merge sub-pixel overlapping text runs in canMerge

## 1.4.5

### Patch Changes

- [#118](https://github.com/run-llama/liteparse/pull/118) [`4fdf3c9`](https://github.com/run-llama/liteparse/commit/4fdf3c9358c22e50e742c2b7d866ae35b83a63cd) Thanks [@mkh09353](https://github.com/mkh09353)! - Honor the OCR rotation-correction option in the built-in Tesseract engine by mapping it to Tesseract.js auto-rotation.

## 1.4.4

### Patch Changes

- [#112](https://github.com/run-llama/liteparse/pull/112) [`0eda8fc`](https://github.com/run-llama/liteparse/commit/0eda8fc27d6ad2cf835894efecc22c5239025447) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Improve buggy-font handling resolution

## 1.4.3

### Patch Changes

- [#108](https://github.com/run-llama/liteparse/pull/108) [`f4ee121`](https://github.com/run-llama/liteparse/commit/f4ee121d800b01f007a1d970d8197eda6673b392) Thanks [@Winds-AI](https://github.com/Winds-AI)! - Fix OCR bullet line spacing inflation

## 1.4.2

### Patch Changes

- [#91](https://github.com/run-llama/liteparse/pull/91) [`5bb3a3b`](https://github.com/run-llama/liteparse/commit/5bb3a3b214b148bec86aaf979ea561611a7df763) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - fix: use path.join for screenshot output filepath

- [#97](https://github.com/run-llama/liteparse/pull/97) [`1100bdb`](https://github.com/run-llama/liteparse/commit/1100bdbcb7293abb63d3eb38ff295669618265e0) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - fix: return null from extension detection for unrecognizable formats

- [#89](https://github.com/run-llama/liteparse/pull/89) [`71f6621`](https://github.com/run-llama/liteparse/commit/71f6621dd413195b3634b747c6cf7cde90966035) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - perf: cache PDFium document across page operations

- [#99](https://github.com/run-llama/liteparse/pull/99) [`b7a3080`](https://github.com/run-llama/liteparse/commit/b7a3080d89dccc4fd3cec65f559d4ebeff12e9bc) Thanks [@Winds-AI](https://github.com/Winds-AI)! - fix: validate ImageMagick executables before using convert

- [#95](https://github.com/run-llama/liteparse/pull/95) [`2718912`](https://github.com/run-llama/liteparse/commit/2718912b520ffc475d8e2541f5430b0823bd0acd) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - fix: guard indexOf before splice in grid anchor resolution

## 1.4.1

### Patch Changes

- [#84](https://github.com/run-llama/liteparse/pull/84) [`53e02df`](https://github.com/run-llama/liteparse/commit/53e02dff71d8f83cb2539b3a856889a5c3a38b52) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - Ensure parse cleans up temp files

- [#86](https://github.com/run-llama/liteparse/pull/86) [`48d86f1`](https://github.com/run-llama/liteparse/commit/48d86f1f2a0fc9d0cad29d3d30476f5cd0844d85) Thanks [@AdemBoukhris457](https://github.com/AdemBoukhris457)! - Ensure screenshot converts formats when possible

## 1.4.0

### Minor Changes

- [#64](https://github.com/run-llama/liteparse/pull/64) [`ab3df58`](https://github.com/run-llama/liteparse/commit/ab3df583fcbf6f0333a0649f7b4bd7331e5d547a) Thanks [@llrightll](https://github.com/llrightll)! - Add confidence scores to TextItems

- [#71](https://github.com/run-llama/liteparse/pull/71) [`57adda1`](https://github.com/run-llama/liteparse/commit/57adda15e6a45832e7f3a1311fb475c7221c1dc8) Thanks [@saravananravi08](https://github.com/saravananravi08)! - Add internal image detection for OCR

### Patch Changes

- [#78](https://github.com/run-llama/liteparse/pull/78) [`d341371`](https://github.com/run-llama/liteparse/commit/d341371eae7c2fa8feb234af732cf30e978230b3) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Improve searchItems output on complex text

## 1.3.2

### Patch Changes

- [#55](https://github.com/run-llama/liteparse/pull/55) [`b57cb61`](https://github.com/run-llama/liteparse/commit/b57cb61de9371cbc1cf91f01aafc7e1fe912e520) Thanks [@hexapode](https://github.com/hexapode)! - Improve text projection on justified text

## 1.3.1

### Patch Changes

- [#70](https://github.com/run-llama/liteparse/pull/70) [`243dc05`](https://github.com/run-llama/liteparse/commit/243dc0556769a59cf59e6565a5657b7d2630fc97) Thanks [@saravananravi08](https://github.com/saravananravi08)! - fix: resolve standard font loading failure in Node.js

## 1.3.0

### Minor Changes

- [#67](https://github.com/run-llama/liteparse/pull/67) [`0542758`](https://github.com/run-llama/liteparse/commit/0542758f6239a1897d7553727ce3ec58c61ea7fe) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Bbox utils and tesseract error handling

## 1.2.0

### Minor Changes

- [#56](https://github.com/run-llama/liteparse/pull/56) [`31b43f9`](https://github.com/run-llama/liteparse/commit/31b43f9666ce6df85e90a44be1e859c615bda757) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Add CLI Stdin support for files, urls, etc.

## 1.1.0

### Minor Changes

- [#51](https://github.com/run-llama/liteparse/pull/51) [`7b421c6`](https://github.com/run-llama/liteparse/commit/7b421c61f2e2ffa04e68bb2bbe02dbf18e261507) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Support for password protected PDFs

## 1.0.1

### Patch Changes

- [#40](https://github.com/run-llama/liteparse/pull/40) [`bb863c4`](https://github.com/run-llama/liteparse/commit/bb863c46f568c5c192e7c6ec608e350303668bba) Thanks [@logan-markewich](https://github.com/logan-markewich)! - Add support for TESSDATA_PREFIX and better error messaging on tesseract network errors

## 1.0.0

### Major Changes

- [#31](https://github.com/run-llama/liteparse/pull/31) [`56ba21c`](https://github.com/run-llama/liteparse/commit/56ba21cb63e8223440b039f49eab710ba089e375) Thanks [@logan-markewich](https://github.com/logan-markewich)! - LiteParse v1.0 launch


================================================
FILE: CLAUDE.md
================================================
@AGENTS.md

================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to LiteParse

Thank you for your interest in contributing to LiteParse! This document provides guidelines and information for contributors.

## Getting Started

1. Fork the repository
2. Clone your fork:
   ```bash
   git clone https://github.com/YOUR_USERNAME/liteparse.git
   cd liteparse
   ```
3. Install dependencies:
   ```bash
   npm install
   ```
4. Build the project:
   ```bash
   npm run build
   ```

## What to Contribute?

In this project, we welcome a wide range of contributions, but we do want to maintain the spirit of the project. We are primarily focused on:

- Core algorithms for PDF parsing and text extraction
- OCR integrations and improvements
- Different types or modifications to output formats

We are less interested in:

- Markdown output
- Any LLM integration or agent code
- Anything that doesn't directly relate to improving the core parsing and extraction capabilities

While the project is in Typescript today, I'm pretty open to porting to Rust if someone wanted to take that on as a contribution. The core algorithms and logic would be the same, just implemented in Rust instead of Typescript.

## Development Workflow

### Building

```bash
npm run build      # Build TypeScript
npm run dev        # Watch mode for development
```

### Testing

```bash
npm test           # Run tests
npm run test:watch # Run tests in watch mode
```

### Linting & Formatting

```bash
npm run lint       # Check for linting issues
npm run lint:fix   # Fix linting issues
npm run format     # Format code with Prettier
```

### Testing Local Changes

You can test your changes locally:

```bash
# Parse a document
./dist/src/index.js parse document.pdf

# Generate screenshots
./dist/src/index.js screenshot document.pdf -o ./screenshots
```

### Debugging Grid Projection

When working on the grid projection algorithm (`src/processing/gridProjection.ts`), you can enable built-in debug logging and visual output instead of adding ad-hoc `console.log` statements.

**Debug logging** traces every decision the projection makes — block detection, anchor extraction, snap assignment, rendering, and flowing text classification:

```bash
# Log all projection decisions to stderr
./dist/src/index.js parse document.pdf --debug

# Filter to a specific page
./dist/src/index.js parse document.pdf --debug --debug-page 3

# Filter to elements containing specific text
./dist/src/index.js parse document.pdf --debug --debug-text-filter "Total" "Revenue"

# Filter to a bounding region (x1,y1,x2,y2 in PDF points)
./dist/src/index.js parse document.pdf --debug --debug-region "0,100,300,200"

# Write debug log to a file
./dist/src/index.js parse document.pdf --debug --debug-output ./debug-output
```

**Visual grid export** generates PNG images showing text boxes color-coded by snap type (blue=left, red=right, green=center, gray=floating, yellow=flowing) with anchor lines overlaid. This is useful for comparing against page screenshots to spot projection issues:

```bash
# Generate visualization PNGs (one per page)
./dist/src/index.js parse document.pdf --debug-visualize

# Specify output directory
./dist/src/index.js parse document.pdf --debug-visualize --debug-output ./my-debug
```

These options are also available via the library API:

```typescript
const parser = new LiteParse({
  debug: {
    enabled: true,
    textFilter: ["Total"],
    pageFilter: 2,
    visualize: true,
    visualizePath: "./debug-output",
  }
});
```

See `src/processing/gridDebugLogger.ts` for the full `GridDebugConfig` interface and `src/processing/gridVisualizer.ts` for the visualization renderer.

## Making Changes

### Versioning & Changelogs

We use [Changesets](https://github.com/changesets/changesets) to manage versioning and changelogs. When you make a change to source code that should be released:

1. Run `npm run changeset`
2. Select the type of change (patch, minor, major)
3. Write a description of your changes
4. Commit the generated changeset file with your PR

## Pull Requests

1. Fork and create a feature branch from `main`
2. Make your changes
3. Add a changeset if needed (`npm run changeset`)
4. Ensure all tests pass (`npm test`)
5. Ensure linting passes (`npm run lint:fix` and `npm run format`)
6. Submit a pull request

When you submit a PR, a number of CICD checks will run. Among these, your code will be tested against a regression suite of documents to ensure that your changes don't break existing parsing capabilities. It will be up to the maintainers discretion to determine if any changes to the regression set are expected/positive or unexpected/negative.

### PR Guidelines

- Keep PRs focused on a single change
- Update documentation if needed
- Add tests for new functionality
- For parsing issues, include a test document if possible

## Reporting Issues

### Parsing Issues

If you're reporting a problem with document parsing:

1. **You must attach the document** or provide a way to reproduce the issue
2. Include the command you ran
3. Show the expected vs actual output
4. Include your LiteParse version (`lit --version`)

Issues without reproducible examples will be closed.

### Bug Reports

For other bugs:
1. Describe what you expected vs what happened
2. Include steps to reproduce
3. Include error messages/stack traces
4. Include version information

## Project Structure

See [AGENTS.md](AGENTS.md) for detailed documentation about the codebase structure and architecture.

Key directories:
- `src/core/` - Main orchestrator and configuration
- `src/engines/` - PDF and OCR engine implementations
- `src/processing/` - Text extraction and spatial analysis
- `src/output/` - Output formatters
- `cli/` - CLI implementation

## Questions?

- Open a [Discussion](https://github.com/run-llama/liteparse/discussions) for questions
- Check existing issues before opening new ones
- Read the [README](README.md) for usage documentation

## License

By contributing, you agree that your contributions will be licensed under the Apache 2.0 License.


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: OCR_API_SPEC.md
================================================
# LiteParse OCR API Specification

This document defines the standard HTTP API that OCR servers must implement to work with LiteParse.

## Overview

LiteParse expects a simple HTTP endpoint that accepts an image and returns text with bounding boxes. Your OCR server can internally use any OCR engine (EasyOCR, PaddleOCR, Tesseract, Cloud APIs, etc.) as long as it conforms to this API.

## Endpoint

```
POST /ocr
```

## Request Format

**Content-Type:** `multipart/form-data`

**Fields:**

| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `file` | binary | Yes | Image file (PNG, JPG, etc.) |
| `language` | string | No | Language code (default: `en`) |

### Language Codes

Use ISO 639-1 two-letter codes:
- `en` - English
- `zh` - Chinese
- `ja` - Japanese
- `ko` - Korean
- `fr` - French
- `de` - German
- `es` - Spanish
- `ar` - Arabic
- etc.

Your server should map these to whatever format your underlying OCR engine expects.

## Response Format

**Content-Type:** `application/json`

**Structure:**

```json
{
  "results": [
    {
      "text": "recognized text",
      "bbox": [x1, y1, x2, y2],
      "confidence": 0.95
    }
  ]
}
```

**Fields:**

| Field | Type | Description |
|-------|------|-------------|
| `results` | array | Array of text detection results |
| `results[].text` | string | Recognized text content |
| `results[].bbox` | [number, number, number, number] | Bounding box `[x1, y1, x2, y2]` where (x1,y1) is top-left and (x2,y2) is bottom-right |
| `results[].confidence` | number | Confidence score between 0.0 and 1.0 |

## Example

### Request

```bash
curl -X POST http://localhost:8080/ocr \
  -F "file=@document.png" \
  -F "language=en"
```

### Response

```json
{
  "results": [
    {
      "text": "Hello",
      "bbox": [10, 20, 60, 40],
      "confidence": 0.98
    },
    {
      "text": "World",
      "bbox": [70, 20, 130, 40],
      "confidence": 0.97
    }
  ]
}
```

## Error Handling

Return appropriate HTTP status codes:

- `200 OK` - Success
- `400 Bad Request` - Invalid request (missing file, invalid language, etc.)
- `500 Internal Server Error` - OCR processing failed

Error response format:

```json
{
  "error": "Description of the error"
}
```

## Implementation Notes

### Coordinate System

- Origin (0,0) is at the **top-left** of the image
- X increases to the right
- Y increases downward
- All coordinates are in pixels

### Bounding Box Format

Always return axis-aligned bounding boxes as `[x1, y1, x2, y2]`:
- `x1, y1` = top-left corner
- `x2, y2` = bottom-right corner
- `x2 > x1` and `y2 > y1`

If your OCR engine returns rotated boxes or polygon coordinates, convert them to axis-aligned boxes by taking min/max coordinates.

### Confidence Scores

- Normalize to range 0.0 to 1.0
- 1.0 = 100% confident
- 0.0 = 0% confident
- If your OCR engine doesn't provide confidence, use `1.0`

### Text Ordering

Results should be ordered by reading order (top-to-bottom, left-to-right for most languages).

## Example Implementations

See the `/ocr` directory for reference implementations:

- `ocr/easyocr/` - Wrapper for EasyOCR
- `ocr/paddleocr/` - Wrapper for PaddleOCR

## Testing Your Server

Quick test:

```bash
# 1. Start your server
python server.py

# 2. Test with curl
curl -X POST http://localhost:8080/ocr \
  -F "file=@test.png" \
  -F "language=en" \
  | jq .

# 3. Expected output:
# {
#   "results": [
#     {
#       "text": "...",
#       "bbox": [x1, y1, x2, y2],
#       "confidence": 0.xx
#     }
#   ]
# }
```

Use with LiteParse:

```bash
lit parse document.pdf --ocr-server-url http://localhost:8080/ocr
```

## FAQ

### Q: What if my OCR returns rotated bounding boxes?

Convert to axis-aligned boxes:

```python
def polygon_to_bbox(polygon):
    """Convert polygon [[x1,y1], [x2,y2], ...] to [x1, y1, x2, y2]"""
    xs = [point[0] for point in polygon]
    ys = [point[1] for point in polygon]
    return [min(xs), min(ys), max(xs), max(ys)]
```

### Q: What if my OCR doesn't return confidence scores?

Just return `1.0` for all results.

### Q: Can I return empty results?

Yes, return `{"results": []}` if no text is detected.

### Q: Should I filter low-confidence results?

You can, but LiteParse will also handle filtering based on its own thresholds.

### Q: What image formats should I accept?

At minimum: PNG, JPG. Optionally: TIFF, WebP, BMP, GIF.

### Q: Should I handle rotation correction?

Optional. If your OCR engine supports it, you can auto-correct rotation before processing.

### Q: What about multi-page documents?

LiteParse handles page splitting. Your server only needs to process single images.

### Q: Performance considerations?

- Keep server response time under 10 seconds per image
- Support concurrent requests
- Consider GPU acceleration for better performance
- Cache OCR models in memory (don't reload per request)

## Compliance Checklist

- [ ] Accepts `POST /ocr` endpoint
- [ ] Accepts `file` and `language` form fields
- [ ] Returns JSON with `results` array
- [ ] Each result has `text`, `bbox`, and `confidence`
- [ ] Bounding boxes in `[x1, y1, x2, y2]` format
- [ ] Confidence normalized to 0.0-1.0 range
- [ ] Returns 200 status on success
- [ ] Returns appropriate error codes and messages
- [ ] Handles common image formats (PNG, JPG)
- [ ] Processes images in under 10 seconds

## Support

Questions? Open an issue on GitHub or refer to the example implementations in `/ocr`.


================================================
FILE: README.md
================================================
# LiteParse

[![CI](https://github.com/run-llama/liteparse/actions/workflows/ci.yml/badge.svg)](https://github.com/run-llama/liteparse/actions/workflows/ci.yml)
|
[![npm version](https://img.shields.io/npm/v/@llamaindex/liteparse.svg)](https://www.npmjs.com/package/@llamaindex/liteparse)
|
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
|
[Docs](https://developers.llamaindex.ai/liteparse/)

<img src="https://github.com/user-attachments/assets/07ba6a82-6bb1-4dea-b0ef-cad7df7d1622" alt="out" width="600">

LiteParse is a standalone OSS PDF parsing tool focused exclusively on **fast and light** parsing. It provides high-quality spatial text parsing with bounding boxes, without proprietary LLM features or cloud dependencies. Everything runs locally on your machine. 

**Hitting the limits of local parsing?**
For complex documents (dense tables, multi-column layouts, charts, handwritten text, or 
scanned PDFs), you'll get significantly better results with [LlamaParse](https://developers.llamaindex.ai/python/cloud/llamaparse/?utm_source=github&utm_medium=liteparse), 
our cloud-based document parser built for production document pipelines. LlamaParse handles the 
hard stuff so your models see clean, structured data and markdown.

>  👉 [Sign up for LlamaParse free](https://cloud.llamaindex.ai?utm_source=github&utm_medium=liteparse)

## Overview

- **Fast Text Parsing**: Spatial text parsing using PDF.js
- **Flexible OCR System**:
  - **Built-in**: Tesseract.js (zero setup, works out of the box!)
  - **HTTP Servers**: Plug in any OCR server (EasyOCR, PaddleOCR, custom)
  - **Standard API**: Simple, well-defined OCR API specification
- **Screenshot Generation**: Generate high-quality page screenshots for LLM agents
- **Multiple Output Formats**: JSON and Text
- **Bounding Boxes**: Precise text positioning information
- **Standalone Binary**: No cloud dependencies, runs entirely locally
- **Multi-platform**: Linux, macOS (Intel/ARM), Windows

## Installation

### CLI Tool

#### Option 1: Global Install (Recommended)

Install globally via npm to use the `lit` command anywhere:

```bash
npm i -g @llamaindex/liteparse
```

Then use it:

```bash
lit parse document.pdf
lit screenshot document.pdf
```

For macOS and Linux users, `liteparse` can be also installed via `brew`:

```bash
brew tap run-llama/liteparse
brew install llamaindex-liteparse
```

#### Option 2: Install from Source

You can clone the repo and install the CLI globally from source:

```
git clone https://github.com/run-llama/liteparse.git
cd liteparse
npm run build
npm pack
npm install -g ./liteparse-*.tgz
```

### Agent Skill

You can use `liteparse` as an agent skill, downloading it with the `skills` CLI tool:

```bash
npx skills add run-llama/llamaparse-agent-skills --skill liteparse
```

Or copy-pasting the [`SKILL.md`](https://github.com/run-llama/llamaparse-agent-skills/blob/main/skills/liteparse/SKILL.md) file to your own skills setup.

## Usage

### Parse Files

```bash
# Basic parsing
lit parse document.pdf

# Parse with specific format
lit parse document.pdf --format json -o output.md

# Parse specific pages
lit parse document.pdf --target-pages "1-5,10,15-20"

# Parse without OCR
lit parse document.pdf --no-ocr

# Parse a remote PDF
curl -sL https://example.com/report.pdf | lit parse -
```

### Batch Parsing

You can also parse an entire directory of documents:

```bash
lit batch-parse ./input-directory ./output-directory
```

### Generate Screenshots

Screenshots are essential for LLM agents to extract visual information that text alone cannot capture.

```bash
# Screenshot all pages
lit screenshot document.pdf -o ./screenshots

# Screenshot specific pages
lit screenshot document.pdf --target-pages "1,3,5" -o ./screenshots

# Custom DPI
lit screenshot document.pdf --dpi 300 -o ./screenshots

# Screenshot page range
lit screenshot document.pdf --target-pages "1-10" -o ./screenshots
```

### Library Usage

Install as a dependency in your project:

```bash
npm install @llamaindex/liteparse
# or
pnpm add @llamaindex/liteparse
```

```typescript
import { LiteParse } from '@llamaindex/liteparse';

const parser = new LiteParse({ ocrEnabled: true });
const result = await parser.parse('document.pdf');
console.log(result.text);
```

#### Buffer / Uint8Array Input

You can pass raw bytes directly instead of a file path, which is useful for remote files:

```typescript
import { LiteParse } from '@llamaindex/liteparse';
import { readFile } from 'fs/promises';

const parser = new LiteParse();

// From a file read
const pdfBytes = await readFile('document.pdf');
const result = await parser.parse(pdfBytes);

// From an HTTP response
const response = await fetch('https://example.com/document.pdf');
const buffer = Buffer.from(await response.arrayBuffer());
const result2 = await parser.parse(buffer);
```

Non-PDF buffers (images, Office documents) are written to a temp directory for format conversion. Screenshots also work with buffer input:

```typescript
const screenshots = await parser.screenshot(pdfBytes, [1, 2, 3]);
```

### CLI Options

#### Parse Command

```
$ lit parse --help
Usage: lit parse [options] <file>

Parse a document file (PDF, DOCX, XLSX, PPTX, images, etc.)

Options:
  -o, --output <file>     Output file path
  --format <format>       Output format: json|text (default: "text")
  --ocr-server-url <url>  HTTP OCR server URL (uses Tesseract if not provided)
  --no-ocr                Disable OCR
  --ocr-language <lang>   OCR language(s) (default: "en")
  --num-workers <n>       Number of pages to OCR in parallel (default: CPU cores - 1)
  --max-pages <n>         Max pages to parse (default: "10000")
  --target-pages <pages>  Target pages (e.g., "1-5,10,15-20")
  --dpi <dpi>             DPI for rendering (default: "150")
  --no-precise-bbox       Disable precise bounding boxes
  --preserve-small-text   Preserve very small text
  --password <password>   Password for encrypted/protected documents
  --config <file>         Config file (JSON)
  -q, --quiet             Suppress progress output
  -h, --help              display help for command
```

#### Batch Parse Command

```
$ lit batch-parse --help
Usage: lit batch-parse [options] <input-dir> <output-dir>

Parse multiple documents in batch mode (reuses PDF engine for efficiency)

Options:
  --format <format>       Output format: json|text (default: "text")
  --ocr-server-url <url>  HTTP OCR server URL (uses Tesseract if not provided)
  --no-ocr                Disable OCR
  --ocr-language <lang>   OCR language(s) (default: "en")
  --num-workers <n>       Number of pages to OCR in parallel (default: CPU cores - 1)
  --max-pages <n>         Max pages to parse per file (default: "10000")
  --dpi <dpi>             DPI for rendering (default: "150")
  --no-precise-bbox       Disable precise bounding boxes
  --recursive             Recursively search input directory
  --extension <ext>       Only process files with this extension (e.g., ".pdf")
  --password <password>   Password for encrypted/protected documents (applied to all files)
  --config <file>         Config file (JSON)
  -q, --quiet             Suppress progress output
  -h, --help              display help for command
```

#### Screenshot Command

```
$ lit screenshot --help
Usage: lit screenshot [options] <file>

Generate screenshots of PDF pages

Options:
  -o, --output-dir <dir>  Output directory for screenshots (default: "./screenshots")
  --target-pages <pages>  Page numbers to screenshot (e.g., "1,3,5" or "1-5")
  --dpi <dpi>             DPI for rendering (default: "150")
  --format <format>       Image format: png|jpg (default: "png")
  --password <password>   Password for encrypted/protected documents
  --config <file>         Config file (JSON)
  -q, --quiet             Suppress progress output
  -h, --help              display help for command
```

## OCR Setup

### Default: Tesseract.js

```bash
# Tesseract is enabled by default
lit parse document.pdf

# Specify language
lit parse document.pdf --ocr-language fra

# Disable OCR
lit parse document.pdf --no-ocr
```

By default, Tesseract.js downloads language data from the internet on first use. For offline or air-gapped environments, set the `TESSDATA_PREFIX` environment variable to a directory containing pre-downloaded `.traineddata` files:

```bash
export TESSDATA_PREFIX=/path/to/tessdata
lit parse document.pdf --ocr-language eng
```

You can also pass `tessdataPath` in the library config:

```typescript
const parser = new LiteParse({ tessdataPath: '/path/to/tessdata' });
```

### Optional: HTTP OCR Servers

For higher accuracy or better performance, you can use an HTTP OCR server. We provide ready-to-use example wrappers for popular OCR engines:

- [EasyOCR](ocr/easyocr/README.md)
- [PaddleOCR](ocr/paddleocr/README.md)

You can integrate any OCR service by implementing the simple LiteParse OCR API specification (see [`OCR_API_SPEC.md`](OCR_API_SPEC.md)).

The API requires:
- POST `/ocr` endpoint
- Accepts `file` and `language` parameters
- Returns JSON: `{ results: [{ text, bbox: [x1,y1,x2,y2], confidence }] }`

See the example servers in `ocr/easyocr/` and `ocr/paddleocr/` as templates.

For the complete OCR API specification, see [`OCR_API_SPEC.md`](OCR_API_SPEC.md).

## Multi-Format Input Support

LiteParse supports **automatic conversion** of various document formats to PDF before parsing. This makes it unique compared to other PDF-only parsing tools!

### Supported Input Formats

#### Office Documents (via LibreOffice)
- **Word**: `.doc`, `.docx`, `.docm`, `.odt`, `.rtf`
- **PowerPoint**: `.ppt`, `.pptx`, `.pptm`, `.odp`
- **Spreadsheets**: `.xls`, `.xlsx`, `.xlsm`, `.ods`, `.csv`, `.tsv`

Just install the dependency and LiteParse will automatically convert these formats to PDF for parsing:

```bash
# macOS
brew install --cask libreoffice

# Ubuntu/Debian
apt-get install libreoffice

# Windows
choco install libreoffice-fresh # might require admin permissions
```

> _For Windows, you might need to add the path to the directory containing LibreOffice CLI executable (generally `C:\Program Files\LibreOffice\program`) to the environment variables and re-start the machine._

#### Images (via ImageMagick)
- **Formats**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.tiff`, `.webp`, `.svg`

Just install ImageMagick and LiteParse will convert images to PDF for parsing (with OCR):

```bash
# macOS
brew install imagemagick

# Ubuntu/Debian
apt-get install imagemagick

# Windows
choco install imagemagick.app # might require admin permissions
```

## Environment Variables

| Variable | Description |
|----------|-------------|
| `TESSDATA_PREFIX` | Path to a directory containing Tesseract `.traineddata` files. Used for offline/air-gapped environments where Tesseract.js cannot download language data from the internet. |
| `LITEPARSE_TMPDIR` | Override the temp directory used for format conversion and intermediate files. Defaults to the OS temp directory (`os.tmpdir()`). Useful in containerized or read-only filesystem environments. |

## Configuration

You can configure parsing options via CLI flags or a JSON config file. The config file allows you to set sensible defaults and override as needed.

### Config File Example

Create a `liteparse.config.json` file:

```json
{
  "ocrLanguage": "en",
  "ocrEnabled": true,
  "maxPages": 1000,
  "dpi": 150,
  "outputFormat": "json",
  "preciseBoundingBox": true,
  "preserveVerySmallText": false,
  "password": "optional_password"
}
```

For HTTP OCR servers, just add `ocrServerUrl`:

```json
{
  "ocrServerUrl": "http://localhost:8828/ocr",
  "ocrLanguage": "en",
  "outputFormat": "json"
}
```

Use with:

```bash
lit parse document.pdf --config liteparse.config.json
```

## Development

We provide a fairly rich `AGENTS.md`/`CLAUDE.md` that we recommend using to help with development + coding agents.

```bash
# Install dependencies
npm install

# Build TypeScript (Linux/macOs)
npm run build

# Build Typescript (Windows)
npm run build:windows

# Watch mode
npm run dev

# Test parsing
npm test
```

## License

Apache 2.0

## Credits

Built on top of:

- [PDF.js](https://github.com/mozilla/pdf.js) - PDF parsing engine
- [Tesseract.js](https://github.com/naptha/tesseract.js) - In-process OCR engine
- [EasyOCR](https://github.com/JaidedAI/EasyOCR) - HTTP OCR server (optional)
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - HTTP OCR server (optional)
- [Sharp](https://github.com/lovell/sharp) - Image processing


================================================
FILE: SECURITY.md
================================================
# Security Policy

## Reporting a Vulnerability

If you discover a security vulnerability in LiteParse, please report it responsibly:

1. **Do NOT open a public issue** for security vulnerabilities
2. Email security concerns to: security@llamaindex.ai
3. Include as much detail as possible:
   - Description of the vulnerability
   - Steps to reproduce
   - Potential impact
   - Suggested fix (if any)

## Scope

### In Scope

Security issues we will address:

- Remote code execution in the CLI tool or library
- Vulnerabilities in LiteParse's own code that could be exploited
- Dependency vulnerabilities with known, exploitable CVEs

### Out of Scope

LiteParse is intended to be a local CLI tool and library, designed to process documents you provide. The following are not security vulnerabilities we will address:

- **Malicious input files** - Processing untrusted documents (zip bombs, malformed PDFs, path traversal in filenames, etc.) is the user's responsibility. If you're building a service that accepts untrusted uploads, you must implement your own validation, sandboxing, and resource limits.
- **Denial of service via large/complex files** - Documents that cause high memory usage, long processing times, or crashes are not security issues. Use `--max-pages`, timeouts, and resource limits in your deployment.
- **Issues requiring a server setup** - LiteParse does not include or recommend any specific production server deployment. Security of web services built on top of LiteParse is the deployer's responsibility.
- **Theoretical attacks without proof of concept** - Please include a working demonstration.

### Building Secure Services

If you're exposing LiteParse through a web service or API:

1. Validate uploads:  Check file types, sizes, and origins before processing
2. Use sandboxing:  Run parsing in isolated containers with resource limits
3. Set timeouts:  Don't allow unbounded processing time
4. Limit concurrency:  Prevent resource exhaustion from parallel requests
5. Don't trust filenames:  Sanitize any paths derived from user input

These concerns are standard for any document processing service and are outside LiteParse's scope.


================================================
FILE: cli/README.md
================================================
# cli/

Command-line interface for LiteParse using Commander.js.

## Files

### parse.ts
**CLI entry point with two main commands: `parse` and `screenshot`.**

---

## Commands

### `lit parse <file>`

Parse documents and extract text.

---

### `lit screenshot <file> -o <output_dir>`

Generate page screenshots.

---

## Configuration File

Both commands accept `--config <file>` to load settings from JSON:

```json
{
  "ocrEnabled": true,
  "ocrLanguage": "en",
  "ocrServerUrl": "http://localhost:5000/ocr",
  "maxPages": 100,
  "dpi": 200,
  "outputFormat": "json",
  "preciseBoundingBox": true,
  "password": "optional_password"
}
```

CLI options override config file values.

---

## Adding CLI Options

1. Add `.option()` call in the command definition
2. Read option in action handler from `options` object
3. Add to config object that's passed to `LiteParse`
4. If new config field, update `src/core/types.ts` and `src/core/config.ts`


================================================
FILE: cli/parse.ts
================================================
import { Command, Option } from "commander";
import fs from "fs/promises";
import { existsSync, readdirSync, statSync } from "fs";
import os from "os";
import path from "path";
import { LiteParse } from "../src/core/parser.js";
import { LiteParseConfig, OutputFormat } from "../src/core/types.js";
import { performance } from "perf_hooks";

const DEFAULT_MAX_PAGES = 10000;
const DEFAULT_DPI = 150;
const DEFAULT_LANGUAGE = "en";
const DEFAULT_OUTPUT_FORMAT = "text";
const DEFAULT_SCREENSHOT_FORMAT = "png";
const DEFAULT_SCREENSHOT_DIR = "./screenshots";

interface ParseCommandOptions {
  output?: string;
  format?: string;
  ocrServerUrl?: string;
  ocr?: boolean;
  ocrLanguage?: string;
  numWorkers?: string;
  maxPages?: string;
  targetPages?: string;
  dpi?: string;
  preciseBbox?: boolean;
  preserveSmallText?: boolean;
  password?: string;
  config?: string;
  quiet?: boolean;
  debug?: boolean;
  debugVisualize?: boolean;
  debugOutput?: string;
  debugTextFilter?: string[];
  debugPage?: string;
  debugRegion?: string;
}

interface ScreenshotCommandOptions {
  outputDir?: string;
  targetPages?: string;
  dpi?: string;
  format?: string;
  password?: string;
  config?: string;
  quiet?: boolean;
}

interface BatchParseCommandOptions {
  format?: string;
  ocrServerUrl?: string;
  ocr?: boolean;
  ocrLanguage?: string;
  numWorkers?: string;
  maxPages?: string;
  dpi?: string;
  preciseBbox?: boolean;
  recursive?: boolean;
  extension?: string;
  password?: string;
  config?: string;
  quiet?: boolean;
}

const program = new Command();

program
  .name("lit")
  .description("OSS document parsing tool (supports PDF, DOCX, XLSX, images, and more)")
  .version("0.1.0");

program
  .command("parse <file>")
  .description("Parse a document file (PDF, DOCX, XLSX, PPTX, images, etc.)")
  .option("-o, --output <file>", "Output file path")
  .option("--format <format>", "Output format: json|text", DEFAULT_OUTPUT_FORMAT)
  .option("--ocr-server-url <url>", "HTTP OCR server URL (uses Tesseract if not provided)")
  .option("--no-ocr", "Disable OCR")
  .option("--ocr-language <lang>", "OCR language(s)", DEFAULT_LANGUAGE)
  .option(
    "--num-workers <n>",
    "Number of pages to OCR in parallel. Defaults to number of CPU cores minus one."
  )
  .option("--max-pages <n>", "Max pages to parse", DEFAULT_MAX_PAGES.toString())
  .option("--target-pages <pages>", 'Target pages (e.g., "1-5,10,15-20")')
  .option("--dpi <dpi>", "DPI for rendering", DEFAULT_DPI.toString())
  .option("--no-precise-bbox", "Disable precise bounding boxes")
  .option("--preserve-small-text", "Preserve very small text")
  .option("--password <password>", "Password for encrypted/protected documents")
  .option("--config <file>", "Config file (JSON)")
  .option("-q, --quiet", "Suppress progress output")
  .addOption(new Option("--debug", "Enable grid projection debug logging").hideHelp())
  .addOption(new Option("--debug-visualize", "Generate grid visualization PNGs").hideHelp())
  .addOption(new Option("--debug-output <path>", "Output directory for debug files").hideHelp())
  .addOption(
    new Option("--debug-text-filter <texts...>", "Filter debug output by text content").hideHelp()
  )
  .addOption(
    new Option("--debug-page <num>", "Filter debug output to specific page number").hideHelp()
  )
  .addOption(
    new Option("--debug-region <coords>", 'Filter to bounding region "x1,y1,x2,y2"').hideHelp()
  )
  .action(async (file: string, options: ParseCommandOptions) => {
    try {
      const quiet = options.quiet || false;
      const isStdin = file === "-";

      // Check if file exists (skip for stdin)
      if (!isStdin && !existsSync(file)) {
        console.error(`Error: File not found: ${file}`);
        process.exit(1);
      }

      let config: Partial<LiteParseConfig> = {};

      // Load config file if provided
      if (options.config) {
        if (!existsSync(options.config)) {
          console.error(`Error: Config file not found: ${options.config}`);
          process.exit(1);
        }
        const configData = await fs.readFile(options.config, "utf-8");
        config = JSON.parse(configData);
      }

      // Override with CLI options
      let calculatedNumWorkers = os.cpus().length - 1;
      if (calculatedNumWorkers < 1) {
        calculatedNumWorkers = 1;
      }

      config = {
        ...config,
        outputFormat: options.format as OutputFormat,
        ocrEnabled: options.ocr !== false,
        ocrServerUrl: options.ocrServerUrl,
        ocrLanguage: options.ocrLanguage,
        numWorkers: parseInt(options.numWorkers || calculatedNumWorkers.toString()),
        maxPages: parseInt(options.maxPages || DEFAULT_MAX_PAGES.toString()),
        targetPages: options.targetPages,
        dpi: parseInt(options.dpi || DEFAULT_DPI.toString()),
        preciseBoundingBox: options.preciseBbox !== false,
        preserveVerySmallText: options.preserveSmallText || false,
        password: options.password,
      };

      // Build debug config if any debug flags are set
      if (options.debug || options.debugVisualize) {
        let regionFilter: { x1: number; y1: number; x2: number; y2: number } | undefined;
        if (options.debugRegion) {
          const [x1, y1, x2, y2] = options.debugRegion.split(",").map(Number);
          regionFilter = { x1, y1, x2, y2 };
        }
        config.debug = {
          enabled: true,
          visualize: options.debugVisualize,
          visualizePath: options.debugOutput ?? "./debug-output",
          outputPath: options.debugOutput ? `${options.debugOutput}/debug.log` : undefined,
          textFilter: options.debugTextFilter,
          pageFilter: options.debugPage ? parseInt(options.debugPage) : undefined,
          regionFilter,
        };
      }

      // Create parser
      const parser = new LiteParse(config);

      // Read from stdin or file
      let input: string | Buffer;
      if (isStdin) {
        const chunks: Buffer[] = [];
        for await (const chunk of process.stdin) {
          chunks.push(chunk);
        }
        input = Buffer.concat(chunks);
        if (input.length === 0) {
          console.error("Error: No data received from stdin");
          process.exit(1);
        }
      } else {
        input = file;
      }

      // Parse document (quiet flag controls progress output)
      const result = await parser.parse(input, quiet);

      // Format output based on format
      let output: string;
      switch (config.outputFormat) {
        case "json":
          output = JSON.stringify(result.json, null, 2);
          break;
        case "text":
        default:
          output = result.text;
          break;
      }

      // Write to file or stdout
      if (options.output) {
        await fs.writeFile(options.output, output);
        if (!quiet) {
          console.error(`\n✓ Parsed ${result.pages.length} pages → ${options.output}`);
        }
      } else {
        // Output result to stdout (can be piped)
        console.log(output);
      }
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      const stack = error instanceof Error ? error.stack : undefined;
      console.error(`\nError: ${message}`);
      if (stack) {
        console.error(stack);
      }
      process.exit(1);
    }
  });

program
  .command("screenshot <file>")
  .description("Generate screenshots of PDF pages")
  .option("-o, --output-dir <dir>", "Output directory for screenshots", DEFAULT_SCREENSHOT_DIR)
  .option("--target-pages <pages>", 'Page numbers to screenshot (e.g., "1,3,5" or "1-5")')
  .option("--dpi <dpi>", "DPI for rendering", DEFAULT_DPI.toString())
  .option("--format <format>", "Image format: png|jpg", DEFAULT_SCREENSHOT_FORMAT)
  .option("--password <password>", "Password for encrypted/protected documents")
  .option("--config <file>", "Config file (JSON)")
  .option("-q, --quiet", "Suppress progress output")
  .action(async (file: string, options: ScreenshotCommandOptions) => {
    try {
      const quiet = options.quiet || false;

      // Check if file exists
      if (!existsSync(file)) {
        console.error(`Error: File not found: ${file}`);
        process.exit(1);
      }

      let config: Partial<LiteParseConfig> = {};

      // Load config file if provided
      if (options.config) {
        if (!existsSync(options.config)) {
          console.error(`Error: Config file not found: ${options.config}`);
          process.exit(1);
        }
        const configData = await fs.readFile(options.config, "utf-8");
        config = JSON.parse(configData);
      }

      // Override with CLI options
      config = {
        ...config,
        dpi: parseInt(options.dpi || DEFAULT_DPI.toString()),
        password: options.password,
      };

      // Parse target pages
      let pageNumbers: number[] | undefined;
      if (options.targetPages) {
        pageNumbers = parsePageNumbers(options.targetPages);
      }

      const outputDir = options.outputDir || DEFAULT_SCREENSHOT_DIR;

      // Create output directory
      if (!existsSync(outputDir)) {
        await fs.mkdir(outputDir, { recursive: true });
      }

      // Create parser
      const parser = new LiteParse(config);

      // Generate screenshots
      const results = await parser.screenshot(file, pageNumbers, quiet);

      // Save screenshots
      for (const result of results) {
        const filename = `page_${result.pageNum}.${options.format || DEFAULT_SCREENSHOT_FORMAT}`;
        const filepath = path.join(outputDir, filename);
        await fs.writeFile(filepath, result.imageBuffer);
        if (!quiet) {
          console.error(`✓ ${filepath} (${result.width}x${result.height})`);
        }
      }

      if (!quiet) {
        console.error(`\n✓ Generated ${results.length} screenshots → ${outputDir}`);
      }
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      const stack = error instanceof Error ? error.stack : undefined;
      console.error(`\nError: ${message}`);
      if (stack) {
        console.error(stack);
      }
      process.exit(1);
    }
  });

// Supported file extensions for batch parsing
const SUPPORTED_EXTENSIONS = new Set([
  ".pdf",
  ".doc",
  ".docx",
  ".docm",
  ".dot",
  ".dotm",
  ".dotx",
  ".odt",
  ".ott",
  ".ppt",
  ".pptx",
  ".pptm",
  ".pot",
  ".potm",
  ".potx",
  ".odp",
  ".otp",
  ".xls",
  ".xlsx",
  ".xlsm",
  ".xlsb",
  ".ods",
  ".ots",
  ".csv",
  ".tsv",
  ".jpg",
  ".jpeg",
  ".png",
  ".gif",
  ".bmp",
  ".tiff",
  ".tif",
  ".webp",
  ".svg",
  ".rtf",
  ".pages",
  ".key",
  ".numbers",
]);

program
  .command("batch-parse <input-dir> <output-dir>")
  .description("Parse multiple documents in batch mode")
  .option("--format <format>", "Output format: json|text", DEFAULT_OUTPUT_FORMAT)
  .option("--ocr-server-url <url>", "HTTP OCR server URL (uses Tesseract if not provided)")
  .option("--no-ocr", "Disable OCR")
  .option("--ocr-language <lang>", "OCR language(s)", DEFAULT_LANGUAGE)
  .option(
    "--num-workers <n>",
    "Number of pages to OCR in parallel. Defaults to number of CPU cores minus one."
  )
  .option("--max-pages <n>", "Max pages to parse per file", DEFAULT_MAX_PAGES.toString())
  .option("--dpi <dpi>", "DPI for rendering", DEFAULT_DPI.toString())
  .option("--no-precise-bbox", "Disable precise bounding boxes")
  .option("--recursive", "Recursively search input directory")
  .option("--extension <ext>", 'Only process files with this extension (e.g., ".pdf")')
  .option(
    "--password <password>",
    "Password for encrypted/protected documents (applied to all files)"
  )
  .option("--config <file>", "Config file (JSON)")
  .option("-q, --quiet", "Suppress progress output")
  .action(async (inputDir: string, outputDir: string, options: BatchParseCommandOptions) => {
    try {
      const quiet = options.quiet || false;
      const startTime = performance.now();

      // Validate input directory
      if (!existsSync(inputDir)) {
        console.error(`Error: Input directory not found: ${inputDir}`);
        process.exit(1);
      }

      const inputStat = statSync(inputDir);
      if (!inputStat.isDirectory()) {
        console.error(`Error: Input path is not a directory: ${inputDir}`);
        process.exit(1);
      }

      // Create output directory
      if (!existsSync(outputDir)) {
        await fs.mkdir(outputDir, { recursive: true });
      }

      // Find all files to process
      const files = findFiles(inputDir, options.recursive || false, options.extension);

      if (files.length === 0) {
        console.error("No supported files found in input directory");
        process.exit(1);
      }

      if (!quiet) {
        console.error(`Found ${files.length} files to process`);
      }

      // Load config
      let config: Partial<LiteParseConfig> = {};
      if (options.config) {
        if (!existsSync(options.config)) {
          console.error(`Error: Config file not found: ${options.config}`);
          process.exit(1);
        }
        const configData = await fs.readFile(options.config, "utf-8");
        config = JSON.parse(configData);
      }

      // Apply CLI options
      let calculatedNumWorkers = os.cpus().length - 1;
      if (calculatedNumWorkers < 1) {
        calculatedNumWorkers = 1;
      }

      config = {
        ...config,
        outputFormat: options.format as OutputFormat,
        ocrEnabled: options.ocr !== false,
        ocrServerUrl: options.ocrServerUrl,
        ocrLanguage: options.ocrLanguage,
        numWorkers: parseInt(options.numWorkers || calculatedNumWorkers.toString()),
        maxPages: parseInt(options.maxPages || DEFAULT_MAX_PAGES.toString()),
        dpi: parseInt(options.dpi || DEFAULT_DPI.toString()),
        preciseBoundingBox: options.preciseBbox !== false,
        password: options.password,
      };

      // Create a SINGLE parser instance for all files (key for batch efficiency)
      const parser = new LiteParse(config);

      // Process files
      let successCount = 0;
      let errorCount = 0;
      const outputExt = options.format === "json" ? ".json" : ".txt";

      for (let i = 0; i < files.length; i++) {
        const file = files[i];
        const relativePath = path.relative(inputDir, file);
        const outputPath = path.join(outputDir, relativePath.replace(/\.[^.]+$/, outputExt));

        // Create output subdirectory if needed
        const outputSubdir = path.dirname(outputPath);
        if (!existsSync(outputSubdir)) {
          await fs.mkdir(outputSubdir, { recursive: true });
        }

        try {
          const fileStart = performance.now();
          const result = await parser.parse(file, true); // Always quiet for individual files

          // Format output
          let output: string;
          if (options.format === "json") {
            output = JSON.stringify(result.json, null, 2);
          } else {
            output = result.text;
          }

          await fs.writeFile(outputPath, output);
          successCount++;

          if (!quiet) {
            const fileTime = (performance.now() - fileStart).toFixed(0);
            console.error(
              `[${i + 1}/${files.length}] ✓ ${relativePath} (${result.pages.length} pages, ${fileTime}ms)`
            );
          }
        } catch (error: unknown) {
          errorCount++;
          if (!quiet) {
            const message = error instanceof Error ? error.message : String(error);
            console.error(`[${i + 1}/${files.length}] ✗ ${relativePath}: ${message}`);
          }
        }
      }

      const totalTime = ((performance.now() - startTime) / 1000).toFixed(2);
      const avgTime =
        files.length > 0 ? ((performance.now() - startTime) / files.length).toFixed(0) : 0;

      if (!quiet) {
        console.error("");
        console.error(`Batch complete: ${successCount} succeeded, ${errorCount} failed`);
        console.error(`Total time: ${totalTime}s (avg ${avgTime}ms/file)`);
        console.error(`Output: ${outputDir}`);
      }

      if (errorCount > 0) {
        process.exit(1);
      }
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      const stack = error instanceof Error ? error.stack : undefined;
      console.error(`\nError: ${message}`);
      if (stack) {
        console.error(stack);
      }
      process.exit(1);
    }
  });

/**
 * Find all supported files in a directory
 */
function findFiles(dir: string, recursive: boolean, filterExt?: string): string[] {
  const files: string[] = [];

  function scan(currentDir: string) {
    const entries = readdirSync(currentDir);

    for (const entry of entries) {
      const fullPath = path.join(currentDir, entry);
      const stat = statSync(fullPath);

      if (stat.isDirectory()) {
        if (recursive) {
          scan(fullPath);
        }
      } else if (stat.isFile()) {
        const ext = path.extname(entry).toLowerCase();

        // Filter by extension if specified
        if (filterExt && ext !== filterExt.toLowerCase()) {
          continue;
        }

        // Check if supported
        if (SUPPORTED_EXTENSIONS.has(ext)) {
          files.push(fullPath);
        }
      }
    }
  }

  scan(dir);
  return files.sort();
}

/**
 * Parse page numbers from string like "1,3,5" or "1-5,10"
 */
function parsePageNumbers(pagesStr: string): number[] {
  const pages: number[] = [];
  const parts = pagesStr.split(",");

  for (const part of parts) {
    const trimmed = part.trim();
    if (trimmed.includes("-")) {
      const [start, end] = trimmed.split("-").map((n) => parseInt(n.trim()));
      for (let i = start; i <= end; i++) {
        pages.push(i);
      }
    } else {
      pages.push(parseInt(trimmed));
    }
  }

  return [...new Set(pages)].sort((a, b) => a - b);
}

export { program };


================================================
FILE: dataset_eval_utils/README.md
================================================
# LiteParse Eval Utils

Utilities for generating and evaluating datasets for PDF parsing performance. Compares text extraction quality across multiple PDF parsers using LLM-based QA evaluation.

## Setup

Requires Python 3.12+.

```bash
# Install the package (from the dataset_eval_utils directory)
pip install -e .
```

You'll need an `ANTHROPIC_API_KEY` environment variable set for the LLM-based evaluation and dataset processing tools.

## Dataset

An existing dataset that was generated and evaluated using this framework can be found on [huggingface]().

You can download the dataset using the Hugging Face CLI:

```bash
hf download run-llama/liteparse-eval-dataset --repo-type dataset --local-dir ./liteparse-eval-dataset
```

## CLI Tools

### `lp-process` — Generate Ground Truth Datasets

Processes PDF and image files using Claude's vision capabilities to generate structured QA ground truth data.

```bash
lp-process /path/to/documents --output-dir ./ground_truth
```

Options:
- `--output-dir` — Directory to save output JSON files (default: `./output`)
- `--model` — Claude model to use (default: `claude-sonnet-4-5-20250929`)
- `--api-key` — Anthropic API key (or set `ANTHROPIC_API_KEY` env var)

Each output JSON file contains document metadata and QA pairs extracted from the document pages.

### `lp-evaluate` — Run QA Evaluation

Evaluates parser text extraction quality by having an LLM answer questions from extracted text and judging correctness against ground truth answers.

```bash
lp-evaluate \
  --data-dir ./documents \
  --ground-truth-dir ./ground_truth \
  --parse-provider liteparse \
  --output ./results/run1
```

Options:
- `--data-dir` — Directory containing source PDF documents (required)
- `--ground-truth-dir` — Directory containing ground truth JSON files (required)
- `--output` — Path to save results (JSON + HTML report)
- `--parse-provider` — Parser to evaluate: `liteparse`, `pymupdf`, `pypdf`, `markitdown` (default: `liteparse`)
- `--llm-provider` — LLM for answering questions: `anthropic` (default: `anthropic`)

Outputs:
- `<output>.json` — Aggregate results with pass rates
- `<output>_detailed.json` — Per-document results with extracted text and individual QA results
- `<output>_report.html` — Interactive HTML report with PDF previews and QA breakdowns

### `lp-benchmark` — Performance Benchmarking

Measures parse latency and memory usage across providers.

```bash
lp-benchmark document.pdf --providers pymupdf liteparse --runs 20
```

Options:
- `--providers` — Providers to benchmark (default: all local providers)
- `--runs` — Number of benchmark runs per provider (default: 10)
- `--warmup` — Number of warmup runs (default: 1)
- `--output` — Path to save JSON results

## Parser Providers

| Provider | Library | Notes |
|----------|---------|-------|
| `liteparse` | [liteparse](https://github.com/run-llama/liteparse) | Spatial text extraction with OCR support |
| `pymupdf` | [PyMuPDF](https://pymupdf.readthedocs.io/) | Fast, mature PDF library |
| `pypdf` | [pypdf](https://pypdf.readthedocs.io/) | Pure-Python PDF library |
| `markitdown` | [MarkItDown](https://github.com/microsoft/markitdown) | Microsoft's document-to-markdown converter |

## Evaluation Pipeline

1. **Extract text** from PDF using the selected parser provider
2. **Answer questions** — LLM reads the extracted text and answers ground truth questions
3. **Judge answers** — A separate LLM judge evaluates whether predicted answers are semantically equivalent to expected answers
4. **Aggregate** — Pass rates are computed per-document and overall


================================================
FILE: dataset_eval_utils/pyproject.toml
================================================
[project]
name = "liteparse-eval"
version = "0.1.0"
description = "Utilities for generating and evaluating datasets for PDF parsing performance."
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "anthropic>=0.76.0",
    "pillow>=10.0.0",
    "pymupdf>=1.26.7",
    "pypdf>=6.6.2",
    "markitdown[all]",
    "rapidfuzz>=3.14.3",
    "liteparse @ file:///${PROJECT_ROOT}/../packages/python",
]

[project.scripts]
lp-evaluate = "liteparse_eval.evaluation:main"
lp-process = "liteparse_eval.processing:main"
lp-benchmark = "liteparse_eval.benchmark:main"

[tool.setuptools]
package-dir = {"" = "src"}

[tool.setuptools.packages.find]
where = ["src"]


================================================
FILE: dataset_eval_utils/src/liteparse_eval/__init__.py
================================================
"""LiteParse Eval - Document parsing evaluation and benchmarking toolkit."""

from liteparse_eval.providers import (
    LLMProvider,
    AnthropicProvider,
    ParserProvider,
    LiteparseProvider,
    MarkItDownProvider,
    PyMuPDFProvider,
    PyPDFProvider,
)

__version__ = "0.1.0"
__all__ = [
    "LLMProvider",
    "AnthropicProvider",
    "ParserProvider",
    "LiteparseProvider",
    "MarkItDownProvider",
    "PyMuPDFProvider",
    "PyPDFProvider",
]


================================================
FILE: dataset_eval_utils/src/liteparse_eval/benchmark.py
================================================
"""
Performance benchmarking tool for parser providers.

Measures latency and resource usage across multiple runs for a given document.
"""

import argparse
import gc
import json
import statistics
import time
import tracemalloc
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

from liteparse_eval.providers import (
    ParserProvider,
    LiteparseProvider,
    MarkItDownProvider,
    PyMuPDFProvider,
    PyPDFProvider,
)


@dataclass
class BenchmarkMetrics:
    """Performance metrics for a benchmark run."""
    latencies: list[float] = field(default_factory=list)  # seconds
    memory_peaks: list[float] = field(default_factory=list)  # MB

    @property
    def count(self) -> int:
        return len(self.latencies)

    @property
    def latency_avg(self) -> float:
        return statistics.mean(self.latencies) if self.latencies else 0.0

    @property
    def latency_median(self) -> float:
        return statistics.median(self.latencies) if self.latencies else 0.0

    @property
    def latency_stddev(self) -> float:
        return statistics.stdev(self.latencies) if len(self.latencies) >= 2 else 0.0

    @property
    def latency_min(self) -> float:
        return min(self.latencies) if self.latencies else 0.0

    @property
    def latency_max(self) -> float:
        return max(self.latencies) if self.latencies else 0.0

    @property
    def memory_avg(self) -> float:
        return statistics.mean(self.memory_peaks) if self.memory_peaks else 0.0

    @property
    def memory_median(self) -> float:
        return statistics.median(self.memory_peaks) if self.memory_peaks else 0.0

    @property
    def memory_stddev(self) -> float:
        return statistics.stdev(self.memory_peaks) if len(self.memory_peaks) >= 2 else 0.0

    @property
    def memory_min(self) -> float:
        return min(self.memory_peaks) if self.memory_peaks else 0.0

    @property
    def memory_max(self) -> float:
        return max(self.memory_peaks) if self.memory_peaks else 0.0

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return {
            "runs": self.count,
            "latency": {
                "avg_seconds": round(self.latency_avg, 4),
                "median_seconds": round(self.latency_median, 4),
                "stddev_seconds": round(self.latency_stddev, 4),
                "min_seconds": round(self.latency_min, 4),
                "max_seconds": round(self.latency_max, 4),
                "all_runs": [round(l, 4) for l in self.latencies],
            },
            "memory": {
                "avg_mb": round(self.memory_avg, 2),
                "median_mb": round(self.memory_median, 2),
                "stddev_mb": round(self.memory_stddev, 2),
                "min_mb": round(self.memory_min, 2),
                "max_mb": round(self.memory_max, 2),
                "all_runs": [round(m, 2) for m in self.memory_peaks],
            },
        }


@dataclass
class ProviderBenchmarkResult:
    """Result for a single provider benchmark."""
    provider_name: str
    metrics: BenchmarkMetrics
    success: bool = True
    error: Optional[str] = None
    extracted_text_length: Optional[int] = None

    def to_dict(self) -> dict:
        result = {
            "provider": self.provider_name,
            "success": self.success,
            "metrics": self.metrics.to_dict() if self.success else None,
        }
        if self.error:
            result["error"] = self.error
        if self.extracted_text_length is not None:
            result["extracted_text_length"] = self.extracted_text_length
        return result


def get_provider_instance(provider_name: str) -> ParserProvider:
    """Create a fresh provider instance by name."""
    providers = {
        "pymupdf": PyMuPDFProvider,
        "pypdf": PyPDFProvider,
        "markitdown": MarkItDownProvider,
        "liteparse": LiteparseProvider,
    }
    if provider_name not in providers:
        raise ValueError(f"Unknown provider: {provider_name}")
    return providers[provider_name]()


def benchmark_provider(
    provider: ParserProvider,
    file_path: Path,
    num_runs: int = 10,
    warmup_runs: int = 1,
) -> BenchmarkMetrics:
    """
    Benchmark a parser provider on a document.

    Args:
        provider: The parser provider to benchmark
        file_path: Path to the document to parse
        num_runs: Number of benchmark runs (default: 10)
        warmup_runs: Number of warmup runs before benchmarking (default: 1)

    Returns:
        BenchmarkMetrics with latency and memory measurements
    """
    metrics = BenchmarkMetrics()

    # Warmup runs (not recorded)
    for _ in range(warmup_runs):
        provider.extract_text(file_path)
        gc.collect()

    # Benchmark runs
    for _ in range(num_runs):
        gc.collect()

        # Start memory tracking
        tracemalloc.start()

        # Time the extraction
        start_time = time.perf_counter()
        provider.extract_text(file_path)
        elapsed = time.perf_counter() - start_time

        # Get peak memory
        _, peak_memory = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        metrics.latencies.append(elapsed)
        metrics.memory_peaks.append(peak_memory / (1024 * 1024))  # Convert to MB

        gc.collect()

    return metrics


def run_benchmark(
    file_path: Path,
    providers: list[str],
    num_runs: int = 10,
    warmup_runs: int = 1,
    output_path: Optional[Path] = None,
) -> dict:
    """
    Run benchmark across multiple providers.

    Args:
        file_path: Path to the document to benchmark
        providers: List of provider names to benchmark
        num_runs: Number of runs per provider
        warmup_runs: Number of warmup runs per provider
        output_path: Optional path to save JSON results

    Returns:
        Dictionary with benchmark results for all providers
    """
    results: list[ProviderBenchmarkResult] = []

    print(f"Benchmarking: {file_path}")
    print(f"Runs per provider: {num_runs} (+ {warmup_runs} warmup)")
    print("=" * 60)

    for provider_name in providers:
        print(f"\n{provider_name}:")
        print("-" * 40)

        try:
            provider = get_provider_instance(provider_name)

            # Get text length from first extraction
            text = provider.extract_text(file_path)
            text_length = len(text)

            metrics = benchmark_provider(
                provider=provider,
                file_path=file_path,
                num_runs=num_runs,
                warmup_runs=warmup_runs,
            )

            result = ProviderBenchmarkResult(
                provider_name=provider_name,
                metrics=metrics,
                extracted_text_length=text_length,
            )

            print(f"  Latency:  avg={metrics.latency_avg:.3f}s  median={metrics.latency_median:.3f}s  stddev={metrics.latency_stddev:.3f}s")
            print(f"  Memory:   avg={metrics.memory_avg:.1f}MB  median={metrics.memory_median:.1f}MB  stddev={metrics.memory_stddev:.1f}MB")
            print(f"  Text length: {text_length:,} chars")

        except Exception as e:
            result = ProviderBenchmarkResult(
                provider_name=provider_name,
                metrics=BenchmarkMetrics(),
                success=False,
                error=str(e),
            )
            print(f"  ERROR: {e}")

        results.append(result)

    # Build output
    output = {
        "file": str(file_path),
        "num_runs": num_runs,
        "warmup_runs": warmup_runs,
        "providers": [r.to_dict() for r in results],
    }

    # Print summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"{'Provider':<15} {'Avg Latency':<12} {'Median':<12} {'Avg Memory':<12}")
    print("-" * 60)
    for r in results:
        if r.success:
            print(f"{r.provider_name:<15} {r.metrics.latency_avg:<12.3f} {r.metrics.latency_median:<12.3f} {r.metrics.memory_avg:<12.1f}")
        else:
            print(f"{r.provider_name:<15} {'FAILED':<12} {'':<12} {'':<12}")

    # Save results
    if output_path:
        with open(output_path, "w") as f:
            json.dump(output, f, indent=2)
        print(f"\nResults saved to: {output_path}")

    return output


def main():
    """CLI entry point for the benchmark tool."""
    parser = argparse.ArgumentParser(
        description="Benchmark parse providers on a document for latency and resource usage"
    )
    parser.add_argument(
        "file",
        type=Path,
        help="Path to the document to benchmark"
    )
    parser.add_argument(
        "--providers",
        type=str,
        nargs="+",
        choices=["pymupdf", "pypdf", "markitdown", "liteparse"],
        default=["pymupdf", "pypdf", "markitdown", "liteparse"],
        help="Parse providers to benchmark (default: all local providers)"
    )
    parser.add_argument(
        "--runs",
        type=int,
        default=10,
        help="Number of benchmark runs per provider (default: 10)"
    )
    parser.add_argument(
        "--warmup",
        type=int,
        default=1,
        help="Number of warmup runs before benchmarking (default: 1)"
    )
    parser.add_argument(
        "--output",
        type=Path,
        help="Path to save JSON results"
    )

    args = parser.parse_args()

    if not args.file.exists():
        print(f"Error: File not found: {args.file}")
        return 1

    run_benchmark(
        file_path=args.file,
        providers=args.providers,
        num_runs=args.runs,
        warmup_runs=args.warmup,
        output_path=args.output,
    )

    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: dataset_eval_utils/src/liteparse_eval/evaluation.py
================================================
"""
Evaluation and benchmarking script for text extraction and LLM-based document understanding.

This script provides:
1. LLM QA evaluation using an LLM judge for pass/fail evaluation
2. Latency tracking for LLM and parse operations
"""

import argparse
import json
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional

from liteparse_eval.providers import (
    ParserProvider,
    LLMProvider,
    AnthropicProvider,
    LiteparseProvider,
    MarkItDownProvider,
    PyMuPDFProvider,
    PyPDFProvider,
)


@dataclass
class LatencyMetrics:
    """Latency metrics for provider calls."""
    latencies: List[float] = field(default_factory=list)  # Individual call latencies in seconds

    @property
    def count(self) -> int:
        """Number of calls."""
        return len(self.latencies)

    @property
    def average(self) -> float:
        """Average latency in seconds."""
        return sum(self.latencies) / len(self.latencies) if self.latencies else 0.0

    @property
    def min(self) -> float:
        """Minimum latency in seconds."""
        return min(self.latencies) if self.latencies else 0.0

    @property
    def max(self) -> float:
        """Maximum latency in seconds."""
        return max(self.latencies) if self.latencies else 0.0

    @property
    def stddev(self) -> float:
        """Standard deviation of latency in seconds."""
        if not self.latencies or len(self.latencies) < 2:
            return 0.0
        mean = self.average
        variance = sum((x - mean) ** 2 for x in self.latencies) / len(self.latencies)
        return variance ** 0.5

    @property
    def total(self) -> float:
        """Total latency in seconds."""
        return sum(self.latencies) if self.latencies else 0.0

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return {
            "count": self.count,
            "total_seconds": round(self.total, 3),
            "average_seconds": round(self.average, 3),
            "min_seconds": round(self.min, 3),
            "max_seconds": round(self.max, 3),
            "stddev_seconds": round(self.stddev, 3),
            "individual_latencies": [round(lat, 3) for lat in self.latencies]
        }


@dataclass
class QAResult:
    """Result for a single QA pair evaluation."""
    question: str
    expected_answer: str
    predicted_answer: str
    llm_judge_pass: bool


@dataclass
class QAEvalResult:
    """Results for QA evaluation on a single document."""
    file_path: Path
    total_questions: int
    llm_judge_pass_rate: float
    qa_results: List[QAResult]
    llm_latency_metrics: Optional[LatencyMetrics] = None
    parse_latency_seconds: Optional[float] = None


class Benchmark:
    """Main benchmark runner for text extraction and QA evaluation."""

    def __init__(
        self,
        parser_provider: Optional[ParserProvider] = None,
        llm_provider: Optional[LLMProvider] = None,
        llm_judge_provider: Optional[LLMProvider] = None,
    ):
        """
        Initialize the benchmark.

        Args:
            parser_provider: Parser provider to use for text extraction
            llm_provider: LLM provider to use for answering questions
            llm_judge_provider: LLM provider for judge-based evaluation
        """
        self.parser_provider = parser_provider
        self.llm_provider = llm_provider
        self.llm_judge_provider = llm_judge_provider

    def run_qa_eval(
        self,
        extracted_text: str,
        doc_path: Path,
        ground_truth_path: Path,
        parse_latency: Optional[float] = None
    ) -> QAEvalResult:
        """
        Run QA evaluation on a single document.

        Args:
            extracted_text: Extracted text from the document
            doc_path: Path to the source document
            ground_truth_path: Path to the ground truth JSON file
            parse_latency: Time taken for text extraction in seconds

        Returns:
            QAEvalResult with evaluation metrics
        """
        if not self.llm_provider or not self.llm_judge_provider:
            raise ValueError("LLM provider and judge provider must be configured")

        # Load ground truth
        with open(ground_truth_path, "r") as f:
            ground_truth = json.load(f)

        # Get predicted answers with latency tracking
        predicted_answers = []
        llm_latency_metrics = LatencyMetrics()
        for qa_pair in ground_truth["qa_pairs"]:
            start_time = time.perf_counter()
            answer = self.llm_provider.answer_question(extracted_text, qa_pair["question"])
            latency = time.perf_counter() - start_time
            llm_latency_metrics.latencies.append(latency)
            predicted_answers.append(answer)

        # Evaluate with LLM judge
        qa_results = []
        judge_passes = 0

        for predicted, gt_pair in zip(predicted_answers, ground_truth["qa_pairs"]):
            question = gt_pair["question"]
            expected = gt_pair["answer"]

            try:
                llm_judge_pass = self.llm_judge_provider.evaluate_answer(question, expected, predicted)
            except Exception as e:
                print(f"  Warning: LLM judge evaluation failed: {e}")
                llm_judge_pass = False

            if llm_judge_pass:
                judge_passes += 1

            qa_results.append(QAResult(
                question=question,
                expected_answer=expected,
                predicted_answer=predicted,
                llm_judge_pass=llm_judge_pass,
            ))

        total = len(ground_truth["qa_pairs"])
        llm_judge_pass_rate = judge_passes / total if total > 0 else 0.0

        result = QAEvalResult(
            file_path=doc_path,
            total_questions=total,
            llm_judge_pass_rate=llm_judge_pass_rate,
            qa_results=qa_results,
            llm_latency_metrics=llm_latency_metrics,
            parse_latency_seconds=parse_latency,
        )
        return result

    def run_full_benchmark(
        self,
        data_dir: Path,
        ground_truth_dir: Path,
        output_path: Optional[Path] = None
    ) -> dict:
        """
        Run full benchmark across all documents using batch extraction.

        Args:
            data_dir: Directory containing input documents
            ground_truth_dir: Directory containing ground truth JSON files
            output_path: Optional path to save detailed results

        Returns:
            Dictionary with aggregated benchmark results
        """
        qa_results: list[QAEvalResult] = []
        extracted_texts: dict[str, str] = {}  # Store extracted text for each document

        # Find all ground truth files
        gt_files = sorted(ground_truth_dir.glob("*.json"))

        print(f"Running benchmark on {len(gt_files)} documents...")

        # Find all source documents and match them to ground truth files
        source_docs = sorted(data_dir.glob("*.pdf"))
        doc_gt_pairs: list[tuple[Path, Path]] = []

        for gt_path in gt_files:
            source_doc = next((
                doc for doc in source_docs if doc.stem == gt_path.stem
            ), None)

            if not source_doc:
                print(f"  Warning: Could not find source document for {gt_path.name}")
                continue

            doc_gt_pairs.append((source_doc, gt_path))

        if not doc_gt_pairs:
            print("No documents found to process.")
            return {}

        # Extract text
        parse_latency_per_doc: dict[Path, float] = {}
        if self.parser_provider:
            docs_to_extract = [doc for doc, _ in doc_gt_pairs]

            for doc_path in docs_to_extract:
                start_time = time.perf_counter()
                try:
                    parse_result = self.parser_provider.extract_text(doc_path)
                    total_time = time.perf_counter() - start_time

                    extracted_texts[str(doc_path)] = parse_result
                    parse_latency_per_doc[doc_path] = total_time
                except Exception as e:
                    print(f"  Error: extraction failed: {e}")
                    parse_result = ""

        # Run QA evaluation for each document
        for i, (source_doc, gt_path) in enumerate(doc_gt_pairs, 1):
            print(f"\n[{i}/{len(doc_gt_pairs)}] Evaluating: {gt_path.name}")

            extracted_text = extracted_texts.get(str(source_doc), "")
            parse_latency = parse_latency_per_doc.get(source_doc)

            # Run QA evaluation
            try:
                qa_result = self.run_qa_eval(extracted_text, source_doc, gt_path, parse_latency)
                qa_results.append(qa_result)
                latency_str = ""
                if qa_result.llm_latency_metrics:
                    avg_lat = qa_result.llm_latency_metrics.average
                    latency_str = f" [avg LLM: {avg_lat:.2f}s]"

                print(f"  QA: LLM judge pass: {qa_result.llm_judge_pass_rate:.1%}{latency_str}")
            except Exception as e:
                print(f"  Error: QA evaluation failed: {e}")

        # Aggregate results
        aggregate = {}

        if qa_results:
            total_questions = sum(r.total_questions for r in qa_results)
            total_llm_judge_passes = sum(
                r.llm_judge_pass_rate * r.total_questions for r in qa_results
            )

            # Aggregate parse latency metrics
            parse_latencies = [r.parse_latency_seconds for r in qa_results if r.parse_latency_seconds is not None]
            parse_latency_metrics = LatencyMetrics(latencies=parse_latencies) if parse_latencies else None

            # Aggregate LLM latency metrics across all documents
            all_llm_latencies = []
            for r in qa_results:
                if r.llm_latency_metrics:
                    all_llm_latencies.extend(r.llm_latency_metrics.latencies)
            llm_latency_metrics = LatencyMetrics(latencies=all_llm_latencies) if all_llm_latencies else None

            aggregate["qa"] = {
                "total_documents": len(qa_results),
                "total_questions": total_questions,
                "overall_llm_judge_pass_rate": total_llm_judge_passes / total_questions if total_questions > 0 else 0.0,
                "per_document_results": [
                    {
                        "file": str(r.file_path),
                        "llm_judge_pass_rate": r.llm_judge_pass_rate,
                        "total_questions": r.total_questions,
                        "parse_latency_seconds": r.parse_latency_seconds,
                        "llm_latency_metrics": r.llm_latency_metrics.to_dict() if r.llm_latency_metrics else None
                    }
                    for r in qa_results
                ]
            }

            if parse_latency_metrics:
                aggregate["qa"]["parse_latency_metrics"] = parse_latency_metrics.to_dict()

            if llm_latency_metrics:
                aggregate["qa"]["llm_latency_metrics"] = llm_latency_metrics.to_dict()

        # Save results if requested
        if output_path:
            # Save aggregate results
            with open(f"{output_path}.json", "w") as f:
                json.dump(aggregate, f, indent=2)
            print(f"\nAggregate results saved to: {output_path}")

            # Save detailed results with extracted text for debugging
            detailed_output_path = output_path.parent / f"{output_path.stem}_detailed{output_path.suffix}"
            detailed_results = self._build_detailed_results(qa_results, extracted_texts)
            with open(f"{detailed_output_path}.json", "w") as f:
                json.dump(detailed_results, f, indent=2)
            print(f"Detailed results saved to: {detailed_output_path}")

            # Generate HTML report
            try:
                from liteparse_eval.report import HTMLReportGenerator

                html_report_path = output_path.parent / f"{output_path.stem}_report.html"
                generator = HTMLReportGenerator(
                    detailed_results=detailed_results,
                    ground_truth_dir=ground_truth_dir
                )
                generator.generate_report(html_report_path)
                print(f"HTML report saved to: {html_report_path}")
            except Exception as e:
                print(f"Warning: HTML report generation failed: {e}")
                # Don't fail the entire benchmark if HTML generation fails

        return aggregate

    def _build_detailed_results(
        self,
        qa_results: list[QAEvalResult],
        extracted_texts: dict[str, str]
    ) -> dict:
        """
        Build detailed results including extracted text and individual test results.

        Args:
            qa_results: List of QA evaluation results
            extracted_texts: Dictionary mapping file paths to extracted text

        Returns:
            Dictionary with detailed results for debugging
        """
        detailed = {"documents": []}

        # Create a mapping of file paths to results
        qa_map = {str(r.file_path): r for r in qa_results}

        # Combine results for each document
        all_files = set(qa_map.keys())

        for file_path in sorted(all_files):
            doc_result = {
                "file": file_path,
                "extracted_text": extracted_texts.get(file_path, "")
            }

            # Add QA evaluation details
            if file_path in qa_map:
                qa_result = qa_map[file_path]
                doc_result["qa_evaluation"] = {
                    "llm_judge_pass_rate": qa_result.llm_judge_pass_rate,
                    "total_questions": qa_result.total_questions,
                    "parse_latency_seconds": qa_result.parse_latency_seconds,
                    "llm_latency_metrics": qa_result.llm_latency_metrics.to_dict() if qa_result.llm_latency_metrics else None,
                    "qa_pairs": [
                        {
                            "question": qa.question,
                            "expected_answer": qa.expected_answer,
                            "predicted_answer": qa.predicted_answer,
                            "llm_judge_pass": qa.llm_judge_pass,
                        }
                        for qa in qa_result.qa_results
                    ]
                }

            detailed["documents"].append(doc_result)

        return detailed


def main():
    """Entry point of the benchmark framework."""

    parser = argparse.ArgumentParser(
        description="Benchmark text extraction and LLM providers on document understanding tasks"
    )
    parser.add_argument(
        "--data-dir",
        type=Path,
        required=True,
        help="Directory containing source documents"
    )
    parser.add_argument(
        "--ground-truth-dir",
        type=Path,
        required=True,
        help="Directory containing ground truth JSON files"
    )
    parser.add_argument(
        "--output",
        type=Path,
        help="Path to save detailed benchmark results"
    )
    parser.add_argument(
        "--parse-provider",
        type=str,
        choices=["pymupdf", "pypdf", "markitdown", "liteparse"],
        default="liteparse",
        help="Parse provider to use for text extraction. (default: liteparse)"
    )
    parser.add_argument(
        "--llm-provider",
        type=str,
        choices=["anthropic"],
        default="anthropic",
        help="LLM provider to use. (default: anthropic)"
    )

    args = parser.parse_args()

    # Initialize parser provider
    if args.parse_provider == "pymupdf":
        parser_provider = PyMuPDFProvider()
    elif args.parse_provider == "pypdf":
        parser_provider = PyPDFProvider()
    elif args.parse_provider == "markitdown":
        parser_provider = MarkItDownProvider()
    elif args.parse_provider == "liteparse":
        parser_provider = LiteparseProvider()
    else:
        raise ValueError("Please specify a valid parser provider using --parse-provider")

    # Initialize LLM provider
    if args.llm_provider == "anthropic":
        llm_provider = AnthropicProvider()
    else:
        raise ValueError("Please specify a valid LLM provider using --llm-provider")

    # Use separate LLM judge provider
    llm_judge_provider = AnthropicProvider(model="claude-haiku-4-5-20251001")

    benchmark = Benchmark(
        parser_provider=parser_provider,
        llm_provider=llm_provider,
        llm_judge_provider=llm_judge_provider,
    )

    results = benchmark.run_full_benchmark(
        data_dir=args.data_dir,
        ground_truth_dir=args.ground_truth_dir,
        output_path=args.output
    )

    print("\n" + "="*60)
    print("BENCHMARK RESULTS")
    print("="*60)

    if "qa" in results:
        print(f"\nQA Evaluation:")
        print(f"  Overall LLM Judge Pass Rate: {results['qa']['overall_llm_judge_pass_rate']:.1%}")
        print(f"  Total Questions: {results['qa']['total_questions']}")


if __name__ == "__main__":
    exit(main())


================================================
FILE: dataset_eval_utils/src/liteparse_eval/processing.py
================================================
"""
Process PDFs and images to create a structured dataset using Anthropic's Claude with vision.
"""

import base64
import json
import random
from pathlib import Path
from typing import List, Literal

from anthropic import Anthropic
from liteparse import LiteParse
from pydantic import BaseModel, Field


# Define the output schema using Pydantic-like structure
class QAPair(BaseModel):
    question: str = Field(..., description="A question that can only be answered using information from the page.")
    answer: str = Field()

class PageAnnotation(BaseModel):
    has_text: bool = Field(..., description="Whether the document contains readable text")
    document_type: Literal["academic_paper", "form", "invoice", "newspaper", "other"] = Field(
        ..., description="The type of document"
    )
    layout_complexity: Literal["simple", "multi_column", "complex"] = Field(
        ..., description="The complexity of the document layout"
    )
    qa_pairs: List[QAPair] = Field(
        ...,
        description="Question-answer pairs about the document",
        example=[{"question": "What is the main topic?", "answer": "Sample Answer"}]
    )


def pdf_to_images(pdf_path: Path, dpi: int = 150) -> List[Path]:
    """
    Convert a PDF to a list of image paths (one per page) using liteparse.

    Args:
        pdf_path: Path to the PDF file
        dpi: DPI for rendering (default: 150)

    Returns:
        List of paths to generated images (one per page)
    """
    parser = LiteParse()
    result = parser.screenshot(pdf_path, dpi=dpi)

    return [Path(s.image_path) for s in result.screenshots]

    
def encode_image(image_path: Path) -> tuple[str, str]:
    """
    Encode an image to base64 and determine its media type.

    Args:
        image_path: Path to the image file

    Returns:
        Tuple of (base64_encoded_data, media_type)
    """
    with open(image_path, "rb") as image_file:
        image_data = base64.standard_b64encode(image_file.read()).decode("utf-8")

    # Determine media type from extension
    extension = image_path.suffix.lower()
    media_type_map = {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp"
    }
    media_type = media_type_map.get(extension, "image/jpeg")

    return image_data, media_type


def analyze_image_with_claude(
    client: Anthropic,
    image_path: Path,
    model: str = "claude-sonnet-4-5-20250929"
) -> PageAnnotation | None:
    """
    Analyze an image using Claude with structured outputs.

    Args:
        client: Anthropic client
        image_path: Path to the image to analyze
        model: Model to use for analysis

    Returns:
        Structured analysis result as a dictionary
    """
    image_data, media_type = encode_image(image_path)

    prompt = """Analyze this document image and provide a structured analysis.

Extract:
1. Whether it contains readable text
2. The document type (academic_paper, form, invoice, newspaper, or other)
3. Layout complexity (simple, multi_column, or complex)
4. Generate 3-5 question-answer pairs about the document content

Be thorough and accurate in your analysis. This data will be used in a document parsing benchmark (LLM-as-a-judge on QA responses), so extracted data should be interesting, diverse, and sometimes challenging."""

    # With .parse() - can pass Pydantic model directly
    response = client.beta.messages.parse(
        model=model,
        max_tokens=8192,
        betas=["structured-outputs-2025-11-13"],
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ],
            }
        ],
        output_format=PageAnnotation,
    )

    return response.parsed_output


def process_file(
    client: Anthropic,
    file_path: Path,
    output_dir: Path,
    model: str = "claude-sonnet-4-5-20250929"
) -> None:
    """
    Process a single file (PDF or image) and save results.

    Args:
        client: Anthropic client
        file_path: Path to the file to process
        output_dir: Directory to save output JSON files
        model: Model to use for analysis
    """
    print(f"Processing: {file_path}")

    # Handle PDFs vs images
    if file_path.suffix.lower() == ".pdf":
        try:
            image_paths = pdf_to_images(file_path)
        except NotImplementedError:
            print(f"  Skipping PDF (conversion not implemented): {file_path}")
            return
    else:
        # Single image file
        image_paths = [file_path]

    # Process each page/image
    for page_num, image_path in enumerate(image_paths, start=1):
        try:
            print(f"  Analyzing page {page_num}/{len(image_paths)}...")
            result = analyze_image_with_claude(client, image_path, model)

            if result is None:
                print(f"  ✗ No results for page {page_num} in {file_path}")
                continue

            # Create output filename
            if len(image_paths) > 1:
                # Multi-page PDF
                output_filename = f"{file_path.stem}_page_{page_num:03d}.json"
            else:
                # Single image
                output_filename = f"{file_path.stem}.json"

            output_path = output_dir / output_filename

            # Save result
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result.model_dump(), f, indent=2, ensure_ascii=False)

            print(f"  ✓ Saved: {output_path}")

        except Exception as e:
            print(f"  ✗ Error processing page {page_num}: {e}")


def find_documents(input_dir: Path) -> List[Path]:
    """
    Find all PDF and image files in a directory (recursively).

    Args:
        input_dir: Directory to search

    Returns:
        List of paths to document files
    """
    supported_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp"}
    documents = []

    for ext in supported_extensions:
        documents.extend(input_dir.rglob(f"*{ext}"))
        documents.extend(input_dir.rglob(f"*{ext.upper()}"))

    return sorted(set(documents))


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Process PDFs and images to create a structured dataset"
    )
    parser.add_argument(
        "input_dir",
        type=Path,
        help="Directory containing PDFs and/or images to process"
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("output"),
        help="Directory to save output JSON files (default: ./output)"
    )
    parser.add_argument(
        "--model",
        type=str,
        default="claude-sonnet-4-5-20250929",
        help="Claude model to use (default: claude-sonnet-4-5-20250929)"
    )
    parser.add_argument(
        "--api-key",
        type=str,
        help="Anthropic API key (or set ANTHROPIC_API_KEY environment variable)"
    )

    args = parser.parse_args()

    # Validate input directory
    if not args.input_dir.exists():
        print(f"Error: Input directory does not exist: {args.input_dir}")
        return 1

    # Create output directory
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # Initialize Anthropic client
    client = Anthropic(api_key=args.api_key) if args.api_key else Anthropic()

    # Find all documents
    documents = find_documents(args.input_dir)

    if not documents:
        print(f"No documents found in {args.input_dir}")
        return 1


    documents = random.sample(documents, 50)
    print(f"Found {len(documents)} document(s) to process\n")

    # Process each document
    for i, doc_path in enumerate(documents, start=1):
        print(f"\n[{i}/{len(documents)}]")
        process_file(client, doc_path, args.output_dir, args.model)

    print(f"\n✓ Complete! Results saved to: {args.output_dir}")
    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/__init__.py
================================================
from .llm import LLMProvider, AnthropicProvider, QA_PROMPT
from .parsers import (
    ParserProvider,
    LiteparseProvider,
    MarkItDownProvider,
    PyMuPDFProvider,
    PyPDFProvider,
)

__all__ = [
    "LLMProvider",
    "AnthropicProvider",
    "QA_PROMPT",
    "ParserProvider",
    "LiteparseProvider",
    "MarkItDownProvider",
    "PyMuPDFProvider",
    "PyPDFProvider",
]


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/llm/__init__.py
================================================
from .base import LLMProvider, QA_PROMPT
from .anthropic import AnthropicProvider

__all__ = ["LLMProvider", "AnthropicProvider", "QA_PROMPT"]


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/llm/anthropic.py
================================================
from anthropic import Anthropic

from .base import LLMProvider, QA_PROMPT, JUDGE_PROMPT


class AnthropicProvider(LLMProvider):
    """
    LLM provider using Anthropic for QA.
    """

    def __init__(self, api_key: str = None, model: str = "claude-sonnet-4-5-20250929"):
        """
        Initialize Anthropic QA provider.

        Args:
            api_key: Anthropic API key (or use ANTHROPIC_API_KEY env var)
            model: Claude model to use
        """

        self.client = Anthropic(api_key=api_key, max_retries=100, timeout=10000)
        self.model = model

    def answer_question(self, ocr_text: str, question: str) -> str:
        """Answer a question about an image using Claude."""

        # Call Claude
        response = self.client.messages.create(
            model=self.model,
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": QA_PROMPT.format(ocr_text=ocr_text, question=question),
                }
            ],
            timeout=10000,
        )
        if not response.content or len(response.content) == 0:
            raise ValueError("No content returned from Anthropic response")
        
        return response.content[0].text

    def evaluate_answer(self, question: str, expected_answer: str, predicted_answer: str) -> bool:
        """
        Evaluate whether the predicted answer is correct compared to the expected answer using an LLM judge.
        """

        # Call the LLM judge
        judge_prompt = JUDGE_PROMPT.format(
            question=question,
            expected_answer=expected_answer,
            predicted_answer=predicted_answer,
        )

        response = self.client.messages.create(
            model=self.model,
            max_tokens=256,
            messages=[
                {
                    "role": "user",
                    "content": judge_prompt,
                }
            ],
            timeout=10000,
        )
        if not response.content or len(response.content) == 0:
            # If the judge fails to respond, we can choose to be lenient and pass the answer
            return True  

        # Check if the response is "<pass>" or "<fail>"
        resp_text = response.content[0].text.strip()

        resp_text_lower = resp_text.lower()
        return "<pass" in resp_text_lower and "<fail" not in resp_text_lower


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/llm/base.py
================================================
from abc import ABC, abstractmethod
from pathlib import Path

QA_PROMPT = "<document>{ocr_text}</document>\n\nAnswer the following question about the document. Be as concise and accurate and possible, pulling from the exact text. If the document does not contain the answer, response with 'not found'.\n\nQuestion: {question}"
JUDGE_PROMPT = """You are evaluating whether two answers to a question are semantically equivalent.

<question>{question}</question>

<expected_answer>{expected_answer}</expected_answer>

<predicted_answer>{predicted_answer}</predicted_answer>

Do these answers convey the same information? Consider:
1. Are the answers semantically equivalent, even if wording differs?
2. Is the meaning preserved even if wording differs?
3. Some answers rely on the question for context, so ensure that the predicted answer is correct in the context of the question, even if it is not verbatim the same as the expected answer.
4. If the predicted answer says "not found" or similar, it should only pass if the expected answer also indicates the information is not present.

Respond with "<pass>{{short explanation}}</pass>" if the answers are semantically equivalent, or "<fail>{{short explanation}}</fail>" if they are not. The explanation should be concise, ideally one sentence, and should justify the pass/fail decision based on the criteria above."""


class LLMProvider(ABC):
    """Abstract base class for LLM providers."""

    @abstractmethod
    def answer_question(self, image_path: Path, question: str) -> str:
        """
        Answer a question about an image.

        Args:
            image_path: Path to the image file
            question: Question to answer

        Returns:
            Answer as a string
        """
        pass

    @abstractmethod
    def evaluate_answer(self, question: str, expected_answer: str, predicted_answer: str) -> bool:
        """
        Evaluate whether the predicted answer is correct compared to the expected answer.

        Args:
            expected_answer: The ground truth answer
            predicted_answer: The answer generated by the LLM

        Returns:
            True if the predicted answer is correct, False otherwise
        """
        pass


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/__init__.py
================================================
from .base import ParserProvider
from .liteparse import LiteparseProvider
from .markitdown import MarkItDownProvider
from .pymupdf import PyMuPDFProvider
from .pypdf import PyPDFProvider

__all__ = [
    "ParserProvider",
    "LiteparseProvider",
    "MarkItDownProvider",
    "PyMuPDFProvider",
    "PyPDFProvider",
]


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/base.py
================================================
from abc import ABC, abstractmethod
from pathlib import Path


class ParserProvider(ABC):
    """Abstract base class for text extraction/parsing providers."""

    @abstractmethod
    def extract_text(self, file_path: Path) -> str:
        """
        Extract text from a document.

        Args:
            file_path: Path to the document file

        Returns:
            Extracted text as a single string
        """
        pass

    def extract_text_batch(self, file_paths: list[Path]) -> dict[Path, str]:
        """
        Extract text from multiple documents in batch.

        Override this method for providers that support native batch processing.
        The default implementation processes files sequentially.

        Args:
            file_paths: List of paths to document files

        Returns:
            Dictionary mapping file paths to extracted text
        """
        results = {}
        for file_path in file_paths:
            results[file_path] = self.extract_text(file_path)
        return results


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/liteparse.py
================================================
from pathlib import Path
from typing import Optional

from liteparse import LiteParse

from .base import ParserProvider


class LiteparseProvider(ParserProvider):
    """
    Parser provider using the liteparse Python wrapper.

    This provider uses the liteparse library for PDF text extraction.
    """

    def __init__(
        self,
        ocr_enabled: bool = False,
        ocr_server_url: Optional[str] = None,
        ocr_language: str = "en",
        max_pages: int = 1000,
        dpi: int = 150,
        precise_bounding_box: bool = True,
        skip_diagonal_text: bool = False,
        preserve_very_small_text: bool = False,
        cli_path: Optional[str] = None,
    ):
        """
        Initialize the liteparse provider.

        Args:
            ocr_enabled: Whether to enable OCR for scanned documents
            ocr_server_url: URL of HTTP OCR server (uses Tesseract if not provided)
            ocr_language: Language code for OCR (e.g., "en", "fr", "de")
            max_pages: Maximum number of pages to parse
            dpi: DPI for rendering (affects OCR quality)
            precise_bounding_box: Whether to compute precise bounding boxes
            skip_diagonal_text: Whether to skip diagonal text
            preserve_very_small_text: Whether to preserve very small text
            cli_path: Custom path to liteparse CLI (auto-detected if not provided)
        """
        self.parser = LiteParse(cli_path=cli_path)
        self.ocr_enabled = ocr_enabled
        self.ocr_server_url = ocr_server_url
        self.ocr_language = ocr_language
        self.max_pages = max_pages
        self.dpi = dpi
        self.precise_bounding_box = precise_bounding_box
        self.skip_diagonal_text = skip_diagonal_text
        self.preserve_very_small_text = preserve_very_small_text

    def extract_text(self, file_path: Path) -> str:
        """Extract text from a document using liteparse."""
        result = self.parser.parse(
            file_path,
            ocr_enabled=self.ocr_enabled,
            ocr_server_url=self.ocr_server_url,
            ocr_language=self.ocr_language,
            max_pages=self.max_pages,
            dpi=self.dpi,
            precise_bounding_box=self.precise_bounding_box,
            skip_diagonal_text=self.skip_diagonal_text,
            preserve_very_small_text=self.preserve_very_small_text,
        )
        return result.text


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/markitdown.py
================================================
from pathlib import Path

from markitdown import MarkItDown

from .base import ParserProvider


class MarkItDownProvider(ParserProvider):
    """
    Parse provider using MarkItDown.

    Install with: pip install markitdown
    """

    def __init__(self, config: dict | None = None):
        """
        Initialize the parse provider.

        Args:
            config: Configuration dict parameters for MarkItDown
        """
        self.config = config or {}
        self.markitdown = MarkItDown(**self.config)

    def extract_text(self, file_path: Path) -> str:
        """Extract text from a document using MarkItDown."""
        result = self.markitdown.convert(str(file_path))
        return result.text_content


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/pymupdf.py
================================================
from pathlib import Path

import fitz  # PyMuPDF

from .base import ParserProvider


class PyMuPDFProvider(ParserProvider):
    """
    Parse provider using PyMuPDF.

    Install with: pip install pymupdf
    """

    def __init__(self):
        """Initialize the parse provider."""
        pass

    def extract_text(self, file_path: Path) -> str:
        """Extract text from a document using PyMuPDF."""
        doc = fitz.open(str(file_path))
        text = "\n\n".join(page.get_text() for page in doc)
        return text


================================================
FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/pypdf.py
================================================
from pathlib import Path

import pypdf

from .base import ParserProvider


class PyPDFProvider(ParserProvider):
    """
    Parse provider using PyPDF.

    Install with: pip install pypdf
    """

    def __init__(self, config: dict | None = None):
        """
        Initialize the parse provider.

        Args:
            config: Configuration dict parameters for PyPDF
        """
        self.config = config or {}

    def extract_text(self, file_path: Path) -> str:
        """Extract text from a document using PyPDF."""
        result = pypdf.PdfReader(str(file_path), **self.config)
        text = "\n\n".join(page.extract_text() for page in result.pages)
        return text


================================================
FILE: dataset_eval_utils/src/liteparse_eval/report.py
================================================
"""HTML report generation for text extraction evaluation results."""

import base64
import html
import io
from datetime import datetime
from pathlib import Path

import fitz  # PyMuPDF

try:
    from PIL import Image
    HAS_PIL = True
except ImportError:
    HAS_PIL = False


class HTMLReportGenerator:
    """Generate HTML reports from text extraction evaluation results."""

    def __init__(self, detailed_results: dict, ground_truth_dir: Path):
        """
        Initialize the HTML report generator.

        Args:
            detailed_results: Dictionary containing detailed evaluation results
            ground_truth_dir: Path to the directory containing ground truth JSON files
        """
        self.detailed_results = detailed_results
        self.ground_truth_dir = ground_truth_dir
        self.documents = detailed_results.get("documents", [])

    def generate_report(self, output_path: Path) -> None:
        """
        Generate complete HTML report and save to file.

        Args:
            output_path: Path where the HTML report will be saved
        """
        html_content = self._build_html()
        output_path.write_text(html_content, encoding="utf-8")

    def _build_html(self) -> str:
        """Build the complete HTML document."""
        css = self._generate_css()
        summary = self._generate_summary_html()
        navigation = self._generate_navigation_html()
        documents_html = self._generate_all_documents_html()

        return f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Parsing Evaluation Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
    <style>
{css}
    </style>
</head>
<body>
    {navigation}
    {summary}
    {documents_html}
</body>
</html>"""

    def _generate_css(self) -> str:
        """Generate CSS styles for the report."""
        return """        body {
            font-family: system-ui, -apple-system, sans-serif;
            margin: 0;
            padding: 0;
            background: #f5f5f5;
        }
        .container {
            max-width: 1400px;
            margin: 0 auto;
            padding: 20px;
        }
        nav {
            position: sticky;
            top: 0;
            background: #fff;
            padding: 15px 20px;
            border-bottom: 2px solid #ddd;
            z-index: 100;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        nav h2 {
            margin: 0 0 10px 0;
            font-size: 1.25em;
        }
        .nav-list {
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
            gap: 10px;
            max-height: 0;
            overflow: hidden;
            transition: max-height 0.3s ease;
        }
        .nav-list.expanded {
            max-height: 500px;
            overflow-y: auto;
        }
        .toggle-nav {
            background: #3b82f6;
            color: white;
            border: none;
            padding: 8px 16px;
            border-radius: 4px;
            cursor: pointer;
            font-size: 0.9em;
        }
        .toggle-nav:hover {
            background: #2563eb;
        }
        .nav-item {
            padding: 8px 12px;
            background: #f9fafb;
            border-radius: 4px;
            text-decoration: none;
            color: #1f2937;
            display: flex;
            justify-content: space-between;
            align-items: center;
            border-left: 4px solid #d1d5db;
        }
        .nav-item:hover {
            background: #e5e7eb;
        }
        .nav-item.good {
            border-left-color: #22c55e;
        }
        .nav-item.warning {
            border-left-color: #f59e0b;
        }
        .nav-item.poor {
            border-left-color: #ef4444;
        }
        .nav-item-metrics {
            font-size: 0.85em;
            color: #6b7280;
        }
        .summary {
            background: #fff;
            padding: 30px;
            margin: 20px 0;
            border-radius: 8px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
        }
        .summary h1 {
            margin: 0 0 20px 0;
            color: #1f2937;
        }
        .metrics {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }
        .metric {
            background: #f9fafb;
            border: 1px solid #e5e7eb;
            padding: 20px;
            border-radius: 8px;
        }
        .metric-label {
            font-size: 0.9em;
            color: #6b7280;
            margin-bottom: 8px;
        }
        .metric-value {
            font-size: 2em;
            font-weight: bold;
            margin-bottom: 5px;
        }
        .metric-value.good {
            color: #22c55e;
        }
        .metric-value.warning {
            color: #f59e0b;
        }
        .metric-value.poor {
            color: #ef4444;
        }
        .document {
            background: #fff;
            border: 1px solid #e5e7eb;
            margin: 20px 0;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
        }
        .document-header {
            display: flex;
            justify-content: space-between;
            align-items: flex-start;
            margin-bottom: 20px;
            padding-bottom: 15px;
            border-bottom: 2px solid #e5e7eb;
        }
        .document-title {
            font-size: 1.5em;
            font-weight: bold;
            color: #1f2937;
            word-break: break-all;
        }
        .edit-link {
            display: inline-block;
            padding: 8px 16px;
            background: #3b82f6;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            font-size: 0.9em;
            white-space: nowrap;
        }
        .edit-link:hover {
            background: #2563eb;
        }
        .section {
            margin: 30px 0;
        }
        .section-title {
            font-size: 1.25em;
            font-weight: bold;
            margin: 0 0 15px 0;
            color: #374151;
        }
        .pdf-preview {
            max-width: 100%;
            border: 1px solid #d1d5db;
            border-radius: 4px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .pdf-error {
            background: #fee2e2;
            border: 1px solid #fecaca;
            padding: 20px;
            border-radius: 4px;
            color: #991b1b;
        }
        .doc-text {
            background: #f9fafb;
            border: 1px solid #e5e7eb;
            padding: 20px;
            border-radius: 4px;
            white-space: pre-wrap;
            font-family: 'Courier New', monospace;
            font-size: 0.9em;
            max-height: 400px;
            overflow-y: auto;
            line-height: 1.5;
        }
        .qa-pairs {
            display: flex;
            flex-direction: column;
            gap: 15px;
        }
        .qa-item {
            padding: 20px;
            background: #f9fafb;
            border-radius: 4px;
            border-left: 4px solid;
        }
        .qa-item.pass {
            border-left-color: #22c55e;
        }
        .qa-item.fail {
            border-left-color: #ef4444;
        }
        .qa-question {
            font-weight: bold;
            margin-bottom: 12px;
            color: #1f2937;
        }
        .qa-answer {
            margin: 8px 0;
            padding: 10px;
            background: white;
            border-radius: 4px;
        }
        .qa-label {
            font-size: 0.85em;
            color: #6b7280;
            margin-bottom: 4px;
        }
        .qa-score {
            font-weight: bold;
            font-size: 1.1em;
            margin-top: 10px;
        }
        .score-badge {
            display: inline-block;
            padding: 4px 8px;
            border-radius: 4px;
            font-size: 0.85em;
            font-weight: bold;
        }
        .score-badge.good {
            background: #dcfce7;
            color: #166534;
        }
        .score-badge.warning {
            background: #fef3c7;
            color: #92400e;
        }
        .score-badge.poor {
            background: #fee2e2;
            color: #991b1b;
        }"""

    def _generate_summary_html(self) -> str:
        """Generate aggregate metrics summary section."""
        total_docs = len(self.documents)

        # Calculate aggregate QA metrics
        total_llm_judge_pass = 0
        qa_count = 0
        parse_latencies = []
        all_llm_latencies = []

        for doc in self.documents:
            qa_eval = doc.get("qa_evaluation", {})
            if qa_eval:
                llm_judge_rate = qa_eval.get("llm_judge_pass_rate", 0)
                total_llm_judge_pass += llm_judge_rate
                qa_count += 1

                parse_lat = qa_eval.get("parse_latency_seconds")
                if parse_lat is not None:
                    parse_latencies.append(parse_lat)

                llm_metrics = qa_eval.get("llm_latency_metrics")
                if llm_metrics:
                    all_llm_latencies.extend(llm_metrics.get("individual_latencies", []))

        avg_llm_judge_pass = total_llm_judge_pass / qa_count if qa_count > 0 else 0
        judge_class = self._get_metric_class(avg_llm_judge_pass)

        # Generate latency metrics HTML
        latency_metrics_html = ""
        if parse_latencies:
            avg_parse_lat = sum(parse_latencies) / len(parse_latencies)
            latency_metrics_html += f"""                <div class="metric">
                    <div class="metric-label">Parse Latency (Avg)</div>
                    <div class="metric-value">{avg_parse_lat:.2f}s</div>
                    <div class="metric-label">min: {min(parse_latencies):.2f}s, max: {max(parse_latencies):.2f}s</div>
                </div>"""

        if all_llm_latencies:
            avg_llm_lat = sum(all_llm_latencies) / len(all_llm_latencies)
            latency_metrics_html += f"""                <div class="metric">
                    <div class="metric-label">LLM Latency (Avg per call)</div>
                    <div class="metric-value">{avg_llm_lat:.2f}s</div>
                    <div class="metric-label">min: {min(all_llm_latencies):.2f}s, max: {max(all_llm_latencies):.2f}s</div>
                </div>"""

        return f"""    <div class="container">
        <div class="summary">
            <h1>QA Evaluation Report</h1>
            <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            <div class="metrics">
                <div class="metric">
                    <div class="metric-label">Total Documents</div>
                    <div class="metric-value">{total_docs}</div>
                </div>
                <div class="metric">
                    <div class="metric-label">LLM Judge Pass Rate</div>
                    <div class="metric-value {judge_class}">{avg_llm_judge_pass:.1%}</div>
                    <div class="metric-label">Across {qa_count} documents</div>
                </div>
{latency_metrics_html}
            </div>
        </div>
    </div>"""

    def _generate_navigation_html(self) -> str:
        """Generate navigation with anchors to each document."""
        nav_items = []
        for idx, doc in enumerate(self.documents):
            filename = Path(doc["file"]).name
            qa_eval = doc.get("qa_evaluation", {})

            judge_pass = qa_eval.get("llm_judge_pass_rate", 0)
            nav_class = self._get_metric_class(judge_pass)

            metrics_text = f"Judge Pass: {judge_pass:.0%}"

            nav_items.append(
                f'            <a href="#doc-{idx}" class="nav-item {nav_class}">'
                f'<span>{html.escape(filename)}</span>'
                f'<span class="nav-item-metrics">{metrics_text}</span>'
                f'</a>'
            )

        nav_list = "\n".join(nav_items)

        return f"""    <nav>
        <div class="container">
            <h2>Document Navigation ({len(self.documents)} documents)</h2>
            <button class="toggle-nav" onclick="this.nextElementSibling.classList.toggle('expanded')">
                Toggle Navigation
            </button>
            <div class="nav-list">
{nav_list}
            </div>
        </div>
    </nav>"""

    def _generate_all_documents_html(self) -> str:
        """Generate HTML for all documents."""
        docs_html = []
        for idx, doc in enumerate(self.documents):
            docs_html.append(self._generate_document_html(doc, idx))

        return f"""    <div class="container">
{chr(10).join(docs_html)}
    </div>"""

    def _generate_document_html(self, doc: dict, index: int) -> str:
        """Generate HTML for single document with all evaluations."""
        filename = Path(doc["file"]).name
        pdf_path = Path(doc["file"])

        # Generate VS Code link
        vscode_link = self._generate_vscode_link(doc["file"])
        edit_button = f'<a href="{vscode_link}" class="edit-link">Edit Ground Truth</a>' if vscode_link else ''

        # Generate PDF preview
        pdf_preview_html = self._generate_pdf_preview_html(pdf_path)

        extracted_text = doc.get("extracted_text")
        extracted_text_html = f'<pre class="doc-text">{html.escape(extracted_text)}</pre>'

        # Generate QA results
        qa_html = self._generate_qa_html(doc.get("qa_evaluation", {}))

        return f"""        <div class="document" id="doc-{index}">
            <div class="document-header">
                <div class="document-title">{html.escape(filename)}</div>
                {edit_button}
            </div>

            <div class="section">
                <h3 class="section-title">PDF Preview</h3>
                {pdf_preview_html}
            </div>

            <div class="section">
                <h3 class="section-title">Extracted Text</h3>
                {extracted_text_html}
            </div>

            <div class="section">
                <h3 class="section-title">QA Evaluation</h3>
                {qa_html}
            </div>
        </div>"""

    def _generate_pdf_preview_html(self, pdf_path: Path) -> str:
        """Generate PDF preview HTML with base64 encoded image."""
        try:
            base64_image = self._pdf_to_base64_image(pdf_path)
            return f'<img src="{base64_image}" alt="PDF Preview" class="pdf-preview">'
        except Exception as e:
            return f'<div class="pdf-error">Failed to load PDF preview: {html.escape(str(e))}</div>'

    def _generate_qa_html(self, qa_eval: dict) -> str:
        """Generate HTML for QA evaluation results."""
        if not qa_eval:
            return '<p>No QA evaluation data available.</p>'

        llm_judge_pass_rate = qa_eval.get("llm_judge_pass_rate", 0)
        total_questions = qa_eval.get("total_questions", 0)
        parse_latency = qa_eval.get("parse_latency_seconds")
        llm_metrics = qa_eval.get("llm_latency_metrics")

        judge_class = self._get_metric_class(llm_judge_pass_rate)

        latency_text = ""
        if parse_latency is not None:
            latency_text += f'<p><strong>Parse Latency:</strong> {parse_latency:.2f}s</p>'

        if llm_metrics:
            avg_lat = llm_metrics.get("average_seconds", 0)
            min_lat = llm_metrics.get("min_seconds", 0)
            max_lat = llm_metrics.get("max_seconds", 0)
            stddev_lat = llm_metrics.get("stddev_seconds", 0)
            latency_text += f'<p><strong>LLM Latency:</strong> avg: {avg_lat:.2f}s, min: {min_lat:.2f}s, max: {max_lat:.2f}s, stddev: {stddev_lat:.2f}s</p>'

        summary = f'<p><strong>LLM Judge Pass Rate:</strong> <span class="score-badge {judge_class}">{llm_judge_pass_rate:.1%}</span> (across {total_questions} questions)</p>{latency_text}'

        qa_pairs = qa_eval.get("qa_pairs", [])
        if not qa_pairs:
            return summary + '<p>No QA pairs to display.</p>'

        qa_items = []
        for qa in qa_pairs:
            llm_judge_pass = qa.get("llm_judge_pass", False)

            item_class = "pass" if llm_judge_pass else "fail"
            badge_class = "good" if llm_judge_pass else "poor"
            badge_text = "PASS" if llm_judge_pass else "FAIL"

            question = html.escape(qa.get("question", ""))
            expected = html.escape(qa.get("expected_answer", ""))
            predicted = html.escape(qa.get("predicted_answer", ""))

            qa_items.append(
                f'                <div class="qa-item {item_class}">'
                f'<div class="qa-question">Q: {question}</div>'
                f'<div class="qa-answer">'
                f'<div class="qa-label">Expected Answer:</div>'
                f'{expected}'
                f'</div>'
                f'<div class="qa-answer">'
                f'<div class="qa-label">Predicted Answer:</div>'
                f'{predicted}'
                f'</div>'
                f'<div class="qa-score">'
                f'<span class="score-badge {badge_class}">LLM Judge: {badge_text}</span>'
                f'</div>'
                f'</div>'
            )

        qa_html = "\n".join(qa_items)

        return f"""{summary}
            <div class="qa-pairs">
{qa_html}
            </div>"""

    def _pdf_to_base64_image(self, pdf_path: Path, dpi: int = 72) -> str:
        """
        Convert first page of PDF to base64-encoded image.

        Uses JPEG compression if PIL is available, otherwise PNG.

        Args:
            pdf_path: Path to the PDF file
            dpi: Resolution for rendering (default 72)

        Returns:
            Base64-encoded image as data URL

        Raises:
            Exception: If PDF cannot be opened or converted
        """
        doc = fitz.open(str(pdf_path))
        try:
            page = doc[0]  # First page only

            # Render to pixmap at specified DPI
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pix = page.get_pixmap(matrix=mat)

            # Try to compress to JPEG if PIL is available
            if HAS_PIL:
                # Convert pixmap to PIL Image
                img_data = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_data))

                # Convert to RGB (JPEG doesn't support alpha channel)
                if img.mode in ('RGBA', 'LA', 'P'):
                    rgb_img = Image.new('RGB', img.size, (255, 255, 255))
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    rgb_img.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
                    img = rgb_img

                # Save as JPEG with compression
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG', quality=85, optimize=True)
                img_bytes = buffer.getvalue()

                # Encode to base64
                base64_str = base64.b64encode(img_bytes).decode("utf-8")
                return f"data:image/jpeg;base64,{base64_str}"
            else:
                # Fallback to PNG
                img_bytes = pix.tobytes("png")
                base64_str = base64.b64encode(img_bytes).decode("utf-8")
                return f"data:image/png;base64,{base64_str}"
        finally:
            doc.close()

    def _generate_vscode_link(self, pdf_path: str) -> str:
        """
        Generate vscode:// link to ground truth JSON file.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            vscode:// URL or empty string if ground truth file doesn't exist
        """
        # Get the stem of the PDF filename
        pdf_stem = Path(pdf_path).stem

        # Find corresponding ground truth JSON file
        gt_file = self.ground_truth_dir / f"{pdf_stem}.json"

        if gt_file.exists():
            # Convert to absolute path for VS Code link
            abs_path = gt_file.resolve()
            return f"vscode://file/{abs_path}"

        return ""

    def _get_metric_class(self, score: float) -> str:
        """
        Get CSS class based on metric score.

        Args:
            score: Score value between 0 and 1

        Returns:
            CSS class name: 'good', 'warning', or 'poor'
        """
        if score >= 0.9:
            return "good"
        elif score >= 0.7:
            return "warning"
        else:
            return "poor"


================================================
FILE: docs/src/content/docs/liteparse/_meta.yml
================================================
label: LiteParse
order: 1
collapsed: false


================================================
FILE: docs/src/content/docs/liteparse/cli-reference.md
================================================
---
title: CLI Reference
description: Complete reference for all LiteParse CLI commands and options.
sidebar:
  order: 5
---

LiteParse provides the `lit` CLI with three commands: `parse`, `batch-parse`, and `screenshot`.

## `lit parse`

Parse a single document.

```
lit parse [options] <file>
```

### Arguments

| Argument | Description |
|----------|-------------|
| `file` | Path to the document file, or `-` to read from stdin |

### Options

| Option | Description | Default |
|--------|-------------|---------|
| `-o, --output <file>` | Write output to a file instead of stdout | — |
| `--format <format>` | Output format: `json` or `text` | `text` |
| `--ocr-server-url <url>` | HTTP OCR server URL | — (uses Tesseract) |
| `--no-ocr` | Disable OCR entirely | — |
| `--ocr-language <lang>` | OCR language code | `en` |
| `--num-workers <n>` | Pages to OCR in parallel | CPU cores - 1 |
| `--max-pages <n>` | Maximum pages to parse | `10000` |
| `--target-pages <pages>` | Pages to parse (e.g., `"1-5,10"`) | — (all pages) |
| `--dpi <dpi>` | Rendering DPI | `150` |
| `--no-precise-bbox` | **Deprecated:** Disable populating the output boundingBoxes array. Will be removed in v2.0. Text item coordinates (`x`, `y`, `width`, `height`) are always present regardless. | — |
| `--preserve-small-text` | Keep very small text | — |
| `--password <password>` | Password for encrypted/protected documents | — |
| `--config <file>` | JSON config file path | — |
| `-q, --quiet` | Suppress progress output | — |

### Examples

```bash
# Basic text parsing
lit parse report.pdf

# JSON output with bounding boxes
lit parse report.pdf --format json -o report.json

# Parse pages 1-5 only, no OCR
lit parse report.pdf --target-pages "1-5" --no-ocr

# High-DPI rendering with French OCR
lit parse report.pdf --dpi 300 --ocr-language fra

# Use an external OCR server
lit parse report.pdf --ocr-server-url http://localhost:8828/ocr

# Pipe output to another tool
lit parse report.pdf -q | wc -l

# Parse a remote file via stdin
curl -sL https://example.com/report.pdf | lit parse --no-ocr -
```

---

## `lit batch-parse`

Parse multiple documents in a directory.

```
lit batch-parse [options] <input-dir> <output-dir>
```

### Arguments

| Argument | Description |
|----------|-------------|
| `input-dir` | Directory containing documents to parse |
| `output-dir` | Directory for output files |

### Options

| Option | Description | Default |
|--------|-------------|---------|
| `--format <format>` | Output format: `json` or `text` | `text` |
| `--ocr-server-url <url>` | HTTP OCR server URL | — (uses Tesseract) |
| `--no-ocr` | Disable OCR entirely | — |
| `--ocr-language <lang>` | OCR language code | `en` |
| `--num-workers <n>` | Pages to OCR in parallel | CPU cores - 1 |
| `--max-pages <n>` | Maximum pages per file | `10000` |
| `--dpi <dpi>` | Rendering DPI | `150` |
| `--no-precise-bbox` | **Deprecated:** Disable populating the output boundingBoxes array. Will be removed in v2.0. Text item coordinates (`x`, `y`, `width`, `height`) are always present regardless. | — |
| `--recursive` | Search subdirectories | — |
| `--extension <ext>` | Only process this extension (e.g., `".pdf"`) | — (all supported) |
| `--password <password>` | Password for encrypted/protected documents (applied to all files) | — |
| `--config <file>` | JSON config file path | — |
| `-q, --quiet` | Suppress progress output | — |

### Examples

```bash
# Parse all supported files in a directory
lit batch-parse ./documents ./output

# Recursively parse only PDFs
lit batch-parse ./documents ./output --recursive --extension ".pdf"

# Batch parse with JSON output and no OCR
lit batch-parse ./documents ./output --format json --no-ocr

# Use a config file for consistent settings
lit batch-parse ./documents ./output --config liteparse.config.json
```

---

## `lit screenshot`

Generate page images from a PDF.

```
lit screenshot [options] <file>
```

### Arguments

| Argument | Description |
|----------|-------------|
| `file` | Path to the PDF file |

### Options

| Option | Description | Default |
|--------|-------------|---------|
| `-o, --output-dir <dir>` | Output directory | `./screenshots` |
| `--target-pages <pages>` | Pages to screenshot (e.g., `"1,3,5"` or `"1-5"`) | — (all pages) |
| `--dpi <dpi>` | Rendering DPI | `150` |
| `--format <format>` | Image format: `png` or `jpg` | `png` |
| `--password <password>` | Password for encrypted/protected documents | — |
| `--config <file>` | JSON config file path | — |
| `-q, --quiet` | Suppress progress output | — |

### Examples

```bash
# Screenshot all pages
lit screenshot document.pdf -o ./pages

# First 5 pages at high DPI
lit screenshot document.pdf --pages "1-5" --dpi 300 -o ./pages

# JPG format for smaller files
lit screenshot document.pdf --format jpg -o ./pages

# Specific pages only
lit screenshot document.pdf --pages "1,5,10" -o ./pages
```

---

## Global options

These options are available on all commands:

| Option | Description |
|--------|-------------|
| `-h, --help` | Show help for a command |
| `-V, --version` | Show version number |


================================================
FILE: docs/src/content/docs/liteparse/getting_started.md
================================================
---
title: Getting Started
description: Install LiteParse and parse your first document in under a minute.
sidebar:
  order: 1
---

## Installation

Install LiteParse globally via npm to use the `lit` command anywhere:

```bash
npm i -g @llamaindex/liteparse
```

For macOS and Linux users, LiteParse can be also installed via `brew`:

```bash
brew tap run-llama/liteparse
brew install llamaindex-liteparse
```

## Quick start

Once installed, you can start parsing from the command line:

```bash
# Parse a PDF and print text to stdout
lit parse document.pdf

# Save output to a file
lit parse document.pdf -o output.txt

# Get structured JSON with bounding boxes
lit parse document.pdf --format json -o output.json

# Parse only specific pages
lit parse document.pdf --target-pages "1-5,10,15-20"
```

### Batch parsing

Parse an entire directory of documents at once:

```bash
lit batch-parse ./pdfs ./outputs
```

### Screenshots

Generate page images from a PDF for LLM agents or visual workflows:

```bash
lit screenshot document.pdf -o ./screenshots
```

## Next steps

- [Library usage](/liteparse/guides/library-usage/): Use LiteParse programmatically from TypeScript or Python.
- [OCR configuration](/liteparse/guides/ocr/): Configure Tesseract, use an external OCR server, or bring your own.
- [Multi-format support](/liteparse/guides/multi-format/): Parse DOCX, XLSX, PPTX, images, and more.
- [Agent skill](/liteparse/guides/agent-skill/): Add LiteParse as a skill for coding agents.
- [CLI reference](/liteparse/cli-reference/): Complete command and option reference.


================================================
FILE: docs/src/content/docs/liteparse/guides/_meta.yml
================================================
label: Guides
order: 2
collapsed: false


================================================
FILE: docs/src/content/docs/liteparse/guides/agent-skill.md
================================================
---
title: Agent Skill
description: Add LiteParse as a skill for coding agents like Claude Code, Cursor, and others.
sidebar:
  order: 6
---

LiteParse can be installed as a **coding agent skill** using Vercel's [skills](https://github.com/vercel-labs/skills) utility. This gives your coding agent the ability to process documents, generate screenshots, and parse text from files, all locally.

## Installation

Add the LiteParse skill to your project:

```bash
npx skills add run-llama/llamaparse-agent-skills --skill liteparse
```

This downloads a skill file that compatible coding agents (Claude Code, Cursor, etc.) will automatically pick up.

Once configured, your agent will be able to call the LiteParse CLI commands directly from its code execution environment. This means you can have your agent parse PDFs, pull out the text, and generate screenshots on the fly as part of its reasoning process.

## Example prompts

Once the skill is installed, you can ask your coding agent things like:

- "Parse this PDF and extract the text as JSON"
- "Extract text from all the DOCX files in the `./contracts` folder"
- "Screenshot pages 1-5 of this PDF at 300 DPI"
- "Parse this scanned document using the PaddleOCR server on localhost:8828"
- "Get the bounding boxes for all text on page 3"


## Configuring Defaults

You might want to configure some defaults so that your agent doesn't have to specify them in every prompt. You can create a `liteparse.config.json` file in the root of your project with settings like:

```json
{
  "ocrLanguage": "en",
  "ocrEnabled": true,
  "maxPages": 1000,
  "dpi": 150,
  "outputFormat": "json",
  "preserveVerySmallText": false
}
```

This is especially useful for custom OCR servers. Just add the `ocrServerUrl` to your config:

```json
{
  "ocrServerUrl": "http://localhost:8828/ocr",
  "ocrLanguage": "en",
  "outputFormat": "json"
}
```


================================================
FILE: docs/src/content/docs/liteparse/guides/library-usage.md
================================================
---
title: Library Usage
description: Use LiteParse programmatically from TypeScript or Python.
sidebar:
  order: 1
---

LiteParse can be used as a library in your own code, not just from the CLI. There are packages for both TypeScript and Python.

## TypeScript

Install as a project dependency:

```bash
npm install @llamaindex/liteparse
# or
pnpm add @llamaindex/liteparse
```

### Parsing a document

```typescript
import { LiteParse } from "@llamaindex/liteparse";

const parser = new LiteParse({ ocrEnabled: true });
const result = await parser.parse("document.pdf");

// Full document text with layout preserved
console.log(result.text);

// Per-page data
for (const page of result.pages) {
  console.log(`Page ${page.pageNum}: ${page.textItems.length} text items`);
}
```

### JSON output with bounding boxes

```typescript
const parser = new LiteParse({ outputFormat: "json" });
const result = await parser.parse("document.pdf");

for (const page of result.json?.pages || []) {
  for (const item of page.textItems) {
    console.log(`[${item.x}, ${item.y}] → [${item.x + item.width}, ${item.y + item.height}] ${item.text}`);
  }
}
```

### Configuration

Pass any config options to the constructor. You only need to specify what you want to override:

```typescript
const parser = new LiteParse({
  ocrEnabled: true,
  ocrServerUrl: "http://localhost:8828/ocr",
  ocrLanguage: "fra",
  dpi: 300,
  outputFormat: "json",
  targetPages: "1-10",
  password: "secret",        // for encrypted/protected documents
});
```

### Buffer / Uint8Array input

You can pass raw bytes directly instead of a file path. PDF buffers are parsed with **zero disk I/O** — no temp files are written:

```typescript
import { readFile } from "fs/promises";

const parser = new LiteParse();

// From a file read
const pdfBytes = await readFile("document.pdf");
const result = await parser.parse(pdfBytes);

// From an HTTP response
const response = await fetch("https://example.com/document.pdf");
const buffer = Buffer.from(await response.arrayBuffer());
const result2 = await parser.parse(buffer);
```

Non-PDF buffers (images, Office documents) are written to a temp directory for format conversion. You can control the temp directory with the `LITEPARSE_TMPDIR` environment variable.

### Screenshots

Generate page images as buffers — useful for sending to LLMs or saving to disk. Accepts file paths, `Buffer`, or `Uint8Array`:

```typescript
const parser = new LiteParse();
const screenshots = await parser.screenshot("document.pdf");

for (const shot of screenshots) {
  console.log(`Page ${shot.pageNum}: ${shot.width}x${shot.height}`);
  // shot.imageBuffer contains the raw PNG/JPG data
}

// Also works with buffer input
const pdfBytes = await readFile("document.pdf");
const shots = await parser.screenshot(pdfBytes, [1, 2, 3]);
```

### Environment variables

| Variable | Description |
|----------|-------------|
| `TESSDATA_PREFIX` | Path to a directory containing Tesseract `.traineddata` files. For offline/air-gapped environments. Also available as the `tessdataPath` config option. |
| `LITEPARSE_TMPDIR` | Override the temp directory for format conversion. Defaults to `os.tmpdir()`. |

See the [API reference](/liteparse/api/) for full type details.

---

## Python

The Python package is a wrapper around the LiteParse Node.js CLI. **Node.js (>= 18) must be installed** on your system.

### Installation

First, install the LiteParse CLI:

```bash
npm install -g @llamaindex/liteparse
```

Then install the Python package:

```bash
pip install liteparse
```

<Aside type="caution">
  The Python package calls the LiteParse CLI under the hood via subprocess. Node.js (>= 18) is required. While the package can auto-install the CLI via `npm install -g @llamaindex/liteparse` on first use, it is recommended to install it separately beforehand.
</Aside>

### Parsing a document

```python
from liteparse import LiteParse

parser = LiteParse()
result = parser.parse("document.pdf")

# Full document text
print(result.text)

# Per-page data
for page in result.pages:
    print(f"Page {page.pageNum}: {len(page.textItems)} text items")
```

### Configuration

The `LiteParse` constructor accepts `cli_path` (to override CLI auto-detection) and `install_if_not_available` (to control auto-install behavior). All parsing options are passed per-call:

```python
parser = LiteParse()

result = parser.parse(
    "document.pdf",
    ocr_enabled=True,
    ocr_server_url="http://localhost:8828/ocr",
    ocr_language="fra",
    dpi=300,
    target_pages="1-5",
    password="secret",  # for encrypted/protected documents
)
```

### Parsing from bytes

If you already have file contents in memory (e.g. from a web upload), pass them directly to `parse()`:

```python
with open("document.pdf", "rb") as f:
    pdf_bytes = f.read()

result = parser.parse(pdf_bytes)
print(result.text)
```

### Batch parsing

For multiple files, batch mode reuses the PDF engine and is significantly faster:

```python
result = parser.batch_parse(
    input_dir="./documents",
    output_dir="./output",
    recursive=True,
    extension_filter=".pdf",
)

print(f"Output written to: {result.output_dir}")
```


================================================
FILE: docs/src/content/docs/liteparse/guides/multi-format.md
================================================
---
title: Multi-Format Support
description: Parse Word documents, spreadsheets, presentations, and images with LiteParse.
sidebar:
  order: 3
---

LiteParse automatically converts non-PDF formats to PDF before parsing. This lets you use the same parsing pipeline for Office documents, images, and more.

## Supported formats

### Office documents (via LibreOffice)

| Category | Extensions |
|----------|-----------|
| Word | `.doc`, `.docx`, `.docm`, `.odt`, `.rtf` |
| PowerPoint | `.ppt`, `.pptx`, `.pptm`, `.odp` |
| Spreadsheets | `.xls`, `.xlsx`, `.xlsm`, `.ods`, `.csv`, `.tsv` |

### Images (via ImageMagick)

`.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.tiff`, `.webp`, `.svg`

Images are converted to PDF and then parsed with OCR to extract text.

## Installing dependencies

Format conversion uses standard system tools. Install the ones you need:

### LibreOffice (for Office documents)

```bash
# macOS
brew install --cask libreoffice

# Ubuntu/Debian
apt-get install libreoffice

# Windows
choco install libreoffice-fresh
```

> On Windows, you may need to add the LibreOffice CLI directory (typically `C:\Program Files\LibreOffice\program`) to your PATH and restart.

### ImageMagick (for images)

```bash
# macOS
brew install imagemagick

# Ubuntu/Debian
apt-get install imagemagick

# Windows
choco install imagemagick.app
```

## Usage

Once the dependencies are installed, just pass any supported file to `lit parse`:

```bash
lit parse report.docx
lit parse slides.pptx --format json
lit parse spreadsheet.xlsx -o output.txt
lit parse scan.png
```

Batch mode also handles mixed formats:

```bash
lit batch-parse ./documents ./output --recursive
```

## How it works

1. LiteParse detects the file extension
2. If it's not a PDF, it converts to PDF using the appropriate tool (LibreOffice or ImageMagick)
3. The resulting PDF is parsed normally
4. Temporary conversion files are cleaned up automatically

If the required conversion tool isn't installed, LiteParse will return an error explaining which dependency is needed.


================================================
FILE: docs/src/content/docs/liteparse/guides/ocr.md
================================================
---
title: OCR Configuration
description: Configure OCR in LiteParse — built-in Tesseract, or bring your own via HTTP servers.
sidebar:
  order: 2
---

LiteParse uses OCR selectively — only on embedded images or pages where native text extraction didn't find text. This keeps parsing fast while still capturing text from scanned pages and embedded images.

## Built-in Tesseract (default)

Tesseract.js is bundled with LiteParse. The only setup is the automatic download of the Tesseract model files on first use. Just run:

```bash
lit parse document.pdf
```

If bundling LiteParse into a docker container or server environment, you might want to pre-download the Tesseract files to avoid network calls at runtime with the above command or similar.

### Language support

Specify the OCR language for better accuracy on non-English documents:

```bash
lit parse document.pdf --ocr-language fra    # French
lit parse document.pdf --ocr-language deu    # German
lit parse document.pdf --ocr-language jpn    # Japanese
```

Tesseract uses [ISO 639-3](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) language codes (`eng`, `fra`, `deu`, etc.).

### Disabling OCR

If you don't need OCR (pure native-text PDFs, or you don't care about images), disable it for faster parsing:

```bash
lit parse document.pdf --no-ocr
```

## HTTP OCR servers

For higher accuracy or GPU-accelerated OCR, you can point LiteParse at an HTTP OCR server. LiteParse ships with ready-to-use examples for popular OCR engines.

### EasyOCR

```bash
# Start the EasyOCR server (requires Python)
git clone https://github.com/run-llama/liteparse.git
cd liteparse/ocr/easyocr
pip install -r requirements.txt
python server.py

# Parse with EasyOCR in another terminal
lit parse document.pdf --ocr-server-url http://localhost:8828/ocr
```

### PaddleOCR

```bash
# Start the PaddleOCR server (requires Python)
git clone https://github.com/run-llama/liteparse.git
cd liteparse/ocr/paddleocr
pip install -r requirements.txt
python server.py

# Parse with PaddleOCR in another terminal
lit parse document.pdf --ocr-server-url http://localhost:8828/ocr
```

### Parallel OCR workers

LiteParse OCRs multiple pages in parallel. By default, it uses one fewer worker than your CPU core count. Override this with:

```bash
lit parse document.pdf --num-workers 8
```

This is useful if you need to slow down OCR requests to an external server or if your OCR engine is GPU-accelerated and can handle more concurrency.

## Custom OCR servers

You can integrate any OCR engine by implementing the LiteParse OCR API. Your server needs a single endpoint:

```
POST /ocr
Content-Type: multipart/form-data
```

**Request fields:**

| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `file` | binary | Yes | Image file (PNG, JPG, etc.) |
| `language` | string | No | ISO 639-1 language code (default: `en`) |

**Response format:**

```json
{
  "results": [
    {
      "text": "recognized text",
      "bbox": [x1, y1, x2, y2],
      "confidence": 0.95
    }
  ]
}
```

Each result contains:

| Field | Type | Description |
|-------|------|-------------|
| `text` | string | Recognized text |
| `bbox` | `[x1, y1, x2, y2]` | Bounding box in pixels. Origin is top-left, x goes right, y goes down |
| `confidence` | number | Score from 0.0 to 1.0 |

### Testing your server

```bash
# Quick test with curl
curl -X POST http://localhost:8080/ocr \
  -F "file=@test.png" \
  -F "language=en" | jq .

# Use with LiteParse
lit parse document.pdf --ocr-server-url http://localhost:8080/ocr
```

### Common Gotchas

- Return `{"results": []}` if no text is detected
- Bounding boxes must be axis-aligned (`[x1, y1, x2, y2]` where top-left to bottom-right)
- If your engine returns rotated boxes, convert to axis-aligned by taking min/max coordinates
- If your engine doesn't provide confidence scores, return `1.0`
- Results should be in reading order (top-to-bottom, left-to-right)
- Cache OCR models in memory rather than reloading per request

### A note on OCR approaches

These days, its common to apply the term "OCR" to both traditional approaches and newer LLM-based document understanding models. 

The LiteParse OCR API is designed specifically for approaches that return text with bounding boxes. 

If you are trying to integrate a method that doesn't return bounding boxes, you will have to generate dummy bounding boxes.


================================================
FILE: docs/src/content/docs/liteparse/guides/parsing-urls.md
================================================
---
title: Parsing URLs
description: Parse remote documents by reading URLs.
sidebar:
  order: 5
---

To parse remote files, LiteParse support both CLI and library usage for reading bytes and streams. This means the CLI can download them with any tool you like and pipe the bytes to `lit parse` using `-` as the file argument, while the libraries can fetch the bytes directly from the URL and pass them to the parser.

## CLI usage

```bash
# Parse a remote PDF
curl -sL https://example.com/report.pdf | lit parse -

# With options
curl -sL https://example.com/report.pdf | lit parse --no-ocr --format json -

# Save to a file
curl -sL https://example.com/report.pdf | lit parse -o report.txt -
```

The `-` argument tells LiteParse to read from stdin instead of a file path. Any tool that writes to stdout works — `curl`, `wget`, `aws s3 cp - -`, etc.

## Library usage

The TypeScript library accepts `Buffer`/`Uint8Array` directly, so you can handle the download however you like:

```typescript
import { LiteParse } from "@llamaindex/liteparse";

const response = await fetch("https://example.com/report.pdf");
const buffer = Buffer.from(await response.arrayBuffer());

const parser = new LiteParse({ ocrEnabled: false });
const result = await parser.parse(buffer);
console.log(result.text);
```


================================================
FILE: docs/src/content/docs/liteparse/guides/visual-citations.md
================================================
---
title: Visual Citations with Bounding Boxes
description: Use bounding boxes and screenshots to show exactly where information was found in a document.
sidebar:
  order: 4
---

When building agents or RAG workflows, it is often not enough to parse text and call it done. Frequently, users and applications will require you to show _where_ that text came from. 

LiteParse gives you spatial coordinates for every text item, plus page screenshots, so you can highlight exact regions on the rendered page.

## How bounding boxes work

When you parse a document with JSON output, each page includes a key data source for visual citations: **`textItems`**. Every extracted text element with its position (`x`, `y`, `width`, `height`) and content.

```json
$ lit parse document.pdf --format json
{
  "pages": [{
    "page": 1,
    "width": 612,
    "height": 792,
    "text": "...",
    "textItems": [
      { "text": "Revenue grew 15%", "x": 72, "y": 200, "width": 150, "height": 12, ... }
    ],
  }]
}
```

Coordinates are in **PDF points** (1 point = 1/72 inch). Origin is the top-left corner of the page, with X increasing right and Y increasing down.

## Library usage

The library lets you do both in a single script, parse for bboxes and generate screenshots. For example, you might be looking for specific information like "Revenue" and want to show exactly where it appears on the page:

```typescript
import { LiteParse } from "@llamaindex/liteparse";

const parser = new LiteParse({ outputFormat: "json", dpi: 150 });

const result = await parser.parse("report.pdf");
const screenshots = await parser.screenshot("report.pdf");

// Find a text item by its content
for (const page of result.json?.pages || []) {
  for (const item of page.textItems) {
    if (item.text.includes("Revenue")) {
      console.log(`Found on page ${page.page}: (${item.x}, ${item.y}) ${item.width}×${item.height}`);
    }
  }
}
```

## Converting coordinates to image pixels

Text item coordinates are in PDF points, but screenshots are in pixels. To draw highlights on a screenshot, you need to scale the coordinates:

```typescript
const scaleFactor = dpi / 72; // PDF points → pixels at your chosen DPI

function itemToPixels(item, dpi = 150) {
  const scale = dpi / 72;
  return {
    x: item.x * scale,
    y: item.y * scale,
    width: item.width * scale,
    height: item.height * scale,
  };
}
```

For example, at the default 150 DPI the scale factor is `150 / 72 ≈ 2.08`, so a text item at `(72, 200)` maps to pixel `(150, 416)`.

## Searching for phrases with `searchItems`

A single text item often contains just one word or fragment. A phrase like `"0°C to 70°C"` may span several adjacent items. The `searchItems` utility handles this — it concatenates consecutive items, finds matches, and returns merged text items with combined bounding boxes:

```typescript
import { LiteParse, searchItems } from "@llamaindex/liteparse";

const parser = new LiteParse({ outputFormat: "json" });
const result = await parser.parse("report.pdf");

for (const page of result.json.pages) {
  const matches = searchItems(page.textItems, { phrase: "0°C to 70°C" });
  for (const match of matches) {
    console.log(`Found "${match.text}" at (${match.x}, ${match.y}) ${match.width}×${match.height}`);
  }
}
```

Each returned item has the same shape as a regular text item, with merged coordinates spanning all the items that contributed to the match.

For single-word searches, iterating `textItems` individually (as shown in the library usage section above) is simpler and works fine.

## Full example: highlighting citations with sharp

Here's a complete workflow that parses a PDF, searches for a phrase, and draws yellow highlight boxes on the page screenshot:

```typescript
import { LiteParse, searchItems } from "@llamaindex/liteparse";
import sharp from "sharp";

const DPI = 150;
const SCALE = DPI / 72;

async function main() {
  const parser = new LiteParse({ outputFormat: "json", dpi: DPI });

  const result = await parser.parse("manual.pdf");
  const screenshots = await parser.screenshot("manual.pdf");

  // Search for a phrase, grouped by page
  const query = "0°C to 70°C";
  const hitsByPage = new Map<number, Array<{ x: number; y: number; width: number; height: number }>>();

  for (const page of result.json?.pages || []) {
    const matches = searchItems(page.textItems, { phrase: query });
    if (matches.length) hitsByPage.set(page.page, matches);
  }

  // Draw all highlights per page into a single image
  for (const [pageNum, rects] of hitsByPage) {
    const shot = screenshots.find((s) => s.pageNum === pageNum);
    if (!shot) continue;

    const composites = await Promise.all(
      rects.map(async (rect) => {
        const pixel = {
          left: Math.round(rect.x * SCALE),
          top: Math.round(rect.y * SCALE),
          width: Math.round(rect.width * SCALE),
          height: Math.round(rect.height * SCALE),
        };

        const overlay = await sharp({
          create: {
            width: pixel.width,
            height: pixel.height,
            channels: 4,
            background: { r: 255, g: 255, b: 0, alpha: 0.3 },
          },
        })
          .png()
          .toBuffer();

        return { input: overlay, left: pixel.left, top: pixel.top };
      })
    );

    const highlighted = await sharp(shot.imageBuffer)
      .composite(composites)
      .png()
      .toBuffer();

    await sharp(highlighted).toFile(`citation_page${pageNum}.png`);
    console.log(`Saved citation_page${pageNum}.png (${rects.length} highlights)`);
  }
}

main().catch(console.error);
```

Running this script on a PDF will produce new images with the search phrase highlighted, showing exactly where the information was found on the page.

![Example output showing highlighted search results on a PDF page](visual_citation.png)

## CLI usage

Parse to JSON to get bounding boxes:

```bash
lit parse document.pdf --format json -o result.json
```

Generate page screenshots alongside:

```bash
lit screenshot document.pdf -o ./screenshots
```

From there, you (or an agent) can process the resulting JSON and screenshots as needed using any tools available.

## Deprecated: `boundingBoxes`

The `boundingBoxes` array in JSON output is **deprecated** and will be removed in **v2.0**. It is a redundant representation of the same spatial data already available on each text item (`x`, `y`, `width`, `height`). Use `textItems` directly instead — it has the same coordinates plus text content, font metadata, and consistent indexing.

## Tips

- Use the same `dpi` value for both `parse()` and `screenshot()`. The default is `150` for both.
- Page `width` and `height` in the JSON are in PDF points, matching the coordinate space. Use these if you need to normalize coordinates to percentages.


================================================
FILE: docs/src/content/docs/liteparse/index.md
================================================
---
title: What is LiteParse?
description: Fast, local PDF parsing with spatial text parsing, OCR, and bounding boxes.
sidebar:
  order: 0
---

LiteParse is an open-source document parsing library that parses text with spatial layout information and bounding boxes. It runs entirely on your machine, with no cloud dependencies, no LLMs, no API keys.

LiteParse is designed specifically for use cases that require fast, accurate text parsing: real-time applications, coding agents, and local workflows. It provides a simple CLI and library API for parsing PDFs, Office documents, and images, with built-in OCR support.

## What can LiteParse do?

- **Parse PDFs** with precise spatial layout. Text comes back positioned where it appears on the page
- **Extract bounding boxes** for every text line, ready for downstream processing or visualization
- **OCR scanned documents** using built-in Tesseract.js or plug in your own OCR server
- **Parse Office files and images** with support for DOCX, XLSX, PPTX, PNG, JPG, and more via automatic conversion
- **Screenshot PDF pages** as high-quality images for LLM-based workflows
- **Use from TypeScript, Python, or the CLI** — whatever fits your stack

## Get started

- [Getting started](/liteparse/getting_started/): Install LiteParse and parse your first document.
- [Library usage](/liteparse/guides/library-usage/): Use LiteParse from TypeScript or Python code.
- [CLI reference](/liteparse/cli-reference/): Complete command and option reference.
- [API reference](/liteparse/api/): TypeScript library types and methods.


================================================
FILE: docs.config.mjs
================================================
export default {
  section: "liteparse",
  label: "LiteParse",
  content: [
    { src: "./docs/src/content/docs/liteparse", dest: "liteparse" },
  ],
  sidebar: [{
    label: "LiteParse",
    content: { type: "autogenerate", directory: "liteparse", collapsed: true },
  }],
};


================================================
FILE: eslint.config.js
================================================
import eslint from "@eslint/js";
import tseslint from "typescript-eslint";
import eslintConfigPrettier from "eslint-config-prettier";

export default tseslint.config(
  eslint.configs.recommended,
  ...tseslint.configs.recommended,
  eslintConfigPrettier,
  {
    languageOptions: {
      parserOptions: {
        projectService: true,
        tsconfigRootDir: import.meta.dirname,
      },
    },
  },
  {
    ignores: ["dist/**", "node_modules/**", "src/vendor/**", "examples/**"],
  },
  {
    rules: {
      // Allow unused vars that start with underscore
      "@typescript-eslint/no-unused-vars": [
        "error",
        {
          argsIgnorePattern: "^_",
          varsIgnorePattern: "^_",
        },
      ],
      // Allow explicit any in some cases (can be tightened later)
      "@typescript-eslint/no-explicit-any": "warn",
    },
  }
);


================================================
FILE: ocr/README.md
================================================
# ocr/

Example OCR server implementations that conform to the LiteParse OCR API specification.

These servers allow you to use alternative OCR engines instead of the built-in Tesseract.js.

## Why Use an External OCR Server?

| Feature | Tesseract.js (built-in) | EasyOCR | PaddleOCR |
|---------|-------------------------|---------|-----------|
| Setup | Zero (included) | uv | uv |
| Speed | Moderate | Moderate | Fast (2-3x) |
| Accuracy (Latin) | Good | Good | Good |
| Accuracy (CJK) | Fair | Good | Excellent |
| Languages | 100+ | 80+ | 80+ |
| Memory | In-process | Separate | Separate |

**Recommendations:**
- **Quick start**: Use built-in Tesseract (no setup)
- **Asian languages**: Use PaddleOCR (best CJK support)
- **General use**: EasyOCR (good balance)

## Available Servers

### [easyocr/](./easyocr/)
Flask server wrapping EasyOCR library.
- Port: **8828**
- Good general-purpose OCR
- 80+ languages

### [paddleocr/](./paddleocr/)
Flask server wrapping PaddleOCR library.
- Port: **8829**
- Excellent for Chinese, Japanese, Korean
- 2-3x faster than EasyOCR

## Quick Start

```bash
# Start EasyOCR server
cd ocr/easyocr
uv run server.py

# OR start PaddleOCR server
cd ocr/paddleocr
uv run server.py
```

Then use with LiteParse:

```bash
# CLI
lit parse document.pdf --ocr-server-url http://localhost:8828/ocr

# Code
const parser = new LiteParse({
  ocrServerUrl: 'http://localhost:8828/ocr',
  ocrLanguage: 'en',
});
```

## API Specification

All servers implement the same API (defined in `OCR_API_SPEC.md`):

**Endpoint:** `POST /ocr`

**Request:**
- Content-Type: `multipart/form-data`
- Fields:
  - `file` - Image file
  - `language` - Language code (e.g., 'en', 'zh', 'ja')

**Response:**
```json
{
  "results": [
    {
      "text": "recognized text",
      "bbox": [x1, y1, x2, y2],
      "confidence": 0.95
    }
  ]
}
```

## Creating a Custom OCR Server

To implement your own OCR server:

1. Create a Flask/FastAPI/Express server
2. Accept `POST /ocr` with multipart form data
3. Return JSON with `results` array containing:
   - `text` - Recognized text string
   - `bbox` - Bounding box as `[x1, y1, x2, y2]`
   - `confidence` - Confidence score (0-1)

4. (Optional) Implement `GET /health` endpoint

See the existing servers as reference implementations.

## Language Codes

Most servers accept ISO 639-1 codes (e.g., 'en', 'zh', 'ja') and map them internally:

| ISO Code | Language | Notes |
|----------|----------|-------|
| en | English | |
| zh | Chinese (Simplified) | |
| zh-tw | Chinese (Traditional) | |
| ja | Japanese | |
| ko | Korean | |
| fr | French | |
| de | German | |
| es | Spanish | |
| ar | Arabic | |
| hi | Hindi | |


================================================
FILE: ocr/easyocr/Dockerfile
================================================
FROM ghcr.io/astral-sh/uv:python3.12-trixie

# Install system dependencies
RUN apt-get update && apt-get install -y \
    libgomp1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgl1 \
    && rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /app

# Copy pyproject.toml and server code
COPY ./*py* .

# install necessary dependencies
RUN uv sync

# Expose port
EXPOSE 8828

# Run serv
Download .txt
gitextract_a5not0fy/

├── .changeset/
│   ├── README.md
│   └── config.json
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── feature_request.yml
│   │   └── parsing_issue.yml
│   └── workflows/
│       ├── ci.yml
│       ├── e2e-output.yml
│       ├── homebrew_release.yml
│       ├── ocr_servers.yml
│       ├── release.yml
│       └── sync-docs.yml
├── .gitignore
├── .prettierignore
├── .prettierrc
├── AGENTS.md
├── CHANGELOG.md
├── CLAUDE.md
├── CONTRIBUTING.md
├── LICENSE
├── OCR_API_SPEC.md
├── README.md
├── SECURITY.md
├── cli/
│   ├── README.md
│   └── parse.ts
├── dataset_eval_utils/
│   ├── README.md
│   ├── pyproject.toml
│   └── src/
│       └── liteparse_eval/
│           ├── __init__.py
│           ├── benchmark.py
│           ├── evaluation.py
│           ├── processing.py
│           ├── providers/
│           │   ├── __init__.py
│           │   ├── llm/
│           │   │   ├── __init__.py
│           │   │   ├── anthropic.py
│           │   │   └── base.py
│           │   └── parsers/
│           │       ├── __init__.py
│           │       ├── base.py
│           │       ├── liteparse.py
│           │       ├── markitdown.py
│           │       ├── pymupdf.py
│           │       └── pypdf.py
│           └── report.py
├── docs/
│   └── src/
│       └── content/
│           └── docs/
│               └── liteparse/
│                   ├── _meta.yml
│                   ├── cli-reference.md
│                   ├── getting_started.md
│                   ├── guides/
│                   │   ├── _meta.yml
│                   │   ├── agent-skill.md
│                   │   ├── library-usage.md
│                   │   ├── multi-format.md
│                   │   ├── ocr.md
│                   │   ├── parsing-urls.md
│                   │   └── visual-citations.md
│                   └── index.md
├── docs.config.mjs
├── eslint.config.js
├── ocr/
│   ├── README.md
│   ├── easyocr/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── pyproject.toml
│   │   ├── server.py
│   │   └── test_server.py
│   └── paddleocr/
│       ├── Dockerfile
│       ├── README.md
│       ├── pyproject.toml
│       ├── server.py
│       └── test_server.py
├── package.json
├── packages/
│   └── python/
│       ├── README.md
│       ├── liteparse/
│       │   ├── __init__.py
│       │   ├── parser.py
│       │   ├── py.typed
│       │   └── types.py
│       ├── pyproject.toml
│       └── tests/
│           ├── __init__.py
│           ├── conftest.py
│           ├── test_batch_e2e.py
│           ├── test_parse_e2e.py
│           └── test_screenshot_e2e.py
├── scripts/
│   ├── compare-dataset.ts
│   ├── compare-outputs.sh
│   ├── create-dataset.ts
│   ├── generate-api-docs.sh
│   ├── publish-to-homebrew-repo.sh
│   ├── sync-docs-to-developer-hub.sh
│   └── upload-dataset.ts
├── src/
│   ├── conversion/
│   │   ├── README.md
│   │   ├── convertToPdf.test.ts
│   │   └── convertToPdf.ts
│   ├── core/
│   │   ├── README.md
│   │   ├── config.test.ts
│   │   ├── config.ts
│   │   ├── parser.test.ts
│   │   ├── parser.ts
│   │   └── types.ts
│   ├── engines/
│   │   ├── README.md
│   │   ├── ocr/
│   │   │   ├── README.md
│   │   │   ├── http-simple.test.ts
│   │   │   ├── http-simple.ts
│   │   │   ├── interface.ts
│   │   │   ├── tesseract.test.ts
│   │   │   └── tesseract.ts
│   │   └── pdf/
│   │       ├── README.md
│   │       ├── interface.ts
│   │       ├── pdfium-renderer.test.ts
│   │       ├── pdfium-renderer.ts
│   │       ├── pdfjs.test.ts
│   │       ├── pdfjs.ts
│   │       └── pdfjsImporter.ts
│   ├── index.ts
│   ├── lib.ts
│   ├── output/
│   │   ├── README.md
│   │   ├── json.test.ts
│   │   ├── json.ts
│   │   ├── text.test.ts
│   │   └── text.ts
│   ├── processing/
│   │   ├── README.md
│   │   ├── bbox.test.ts
│   │   ├── bbox.ts
│   │   ├── cleanText.test.ts
│   │   ├── cleanText.ts
│   │   ├── grid.ts
│   │   ├── gridDebugLogger.ts
│   │   ├── gridProjection.test.ts
│   │   ├── gridProjection.ts
│   │   ├── gridVisualizer.ts
│   │   ├── markupUtils.test.ts
│   │   ├── markupUtils.ts
│   │   ├── ocrUtils.ts
│   │   ├── octUtils.test.ts
│   │   ├── searchItems.test.ts
│   │   ├── searchItems.ts
│   │   ├── textUtils.test.ts
│   │   └── textUtils.ts
│   └── vendor/
│       └── pdfjs/
│           ├── LICENSE
│           ├── README.md
│           ├── cmaps/
│           │   ├── 78-EUC-H.bcmap
│           │   ├── 78-EUC-V.bcmap
│           │   ├── 78-H.bcmap
│           │   ├── 78-RKSJ-H.bcmap
│           │   ├── 78-RKSJ-V.bcmap
│           │   ├── 78-V.bcmap
│           │   ├── 78ms-RKSJ-H.bcmap
│           │   ├── 78ms-RKSJ-V.bcmap
│           │   ├── 83pv-RKSJ-H.bcmap
│           │   ├── 90ms-RKSJ-H.bcmap
│           │   ├── 90ms-RKSJ-V.bcmap
│           │   ├── 90msp-RKSJ-H.bcmap
│           │   ├── 90msp-RKSJ-V.bcmap
│           │   ├── 90pv-RKSJ-H.bcmap
│           │   ├── 90pv-RKSJ-V.bcmap
│           │   ├── Add-H.bcmap
│           │   ├── Add-RKSJ-H.bcmap
│           │   ├── Add-RKSJ-V.bcmap
│           │   ├── Add-V.bcmap
│           │   ├── Adobe-CNS1-0.bcmap
│           │   ├── Adobe-CNS1-1.bcmap
│           │   ├── Adobe-CNS1-2.bcmap
│           │   ├── Adobe-CNS1-3.bcmap
│           │   ├── Adobe-CNS1-4.bcmap
│           │   ├── Adobe-CNS1-5.bcmap
│           │   ├── Adobe-CNS1-6.bcmap
│           │   ├── Adobe-CNS1-UCS2.bcmap
│           │   ├── Adobe-GB1-0.bcmap
│           │   ├── Adobe-GB1-1.bcmap
│           │   ├── Adobe-GB1-2.bcmap
│           │   ├── Adobe-GB1-3.bcmap
│           │   ├── Adobe-GB1-4.bcmap
│           │   ├── Adobe-GB1-5.bcmap
│           │   ├── Adobe-GB1-UCS2.bcmap
│           │   ├── Adobe-Japan1-0.bcmap
│           │   ├── Adobe-Japan1-1.bcmap
│           │   ├── Adobe-Japan1-2.bcmap
│           │   ├── Adobe-Japan1-3.bcmap
│           │   ├── Adobe-Japan1-4.bcmap
│           │   ├── Adobe-Japan1-5.bcmap
│           │   ├── Adobe-Japan1-6.bcmap
│           │   ├── Adobe-Japan1-UCS2.bcmap
│           │   ├── Adobe-Korea1-0.bcmap
│           │   ├── Adobe-Korea1-1.bcmap
│           │   ├── Adobe-Korea1-2.bcmap
│           │   ├── Adobe-Korea1-UCS2.bcmap
│           │   ├── B5-H.bcmap
│           │   ├── B5-V.bcmap
│           │   ├── B5pc-H.bcmap
│           │   ├── B5pc-V.bcmap
│           │   ├── CNS-EUC-H.bcmap
│           │   ├── CNS-EUC-V.bcmap
│           │   ├── CNS1-H.bcmap
│           │   ├── CNS1-V.bcmap
│           │   ├── CNS2-H.bcmap
│           │   ├── CNS2-V.bcmap
│           │   ├── ETHK-B5-H.bcmap
│           │   ├── ETHK-B5-V.bcmap
│           │   ├── ETen-B5-H.bcmap
│           │   ├── ETen-B5-V.bcmap
│           │   ├── ETenms-B5-H.bcmap
│           │   ├── ETenms-B5-V.bcmap
│           │   ├── EUC-H.bcmap
│           │   ├── EUC-V.bcmap
│           │   ├── Ext-H.bcmap
│           │   ├── Ext-RKSJ-H.bcmap
│           │   ├── Ext-RKSJ-V.bcmap
│           │   ├── Ext-V.bcmap
│           │   ├── GB-EUC-H.bcmap
│           │   ├── GB-EUC-V.bcmap
│           │   ├── GB-H.bcmap
│           │   ├── GB-V.bcmap
│           │   ├── GBK-EUC-H.bcmap
│           │   ├── GBK-EUC-V.bcmap
│           │   ├── GBK2K-H.bcmap
│           │   ├── GBK2K-V.bcmap
│           │   ├── GBKp-EUC-H.bcmap
│           │   ├── GBKp-EUC-V.bcmap
│           │   ├── GBT-EUC-H.bcmap
│           │   ├── GBT-EUC-V.bcmap
│           │   ├── GBT-H.bcmap
│           │   ├── GBT-V.bcmap
│           │   ├── GBTpc-EUC-H.bcmap
│           │   ├── GBTpc-EUC-V.bcmap
│           │   ├── GBpc-EUC-H.bcmap
│           │   ├── GBpc-EUC-V.bcmap
│           │   ├── H.bcmap
│           │   ├── HKdla-B5-H.bcmap
│           │   ├── HKdla-B5-V.bcmap
│           │   ├── HKdlb-B5-H.bcmap
│           │   ├── HKdlb-B5-V.bcmap
│           │   ├── HKgccs-B5-H.bcmap
│           │   ├── HKgccs-B5-V.bcmap
│           │   ├── HKm314-B5-H.bcmap
│           │   ├── HKm314-B5-V.bcmap
│           │   ├── HKm471-B5-H.bcmap
│           │   ├── HKm471-B5-V.bcmap
│           │   ├── HKscs-B5-H.bcmap
│           │   ├── HKscs-B5-V.bcmap
│           │   ├── Hankaku.bcmap
│           │   ├── Hiragana.bcmap
│           │   ├── KSC-EUC-H.bcmap
│           │   ├── KSC-EUC-V.bcmap
│           │   ├── KSC-H.bcmap
│           │   ├── KSC-Johab-H.bcmap
│           │   ├── KSC-Johab-V.bcmap
│           │   ├── KSC-V.bcmap
│           │   ├── KSCms-UHC-H.bcmap
│           │   ├── KSCms-UHC-HW-H.bcmap
│           │   ├── KSCms-UHC-HW-V.bcmap
│           │   ├── KSCms-UHC-V.bcmap
│           │   ├── KSCpc-EUC-H.bcmap
│           │   ├── KSCpc-EUC-V.bcmap
│           │   ├── Katakana.bcmap
│           │   ├── LICENSE
│           │   ├── NWP-H.bcmap
│           │   ├── NWP-V.bcmap
│           │   ├── RKSJ-H.bcmap
│           │   ├── RKSJ-V.bcmap
│           │   ├── Roman.bcmap
│           │   ├── UniCNS-UCS2-H.bcmap
│           │   ├── UniCNS-UCS2-V.bcmap
│           │   ├── UniCNS-UTF16-H.bcmap
│           │   ├── UniCNS-UTF16-V.bcmap
│           │   ├── UniCNS-UTF32-H.bcmap
│           │   ├── UniCNS-UTF32-V.bcmap
│           │   ├── UniCNS-UTF8-H.bcmap
│           │   ├── UniCNS-UTF8-V.bcmap
│           │   ├── UniGB-UCS2-H.bcmap
│           │   ├── UniGB-UCS2-V.bcmap
│           │   ├── UniGB-UTF16-H.bcmap
│           │   ├── UniGB-UTF16-V.bcmap
│           │   ├── UniGB-UTF32-H.bcmap
│           │   ├── UniGB-UTF32-V.bcmap
│           │   ├── UniGB-UTF8-H.bcmap
│           │   ├── UniGB-UTF8-V.bcmap
│           │   ├── UniJIS-UCS2-H.bcmap
│           │   ├── UniJIS-UCS2-HW-H.bcmap
│           │   ├── UniJIS-UCS2-HW-V.bcmap
│           │   ├── UniJIS-UCS2-V.bcmap
│           │   ├── UniJIS-UTF16-H.bcmap
│           │   ├── UniJIS-UTF16-V.bcmap
│           │   ├── UniJIS-UTF32-H.bcmap
│           │   ├── UniJIS-UTF32-V.bcmap
│           │   ├── UniJIS-UTF8-H.bcmap
│           │   ├── UniJIS-UTF8-V.bcmap
│           │   ├── UniJIS2004-UTF16-H.bcmap
│           │   ├── UniJIS2004-UTF16-V.bcmap
│           │   ├── UniJIS2004-UTF32-H.bcmap
│           │   ├── UniJIS2004-UTF32-V.bcmap
│           │   ├── UniJIS2004-UTF8-H.bcmap
│           │   ├── UniJIS2004-UTF8-V.bcmap
│           │   ├── UniJISPro-UCS2-HW-V.bcmap
│           │   ├── UniJISPro-UCS2-V.bcmap
│           │   ├── UniJISPro-UTF8-V.bcmap
│           │   ├── UniJISX0213-UTF32-H.bcmap
│           │   ├── UniJISX0213-UTF32-V.bcmap
│           │   ├── UniJISX02132004-UTF32-H.bcmap
│           │   ├── UniJISX02132004-UTF32-V.bcmap
│           │   ├── UniKS-UCS2-H.bcmap
│           │   ├── UniKS-UCS2-V.bcmap
│           │   ├── UniKS-UTF16-H.bcmap
│           │   ├── UniKS-UTF16-V.bcmap
│           │   ├── UniKS-UTF32-H.bcmap
│           │   ├── UniKS-UTF32-V.bcmap
│           │   ├── UniKS-UTF8-H.bcmap
│           │   ├── UniKS-UTF8-V.bcmap
│           │   ├── V.bcmap
│           │   └── WP-Symbol.bcmap
│           ├── pdf.mjs
│           ├── pdf.sandbox.mjs
│           ├── pdf.worker.mjs
│           └── standard_fonts/
│               ├── FoxitDingbats.pfb
│               ├── FoxitFixed.pfb
│               ├── FoxitFixedBold.pfb
│               ├── FoxitFixedBoldItalic.pfb
│               ├── FoxitFixedItalic.pfb
│               ├── FoxitSerif.pfb
│               ├── FoxitSerifBold.pfb
│               ├── FoxitSerifBoldItalic.pfb
│               ├── FoxitSerifItalic.pfb
│               ├── FoxitSymbol.pfb
│               ├── LICENSE_FOXIT
│               └── LICENSE_LIBERATION
├── tsconfig.json
├── typedoc.json
└── vitest.config.ts
Download .txt
Showing preview only (348K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (5040 symbols across 56 files)

FILE: cli/parse.ts
  constant DEFAULT_MAX_PAGES (line 10) | const DEFAULT_MAX_PAGES = 10000;
  constant DEFAULT_DPI (line 11) | const DEFAULT_DPI = 150;
  constant DEFAULT_LANGUAGE (line 12) | const DEFAULT_LANGUAGE = "en";
  constant DEFAULT_OUTPUT_FORMAT (line 13) | const DEFAULT_OUTPUT_FORMAT = "text";
  constant DEFAULT_SCREENSHOT_FORMAT (line 14) | const DEFAULT_SCREENSHOT_FORMAT = "png";
  constant DEFAULT_SCREENSHOT_DIR (line 15) | const DEFAULT_SCREENSHOT_DIR = "./screenshots";
  type ParseCommandOptions (line 17) | interface ParseCommandOptions {
  type ScreenshotCommandOptions (line 40) | interface ScreenshotCommandOptions {
  type BatchParseCommandOptions (line 50) | interface BatchParseCommandOptions {
  constant SUPPORTED_EXTENSIONS (line 305) | const SUPPORTED_EXTENSIONS = new Set([
  function findFiles (line 509) | function findFiles(dir: string, recursive: boolean, filterExt?: string):...
  function parsePageNumbers (line 546) | function parsePageNumbers(pagesStr: string): number[] {

FILE: dataset_eval_utils/src/liteparse_eval/benchmark.py
  class BenchmarkMetrics (line 27) | class BenchmarkMetrics:
    method count (line 33) | def count(self) -> int:
    method latency_avg (line 37) | def latency_avg(self) -> float:
    method latency_median (line 41) | def latency_median(self) -> float:
    method latency_stddev (line 45) | def latency_stddev(self) -> float:
    method latency_min (line 49) | def latency_min(self) -> float:
    method latency_max (line 53) | def latency_max(self) -> float:
    method memory_avg (line 57) | def memory_avg(self) -> float:
    method memory_median (line 61) | def memory_median(self) -> float:
    method memory_stddev (line 65) | def memory_stddev(self) -> float:
    method memory_min (line 69) | def memory_min(self) -> float:
    method memory_max (line 73) | def memory_max(self) -> float:
    method to_dict (line 76) | def to_dict(self) -> dict:
  class ProviderBenchmarkResult (line 100) | class ProviderBenchmarkResult:
    method to_dict (line 108) | def to_dict(self) -> dict:
  function get_provider_instance (line 121) | def get_provider_instance(provider_name: str) -> ParserProvider:
  function benchmark_provider (line 134) | def benchmark_provider(
  function run_benchmark (line 183) | def run_benchmark(
  function main (line 277) | def main():

FILE: dataset_eval_utils/src/liteparse_eval/evaluation.py
  class LatencyMetrics (line 28) | class LatencyMetrics:
    method count (line 33) | def count(self) -> int:
    method average (line 38) | def average(self) -> float:
    method min (line 43) | def min(self) -> float:
    method max (line 48) | def max(self) -> float:
    method stddev (line 53) | def stddev(self) -> float:
    method total (line 62) | def total(self) -> float:
    method to_dict (line 66) | def to_dict(self) -> dict:
  class QAResult (line 80) | class QAResult:
  class QAEvalResult (line 89) | class QAEvalResult:
  class Benchmark (line 99) | class Benchmark:
    method __init__ (line 102) | def __init__(
    method run_qa_eval (line 120) | def run_qa_eval(
    method run_full_benchmark (line 193) | def run_full_benchmark(
    method _build_detailed_results (line 347) | def _build_detailed_results(
  function main (line 400) | def main():

FILE: dataset_eval_utils/src/liteparse_eval/processing.py
  class QAPair (line 17) | class QAPair(BaseModel):
  class PageAnnotation (line 21) | class PageAnnotation(BaseModel):
  function pdf_to_images (line 36) | def pdf_to_images(pdf_path: Path, dpi: int = 150) -> List[Path]:
  function encode_image (line 53) | def encode_image(image_path: Path) -> tuple[str, str]:
  function analyze_image_with_claude (line 80) | def analyze_image_with_claude(
  function process_file (line 138) | def process_file(
  function find_documents (line 196) | def find_documents(input_dir: Path) -> List[Path]:
  function main (line 216) | def main():

FILE: dataset_eval_utils/src/liteparse_eval/providers/llm/anthropic.py
  class AnthropicProvider (line 6) | class AnthropicProvider(LLMProvider):
    method __init__ (line 11) | def __init__(self, api_key: str = None, model: str = "claude-sonnet-4-...
    method answer_question (line 23) | def answer_question(self, ocr_text: str, question: str) -> str:
    method evaluate_answer (line 43) | def evaluate_answer(self, question: str, expected_answer: str, predict...

FILE: dataset_eval_utils/src/liteparse_eval/providers/llm/base.py
  class LLMProvider (line 22) | class LLMProvider(ABC):
    method answer_question (line 26) | def answer_question(self, image_path: Path, question: str) -> str:
    method evaluate_answer (line 40) | def evaluate_answer(self, question: str, expected_answer: str, predict...

FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/base.py
  class ParserProvider (line 5) | class ParserProvider(ABC):
    method extract_text (line 9) | def extract_text(self, file_path: Path) -> str:
    method extract_text_batch (line 21) | def extract_text_batch(self, file_paths: list[Path]) -> dict[Path, str]:

FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/liteparse.py
  class LiteparseProvider (line 9) | class LiteparseProvider(ParserProvider):
    method __init__ (line 16) | def __init__(
    method extract_text (line 52) | def extract_text(self, file_path: Path) -> str:

FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/markitdown.py
  class MarkItDownProvider (line 8) | class MarkItDownProvider(ParserProvider):
    method __init__ (line 15) | def __init__(self, config: dict | None = None):
    method extract_text (line 25) | def extract_text(self, file_path: Path) -> str:

FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/pymupdf.py
  class PyMuPDFProvider (line 8) | class PyMuPDFProvider(ParserProvider):
    method __init__ (line 15) | def __init__(self):
    method extract_text (line 19) | def extract_text(self, file_path: Path) -> str:

FILE: dataset_eval_utils/src/liteparse_eval/providers/parsers/pypdf.py
  class PyPDFProvider (line 8) | class PyPDFProvider(ParserProvider):
    method __init__ (line 15) | def __init__(self, config: dict | None = None):
    method extract_text (line 24) | def extract_text(self, file_path: Path) -> str:

FILE: dataset_eval_utils/src/liteparse_eval/report.py
  class HTMLReportGenerator (line 18) | class HTMLReportGenerator:
    method __init__ (line 21) | def __init__(self, detailed_results: dict, ground_truth_dir: Path):
    method generate_report (line 33) | def generate_report(self, output_path: Path) -> None:
    method _build_html (line 43) | def _build_html(self) -> str:
    method _generate_css (line 67) | def _generate_css(self) -> str:
    method _generate_summary_html (line 313) | def _generate_summary_html(self) -> str:
    method _generate_navigation_html (line 378) | def _generate_navigation_html(self) -> str:
    method _generate_all_documents_html (line 411) | def _generate_all_documents_html(self) -> str:
    method _generate_document_html (line 421) | def _generate_document_html(self, doc: dict, index: int) -> str:
    method _generate_pdf_preview_html (line 461) | def _generate_pdf_preview_html(self, pdf_path: Path) -> str:
    method _generate_qa_html (line 469) | def _generate_qa_html(self, qa_eval: dict) -> str:
    method _pdf_to_base64_image (line 534) | def _pdf_to_base64_image(self, pdf_path: Path, dpi: int = 72) -> str:
    method _generate_vscode_link (line 588) | def _generate_vscode_link(self, pdf_path: str) -> str:
    method _get_metric_class (line 611) | def _get_metric_class(self, score: float) -> str:

FILE: ocr/easyocr/server.py
  class OcrResponse (line 15) | class OcrResponse(BaseModel):
  class StatusResponse (line 19) | class StatusResponse(BaseModel):
  class EasyOCRServer (line 23) | class EasyOCRServer:
    method __init__ (line 24) | def __init__(self) -> None:
    method _create_ocr_server (line 28) | def _create_ocr_server(self) -> FastAPI:
    method serve (line 81) | def serve(self) -> None:

FILE: ocr/easyocr/test_server.py
  function server (line 12) | def server() -> EasyOCRServer:
  class MockEasyOCRReader (line 16) | class MockEasyOCRReader:
    method __init__ (line 17) | def __init__(self, *args, **kwargs) -> None:
    method readtext (line 29) | def readtext(self, *args, **kwargs) -> list[Any]:
  function test_server_init (line 33) | def test_server_init(server: EasyOCRServer) -> None:
  function test_server_health_endpoint (line 38) | def test_server_health_endpoint(server: EasyOCRServer) -> None:
  function test_server_ocr_endpoint (line 46) | def test_server_ocr_endpoint(server: EasyOCRServer) -> None:

FILE: ocr/paddleocr/server.py
  class OcrResponse (line 15) | class OcrResponse(BaseModel):
  class StatusResponse (line 19) | class StatusResponse(BaseModel):
  class PaddleOCRServer (line 23) | class PaddleOCRServer:
    method __init__ (line 24) | def __init__(self) -> None:
    method normalize_language (line 34) | def normalize_language(language: str) -> str:
    method _create_ocr_server (line 47) | def _create_ocr_server(
    method serve (line 151) | def serve(self) -> None:

FILE: ocr/paddleocr/test_server.py
  function server (line 14) | def server() -> PaddleOCRServer:
  class MockPaddleOcr (line 18) | class MockPaddleOcr:
    method __init__ (line 19) | def __init__(self, *args, **kwargs) -> None:
    method predict (line 39) | def predict(self, *args, **kwargs) -> list[Any]:
  function test_server_init (line 43) | def test_server_init(server: PaddleOCRServer) -> None:
  function test_server_health_endpoint (line 48) | def test_server_health_endpoint(server: PaddleOCRServer) -> None:
  function test_server_ocr_endpoint (line 56) | def test_server_ocr_endpoint(server: PaddleOCRServer) -> None:
  function test_server_normalizes_documented_language_aliases (line 77) | def test_server_normalizes_documented_language_aliases(

FILE: packages/python/liteparse/parser.py
  function _find_cli (line 28) | def _find_cli(install_if_not_available: bool) -> str:
  function _parse_json_result (line 83) | def _parse_json_result(json_data: dict) -> ParseResult:
  function _build_parse_cli_args (line 137) | def _build_parse_cli_args(
  function _build_batch_cli_args (line 182) | def _build_batch_cli_args(
  class LiteParse (line 227) | class LiteParse:
    method __init__ (line 241) | def __init__(
    method cli_path (line 255) | def cli_path(self) -> str:
    method _prepare_command (line 261) | def _prepare_command(
    method _extract_path_and_bytes (line 276) | def _extract_path_and_bytes(
    method _extract_batch_params (line 291) | def _extract_batch_params(
    method _extract_screenshot_params (line 308) | def _extract_screenshot_params(
    method _get_parse_result (line 329) | def _get_parse_result(
    method _get_screenshot_result (line 346) | def _get_screenshot_result(
    method parse (line 388) | def parse(
    method parse_async (line 465) | async def parse_async(
    method batch_parse (line 542) | def batch_parse(
    method batch_parse_async (line 626) | async def batch_parse_async(
    method screenshot (line 707) | def screenshot(
    method screenshot_async (line 784) | async def screenshot_async(
    method __repr__ (line 857) | def __repr__(self) -> str:

FILE: packages/python/liteparse/types.py
  class OutputFormat (line 8) | class OutputFormat(str, Enum):
  class ImageFormat (line 14) | class ImageFormat(str, Enum):
  class BoundingBox (line 21) | class BoundingBox:
  class TextItem (line 30) | class TextItem:
  class ParsedPage (line 43) | class ParsedPage:
  class ParseResult (line 54) | class ParseResult:
    method num_pages (line 61) | def num_pages(self) -> int:
    method get_page (line 65) | def get_page(self, page_num: int) -> Optional[ParsedPage]:
  class BatchResult (line 74) | class BatchResult:
  class ScreenshotResult (line 80) | class ScreenshotResult:
  class ScreenshotBatchResult (line 88) | class ScreenshotBatchResult:
    method __len__ (line 93) | def __len__(self) -> int:
    method __iter__ (line 96) | def __iter__(self) -> Iterator[ScreenshotResult]:
    method get_page (line 99) | def get_page(self, page_num: int) -> Optional[ScreenshotResult]:
  class ParseError (line 107) | class ParseError(Exception):
    method __init__ (line 109) | def __init__(self, message: str, stderr: Optional[str] = None):
  class CLINotFoundError (line 114) | class CLINotFoundError(Exception):

FILE: packages/python/tests/conftest.py
  function parser (line 16) | def parser() -> LiteParse:
  function invoice_pdf (line 22) | def invoice_pdf() -> Path:
  function two_page_pdf (line 31) | def two_page_pdf() -> Path:
  function empty_pdf (line 40) | def empty_pdf() -> Path:

FILE: packages/python/tests/test_batch_e2e.py
  class TestBatchParseBasic (line 15) | class TestBatchParseBasic:
    method test_batch_parse_returns_batch_result (line 18) | def test_batch_parse_returns_batch_result(
    method test_batch_parse_creates_output_files (line 32) | def test_batch_parse_creates_output_files(
    method test_batch_parse_json_format (line 48) | def test_batch_parse_json_format(self, parser: LiteParse, invoice_pdf:...
    method test_batch_parse_async (line 62) | async def test_batch_parse_async(self, parser: LiteParse, invoice_pdf:...
  class TestBatchParseErrors (line 76) | class TestBatchParseErrors:
    method test_input_dir_not_found (line 79) | def test_input_dir_not_found(self, parser: LiteParse):
    method test_input_dir_not_found_async (line 85) | async def test_input_dir_not_found_async(self, parser: LiteParse):

FILE: packages/python/tests/test_parse_e2e.py
  class TestParseBasic (line 17) | class TestParseBasic:
    method test_parse_returns_parse_result (line 20) | def test_parse_returns_parse_result(self, parser: LiteParse, invoice_p...
    method test_parse_result_has_pages (line 24) | def test_parse_result_has_pages(self, parser: LiteParse, invoice_pdf: ...
    method test_parse_result_has_text (line 29) | def test_parse_result_has_text(self, parser: LiteParse, invoice_pdf: P...
    method test_parse_result_has_json (line 34) | def test_parse_result_has_json(self, parser: LiteParse, invoice_pdf: P...
    method test_parse_bytest_input (line 39) | def test_parse_bytest_input(self, parser: LiteParse, invoice_pdf: Path):
    method test_parse_async (line 46) | async def test_parse_async(self, parser: LiteParse, invoice_pdf: Path):
    method test_parse_async_bytes_input (line 52) | async def test_parse_async_bytes_input(self, parser: LiteParse, invoic...
  class TestParsedPageStructure (line 59) | class TestParsedPageStructure:
    method test_page_fields (line 62) | def test_page_fields(self, parser: LiteParse, invoice_pdf: Path):
    method test_page_has_text_items (line 74) | def test_page_has_text_items(self, parser: LiteParse, invoice_pdf: Path):
    method test_text_item_fields (line 79) | def test_text_item_fields(self, parser: LiteParse, invoice_pdf: Path):
    method test_bounding_box_fields (line 94) | def test_bounding_box_fields(self, parser: LiteParse, invoice_pdf: Path):
  class TestParseOptions (line 106) | class TestParseOptions:
    method test_target_pages (line 109) | def test_target_pages(self, parser: LiteParse, invoice_pdf: Path):
    method test_max_pages (line 115) | def test_max_pages(self, parser: LiteParse, invoice_pdf: Path):
    method test_no_precise_bbox (line 119) | def test_no_precise_bbox(self, parser: LiteParse, invoice_pdf: Path):
    method test_get_page_helper (line 128) | def test_get_page_helper(self, parser: LiteParse, invoice_pdf: Path):
    method test_multi_page_text_joined (line 135) | def test_multi_page_text_joined(self, parser: LiteParse, invoice_pdf: ...
  class TestParseErrors (line 143) | class TestParseErrors:
    method test_file_not_found (line 146) | def test_file_not_found(self, parser: LiteParse):
    method test_cli_not_found (line 150) | def test_cli_not_found(self):
    method test_timeout (line 156) | def test_timeout(self, parser: LiteParse, invoice_pdf: Path):
    method test_file_not_found_async (line 162) | async def test_file_not_found_async(self, parser: LiteParse):
    method test_cli_not_found_async (line 167) | async def test_cli_not_found_async(self):
    method test_timeout_async (line 174) | async def test_timeout_async(self, parser: LiteParse, invoice_pdf: Path):

FILE: packages/python/tests/test_screenshot_e2e.py
  class TestScreenshotBasic (line 16) | class TestScreenshotBasic:
    method test_screenshot_returns_batch_result (line 19) | def test_screenshot_returns_batch_result(
    method test_screenshot_has_screenshots (line 25) | def test_screenshot_has_screenshots(self, parser: LiteParse, invoice_p...
    method test_screenshot_result_fields (line 30) | def test_screenshot_result_fields(self, parser: LiteParse, invoice_pdf...
    method test_screenshot_output_dir (line 39) | def test_screenshot_output_dir(self, parser: LiteParse, invoice_pdf: P...
    method test_screenshot_png_format (line 47) | def test_screenshot_png_format(self, parser: LiteParse, invoice_pdf: P...
    method test_screenshot_jpg_format (line 52) | def test_screenshot_jpg_format(self, parser: LiteParse, invoice_pdf: P...
    method test_screensho_async_basic (line 58) | async def test_screensho_async_basic(self, parser: LiteParse, invoice_...
  class TestScreenshotOptions (line 64) | class TestScreenshotOptions:
    method test_target_pages (line 67) | def test_target_pages(self, parser: LiteParse, invoice_pdf: Path):
    method test_load_bytes (line 73) | def test_load_bytes(self, parser: LiteParse, invoice_pdf: Path):
    method test_no_load_bytes (line 79) | def test_no_load_bytes(self, parser: LiteParse, invoice_pdf: Path):
    method test_get_page_helper (line 84) | def test_get_page_helper(self, parser: LiteParse, invoice_pdf: Path):
    method test_iterable (line 91) | def test_iterable(self, parser: LiteParse, invoice_pdf: Path):
  class TestScreenshotErrors (line 97) | class TestScreenshotErrors:
    method test_file_not_found (line 100) | def test_file_not_found(self, parser: LiteParse):

FILE: scripts/compare-dataset.ts
  constant DEFAULT_DATASET_DIR (line 24) | const DEFAULT_DATASET_DIR = path.join(import.meta.dirname, "..", "datase...
  type DatasetRow (line 26) | interface DatasetRow {
  type DiffResult (line 34) | interface DiffResult {
  function loadDataset (line 43) | async function loadDataset(datasetDir: string): Promise<Map<string, Data...
  function normalizeForComparison (line 64) | function normalizeForComparison(text: string): string {
  type DiffHunk (line 72) | interface DiffHunk {
  function computeTextDiff (line 77) | function computeTextDiff(expected: string, actual: string, maxHunks = 5,...
  function getCurrentOutput (line 179) | async function getCurrentOutput(
  function main (line 207) | async function main() {

FILE: scripts/create-dataset.ts
  constant DEFAULT_SOURCE_DIR (line 26) | const DEFAULT_SOURCE_DIR = path.join(import.meta.dirname, "..", "e2e-tes...
  constant DEFAULT_OUTPUT_DIR (line 27) | const DEFAULT_OUTPUT_DIR = path.join(import.meta.dirname, "..", "dataset");
  type DatasetRow (line 29) | interface DatasetRow {
  function findFiles (line 37) | async function findFiles(dir: string, baseDir: string = dir): Promise<st...
  function processFile (line 56) | async function processFile(filePath: string, baseDocDir: string): Promis...
  function main (line 117) | async function main() {

FILE: scripts/upload-dataset.ts
  constant DEFAULT_OUTPUT_DIR (line 23) | const DEFAULT_OUTPUT_DIR = path.join(import.meta.dirname, "..", "dataset");
  constant DEFAULT_REPO (line 24) | const DEFAULT_REPO = "llamaindex/liteparse_cicd_data";
  function main (line 26) | async function main() {

FILE: src/conversion/convertToPdf.test.ts
  type SpawnPlan (line 5) | interface SpawnPlan {
  class MockEmitter (line 57) | class MockEmitter {
    method on (line 60) | on(event: string, cb: (...args: unknown[]) => void) {
    method emit (line 67) | emit(event: string, ...args: unknown[]) {
  function enqueuePlan (line 76) | function enqueuePlan(plan: SpawnPlan) {
  function enqueueSpawnPlans (line 114) | function enqueueSpawnPlans(...plans: SpawnPlan[]): void {
  function enqueueMissingCommandLookups (line 120) | function enqueueMissingCommandLookups(count: number): void {
  function enqueueLibreOfficeLookup (line 126) | function enqueueLibreOfficeLookup(): void {
  function enqueueImageMagickLookup (line 130) | function enqueueImageMagickLookup(): void {

FILE: src/conversion/convertToPdf.ts
  function getTmpDir (line 11) | function getTmpDir(): string {
  type ConversionResult (line 15) | interface ConversionResult {
  type ConversionError (line 20) | interface ConversionError {
  type ConversionPassthrough (line 25) | interface ConversionPassthrough {
  type ResolvedCommand (line 29) | interface ResolvedCommand {
  function guessFileExtension (line 88) | async function guessFileExtension(filePath: string): Promise<string | nu...
  function executeCommand (line 105) | async function executeCommand(command: string, args: string[], timeoutMs...
  function executePowerShell (line 144) | async function executePowerShell(command: string, timeoutMs = 60000) {
  function getResolvedPathFromOutput (line 148) | function getResolvedPathFromOutput(output: string, useLastLine = false):...
  function resolveCommandPath (line 164) | async function resolveCommandPath(command: string): Promise<string | nul...
  function isCommandAvailable (line 184) | async function isCommandAvailable(command: string): Promise<boolean> {
  function isCommandAvailableWindows (line 193) | async function isCommandAvailableWindows(command: string): Promise<boole...
  function isPathExecutable (line 205) | async function isPathExecutable(filePath: string): Promise<boolean> {
  function isWindowsSystemConvert (line 214) | function isWindowsSystemConvert(filePath: string): boolean {
  function isImageMagickBinary (line 222) | async function isImageMagickBinary(executablePath: string, args: string[...
  function resolveImageMagickCommand (line 231) | async function resolveImageMagickCommand(
  function findLibreOfficeCommand (line 257) | async function findLibreOfficeCommand(): Promise<string | null> {
  function findImageMagickCommand (line 297) | async function findImageMagickCommand(): Promise<ResolvedCommand | null> {
  function convertOfficeDocument (line 306) | async function convertOfficeDocument(
  function convertImageToPdf (line 349) | async function convertImageToPdf(filePath: string, outputDir: string): P...
  function convertToPdf (line 411) | async function convertToPdf(
  function cleanupConversionFiles (line 475) | async function cleanupConversionFiles(pdfPath: string): Promise<void> {
  function guessExtensionFromBuffer (line 490) | async function guessExtensionFromBuffer(data: Buffer | Uint8Array): Prom...
  function convertBufferToPdf (line 502) | async function convertBufferToPdf(

FILE: src/core/config.ts
  constant DEFAULT_CONFIG (line 3) | const DEFAULT_CONFIG: LiteParseConfig = {
  function mergeConfig (line 26) | function mergeConfig(userConfig: Partial<LiteParseConfig>): LiteParseCon...

FILE: src/core/parser.test.ts
  method constructor (line 369) | constructor() {}
  method constructor (line 389) | constructor() {}
  method constructor (line 407) | constructor(url: string) {
  method constructor (line 457) | constructor() {}

FILE: src/core/parser.ts
  class LiteParse (line 58) | class LiteParse {
    method constructor (line 68) | constructor(userConfig: Partial<LiteParseConfig> = {}) {
    method parse (line 100) | async parse(input: LiteParseInput, quiet = false): Promise<ParseResult> {
    method screenshot (line 235) | async screenshot(
    method runOCR (line 339) | private async runOCR(
    method processPageOcr (line 356) | private async processPageOcr(
    method getConfig (line 492) | getConfig(): LiteParseConfig {

FILE: src/core/types.ts
  type OutputFormat (line 9) | type OutputFormat = "json" | "text";
  type LiteParseInput (line 18) | type LiteParseInput = string | Buffer | Uint8Array;
  type LiteParseConfig (line 36) | interface LiteParseConfig {
  type TextItem (line 169) | interface TextItem {
  type MarkupData (line 207) | interface MarkupData {
  type ProjectionTextBox (line 218) | interface ProjectionTextBox {
  type Coordinates (line 252) | interface Coordinates {
  type OcrData (line 263) | interface OcrData {
  type BoundingBox (line 281) | interface BoundingBox {
  type ParsedPage (line 295) | interface ParsedPage {
  type JsonTextItem (line 316) | interface JsonTextItem {
  type SearchItemsOptions (line 338) | interface SearchItemsOptions {
  type ParseResultJson (line 353) | interface ParseResultJson {
  type ParseResult (line 376) | interface ParseResult {
  type ScreenshotResult (line 388) | interface ScreenshotResult {

FILE: src/engines/ocr/http-simple.ts
  type HttpOcrResponseItem (line 6) | interface HttpOcrResponseItem {
  class HttpOcrEngine (line 22) | class HttpOcrEngine implements OcrEngine {
    method constructor (line 26) | constructor(serverUrl: string) {
    method recognize (line 30) | async recognize(image: string | Buffer, options: OcrOptions): Promise<...
    method recognizeBatch (line 77) | async recognizeBatch(images: (string | Buffer)[], options: OcrOptions)...

FILE: src/engines/ocr/interface.ts
  type OcrEngine (line 1) | interface OcrEngine {
  type OcrOptions (line 7) | interface OcrOptions {
  type OcrResult (line 12) | interface OcrResult {

FILE: src/engines/ocr/tesseract.ts
  class TesseractEngine (line 4) | class TesseractEngine implements OcrEngine {
    method constructor (line 12) | constructor(concurrency: number = 4, tessdataPath?: string) {
    method initialize (line 18) | async initialize(language: string = "eng"): Promise<void> {
    method recognize (line 101) | async recognize(image: string | Buffer, options: OcrOptions): Promise<...
    method recognizeBatch (line 157) | async recognizeBatch(images: (string | Buffer)[], options: OcrOptions)...
    method terminate (line 176) | async terminate(): Promise<void> {
    method normalizeLanguage (line 189) | private normalizeLanguage(lang: string): string {

FILE: src/engines/pdf/interface.ts
  type ExtractOptions (line 4) | interface ExtractOptions {
  type PdfEngine (line 9) | interface PdfEngine {
  type PdfDocument (line 28) | interface PdfDocument {
  type BoundingBox (line 35) | interface BoundingBox {
  type PageData (line 42) | interface PageData {
  type Path (line 53) | interface Path {
  type Image (line 60) | interface Image {
  type EasyOcrResultLine (line 82) | type EasyOcrResultLine = [
  type Annotation (line 88) | interface Annotation {

FILE: src/engines/pdf/pdfium-renderer.test.ts
  method constructor (line 51) | constructor() {}
  method init (line 53) | static init() {

FILE: src/engines/pdf/pdfium-renderer.ts
  type PdfiumWasmModule (line 10) | interface PdfiumWasmModule {
  constant MIN_IMAGE_SIZE_PT (line 26) | const MIN_IMAGE_SIZE_PT = 25;
  constant MAX_IMAGE_PAGE_COVERAGE (line 28) | const MAX_IMAGE_PAGE_COVERAGE = 0.9;
  type PdfiumPageInternal (line 30) | interface PdfiumPageInternal {
  class PdfiumRenderer (line 40) | class PdfiumRenderer {
    method init (line 44) | async init(): Promise<void> {
    method loadDocument (line 55) | async loadDocument(pdfInput: string | Buffer | Uint8Array, password?: ...
    method closeDocument (line 63) | closeDocument(): void {
    method getOrLoadDocument (line 70) | private async getOrLoadDocument(
    method renderPageToBuffer (line 84) | async renderPageToBuffer(
    method extractImageBounds (line 129) | async extractImageBounds(
    method close (line 194) | async close(): Promise<void> {

FILE: src/engines/pdf/pdfjs.test.ts
  method constructor (line 49) | constructor() {}
  function getTestData (line 62) | function getTestData() {
  function getExpectedResults (line 107) | function getExpectedResults() {

FILE: src/engines/pdf/pdfjs.ts
  type PdfJsDocument (line 16) | interface PdfJsDocument {
  type PdfJsPage (line 24) | interface PdfJsPage {
  type PdfJsViewport (line 31) | interface PdfJsViewport {
  type PdfJsTextContent (line 38) | interface PdfJsTextContent {
  type PdfJsTextItem (line 43) | interface PdfJsTextItem {
  type PdfJsExtendedDocument (line 52) | interface PdfJsExtendedDocument extends PdfDocument {
  constant CMAP_URL (line 59) | const CMAP_URL = `${PDFJS_DIR}/cmaps/`;
  constant STANDARD_FONT_DATA_URL (line 60) | const STANDARD_FONT_DATA_URL = `${PDFJS_DIR}/standard_fonts/`;
  constant CMAP_PACKED (line 61) | const CMAP_PACKED = true;
  function getRotation (line 67) | function getRotation(transform: number[]): number {
  function multiplyMatrices (line 74) | function multiplyMatrices(m1: number[], m2: number[]): number[] {
  function applyTransformation (line 88) | function applyTransformation(
  constant BUGGY_FONT_MARKER_CHECK (line 99) | const BUGGY_FONT_MARKER_CHECK = ":->|>";
  constant PIPE_PATTERN_REGEX (line 100) | const PIPE_PATTERN_REGEX = /\s*\|([^|])\|\s*/g;
  constant ADOBE_GLYPH_MAP (line 118) | const ADOBE_GLYPH_MAP: Record<string, string> = {
  function resolveGlyphName (line 390) | function resolveGlyphName(glyphName: string): string | null {
  constant BUGGY_FONT_MARKER_RE (line 424) | const BUGGY_FONT_MARKER_RE = /:->\|>_(\d+)_\d+@([^@]*)@<\|<-:/g;
  function decodeBuggyFontMarkers (line 426) | function decodeBuggyFontMarkers(str: string): string {
  constant WINDOWS_1252_TO_UNICODE (line 455) | const WINDOWS_1252_TO_UNICODE: Record<number, string> = {
  constant LIGATURE_MAP (line 490) | const LIGATURE_MAP: Record<string, string> = {
  function stripControlChars (line 505) | function stripControlChars(str: string): string {
  function isGarbledFontOutput (line 549) | function isGarbledFontOutput(str: string): boolean {
  class PdfJsEngine (line 659) | class PdfJsEngine implements PdfEngine {
    method loadDocument (line 665) | async loadDocument(input: string | Uint8Array, password?: string): Pro...
    method extractPage (line 718) | async extractPage(
    method extractAllPages (line 846) | async extractAllPages(
    method renderPageImage (line 875) | async renderPageImage(
    method close (line 894) | async close(doc: PdfDocument): Promise<void> {
    method parseTargetPages (line 909) | private parseTargetPages(targetPages: string, maxPages: number): numbe...

FILE: src/engines/pdf/pdfjsImporter.ts
  function importPdfJs (line 4) | async function importPdfJs() {

FILE: src/output/json.ts
  function buildJSON (line 6) | function buildJSON(pages: ParsedPage[]): ParseResultJson {
  function formatJSON (line 31) | function formatJSON(result: ParseResult): string {

FILE: src/output/text.ts
  function formatText (line 6) | function formatText(result: ParseResult): string {
  function formatPageText (line 18) | function formatPageText(page: ParsedPage): string {

FILE: src/processing/bbox.ts
  constant OCR_CONFIDENCE_THRESHOLD (line 12) | const OCR_CONFIDENCE_THRESHOLD = 0.1;
  constant OCR_OVERLAP_THRESHOLD (line 20) | const OCR_OVERLAP_THRESHOLD = 0.5;
  constant MAX_IMAGES_PER_PAGE (line 26) | const MAX_IMAGES_PER_PAGE = 10;
  constant MIN_IMAGE_DIMENSION (line 31) | const MIN_IMAGE_DIMENSION = 12;
  constant MIN_IMAGE_AREA (line 32) | const MIN_IMAGE_AREA = 200;
  constant MIN_RENDERED_DIMENSION (line 37) | const MIN_RENDERED_DIMENSION = 6;
  constant MIN_RENDERED_AREA (line 38) | const MIN_RENDERED_AREA = 200;
  function filterImagesForOCR (line 44) | function filterImagesForOCR(
  function getOverlapArea (line 111) | function getOverlapArea(
  function filterOcrBlocksOverlappingWithText (line 136) | function filterOcrBlocksOverlappingWithText(
  function buildBbox (line 196) | function buildBbox(pageData: PageData, config: LiteParseConfig): Project...
  function buildBoundingBoxes (line 316) | function buildBoundingBoxes(textItems: TextItem[]): BoundingBox[] {

FILE: src/processing/cleanText.ts
  function detectAndRemoveMarginOnPage (line 11) | function detectAndRemoveMarginOnPage(page: ParsedPage): void {
  function detectAndRemoveMargin (line 70) | function detectAndRemoveMargin(pages: ParsedPage[]): void {
  function cleanRawText (line 79) | function cleanRawText(pages: ParsedPage[], _config: LiteParseConfig): vo...

FILE: src/processing/grid.ts
  function projectPagesToGrid (line 9) | async function projectPagesToGrid(

FILE: src/processing/gridDebugLogger.ts
  type GridDebugConfig (line 13) | interface GridDebugConfig {
  constant DEFAULT_DEBUG_CONFIG (line 62) | const DEFAULT_DEBUG_CONFIG: GridDebugConfig = {
  type LogEntry (line 66) | type LogEntry = {
  type RenderedSegment (line 76) | interface RenderedSegment {
  type VisualizerPageData (line 85) | interface VisualizerPageData {
  class GridDebugLogger (line 109) | class GridDebugLogger {
    method constructor (line 119) | constructor(config: GridDebugConfig) {
    method enabled (line 129) | get enabled(): boolean {
    method shouldVisualize (line 133) | get shouldVisualize(): boolean {
    method visualizerPages (line 137) | get visualizerPages(): VisualizerPageData[] {
    method debugConfig (line 141) | get debugConfig(): GridDebugConfig {
    method setPage (line 145) | setPage(pageNum: number, _width: number, _height: number): void {
    method isPageFiltered (line 162) | private isPageFiltered(): boolean {
    method matchesBbox (line 167) | matchesBbox(bbox: ProjectionTextBox, lineIndex?: number): boolean {
    method logBlock (line 195) | logBlock(blockIndex: number, start: number, end: number): void {
    method logFlowingBlock (line 204) | logFlowingBlock(start: number, end: number): void {
    method logStructuredBlock (line 213) | logStructuredBlock(_start: number, _end: number): void {}
    method logFlowingLine (line 216) | logFlowingLine(lineIndex: number, reason: string): void {
    method logAnchors (line 227) | logAnchors(
    method logSnapAssignment (line 283) | logSnapAssignment(bbox: ProjectionTextBox, lineIndex: number, boxIndex...
    method captureRender (line 302) | captureRender(
    method captureRawLines (line 313) | captureRawLines(rawLines: string[]): void {
    method logRender (line 322) | logRender(bbox: ProjectionTextBox, lineIndex: number, targetX: number,...
    method logForwardAnchor (line 338) | logForwardAnchor(
    method logDuplicateResolution (line 353) | logDuplicateResolution(bbox: ProjectionTextBox, resolvedTo: string): v...
    method logLineComposition (line 363) | logLineComposition(lineIndex: number, line: ProjectionTextBox[]): void {
    method emit (line 378) | private emit(entry: LogEntry): void {
    method formatEntries (line 394) | private formatEntries(): string {
    method flushSync (line 414) | flushSync(): void {
    method flush (line 422) | async flush(): Promise<void> {
  class NoopGridDebugLogger (line 431) | class NoopGridDebugLogger extends GridDebugLogger {
    method constructor (line 432) | constructor() {
    method enabled (line 435) | override get enabled(): boolean {
    method shouldVisualize (line 438) | override get shouldVisualize(): boolean {
    method matchesBbox (line 441) | override matchesBbox(): boolean {
    method setPage (line 444) | override setPage(): void {}
    method logBlock (line 445) | override logBlock(): void {}
    method logFlowingBlock (line 446) | override logFlowingBlock(): void {}
    method logStructuredBlock (line 447) | override logStructuredBlock(): void {}
    method logFlowingLine (line 448) | override logFlowingLine(): void {}
    method logAnchors (line 449) | override logAnchors(): void {}
    method logSnapAssignment (line 450) | override logSnapAssignment(): void {}
    method captureRender (line 451) | override captureRender(): void {}
    method captureRawLines (line 452) | override captureRawLines(): void {}
    method logRender (line 453) | override logRender(): void {}
    method logForwardAnchor (line 454) | override logForwardAnchor(): void {}
    method logDuplicateResolution (line 455) | override logDuplicateResolution(): void {}
    method logLineComposition (line 456) | override logLineComposition(): void {}
    method flushSync (line 457) | override flushSync(): void {}
    method flush (line 458) | override async flush(): Promise<void> {}
  constant NOOP_LOGGER (line 462) | const NOOP_LOGGER = new NoopGridDebugLogger();
  function createGridDebugLogger (line 464) | function createGridDebugLogger(config?: GridDebugConfig): GridDebugLogger {
  function round2 (line 469) | function round2(n: number): number {

FILE: src/processing/gridProjection.ts
  constant FLOATING_SPACES (line 12) | const FLOATING_SPACES = 2;
  constant COLUMN_SPACES (line 14) | const COLUMN_SPACES = 4;
  constant FLOWING_MAX_TOTAL_ANCHORS (line 18) | const FLOWING_MAX_TOTAL_ANCHORS = 4;
  constant FLOWING_MAX_LEFT_ANCHORS (line 20) | const FLOWING_MAX_LEFT_ANCHORS = 3;
  constant FLOWING_MIN_LINES (line 22) | const FLOWING_MIN_LINES = 3;
  constant FLOWING_WIDE_LINE_RATIO (line 24) | const FLOWING_WIDE_LINE_RATIO = 0.5;
  constant FLOWING_WIDE_LINE_THRESHOLD (line 26) | const FLOWING_WIDE_LINE_THRESHOLD = 0.6;
  constant FLOWING_COLUMN_GAP_MULTIPLIER (line 28) | const FLOWING_COLUMN_GAP_MULTIPLIER = 4;
  constant FLOWING_MIN_LINE_ITEMS (line 30) | const FLOWING_MIN_LINE_ITEMS = 3;
  constant FLOWING_SPACE_HEIGHT_RATIO (line 32) | const FLOWING_SPACE_HEIGHT_RATIO = 0.15;
  constant FLOWING_SPACE_MIN_THRESHOLD (line 34) | const FLOWING_SPACE_MIN_THRESHOLD = 0.3;
  constant FLOWING_MAX_INDENT (line 36) | const FLOWING_MAX_INDENT = 8;
  type Snap (line 38) | type Snap = {
  type Anchor (line 44) | type Anchor = {
  type Anchors (line 48) | type Anchors = {
  type ForwardAnchor (line 54) | type ForwardAnchor = { [key: string]: number };
  type PrevAnchors (line 56) | type PrevAnchors = {
  type PageForwardAnchors (line 62) | type PageForwardAnchors = {
  type SnapMaps (line 69) | type SnapMaps = {
  type LineRange (line 76) | type LineRange = {
  type TextBoxSize (line 81) | type TextBoxSize = {
  type LineMetrics (line 86) | type LineMetrics = {
  function roundAnchor (line 92) | function roundAnchor(anchor: number): number {
  function getRepresentativeLineMetrics (line 97) | function getRepresentativeLineMetrics(
  constant SMALL_FONT_SIZE_THRESHOLD (line 116) | const SMALL_FONT_SIZE_THRESHOLD = 2;
  function isSmallTextLine (line 118) | function isSmallTextLine(line: ProjectionTextBox[]): boolean {
  function filterUnprojectableText (line 127) | function filterUnprojectableText(
  function canSnapLine (line 144) | function canSnapLine(config: LiteParseConfig, line: ProjectionTextBox[])...
  function fixSparseBlocks (line 161) | function fixSparseBlocks(blocks: LineRange[], rawLines: string[]) {
  function extractAnchorsPointsFromLines (line 191) | function extractAnchorsPointsFromLines(lines: ProjectionTextBox[][], pag...
  function handleRotationReadingOrder (line 593) | function handleRotationReadingOrder(textBbox: ProjectionTextBox[], pageH...
  function bboxToLine (line 765) | function bboxToLine(
  function canRenderBbox (line 1085) | function canRenderBbox(line: ProjectionTextBox[], bbox: ProjectionTextBo...
  function updateForwardAnchorRightBound (line 1097) | function updateForwardAnchorRightBound(
  function updateForwardAnchors (line 1127) | function updateForwardAnchors(
  function lineMaxGap (line 1153) | function lineMaxGap(line: ProjectionTextBox[]): number {
  function renderLineAsFlowingText (line 1166) | function renderLineAsFlowingText(
  function isFlowingTextBlock (line 1201) | function isFlowingTextBlock(
  function renderFlowingBlock (line 1240) | function renderFlowingBlock(
  function getMedianTextBoxSize (line 1266) | function getMedianTextBoxSize(lines: ProjectionTextBox[]): TextBoxSize {
  function projectToGrid (line 1287) | function projectToGrid(
  function projectPagesToGrid (line 2002) | async function projectPagesToGrid(

FILE: src/processing/gridVisualizer.ts
  constant CHAR_WIDTH (line 6) | const CHAR_WIDTH = 7;
  constant CHAR_HEIGHT (line 7) | const CHAR_HEIGHT = 14;
  constant LINE_HEIGHT (line 8) | const LINE_HEIGHT = 16;
  constant PADDING (line 9) | const PADDING = 12;
  constant LEGEND_WIDTH (line 10) | const LEGEND_WIDTH = 140;
  constant LEGEND_HEIGHT (line 11) | const LEGEND_HEIGHT = 100;
  constant COLORS (line 14) | const COLORS: Record<RenderedSegment["snap"], { bg: string; text: string...
  function escapeXml (line 22) | function escapeXml(s: string): string {
  function renderGridVisualization (line 38) | async function renderGridVisualization(
  function renderAllVisualizations (line 197) | async function renderAllVisualizations(

FILE: src/processing/markupUtils.ts
  function applyMarkupTags (line 7) | function applyMarkupTags(markup: MarkupData, text: string): string {

FILE: src/processing/ocrUtils.ts
  type OcrBlock (line 3) | interface OcrBlock {
  function parseImageOcrBlocks (line 21) | function parseImageOcrBlocks(image: Image): OcrBlock[] {
  function easyOcrResultLinesToList (line 63) | function easyOcrResultLinesToList(stdOutResult?: string): EasyOcrResultL...

FILE: src/processing/searchItems.test.ts
  function item (line 5) | function item(

FILE: src/processing/searchItems.ts
  function searchItems (line 26) | function searchItems(items: JsonTextItem[], options: SearchItemsOptions)...

FILE: src/processing/textUtils.ts
  function cleanOcrTableArtifacts (line 11) | function cleanOcrTableArtifacts(text: string): string {
  function strToSubscriptString (line 38) | function strToSubscriptString(str: string): string {
  function strToPostScript (line 82) | function strToPostScript(str: string): string {

FILE: src/vendor/pdfjs/pdf.mjs
  class FontLoader (line 34) | class FontLoader {
    method constructor (line 36) | constructor({
    method addNativeFontFace (line 46) | addNativeFontFace(nativeFontFace) {
    method removeNativeFontFace (line 50) | removeNativeFontFace(nativeFontFace) {
    method insertRule (line 54) | insertRule(rule) {
    method clear (line 62) | clear() {
    method loadSystemFont (line 73) | async loadSystemFont({
    method bind (line 101) | async bind(font) {
    method isFontLoadingAPISupported (line 136) | get isFontLoadingAPISupported() {
    method isSyncFontLoadingSupported (line 140) | get isSyncFontLoadingSupported() {
    method _queueLoadingCallback (line 149) | _queueLoadingCallback(callback) {
    method _loadTestFont (line 169) | get _loadTestFont() {
    method _prepareFontLoadEvent (line 173) | _prepareFontLoadEvent(font, request) {
  class FontFaceObject (line 238) | class FontFaceObject {
    method constructor (line 239) | constructor(translatedData, {
    method createNativeFontFace (line 254) | createNativeFontFace() {
    method createFontFaceRule (line 273) | createFontFaceRule() {
    method getPathGenerator (line 292) | getPathGenerator(objs, character) {
  class DrawLayer (line 338) | class DrawLayer {
    method constructor (line 343) | constructor({
    method setParent (line 348) | setParent(parent) {
    method _svgFactory (line 363) | static get _svgFactory() {
    method #setBox (line 366) | static #setBox(element, {
    method #createSVG (line 380) | #createSVG(box) {
    method #createClipPath (line 387) | #createClipPath(defs, pathId) {
    method highlight (line 399) | highlight(outlines, color, opacity, isPathUpdatable = false) {
    method highlightOutline (line 428) | highlightOutline(outlines) {
    method finalizeLine (line 474) | finalizeLine(id, line) {
    method updateLine (line 480) | updateLine(id, line) {
    method removeFreeHighlight (line 486) | removeFreeHighlight(id) {
    method updatePath (line 490) | updatePath(id, line) {
    method updateBox (line 493) | updateBox(id, box) {
    method show (line 496) | show(id, visible) {
    method rotate (line 499) | rotate(id, angle) {
    method changeColor (line 502) | changeColor(id, color) {
    method changeOpacity (line 505) | changeOpacity(id, opacity) {
    method addClass (line 508) | addClass(id, className) {
    method removeClass (line 511) | removeClass(id, className) {
    method remove (line 514) | remove(id) {
    method destroy (line 521) | destroy() {
  class XfaText (line 539) | class XfaText {
    method textContent (line 540) | static textContent(xfa) {
    method shouldBuildText (line 576) | static shouldBuildText(name) {
  class Outliner (line 593) | class Outliner {
    method constructor (line 597) | constructor(boxes, borderWidth = 0, innerMargin = 0, isLTR = true) {
    method getOutlines (line 642) | getOutlines() {
    method #getOutlines (line 656) | #getOutlines(outlineVerticalEdges) {
    method #binarySearch (line 703) | #binarySearch(y) {
    method #insert (line 721) | #insert([, y1, y2]) {
    method #remove (line 725) | #remove([, y1, y2]) {
    method #breakEdge (line 748) | #breakEdge(edge) {
  class Outline (line 781) | class Outline {
    method toSVGPath (line 782) | toSVGPath() {
    method box (line 785) | get box() {
    method serialize (line 788) | serialize(_bbox, _rotation) {
    method free (line 791) | get free() {
  class HighlightOutline (line 795) | class HighlightOutline extends Outline {
    method constructor (line 798) | constructor(outlines, box) {
    method toSVGPath (line 803) | toSVGPath() {
    method serialize (line 823) | serialize([blX, blY, trX, trY], _rotation) {
    method box (line 837) | get box() {
  class FreeOutliner (line 841) | class FreeOutliner {
    method constructor (line 858) | constructor({
    method free (line 872) | get free() {
    method isEmpty (line 875) | isEmpty() {
    method #getLastCoords (line 878) | #getLastCoords() {
    method add (line 884) | add({
    method toSVGPath (line 942) | toSVGPath() {
    method getOutlines (line 975) | getOutlines() {
  class FreeHighlightOutline (line 1023) | class FreeHighlightOutline extends Outline {
    method constructor (line 1031) | constructor(outline, points, box, scaleFactor, innerMargin, isLTR) {
    method toSVGPath (line 1055) | toSVGPath() {
    method serialize (line 1067) | serialize([blX, blY, trX, trY], rotation) {
    method #rescale (line 1095) | #rescale(src, tx, ty, sx, sy) {
    method #rescaleAndSwap (line 1103) | #rescaleAndSwap(src, tx, ty, sx, sy) {
    method #computeMinMax (line 1111) | #computeMinMax(isLTR) {
    method box (line 1162) | get box() {
    method getNewOutline (line 1165) | getNewOutline(thickness, innerMargin) {
  class Metadata (line 1202) | class Metadata {
    method constructor (line 1205) | constructor({
    method getRaw (line 1212) | getRaw() {
    method get (line 1215) | get(name) {
    method getAll (line 1218) | getAll() {
    method has (line 1221) | has(name) {
  function createFetchOptions (line 1239) | function createFetchOptions(headers, withCredentials, abortController) {
  function createHeaders (line 1249) | function createHeaders(httpHeaders) {
  function getArrayBuffer (line 1260) | function getArrayBuffer(val) {
  class PDFFetchStream (line 1270) | class PDFFetchStream {
    method constructor (line 1271) | constructor(source) {
    method _progressiveDataLength (line 1278) | get _progressiveDataLength() {
    method getFullReader (line 1281) | getFullReader() {
    method getRangeReader (line 1286) | getRangeReader(begin, end) {
    method cancelAllRequests (line 1294) | cancelAllRequests(reason) {
  class PDFFetchStreamReader (line 1301) | class PDFFetchStreamReader {
    method constructor (line 1302) | constructor(stream) {
    method headersReady (line 1346) | get headersReady() {
    method filename (line 1349) | get filename() {
    method contentLength (line 1352) | get contentLength() {
    method isRangeSupported (line 1355) | get isRangeSupported() {
    method isStreamingSupported (line 1358) | get isStreamingSupported() {
    method read (line 1361) | async read() {
    method cancel (line 1383) | cancel(reason) {
  class PDFFetchStreamRangeReader (line 1388) | class PDFFetchStreamRangeReader {
    method constructor (line 1389) | constructor(stream, begin, end) {
    method isStreamingSupported (line 1410) | get isStreamingSupported() {
    method read (line 1413) | async read() {
    method cancel (line 1434) | cancel(reason) {
  class GlobalWorkerOptions (line 1449) | class GlobalWorkerOptions {
    method workerPort (line 1452) | static get workerPort() {
    method workerPort (line 1455) | static set workerPort(val) {
    method workerSrc (line 1461) | static get workerSrc() {
    method workerSrc (line 1464) | static set workerSrc(val) {
  function wrapReason (line 1499) | function wrapReason(reason) {
  class MessageHandler (line 1518) | class MessageHandler {
    method constructor (line 1519) | constructor(sourceName, targetName, comObj) {
    method on (line 1590) | on(actionName, handler) {
    method send (line 1597) | send(actionName, data, transfers) {
    method sendWithPromise (line 1605) | sendWithPromise(actionName, data, transfers) {
    method sendWithStream (line 1622) | sendWithStream(actionName, data, queueingStrategy, transfers) {
    method #createStreamSink (line 1675) | #createStreamSink(data) {
    method #processStreamMessage (line 1758) | #processStreamMessage(data) {
    method #deleteStreamController (line 1875) | async #deleteStreamController(streamController, streamId) {
    method destroy (line 1879) | destroy() {
  class ColorPicker (line 1985) | class ColorPicker {
    method _keyboardManager (line 1998) | static get _keyboardManager() {
    method constructor (line 2001) | constructor({
    method renderButton (line 2017) | renderButton() {
    method renderMainDropdown (line 2032) | renderMainDropdown() {
    method #getDropdownRoot (line 2038) | #getDropdownRoot() {
    method #colorSelect (line 2064) | #colorSelect(color, event) {
    method _colorSelectFromKeyboard (line 2072) | _colorSelectFromKeyboard(event) {
    method _moveToNext (line 2083) | _moveToNext(event) {
    method _moveToPrevious (line 2094) | _moveToPrevious(event) {
    method _moveToBeginning (line 2106) | _moveToBeginning(event) {
    method _moveToEnd (line 2113) | _moveToEnd(event) {
    method #keyDown (line 2120) | #keyDown(event) {
    method #openDropdown (line 2123) | #openDropdown(event) {
    method #pointerDown (line 2137) | #pointerDown(event) {
    method hideDropdown (line 2143) | hideDropdown() {
    method #isDropdownVisible (line 2147) | get #isDropdownVisible() {
    method _hideDropdownFromKeyboard (line 2150) | _hideDropdownFromKeyboard() {
    method updateColor (line 2164) | updateColor(color) {
    method destroy (line 2176) | destroy() {
  class XfaLayer (line 2196) | class XfaLayer {
    method setupStorage (line 2197) | static setupStorage(html, id, element, storage, intent) {
    method setAttributes (line 2265) | static setAttributes({
    method render (line 2313) | static render(parameters) {
    method update (line 2398) | static update(parameters) {
  function setVerbosityLevel (line 2736) | function setVerbosityLevel(level) {
  function getVerbosityLevel (line 2741) | function getVerbosityLevel() {
  function info (line 2744) | function info(msg) {
  function warn (line 2749) | function warn(msg) {
  function unreachable (line 2754) | function unreachable(msg) {
  function assert (line 2757) | function assert(cond, msg) {
  function _isValidProtocol (line 2762) | function _isValidProtocol(url) {
  function createValidAbsoluteUrl (line 2774) | function createValidAbsoluteUrl(url, baseUrl = null, options = null) {
  function shadow (line 2799) | function shadow(obj, prop, value, nonSerializable = false) {
  function BaseException (line 2809) | function BaseException(message, name) {
  class PasswordException (line 2820) | class PasswordException extends BaseException {
    method constructor (line 2821) | constructor(msg, code) {
  class UnknownErrorException (line 2826) | class UnknownErrorException extends BaseException {
    method constructor (line 2827) | constructor(msg, details) {
  class InvalidPDFException (line 2832) | class InvalidPDFException extends BaseException {
    method constructor (line 2833) | constructor(msg) {
  class MissingPDFException (line 2837) | class MissingPDFException extends BaseException {
    method constructor (line 2838) | constructor(msg) {
  class UnexpectedResponseException (line 2842) | class UnexpectedResponseException extends BaseException {
    method constructor (line 2843) | constructor(msg, status) {
  class FormatError (line 2848) | class FormatError extends BaseException {
    method constructor (line 2849) | constructor(msg) {
  class AbortException (line 2853) | class AbortException extends BaseException {
    method constructor (line 2854) | constructor(msg) {
  function bytesToString (line 2858) | function bytesToString(bytes) {
  function stringToBytes (line 2875) | function stringToBytes(str) {
  function string32 (line 2886) | function string32(value) {
  function objectSize (line 2889) | function objectSize(obj) {
  function objectFromMap (line 2892) | function objectFromMap(map) {
  function isLittleEndian (line 2899) | function isLittleEndian() {
  function isEvalSupported (line 2905) | function isEvalSupported() {
  class FeatureTest (line 2913) | class FeatureTest {
    method isLittleEndian (line 2914) | static get isLittleEndian() {
    method isEvalSupported (line 2917) | static get isEvalSupported() {
    method isOffscreenCanvasSupported (line 2920) | static get isOffscreenCanvasSupported() {
    method platform (line 2923) | static get platform() {
    method isCSSRoundSupported (line 2933) | static get isCSSRoundSupported() {
  class Util (line 2938) | class Util {
    method makeHexColor (line 2939) | static makeHexColor(r, g, b) {
    method scaleMinMax (line 2942) | static scaleMinMax(transform, minMax) {
    method transform (line 2986) | static transform(m1, m2) {
    method applyTransform (line 2989) | static applyTransform(p, m) {
    method applyInverseTransform (line 2994) | static applyInverseTransform(p, m) {
    method getAxialAlignedBoundingBox (line 3000) | static getAxialAlignedBoundingBox(r, m) {
    method inverseTransform (line 3007) | static inverseTransform(m) {
    method getRotation (line 3011) | static getRotation(m) {
    method singularValueDecompose2dScale (line 3014) | static singularValueDecompose2dScale(m) {
    method normalizeRect (line 3026) | static normalizeRect(rect) {
    method intersect (line 3038) | static intersect(rect1, rect2) {
    method #getExtremumOnCurve (line 3051) | static #getExtremumOnCurve(x0, x1, x2, x3, y0, y1, y2, y3, t, minMax) {
    method #getExtremum (line 3065) | static #getExtremum(x0, x1, x2, x3, y0, y1, y2, y3, a, b, c, minMax) {
    method bezierBoundingBox (line 3081) | static bezierBoundingBox(x0, y0, x1, y1, x2, y2, x3, y3, minMax) {
  function stringToPDFString (line 3096) | function stringToPDFString(str) {
  function stringToUTF8String (line 3140) | function stringToUTF8String(str) {
  function utf8StringToString (line 3143) | function utf8StringToString(str) {
  function isArrayEqual (line 3146) | function isArrayEqual(arr1, arr2) {
  function getModificationDate (line 3157) | function getModificationDate(date = new Date()) {
  class PromiseCapability (line 3161) | class PromiseCapability {
    method constructor (line 3163) | constructor() {
    method settled (line 3175) | get settled() {
  function normalizeUnicode (line 3181) | function normalizeUnicode(str) {
  function getUuid (line 3188) | function getUuid() {
  class AltText (line 3224) | class AltText {
    method constructor (line 3233) | constructor(editor) {
    method initialize (line 3236) | static initialize(l10nPromise) {
    method render (line 3239) | async render() {
    method finish (line 3264) | finish() {
    method isEmpty (line 3273) | isEmpty() {
    method data (line 3276) | get data() {
    method data (line 3282) | set data({
    method toggle (line 3293) | toggle(enabled = false) {
    method destroy (line 3303) | destroy() {
    method #setState (line 3308) | async #setState() {
  class AnnotationEditor (line 3364) | class AnnotationEditor {
    method _resizerKeyboardManager (line 3393) | static get _resizerKeyboardManager() {
    method constructor (line 3415) | constructor(parameters) {
    method editorType (line 3449) | get editorType() {
    method _defaultLineColor (line 3452) | static get _defaultLineColor() {
    method deleteAnnotationElement (line 3455) | static deleteAnnotationElement(editor) {
    method initialize (line 3465) | static initialize(l10n, _uiManager, options) {
    method updateDefaultParams (line 3478) | static updateDefaultParams(_type, _value) {}
    method defaultPropertiesToUpdate (line 3479) | static get defaultPropertiesToUpdate() {
    method isHandlingMimeForPasting (line 3482) | static isHandlingMimeForPasting(mime) {
    method paste (line 3485) | static paste(item, parent) {
    method propertiesToUpdate (line 3488) | get propertiesToUpdate() {
    method _isDraggable (line 3491) | get _isDraggable() {
    method _isDraggable (line 3494) | set _isDraggable(value) {
    method isEnterHandled (line 3498) | get isEnterHandled() {
    method center (line 3501) | center() {
    method addCommands (line 3523) | addCommands(params) {
    method currentLayer (line 3526) | get currentLayer() {
    method setInBackground (line 3529) | setInBackground() {
    method setInForeground (line 3532) | setInForeground() {
    method setParent (line 3535) | setParent(parent) {
    method focusin (line 3544) | focusin(event) {
    method focusout (line 3554) | focusout(event) {
    method commitOrRemove (line 3570) | commitOrRemove() {
    method commit (line 3577) | commit() {
    method addToAnnotationStorage (line 3580) | addToAnnotationStorage() {
    method setAt (line 3583) | setAt(x, y, tx, ty) {
    method #translate (line 3590) | #translate([width, height], x, y) {
    method translate (line 3596) | translate(x, y) {
    method translateInPage (line 3599) | translateInPage(x, y) {
    method drag (line 3605) | drag(tx, ty) {
    method getBaseTranslation (line 3632) | getBaseTranslation() {
    method _mustFixPosition (line 3650) | get _mustFixPosition() {
    method fixAndSetPosition (line 3653) | fixAndSetPosition(rotation = this.rotation) {
    method #rotatePoint (line 3697) | static #rotatePoint(x, y, angle) {
    method screenToPageTranslation (line 3709) | screenToPageTranslation(x, y) {
    method pageTranslationToScreen (line 3712) | pageTranslationToScreen(x, y) {
    method #getRotationMatrix (line 3715) | #getRotationMatrix(rotation) {
    method parentScale (line 3733) | get parentScale() {
    method parentRotation (line 3736) | get parentRotation() {
    method parentDimensions (line 3739) | get parentDimensions() {
    method setDims (line 3748) | setDims(width, height) {
    method fixDims (line 3755) | fixDims() {
    method getInitialTranslation (line 3776) | getInitialTranslation() {
    method #createResizers (line 3779) | #createResizers() {
    method #resizerPointerdown (line 3797) | #resizerPointerdown(name, event) {
    method #addResizeToUndoStack (line 3836) | #addResizeToUndoStack(savedX, savedY, savedWidth, savedHeight) {
    method #resizerPointermove (line 3866) | #resizerPointermove(name, event) {
    method altTextFinish (line 3952) | altTextFinish() {
    method addEditToolbar (line 3955) | async addEditToolbar() {
    method removeEditToolbar (line 3966) | removeEditToolbar() {
    method getClientDimensions (line 3974) | getClientDimensions() {
    method addAltTextButton (line 3977) | async addAltTextButton() {
    method altTextData (line 3985) | get altTextData() {
    method altTextData (line 3988) | set altTextData(data) {
    method hasAltText (line 3994) | hasAltText() {
    method render (line 3997) | render() {
    method pointerdown (line 4019) | pointerdown(event) {
    method #selectOnPointerEvent (line 4034) | #selectOnPointerEvent(event) {
    method #setUpDragSession (line 4044) | #setUpDragSession(event) {
    method moveInDOM (line 4083) | moveInDOM() {
    method _setParentAndPosition (line 4092) | _setParentAndPosition(parent, x, y) {
    method getRect (line 4098) | getRect(tx, ty, rotation = this.rotation) {
    method getRectInCurrentCoords (line 4121) | getRectInCurrentCoords(rect, pageHeight) {
    method onceAdded (line 4138) | onceAdded() {}
    method isEmpty (line 4139) | isEmpty() {
    method enableEditMode (line 4142) | enableEditMode() {
    method disableEditMode (line 4145) | disableEditMode() {
    method isInEditMode (line 4148) | isInEditMode() {
    method shouldGetKeyboardEvents (line 4151) | shouldGetKeyboardEvents() {
    method needsToBeRebuilt (line 4154) | needsToBeRebuilt() {
    method rebuild (line 4157) | rebuild() {
    method rotate (line 4162) | rotate(_angle) {}
    method serialize (line 4163) | serialize(isForCopying = false, context = null) {
    method deserialize (line 4166) | static deserialize(data, parent, uiManager) {
    method remove (line 4181) | remove() {
    method isResizable (line 4205) | get isResizable() {
    method makeResizable (line 4208) | makeResizable() {
    method toolbarPosition (line 4215) | get toolbarPosition() {
    method keydown (line 4218) | keydown(event) {
    method #resizerKeydown (line 4277) | #resizerKeydown(event) {
    method #resizerBlur (line 4280) | #resizerBlur(event) {
    method #resizerFocus (line 4285) | #resizerFocus(name) {
    method #setResizerTabIndex (line 4288) | #setResizerTabIndex(value) {
    method _resizeWithKeyboard (line 4296) | _resizeWithKeyboard(x, y) {
    method #stopResizing (line 4305) | #stopResizing() {
    method _stopResizingWithKeyboard (line 4319) | _stopResizingWithKeyboard() {
    method select (line 4323) | select() {
    method unselect (line 4336) | unselect() {
    method updateParams (line 4346) | updateParams(type, value) {}
    method disableEditing (line 4347) | disableEditing() {}
    method enableEditing (line 4348) | enableEditing() {}
    method enterInEditMode (line 4349) | enterInEditMode() {}
    method getImageForAltText (line 4350) | getImageForAltText() {
    method contentDiv (line 4353) | get contentDiv() {
    method isEditing (line 4356) | get isEditing() {
    method isEditing (line 4359) | set isEditing(value) {
    method setAspectRatio (line 4371) | setAspectRatio(width, height) {
    method MIN_SIZE (line 4380) | static get MIN_SIZE() {
    method canCreateNewEmptyEditor (line 4383) | static canCreateNewEmptyEditor() {
    method telemetryInitialData (line 4386) | get telemetryInitialData() {
    method telemetryFinalData (line 4391) | get telemetryFinalData() {
    method _reportTelemetry (line 4394) | _reportTelemetry(data, mustWait = false) {
    method show (line 4423) | show(visible) {
  class FakeEditor (line 4428) | class FakeEditor extends AnnotationEditor {
    method constructor (line 4429) | constructor(params) {
    method serialize (line 4434) | serialize() {
  class EditorToolbar (line 4455) | class EditorToolbar {
    method constructor (line 4460) | constructor(editor) {
    method render (line 4463) | render() {
    method #pointerDown (line 4484) | static #pointerDown(e) {
    method #focusIn (line 4487) | #focusIn(e) {
    method #focusOut (line 4492) | #focusOut(e) {
    method #addListenersToElement (line 4497) | #addListenersToElement(element) {
    method hide (line 4506) | hide() {
    method show (line 4510) | show() {
    method #addDeleteButton (line 4513) | #addDeleteButton() {
    method #divider (line 4524) | get #divider() {
    method addAltTextButton (line 4529) | addAltTextButton(button) {
    method addColorPicker (line 4533) | addColorPicker(colorPicker) {
    method remove (line 4539) | remove() {
  class HighlightToolbar (line 4545) | class HighlightToolbar {
    method constructor (line 4549) | constructor(uiManager) {
    method #render (line 4552) | #render() {
    method #getLastPoint (line 4563) | #getLastPoint(boxes, isLTR) {
    method show (line 4587) | show(parent, boxes, isLTR) {
    method hide (line 4596) | hide() {
    method #addHighlightButton (line 4599) | #addHighlightButton() {
  class PixelsPerInch (line 4649) | class PixelsPerInch {
  class DOMFilterFactory (line 4654) | class DOMFilterFactory extends _base_factory_js__WEBPACK_IMPORTED_MODULE...
    method constructor (line 4661) | constructor({
    method #cache (line 4669) | get #cache() {
    method #hcmCache (line 4672) | get #hcmCache() {
    method #defs (line 4675) | get #defs() {
    method addFilter (line 4697) | addFilter(maps) {
    method addHCMFilter (line 4741) | addHCMFilter(fgColor, bgColor) {
    method addHighlightHCMFilter (line 4795) | addHighlightHCMFilter(filterName, fgColor, bgColor, newFgColor, newBgC...
    method destroy (line 4851) | destroy(keepHCM = false) {
    method #addGrayConversion (line 4865) | #addGrayConversion(filter) {
    method #createFilter (line 4871) | #createFilter(id) {
    method #appendFeFunc (line 4878) | #appendFeFunc(feComponentTransfer, func, table) {
    method #addTransferMapConversion (line 4884) | #addTransferMapConversion(rTable, gTable, bTable, filter) {
    method #getRGB (line 4891) | #getRGB(color) {
  class DOMCanvasFactory (line 4896) | class DOMCanvasFactory extends _base_factory_js__WEBPACK_IMPORTED_MODULE...
    method constructor (line 4897) | constructor({
    method _createCanvas (line 4903) | _createCanvas(width, height) {
  function fetchData (line 4910) | async function fetchData(url, type = "text") {
  class DOMCMapReaderFactory (line 4950) | class DOMCMapReaderFactory extends _base_factory_js__WEBPACK_IMPORTED_MO...
    method _fetchData (line 4951) | _fetchData(url, compressionType) {
  class DOMStandardFontDataFactory (line 4958) | class DOMStandardFontDataFactory extends _base_factory_js__WEBPACK_IMPOR...
    method _fetchData (line 4959) | _fetchData(url) {
  class DOMSVGFactory (line 4963) | class DOMSVGFactory extends _base_factory_js__WEBPACK_IMPORTED_MODULE_0_...
    method _createSVG (line 4964) | _createSVG(type) {
  class PageViewport (line 4968) | class PageViewport {
    method constructor (line 4969) | constructor({
    method rawDims (line 5038) | get rawDims() {
    method clone (line 5049) | clone({
    method convertToViewportPoint (line 5065) | convertToViewportPoint(x, y) {
    method convertToViewportRectangle (line 5068) | convertToViewportRectangle(rect) {
    method convertToPdfPoint (line 5073) | convertToPdfPoint(x, y) {
  class RenderingCancelledException (line 5077) | class RenderingCancelledException extends _shared_util_js__WEBPACK_IMPOR...
    method constructor (line 5078) | constructor(msg, extraDelay = 0) {
  function isDataScheme (line 5083) | function isDataScheme(url) {
  function isPdfFile (line 5091) | function isPdfFile(filename) {
  function getFilenameFromUrl (line 5094) | function getFilenameFromUrl(url, onlyStripPath = false) {
  function getPdfFilenameFromUrl (line 5100) | function getPdfFilenameFromUrl(url, defaultFilename = "document.pdf") {
  class StatTimer (line 5122) | class StatTimer {
    method time (line 5125) | time(name) {
    method timeEnd (line 5131) | timeEnd(name) {
    method toString (line 5142) | toString() {
  function isValidFetchUrl (line 5160) | function isValidFetchUrl(url, baseUrl) {
  function noContextMenu (line 5170) | function noContextMenu(e) {
  function deprecated (line 5173) | function deprecated(details) {
  class PDFDateString (line 5177) | class PDFDateString {
    method toDateObject (line 5178) | static toDateObject(input) {
  function getXfaPageViewport (line 5213) | function getXfaPageViewport(xfaPage, {
  function getRGB (line 5228) | function getRGB(color) {
  function getColorValues (line 5242) | function getColorValues(colors) {
  function getCurrentTransform (line 5253) | function getCurrentTransform(ctx) {
  function getCurrentTransformInverse (line 5264) | function getCurrentTransformInverse(ctx) {
  function setLayerDimensions (line 5275) | function setLayerDimensions(div, viewport, mustFlip = false, mustRotate ...
  function getArrayBuffer (line 5317) | function getArrayBuffer(xhr) {
  class NetworkManager (line 5324) | class NetworkManager {
    method constructor (line 5325) | constructor(url, args = {}) {
    method requestRange (line 5333) | requestRange(begin, end, listeners) {
    method requestFull (line 5343) | requestFull(listeners) {
    method request (line 5346) | request(args) {
    method onProgress (line 5382) | onProgress(xhrId, evt) {
    method onStateChange (line 5389) | onStateChange(xhrId, evt) {
    method getRequestXhr (line 5433) | getRequestXhr(xhrId) {
    method isPendingRequest (line 5436) | isPendingRequest(xhrId) {
    method abortRequest (line 5439) | abortRequest(xhrId) {
  class PDFNetworkStream (line 5445) | class PDFNetworkStream {
    method constructor (line 5446) | constructor(source) {
    method _onRangeRequestReaderClosed (line 5456) | _onRangeRequestReaderClosed(reader) {
    method getFullReader (line 5462) | getFullReader() {
    method getRangeReader (line 5467) | getRangeReader(begin, end) {
    method cancelAllRequests (line 5473) | cancelAllRequests(reason) {
  class PDFNetworkStreamFullRequestReader (line 5480) | class PDFNetworkStreamFullRequestReader {
    method constructor (line 5481) | constructor(manager, source) {
    method _onHeadersReceived (line 5507) | _onHeadersReceived() {
    method _onDone (line 5530) | _onDone(data) {
    method _onError (line 5554) | _onError(status) {
    method _onProgress (line 5563) | _onProgress(evt) {
    method filename (line 5569) | get filename() {
    method isRangeSupported (line 5572) | get isRangeSupported() {
    method isStreamingSupported (line 5575) | get isStreamingSupported() {
    method contentLength (line 5578) | get contentLength() {
    method headersReady (line 5581) | get headersReady() {
    method read (line 5584) | async read() {
    method cancel (line 5605) | cancel(reason) {
  class PDFNetworkStreamRangeRequestReader (line 5621) | class PDFNetworkStreamRangeRequestReader {
    method constructor (line 5622) | constructor(manager, begin, end) {
    method _close (line 5638) | _close() {
    method _onDone (line 5641) | _onDone(data) {
    method _onError (line 5662) | _onError(status) {
    method _onProgress (line 5670) | _onProgress(evt) {
    method isStreamingSupported (line 5677) | get isStreamingSupported() {
    method read (line 5680) | async read() {
    method cancel (line 5702) | cancel(reason) {
  function getFilenameFromContentDispositionHeader (line 5737) | function getFilenameFromContentDispositionHeader(contentDisposition) {
  function validateRangeRequestCapabilities (line 5872) | function validateRangeRequestCapabilities({
  function extractFilenameFromHeader (line 5903) | function extractFilenameFromHeader(getResponseHeader) {
  function createResponseStatusError (line 5918) | function createResponseStatusError(status, url) {
  function validateResponseStatus (line 5924) | function validateResponseStatus(status) {
  class NodeFilterFactory (line 5958) | class NodeFilterFactory extends _base_factory_js__WEBPACK_IMPORTED_MODUL...
  class NodeCanvasFactory (line 5959) | class NodeCanvasFactory extends _base_factory_js__WEBPACK_IMPORTED_MODUL...
    method _createCanvas (line 5960) | _createCanvas(width, height) {
  class NodeCMapReaderFactory (line 5964) | class NodeCMapReaderFactory extends _base_factory_js__WEBPACK_IMPORTED_M...
    method _fetchData (line 5965) | _fetchData(url, compressionType) {
  class NodeStandardFontDataFactory (line 5972) | class NodeStandardFontDataFactory extends _base_factory_js__WEBPACK_IMPO...
    method _fetchData (line 5973) | _fetchData(url) {
  class BaseFilterFactory (line 5995) | class BaseFilterFactory {
    method constructor (line 5996) | constructor() {
    method addFilter (line 6001) | addFilter(maps) {
    method addHCMFilter (line 6004) | addHCMFilter(fgColor, bgColor) {
    method addHighlightHCMFilter (line 6007) | addHighlightHCMFilter(filterName, fgColor, bgColor, newFgColor, newBgC...
    method destroy (line 6010) | destroy(keepHCM = false) {}
  class BaseCanvasFactory (line 6012) | class BaseCanvasFactory {
    method constructor (line 6013) | constructor() {
    method create (line 6018) | create(width, height) {
    method reset (line 6028) | reset(canvasAndContext, width, height) {
    method destroy (line 6038) | destroy(canvasAndContext) {
    method _createCanvas (line 6047) | _createCanvas(width, height) {
  class BaseCMapReaderFactory (line 6051) | class BaseCMapReaderFactory {
    method constructor (line 6052) | constructor({
    method fetch (line 6062) | async fetch({
    method _fetchData (line 6077) | _fetchData(url, compressionType) {
  class BaseStandardFontDataFactory (line 6081) | class BaseStandardFontDataFactory {
    method constructor (line 6082) | constructor({
    method fetch (line 6090) | async fetch({
    method _fetchData (line 6104) | _fetchData(url) {
  class BaseSVGFactory (line 6108) | class BaseSVGFactory {
    method constructor (line 6109) | constructor() {
    method create (line 6114) | create(width, height, skipDimensions = false) {
    method createElement (line 6128) | createElement(type) {
    method _createSVG (line 6134) | _createSVG(type) {
  class PDFDataTransportStream (line 6152) | class PDFDataTransportStream {
    method constructor (line 6153) | constructor(pdfDataRangeTransport, {
    method _onReceiveData (line 6199) | _onReceiveData({
    method _progressiveDataLength (line 6221) | get _progressiveDataLength() {
    method _onProgress (line 6224) | _onProgress(evt) {
    method _onProgressiveDone (line 6236) | _onProgressiveDone() {
    method _removeRangeReader (line 6240) | _removeRangeReader(reader) {
    method getFullReader (line 6246) | getFullReader() {
    method getRangeReader (line 6252) | getRangeReader(begin, end) {
    method cancelAllRequests (line 6261) | cancelAllRequests(reason) {
  class PDFDataTransportStreamReader (line 6269) | class PDFDataTransportStreamReader {
    method constructor (line 6270) | constructor(stream, queuedChunks, progressiveDone = false, contentDisp...
    method _enqueue (line 6284) | _enqueue(chunk) {
    method headersReady (line 6299) | get headersReady() {
    method filename (line 6302) | get filename() {
    method isRangeSupported (line 6305) | get isRangeSupported() {
    method isStreamingSupported (line 6308) | get isStreamingSupported() {
    method contentLength (line 6311) | get contentLength() {
    method read (line 6314) | async read() {
    method cancel (line 6332) | cancel(reason) {
    method progressiveDone (line 6342) | progressiveDone() {
  class PDFDataTransportStreamRangeReader (line 6349) | class PDFDataTransportStreamRangeReader {
    method constructor (line 6350) | constructor(stream, begin, end) {
    method _enqueue (line 6359) | _enqueue(chunk) {
    method isStreamingSupported (line 6382) | get isStreamingSupported() {
    method read (line 6385) | async read() {
    method cancel (line 6404) | cancel(reason) {
  class OptionalContentGroup (line 6431) | class OptionalContentGroup {
    method constructor (line 6436) | constructor(renderingIntent, {
    method visible (line 6447) | get visible() {
    method _setVisible (line 6465) | _setVisible(internal, visible, userSet = false) {
  class OptionalContentConfig (line 6473) | class OptionalContentConfig {
    method constructor (line 6478) | constructor(data, renderingIntent = _shared_util_js__WEBPACK_IMPORTED_...
    method #evaluateVisibilityExpression (line 6504) | #evaluateVisibilityExpression(array) {
    method isVisible (line 6540) | isVisible(group) {
    method setVisibility (line 6609) | setVisibility(id, visible = true) {
    method setOCGState (line 6618) | setOCGState({
    method hasInitialVisibility (line 6649) | get hasInitialVisibility() {
    method getOrder (line 6652) | getOrder() {
    method getGroups (line 6661) | getGroups() {
    method getGroup (line 6664) | getGroup(id) {
    method getHash (line 6667) | getHash() {
  class MurmurHash3_64 (line 6691) | class MurmurHash3_64 {
    method constructor (line 6692) | constructor(seed) {
    method update (line 6696) | update(input) {
    method hexdigest (line 6766) | hexdigest() {
  class FreeTextEditor (line 6805) | class FreeTextEditor extends editor_editor.AnnotationEditor {
    method _keyboardManager (line 6819) | static get _keyboardManager() {
    method constructor (line 6854) | constructor(params) {
    method initialize (line 6862) | static initialize(l10n, uiManager) {
    method updateDefaultParams (line 6869) | static updateDefaultParams(type, value) {
    method updateParams (line 6879) | updateParams(type, value) {
    method defaultPropertiesToUpdate (line 6889) | static get defaultPropertiesToUpdate() {
    method propertiesToUpdate (line 6892) | get propertiesToUpdate() {
    method #updateFontSize (line 6895) | #updateFontSize(fontSize) {
    method #updateColor (line 6913) | #updateColor(color) {
    method _translateEmpty (line 6928) | _translateEmpty(x, y) {
    method getInitialTranslation (line 6931) | getInitialTranslation() {
    method rebuild (line 6935) | rebuild() {
    method enableEditMode (line 6947) | enableEditMode() {
    method disableEditMode (line 6963) | disableEditMode() {
    method focusin (line 6983) | focusin(event) {
    method onceAdded (line 6992) | onceAdded() {
    method isEmpty (line 7004) | isEmpty() {
    method remove (line 7007) | remove() {
    method #extractText (line 7015) | #extractText() {
    method #setEditorDimensions (line 7025) | #setEditorDimensions() {
    method commit (line 7051) | commit() {
    method shouldGetKeyboardEvents (line 7083) | shouldGetKeyboardEvents() {
    method enterInEditMode (line 7086) | enterInEditMode() {
    method dblclick (line 7090) | dblclick(event) {
    method keydown (line 7093) | keydown(event) {
    method editorDivKeydown (line 7099) | editorDivKeydown(event) {
    method editorDivFocus (line 7102) | editorDivFocus(event) {
    method editorDivBlur (line 7105) | editorDivBlur(event) {
    method editorDivInput (line 7108) | editorDivInput(event) {
    method disableEditing (line 7111) | disableEditing() {
    method enableEditing (line 7115) | enableEditing() {
    method render (line 7119) | render() {
    method #setContent (line 7191) | #setContent() {
    method #serializeContent (line 7202) | #serializeContent() {
    method #deserializeContent (line 7205) | static #deserializeContent(content) {
    method contentDiv (line 7208) | get contentDiv() {
    method deserialize (line 7211) | static deserialize(data, parent, uiManager) {
    method serialize (line 7256) | serialize(isForCopying = false) {
    method #hasElementChanged (line 7289) | #hasElementChanged(serialized) {
    method #cheatInitialRect (line 7299) | #cheatInitialRect(delayed = false) {
  class HighlightEditor (line 7326) | class HighlightEditor extends editor_editor.AnnotationEditor {
    method _keyboardManager (line 7355) | static get _keyboardManager() {
    method constructor (line 7367) | constructor(params) {
    method telemetryInitialData (line 7393) | get telemetryInitialData() {
    method telemetryFinalData (line 7402) | get telemetryFinalData() {
    method computeTelemetryFinalData (line 7408) | static computeTelemetryFinalData(data) {
    method #createOutlines (line 7413) | #createOutlines() {
    method #createFreeOutlines (line 7429) | #createFreeOutlines({
    method initialize (line 7492) | static initialize(l10n, uiManager) {
    method updateDefaultParams (line 7496) | static updateDefaultParams(type, value) {
    method translateInPage (line 7506) | translateInPage(x, y) {}
    method toolbarPosition (line 7507) | get toolbarPosition() {
    method updateParams (line 7510) | updateParams(type, value) {
    method defaultPropertiesToUpdate (line 7520) | static get defaultPropertiesToUpdate() {
    method propertiesToUpdate (line 7523) | get propertiesToUpdate() {
    method #updateColor (line 7526) | #updateColor(color) {
    method #updateThickness (line 7547) | #updateThickness(thickness) {
    method addEditToolbar (line 7567) | async addEditToolbar() {
    method disableEditing (line 7580) | disableEditing() {
    method enableEditing (line 7584) | enableEditing() {
    method fixAndSetPosition (line 7588) | fixAndSetPosition() {
    method getBaseTranslation (line 7591) | getBaseTranslation() {
    method getRect (line 7594) | getRect(tx, ty) {
    method onceAdded (line 7597) | onceAdded() {
    method remove (line 7601) | remove() {
    method rebuild (line 7608) | rebuild() {
    method setParent (line 7621) | setParent(parent) {
    method #changeThickness (line 7635) | #changeThickness(thickness) {
    method #cleanDrawLayer (line 7646) | #cleanDrawLayer() {
    method #addToDrawLayer (line 7655) | #addToDrawLayer(parent = this.parent) {
    method #rotateBbox (line 7668) | static #rotateBbox({
    method rotate (line 7704) | rotate(angle) {
    method render (line 7720) | render() {
    method pointerover (line 7747) | pointerover() {
    method pointerleave (line 7750) | pointerleave() {
    method #keydown (line 7753) | #keydown(event) {
    method _moveCaret (line 7756) | _moveCaret(direction) {
    method #setCaret (line 7769) | #setCaret(start) {
    method select (line 7780) | select() {
    method unselect (line 7785) | unselect() {
    method _mustFixPosition (line 7792) | get _mustFixPosition() {
    method show (line 7795) | show(visible) {
    method #getRotation (line 7802) | #getRotation() {
    method #serializeBoxes (line 7805) | #serializeBoxes() {
    method #serializeOutlines (line 7829) | #serializeOutlines(rect) {
    method startHighlighting (line 7832) | static startHighlighting(parent, isLTR, {
    method #highlightMove (line 7876) | static #highlightMove(parent, event) {
    method #endHighlight (line 7881) | static #endHighlight(parent, event) {
    method deserialize (line 7896) | static deserialize(data, parent, uiManager) {
    method serialize (line 7920) | serialize(isForCopying = false) {
    method canCreateNewEmptyEditor (line 7939) | static canCreateNewEmptyEditor() {
  class InkEditor (line 7950) | class InkEditor extends editor_editor.AnnotationEditor {
    method constructor (line 7971) | constructor(params) {
    method initialize (line 7989) | static initialize(l10n, uiManager) {
    method updateDefaultParams (line 7992) | static updateDefaultParams(type, value) {
    method updateParams (line 8005) | updateParams(type, value) {
    method defaultPropertiesToUpdate (line 8018) | static get defaultPropertiesToUpdate() {
    method propertiesToUpdate (line 8021) | get propertiesToUpdate() {
    method #updateThickness (line 8024) | #updateThickness(thickness) {
    method #updateColor (line 8040) | #updateColor(color) {
    method #updateOpacity (line 8056) | #updateOpacity(opacity) {
    method rebuild (line 8073) | rebuild() {
    method remove (line 8091) | remove() {
    method setParent (line 8109) | setParent(parent) {
    method onScaleChanging (line 8117) | onScaleChanging() {
    method enableEditMode (line 8123) | enableEditMode() {
    method disableEditMode (line 8131) | disableEditMode() {
    method onceAdded (line 8140) | onceAdded() {
    method isEmpty (line 8143) | isEmpty() {
    method #getInitialBBox (line 8146) | #getInitialBBox() {
    method #setStroke (line 8162) | #setStroke() {
    method #startDrawing (line 8177) | #startDrawing(x, y) {
    method #draw (line 8202) | #draw(x, y) {
    method #endPath (line 8222) | #endPath() {
    method #stopDrawing (line 8229) | #stopDrawing(x, y) {
    method #drawPoints (line 8272) | #drawPoints() {
    method #makeBezierCurve (line 8296) | #makeBezierCurve(path2D, x0, y0, x1, y1, x2, y2) {
    method #generateBezierPoints (line 8303) | #generateBezierPoints() {
    method #redraw (line 8328) | #redraw() {
    method commit (line 8345) | commit() {
    method focusin (line 8363) | focusin(event) {
    method canvasPointerdown (line 8370) | canvasPointerdown(event) {
    method canvasPointermove (line 8383) | canvasPointermove(event) {
    method canvasPointerup (line 8387) | canvasPointerup(event) {
    method canvasPointerleave (line 8391) | canvasPointerleave(event) {
    method #endDrawing (line 8394) | #endDrawing(event) {
    method #createCanvas (line 8410) | #createCanvas() {
    method #createObserver (line 8418) | #createObserver() {
    method isResizable (line 8427) | get isResizable() {
    method render (line 8430) | render() {
    method #setCanvasDims (line 8461) | #setCanvasDims() {
    method setDimensions (line 8470) | setDimensions(width, height) {
    method #setScaleFactor (line 8491) | #setScaleFactor(width, height) {
    method #updateTransform (line 8497) | #updateTransform() {
    method #buildPath2D (line 8501) | static #buildPath2D(bezier) {
    method #toPDFCoordinates (line 8512) | static #toPDFCoordinates(points, rect, rotation) {
    method #fromPDFCoordinates (line 8546) | static #fromPDFCoordinates(points, rect, rotation) {
    method #serializePaths (line 8580) | #serializePaths(s, tx, ty, rect) {
    method #getBbox (line 8622) | #getBbox() {
    method #getPadding (line 8638) | #getPadding() {
    method #fitToContent (line 8641) | #fitToContent(firstTime = false) {
    method deserialize (line 8671) | static deserialize(data, parent, uiManager) {
    method serialize (line 8720) | serialize() {
  class StampEditor (line 8745) | class StampEditor extends editor_editor.AnnotationEditor {
    method constructor (line 8759) | constructor(params) {
    method initialize (line 8767) | static initialize(l10n, uiManager) {
    method supportedTypes (line 8770) | static get supportedTypes() {
    method supportedTypesStr (line 8774) | static get supportedTypesStr() {
    method isHandlingMimeForPasting (line 8777) | static isHandlingMimeForPasting(mime) {
    method paste (line 8780) | static paste(item, parent) {
    method #getBitmapFetched (line 8785) | #getBitmapFetched(data, fromId = false) {
    method #getBitmapDone (line 8800) | #getBitmapDone() {
    method #getBitmap (line 8807) | #getBitmap() {
    method remove (line 8848) | remove() {
    method rebuild (line 8863) | rebuild() {
    method onceAdded (line 8881) | onceAdded() {
    method isEmpty (line 8885) | isEmpty() {
    method isResizable (line 8888) | get isResizable() {
    method render (line 8891) | render() {
    method #createCanvas (line 8914) | #createCanvas() {
    method #setDimensions (line 8951) | #setDimensions(width, height) {
    method #scaleBitmap (line 8971) | #scaleBitmap(width, height) {
    method #drawBitmap (line 8995) | #drawBitmap(width, height) {
    method getImageForAltText (line 9035) | getImageForAltText() {
    method #serializeBitmap (line 9038) | #serializeBitmap(toUrl) {
    method #createObserver (line 9066) | #createObserver() {
    method deserialize (line 9075) | static deserialize(data, parent, uiManager) {
    method serialize (line 9101) | serialize(isForCopying = false, context = null) {
  class AnnotationEditorLayer (line 9160) | class AnnotationEditorLayer {
    method constructor (line 9176) | constructor({
    method isEmpty (line 9205) | get isEmpty() {
    method updateToolbar (line 9208) | updateToolbar(mode) {
    method updateMode (line 9211) | updateMode(mode = this.#uiManager.getMode()) {
    method hasTextLayer (line 9245) | hasTextLayer(textLayer) {
    method addInkEditorIfNeeded (line 9248) | addInkEditorIfNeeded(isCommitting) {
    method setEditingState (line 9266) | setEditingState(isEditing) {
    method addCommands (line 9269) | addCommands(params) {
    method togglePointerEvents (line 9272) | togglePointerEvents(enabled = false) {
    method toggleAnnotationLayerPointerEvents (line 9275) | toggleAnnotationLayerPointerEvents(enabled = false) {
    method enable (line 9278) | enable() {
    method disable (line 9307) | disable() {
    method getEditableAnnotation (line 9346) | getEditableAnnotation(id) {
    method setActiveEditor (line 9349) | setActiveEditor(editor) {
    method enableTextSelection (line 9356) | enableTextSelection() {
    method disableTextSelection (line 9362) | disableTextSelection() {
    method #textLayerPointerDown (line 9368) | #textLayerPointerDown(event) {
    method enableClick (line 9388) | enableClick() {
    method disableClick (line 9392) | disableClick() {
    method attach (line 9396) | attach(editor) {
    method detach (line 9405) | detach(editor) {
    method remove (line 9412) | remove(editor) {
    method changeParent (line 9421) | changeParent(editor) {
    method add (line 9438) | add(editor) {
    method moveEditorInDOM (line 9452) | moveEditorInDOM(editor) {
    method addOrRebuild (line 9477) | addOrRebuild(editor) {
    method addUndoableEditor (line 9485) | addUndoableEditor(editor) {
    method getNextId (line 9496) | getNextId() {
    method #currentEditorType (line 9499) | get #currentEditorType() {
    method #createNewEditor (line 9502) | #createNewEditor(params) {
    method canCreateNewEmptyEditor (line 9506) | canCreateNewEmptyEditor() {
    method pasteEditor (line 9509) | pasteEditor(mode, params) {
    method deserialize (line 9530) | deserialize(data) {
    method createAndAddNewEditor (line 9533) | createAndAddNewEditor(event, isCentered, data = {}) {
    method #getCenterPoint (line 9549) | #getCenterPoint() {
    method addNewEditor (line 9568) | addNewEditor() {
    method setSelected (line 9571) | setSelected(editor) {
    method toggleSelected (line 9574) | toggleSelected(editor) {
    method isSelected (line 9577) | isSelected(editor) {
    method unselect (line 9580) | unselect(editor) {
    method pointerup (line 9583) | pointerup(event) {
    method pointerdown (line 9607) | pointerdown(event) {
    method findNewParent (line 9628) | findNewParent(editor, x, y) {
    method destroy (line 9636) | destroy() {
    method #cleanup (line 9655) | #cleanup() {
    method render (line 9664) | render({
    method update (line 9674) | update({
    method pageDimensions (line 9691) | get pageDimensions() {
    method scale (line 9698) | get scale() {
  function parseUrl (line 9725) | function parseUrl(sourceUrl) {
  class PDFNodeStream (line 9738) | class PDFNodeStream {
    method constructor (line 9739) | constructor(source) {
    method _progressiveDataLength (line 9748) | get _progressiveDataLength() {
    method getFullReader (line 9751) | getFullReader() {
    method getRangeReader (line 9756) | getRangeReader(start, end) {
    method cancelAllRequests (line 9764) | cancelAllRequests(reason) {
  class BaseFullReader (line 9771) | class BaseFullReader {
    method constructor (line 9772) | constructor(stream) {
    method headersReady (line 9792) | get headersReady() {
    method filename (line 9795) | get filename() {
    method contentLength (line 9798) | get contentLength() {
    method isRangeSupported (line 9801) | get isRangeSupported() {
    method isStreamingSupported (line 9804) | get isStreamingSupported() {
    method read (line 9807) | async read() {
    method cancel (line 9834) | cancel(reason) {
    method _error (line 9841) | _error(reason) {
    method _setReadableStream (line 9845) | _setReadableStream(readableStream) {
  class BaseRangeReader (line 9866) | class BaseRangeReader {
    method constructor (line 9867) | constructor(stream) {
    method isStreamingSupported (line 9878) | get isStreamingSupported() {
    method read (line 9881) | async read() {
    method cancel (line 9907) | cancel(reason) {
    method _error (line 9914) | _error(reason) {
    method _setReadableStream (line 9918) | _setReadableStream(readableStream) {
  function createRequestOptions (line 9936) | function createRequestOptions(parsedUrl, headers) {
  class PDFNodeStreamFullReader (line 9947) | class PDFNodeStreamFullReader extends BaseFullReader {
    method constructor (line 9948) | constructor(stream) {
  class PDFNodeStreamRangeReader (line 9986) | class PDFNodeStreamRangeReader extends BaseRangeReader {
    method constructor (line 9987) | constructor(stream, start, end) {
  class PDFNodeStreamFsFullReader (line 10018) | class PDFNodeStreamFsFullReader extends BaseFullReader {
    method constructor (line 10019) | constructor(stream) {
  class PDFNodeStreamFsRangeReader (line 10038) | class PDFNodeStreamFsRangeReader extends BaseRangeReader {
    method constructor (line 10039) | constructor(stream, start, end) {
  class AnnotationStorage (line 10076) | class AnnotationStorage {
    method constructor (line 10079) | constructor() {
    method getValue (line 10084) | getValue(key, defaultValue) {
    method getRawValue (line 10091) | getRawValue(key) {
    method remove (line 10094) | remove(key) {
    method setValue (line 10108) | setValue(key, value) {
    method has (line 10129) | has(key) {
    method getAll (line 10132) | getAll() {
    method setAll (line 10135) | setAll(obj) {
    method size (line 10140) | get size() {
    method #setModified (line 10143) | #setModified() {
    method resetModified (line 10151) | resetModified() {
    method print (line 10159) | get print() {
    method serializable (line 10162) | get serializable() {
    method editorStats (line 10192) | get editorStats() {
  class PrintAnnotationStorage (line 10230) | class PrintAnnotationStorage extends AnnotationStorage {
    method constructor (line 10232) | constructor(parent) {
    method print (line 10248) | get print() {
    method serializable (line 10251) | get serializable() {
  function getCtx (line 10277) | function getCtx() {
  function cleanupTextLayer (line 10288) | function cleanupTextLayer() {
  function getAscent (line 10292) | function getAscent(fontFamily) {
  function appendText (line 10342) | function appendText(task, geom, styles) {
  function layout (line 10408) | function layout(params) {
  function render (line 10448) | function render(task) {
  class TextLayerRenderTask (line 10466) | class TextLayerRenderTask {
    method constructor (line 10467) | constructor({
    method promise (line 10507) | get promise() {
    method cancel (line 10510) | cancel() {
    method _processItems (line 10518) | _processItems(items, styleCache) {
    method _layoutText (line 10538) | _layoutText(textDiv) {
    method _render (line 10551) | _render() {
  function renderTextLayer (line 10587) | function renderTextLayer(params) {
  function updateTextLayer (line 10592) | function updateTextLayer({
  function bindEvents (line 10644) | function bindEvents(obj, element, names) {
  function opacityToHex (line 10649) | function opacityToHex(opacity) {
  class IdManager (line 10652) | class IdManager {
    method constructor (line 10654) | constructor() {}
    method id (line 10655) | get id() {
  class ImageManager (line 10659) | class ImageManager {
    method _isSVGFittingCanvas (line 10663) | static get _isSVGFittingCanvas() {
    method #get (line 10675) | async #get(key, rawData) {
    method getFromFile (line 10731) | async getFromFile(file) {
    method getFromUrl (line 10740) | async getFromUrl(url) {
    method getFromId (line 10743) | async getFromId(id) {
    method getSvgUrl (line 10758) | getSvgUrl(id) {
    method deleteId (line 10765) | deleteId(id) {
    method isValidId (line 10777) | isValidId(id) {
  class CommandManager (line 10781) | class CommandManager {
    method constructor (line 10786) | constructor(maxSize = 128) {
    method add (line 10789) | add({
    method undo (line 10836) | undo() {
    method redo (line 10850) | redo() {
    method hasSomethingToUndo (line 10863) | hasSomethingToUndo() {
    method hasSomethingToRedo (line 10866) | hasSomethingToRedo() {
    method destroy (line 10869) | destroy() {
  class KeyboardManager (line 10873) | class KeyboardManager {
    method constructor (line 10874) | constructor(callbacks) {
    method #serialize (line 10900) | #serialize(event) {
    method exec (line 10918) | exec(self, event) {
  class ColorManager (line 10944) | class ColorManager {
    method _colors (line 10946) | get _colors() {
    method convert (line 10951) | convert(color) {
    method getHexCode (line 10963) | getHexCode(name) {
  class AnnotationEditorUIManager (line 10971) | class AnnotationEditorUIManager {
    method _keyboardManager (line 11026) | static get _keyboardManager() {
    method constructor (line 11084) | constructor(container, viewer, altTextManager, eventBus, pdfDocument, ...
    method destroy (line 11107) | destroy() {
    method mlGuess (line 11136) | async mlGuess(data) {
    method hasMLManager (line 11139) | get hasMLManager() {
    method hcmFilter (line 11142) | get hcmFilter() {
    method direction (line 11145) | get direction() {
    method highlightColors (line 11148) | get highlightColors() {
    method highlightColorNames (line 11151) | get highlightColorNames() {
    method setMainHighlightColorPicker (line 11154) | setMainHighlightColorPicker(colorPicker) {
    method editAltText (line 11157) | editAltText(editor) {
    method onPageChanging (line 11160) | onPageChanging({
    method focusMainContainer (line 11165) | focusMainContainer() {
    method findParent (line 11168) | findParent(x, y) {
    method disableUserSelect (line 11182) | disableUserSelect(value = false) {
    method addShouldRescale (line 11185) | addShouldRescale(editor) {
    method removeShouldRescale (line 11188) | removeShouldRescale(editor) {
    method onScaleChanging (line 11191) | onScaleChanging({
    method onRotationChanging (line 11200) | onRotationChanging({
    method #getAnchorElementForSelection (line 11206) | #getAnchorElementForSelection({
    method highlightSelection (line 11211) | highlightSelection(methodOfCreation = "") {
    method #displayHighlightToolbar (line 11255) | #displayHighlightToolbar() {
    method addToAnnotationStorage (line 11269) | addToAnnotationStorage(editor) {
    method #selectionChange (line 11274) | #selectionChange() {
    method #onSelectEnd (line 11331) | #onSelectEnd(methodOfCreation = "") {
    method #addSelectionListener (line 11338) | #addSelectionListener() {
    method #removeSelectionListener (line 11341) | #removeSelectionListener() {
    method #addFocusManager (line 11344) | #addFocusManager() {
    method #removeFocusManager (line 11348) | #removeFocusManager() {
    method blur (line 11352) | blur() {
    method focus (line 11372) | focus() {
    method #addKeyboardManager (line 11385) | #addKeyboardManager() {
    method #removeKeyboardManager (line 11389) | #removeKeyboardManager() {
    method #addCopyPasteListeners (line 11393) | #addCopyPasteListeners() {
    method #removeCopyPasteListeners (line 11398) | #removeCopyPasteListeners() {
    method addEditListeners (line 11403) | addEditListeners() {
    method removeEditListeners (line 11407) | removeEditListeners() {
    method copy (line 11411) | copy(event) {
    method cut (line 11429) | cut(event) {
    method paste (line 11433) | paste(event) {
    method keydown (line 11490) | keydown(event) {
    method keyup (line 11498) | keyup(event) {
    method onEditingAction (line 11507) | onEditingAction({
    method #dispatchUpdateStates (line 11522) | #dispatchUpdateStates(details) {
    method #dispatchUpdateUI (line 11534) | #dispatchUpdateUI(details) {
    method setEditingState (line 11540) | setEditingState(isEditing) {
    method registerEditorTypes (line 11560) | registerEditorTypes(types) {
    method getId (line 11569) | getId() {
    method currentLayer (line 11572) | get currentLayer() {
    method getLayer (line 11575) | getLayer(pageIndex) {
    method currentPageIndex (line 11578) | get currentPageIndex() {
    method addLayer (line 11581) | addLayer(layer) {
    method removeLayer (line 11589) | removeLayer(layer) {
    method updateMode (line 11592) | updateMode(mode, editId = null, isFromKeyboard = false) {
    method addNewEditorFromKeyboard (line 11623) | addNewEditorFromKeyboard() {
    method updateToolbar (line 11628) | updateToolbar(mode) {
    method updateParams (line 11637) | updateParams(type, value) {
    method showAllEditors (line 11670) | showAllEditors(type, visible, updateButton = false) {
    method enableWaiting (line 11681) | enableWaiting(mustWait = false) {
    method #enableAll (line 11695) | #enableAll() {
    method #disableAll (line 11703) | #disableAll() {
    method getEditors (line 11712) | getEditors(pageIndex) {
    method getEditor (line 11721) | getEditor(id) {
    method addEditor (line 11724) | addEditor(editor) {
    method removeEditor (line 11727) | removeEditor(editor) {
    method addDeletedAnnotationElement (line 11743) | addDeletedAnnotationElement(editor) {
    method isDeletedAnnotationElement (line 11747) | isDeletedAnnotationElement(annotationElementId) {
    method removeDeletedAnnotationElement (line 11750) | removeDeletedAnnotationElement(editor) {
    method #addEditorToLayer (line 11754) | #addEditorToLayer(editor) {
    method setActiveEditor (line 11762) | setActiveEditor(editor) {
    method #lastSelectedEditor (line 11771) | get #lastSelectedEditor() {
    method updateUI (line 11776) | updateUI(editor) {
    method toggleSelected (line 11781) | toggleSelected(editor) {
    method setSelected (line 11797) | setSelected(editor) {
    method isSelected (line 11811) | isSelected(editor) {
    method firstSelectedEditor (line 11814) | get firstSelectedEditor() {
    method unselect (line 11817) | unselect(editor) {
    method hasSelection (line 11824) | get hasSelection() {
    method isEnterHandled (line 11827) | get isEnterHandled() {
    method undo (line 11830) | undo() {
    method redo (line 11838) | redo() {
    method addCommands (line 11846) | addCommands(params) {
    method #isEmpty (line 11854) | #isEmpty() {
    method delete (line 11865) | delete() {
    method commitOrRemove (line 11887) | commitOrRemove() {
    method hasSomethingToControl (line 11890) | hasSomethingToControl() {
    method #selectEditors (line 11893) | #selectEditors(editors) {
    method selectAll (line 11909) | selectAll() {
    method unselectAll (line 11915) | unselectAll() {
    method translateSelectedEditors (line 11933) | translateSelectedEditors(x, y, noCommit = false) {
    method setUpDragSession (line 11973) | setUpDragSession() {
    method endDragSession (line 11990) | endDragSession() {
    method dragSelectedEditors (line 12046) | dragSelectedEditors(tx, ty) {
    method rebuild (line 12054) | rebuild(editor) {
    method isEditorHandlingKeyboard (line 12069) | get isEditorHandlingKeyboard() {
    method isActive (line 12072) | isActive(editor) {
    method getActive (line 12075) | getActive() {
    method getMode (line 12078) | getMode() {
    method imageManager (line 12081) | get imageManager() {
    method getSelectionBoxes (line 12084) | getSelectionBoxes(textLayer) {
  function getDocument (line 12213) | function getDocument(src) {
  function _fetchDocument (line 12371) | async function _fetchDocument(worker, source) {
  function getUrlProp (line 12381) | function getUrlProp(val) {
  function getDataProp (line 12394) | function getDataProp(val) {
  class PDFDocumentLoadingTask (line 12409) | class PDFDocumentLoadingTask {
    method constructor (line 12411) | constructor() {
    method promise (line 12420) | get promise() {
    method destroy (line 12423) | async destroy() {
  class PDFDataRangeTransport (line 12443) | class PDFDataRangeTransport {
    method constructor (line 12444) | constructor(length, initialData, progressiveDone = false, contentDispo...
    method addRangeListener (line 12455) | addRangeListener(listener) {
    method addProgressListener (line 12458) | addProgressListener(listener) {
    method addProgressiveReadListener (line 12461) | addProgressiveReadListener(listener) {
    method addProgressiveDoneListener (line 12464) | addProgressiveDoneListener(listener) {
    method onDataRange (line 12467) | onDataRange(begin, chunk) {
    method onDataProgress (line 12472) | onDataProgress(loaded, total) {
    method onDataProgressiveRead (line 12479) | onDataProgressiveRead(chunk) {
    method onDataProgressiveDone (line 12486) | onDataProgressiveDone() {
    method transportReady (line 12493) | transportReady() {
    method requestDataRange (line 12496) | requestDataRange(begin, end) {
    method abort (line 12499) | abort() {}
  class PDFDocumentProxy (line 12501) | class PDFDocumentProxy {
    method constructor (line 12502) | constructor(pdfInfo, transport) {
    method annotationStorage (line 12506) | get annotationStorage() {
    method filterFactory (line 12509) | get filterFactory() {
    method numPages (line 12512) | get numPages() {
    method fingerprints (line 12515) | get fingerprints() {
    method isPureXfa (line 12518) | get isPureXfa() {
    method allXfaHtml (line 12521) | get allXfaHtml() {
    method getPage (line 12524) | getPage(pageNumber) {
    method getPageIndex (line 12527) | getPageIndex(ref) {
    method getDestinations (line 12530) | getDestinations() {
    method getDestination (line 12533) | getDestination(id) {
    method getPageLabels (line 12536) | getPageLabels() {
    method getPageLayout (line 12539) | getPageLayout() {
    method getPageMode (line 12542) | getPageMode() {
    method getViewerPreferences (line 12545) | getViewerPreferences() {
    method getOpenAction (line 12548) | getOpenAction() {
    method getAttachments (line 12551) | getAttachments() {
    method getJSActions (line 12554) | getJSActions() {
    method getOutline (line 12557) | getOutline() {
    method getOptionalContentConfig (line 12560) | getOptionalContentConfig({
    method getPermissions (line 12568) | getPermissions() {
    method getMetadata (line 12571) | getMetadata() {
    method getMarkInfo (line 12574) | getMarkInfo() {
    method getData (line 12577) | getData() {
    method saveDocument (line 12580) | saveDocument() {
    method getDownloadInfo (line 12583) | getDownloadInfo() {
    method cleanup (line 12586) | cleanup(keepLoadedFonts = false) {
    method destroy (line 12589) | destroy() {
    method loadingParams (line 12592) | get loadingParams() {
    method loadingTask (line 12595) | get loadingTask() {
    method getFieldObjects (line 12598) | getFieldObjects() {
    method hasJSActions (line 12601) | hasJSActions() {
    method getCalculationOrderIds (line 12604) | getCalculationOrderIds() {
  class PDFPageProxy (line 12608) | class PDFPageProxy {
    method constructor (line 12611) | constructor(pageIndex, pageInfo, transport, pdfBug = false) {
    method pageNumber (line 12623) | get pageNumber() {
    method rotate (line 12626) | get rotate() {
    method ref (line 12629) | get ref() {
    method userUnit (line 12632) | get userUnit() {
    method view (line 12635) | get view() {
    method getViewport (line 12638) | getViewport({
    method getAnnotations (line 12654) | getAnnotations({
    method getJSActions (line 12662) | getJSActions() {
    method filterFactory (line 12665) | get filterFactory() {
    method isPureXfa (line 12668) | get isPureXfa() {
    method getXfa (line 12671) | async getXfa() {
    method render (line 12674) | render({
    method getOperatorList (line 12772) | getOperatorList({
    method streamTextContent (line 12810) | streamTextContent({
    method getTextContent (line 12826) | getTextContent(params = {}) {
    method getStructTree (line 12856) | getStructTree() {
    method _destroy (line 12859) | _destroy() {
    method cleanup (line 12881) | cleanup(resetStats = false) {
    method #tryCleanup (line 12889) | #tryCleanup(delayed = false) {
    method #abortDelayedCleanup (line 12914) | #abortDelayedCleanup() {
    method _startRenderPage (line 12920) | _startRenderPage(transparency, cacheKey) {
    method _renderPageChunk (line 12928) | _renderPageChunk(operatorListChunk, intentState) {
    method _pumpOperatorList (line 12942) | _pumpOperatorList({
    method _abortOperatorList (line 13001) | _abortOperatorList({
    method stats (line 13046) | get stats() {
  class LoopbackPort (line 13050) | class LoopbackPort {
    method postMessage (line 13053) | postMessage(obj, transfer) {
    method addEventListener (line 13072) | addEventListener(name, listener) {
    method removeEventListener (line 13075) | removeEventListener(name, listener) {
    method terminate (line 13078) | terminate() {
  class PDFWorker (line 13111) | class PDFWorker {
    method constructor (line 13113) | constructor({
    method promise (line 13135) | get promise() {
    method port (line 13138) | get port() {
    method messageHandler (line 13141) | get messageHandler() {
    method _initializeFromPort (line 13144) | _initializeFromPort(port) {
    method _initialize (line 13153) | _initialize() {
    method _setupFakeWorker (line 13226) | _setupFakeWorker() {
    method destroy (line 13251) | destroy() {
    method fromPort (line 13264) | static fromPort(params) {
    method workerSrc (line 13277) | static get workerSrc() {
    method #mainThreadWorkerMessageHandler (line 13283) | static get #mainThreadWorkerMessageHandler() {
    method _setupFakeWorkerGlobal (line 13290) | static get _setupFakeWorkerGlobal() {
  class WorkerTransport (line 13301) | class WorkerTransport {
    method constructor (line 13306) | constructor(messageHandler, loadingTask, networkStream, params, factor...
    method #cacheSimpleMethod (line 13327) | #cacheSimpleMethod(name, data = null) {
    method annotationStorage (line 13336) | get annotationStorage() {
    method getRenderingIntent (line 13339) | getRenderingIntent(intent, annotationMode = _shared_util_js__WEBPACK_I...
    method destroy (line 13380) | destroy() {
    method setupMessageHandler (line 13413) | setupMessageHandler() {
    method getData (line 13688) | getData() {
    method saveDocument (line 13691) | saveDocument() {
    method getPage (line 13708) | getPage(pageNumber) {
    method getPageIndex (line 13730) | getPageIndex(ref) {
    method getAnnotations (line 13739) | getAnnotations(pageIndex, intent) {
    method getFieldObjects (line 13745) | getFieldObjects() {
    method hasJSActions (line 13748) | hasJSActions() {
    method getCalculationOrderIds (line 13751) | getCalculationOrderIds() {
    method getDestinations (line 13754) | getDestinations() {
    method getDestination (line 13757) | getDestination(id) {
    method getPageLabels (line 13765) | getPageLabels() {
    method getPageLayout (line 13768) | getPageLayout() {
    method getPageMode (line 13771) | getPageMode() {
    method getViewerPreferences (line 13774) | getViewerPreferences() {
    method getOpenAction (line 13777) | getOpenAction() {
    method getAttachments (line 13780) | getAttachments() {
    method getDocJSActions (line 13783) | getDocJSActions() {
    method getPageJSActions (line 13786) | getPageJSActions(pageIndex) {
    method getStructTree (line 13791) | getStructTree(pageIndex) {
    method getOutline (line 13796) | getOutline() {
    method getOptionalContentConfig (line 13799) | getOptionalContentConfig(renderingIntent) {
    method getPermissions (line 13802) | getPermissions() {
    method getMetadata (line 13805) | getMetadata() {
    method getMarkInfo (line 13820) | getMarkInfo() {
    method startCleanup (line 13823) | async startCleanup(keepLoadedFonts = false) {
    method loadingParams (line 13842) | get loadingParams() {
  class PDFObjects (line 13853) | class PDFObjects {
    method #ensureObj (line 13855) | #ensureObj(objId) {
    method get (line 13861) | get(objId, callback = null) {
    method getAll (line 13873) | getAll() {
    method has (line 13876) | has(objId) {
    method resolve (line 13880) | resolve(objId, data = null) {
    method clear (line 13885) | clear() {
  method [Symbol.iterator] (line 13894) | *[Symbol.iterator]() {
  class RenderTask (line 13907) | class RenderTask {
    method constructor (line 13909) | constructor(internalRenderTask) {
    method promise (line 13913) | get promise() {
    method cancel (line 13916) | cancel(extraDelay = 0) {
    method separateAnnots (line 13919) | get separateAnnots() {
  class InternalRenderTask (line 13932) | class InternalRenderTask {
    method constructor (line 13934) | constructor({
    method completed (line 13973) | get completed() {
    method initializeGraphics (line 13976) | initializeGraphics({
    method cancel (line 14013) | cancel(error = null, extraDelay = 0) {
    method operatorListChanged (line 14020) | operatorListChanged() {
    method _continue (line 14031) | _continue() {
    method _scheduleNext (line 14042) | _scheduleNext() {
    method _next (line 14051) | async _next() {
  function applyBoundingBox (line 14095) | function applyBoundingBox(ctx, bbox) {
  class BaseShadingPattern (line 14105) | class BaseShadingPattern {
    method constructor (line 14106) | constructor() {
    method getPattern (line 14111) | getPattern() {
  class RadialAxialShadingPattern (line 14115) | class RadialAxialShadingPattern extends BaseShadingPattern {
    method constructor (line 14116) | constructor(IR) {
    method _createGradient (line 14127) | _createGradient(ctx) {
    method getPattern (line 14139) | getPattern(ctx, owner, inverse, pathType) {
  function drawTriangle (line 14169) | function drawTriangle(data, context, p1, p2, p3, c1, c2, c3) {
  function drawFigure (line 14271) | function drawFigure(data, figure, context) {
  class MeshShadingPattern (line 14297) | class MeshShadingPattern extends BaseShadingPattern {
    method constructor (line 14298) | constructor(IR) {
    method _createMeshCanvas (line 14308) | _createMeshCanvas(combinedScale, backgroundColor, cachedCanvases) {
    method getPattern (line 14355) | getPattern(ctx, owner, inverse, pathType) {
  class DummyShadingPattern (line 14379) | class DummyShadingPattern extends BaseShadingPattern {
    method getPattern (line 14380) | getPattern() {
  function getShadingPattern (line 14384) | function getShadingPattern(IR) {
  class TilingPattern (line 14399) | class TilingPattern {
    method constructor (line 14401) | constructor(IR, color, ctx, canvasGraphicsFactory, baseTransform) {
    method createPatternCanvas (line 14414) | createPatternCanvas(owner) {
    method getSizeAndScale (line 14465) | getSizeAndScale(step, realOutputSize, scale) {
    method clipBbox (line 14479) | clipBbox(graphics, x0, y0, x1, y1) {
    method setFillAndStrokeStyleToContext (line 14487) | setFillAndStrokeStyleToContext(graphics, paintType, color) {
    method getPattern (line 14509) | getPattern(ctx, owner, inverse, pathType) {
  function convertToRGBA (line 14529) | function convertToRGBA(params) {
  function convertBlackAndWhiteToRGBA (line 14538) | function convertBlackAndWhiteToRGBA({
  function convertRGBToRGBA (line 14579) | function convertRGBToRGBA({
  function grayToRGBA (line 14622) | function grayToRGBA(src, dest) {
  function mirrorContextOperations (line 14646) | function mirrorContextOperations(ctx, destCtx) {
  class CachedCanvases (line 14744) | class CachedCanvases {
    method constructor (line 14745) | constructor(canvasFactory) {
    method getCanvas (line 14749) | getCanvas(id, width, height) {
    method delete (line 14760) | delete(id) {
    method clear (line 14763) | clear() {
  function drawImageAtIntegerCoords (line 14771) | function drawImageAtIntegerCoords(ctx, srcImg, srcX, srcY, srcW, srcH, d...
  function compileType3Glyph (line 14806) | function compileType3Glyph(imgData) {
  class CanvasExtraState (line 14936) | class CanvasExtraState {
    method constructor (line 14937) | constructor(width, height) {
    method clone (line 14964) | clone() {
    method setCurrentPoint (line 14969) | setCurrentPoint(x, y) {
    method updatePathMinMax (line 14973) | updatePathMinMax(transform, x, y) {
    method updateRectMinMax (line 14980) | updateRectMinMax(transform, rect) {
    method updateScalingPathMinMax (line 14990) | updateScalingPathMinMax(transform, minMax) {
    method updateCurvePathMinMax (line 14997) | updateCurvePathMinMax(transform, x0, y0, x1, y1, x2, y2, x3, y3, minMa...
    method getPathBoundingBox (line 15004) | getPathBoundingBox(pathType = PathType.FILL, transform = null) {
    method updateClipFromPath (line 15020) | updateClipFromPath() {
    method isEmptyClip (line 15024) | isEmptyClip() {
    method startNewPathAndClipBox (line 15027) | startNewPathAndClipBox(box) {
    method getClippedPathBoundingBox (line 15034) | getClippedPathBoundingBox(pathType = PathType.FILL, transform = null) {
  function putBinaryImageData (line 15038) | function putBinaryImageData(ctx, imgData) {
  function putBinaryImageMask (line 15131) | function putBinaryImageMask(ctx, imgData) {
  function copyCtxState (line 15160) | function copyCtxState(sourceCtx, destCtx) {
  function resetCtxToDefault (line 15172) | function resetCtxToDefault(ctx) {
  function composeSMaskBackdrop (line 15195) | function composeSMaskBackdrop(bytes, r0, g0, b0) {
  function composeSMaskAlpha (line 15211) | function composeSMaskAlpha(maskData, layerData, transferMap) {
  function composeSMaskLuminosity (line 15219) | function composeSMaskLuminosity(maskData, layerData, transferMap) {
  function genericComposeSMask (line 15226) | function genericComposeSMask(maskCtx, layerCtx, width, height, subtype, ...
  function composeSMask (line 15245) | function composeSMask(ctx, smask, layerCtx, layerBox) {
  function getImageSmoothingEnabled (line 15261) | function getImageSmoothingEnabled(transform, interpolate) {
  class CanvasGraphics (line 15277) | class CanvasGraphics {
    method constructor (line 15278) | constructor(canvasCtx, commonObjs, objs, canvasFactory, filterFactory, {
    method getObject (line 15316) | getObject(data, fallback = null) {
    method beginDrawing (line 15322) | beginDrawing({
    method executeOperatorList (line 15353) | executeOperatorList(operatorList, executionStartIdx, continueCallback,...
    method #restoreInitialState (line 15397) | #restoreInitialState() {
    method endDrawing (line 15411) | endDrawing() {
    method #drawFilter (line 15426) | #drawFilter() {
    method _scaleImage (line 15437) | _scaleImage(img, inverseTransform) {
    method _createMaskCanvas (line 15472) | _createMaskCanvas(img) {
    method setLineWidth (line 15540) | setLineWidth(width) {
    method setLineCap (line 15547) | setLineCap(style) {
    method setLineJoin (line 15550) | setLineJoin(style) {
    method setMiterLimit (line 15553) | setMiterLimit(limit) {
    method setDash (line 15556) | setDash(dashArray, dashPhase) {
    method setRenderingIntent (line 15563) | setRenderingIntent(intent) {}
    method setFlatness (line 15564) | setFlatness(flatness) {}
    method setGState (line 15565) | setGState(states) {
    method inSMaskMode (line 15613) | get inSMaskMode() {
    method checkSMaskState (line 15616) | checkSMaskState() {
    method beginSMaskMode (line 15624) | beginSMaskMode() {
    method endSMaskMode (line 15640) | endSMaskMode() {
    method compose (line 15649) | compose(dirtyBox) {
    method save (line 15669) | save() {
    method restore (line 15680) | restore() {
    method transform (line 15698) | transform(a, b, c, d, e, f) {
    method constructPath (line 15703) | constructPath(ops, args, minMax) {
    method closePath (line 15787) | closePath() {
    method stroke (line 15790) | stroke(consumePath = true) {
    method closeStroke (line 15809) | closeStroke() {
    method fill (line 15813) | fill(consumePath = true) {
    method eoFill (line 15839) | eoFill() {
    method fillStroke (line 15843) | fillStroke() {
    method eoFillStroke (line 15848) | eoFillStroke() {
    method closeFillStroke (line 15852) | closeFillStroke() {
    method closeEOFillStroke (line 15856) | closeEOFillStroke() {
    method endPath (line 15861) | endPath() {
    method clip (line 15864) | clip() {
    method eoClip (line 15867) | eoClip() {
    method beginText (line 15870) | beginText() {
    method endText (line 15876) | endText() {
    method setCharSpacing (line 15895) | setCharSpacing(spacing) {
    method setWordSpacing (line 15898) | setWordSpacing(spacing) {
    method setHScale (line 15901) | setHScale(scale) {
    method setLeading (line 15904) | setLeading(leading) {
    method setFont (line 15907) | setFont(fontRefName, size) {
    method setTextRenderingMode (line 15946) | setTextRenderingMode(mode) {
    method setTextRise (line 15949) | setTextRise(rise) {
    method moveText (line 15952) | moveText(x, y) {
    method setLeadingMoveText (line 15956) | setLeadingMoveText(x, y) {
    method setTextMatrix (line 15960) | setTextMatrix(a, b, c, d, e, f) {
    method nextLine (line 15966) | nextLine() {
    method paintChar (line 15969) | paintChar(character, x, y, patternTransform) {
    method isFontSubpixelAAEnabled (line 16016) | get isFontSubpixelAAEnabled() {
    method showText (line 16032) | showText(glyphs) {
    method showType3Text (line 16162) | showType3Text(glyphs) {
    method setCharWidth (line 16215) | setCharWidth(xWidth, yWidth) {}
    method setCharWidthAndBounds (line 16216) | setCharWidthAndBounds(xWidth, yWidth, llx, lly, urx, ury) {
    method getColorN_Pattern (line 16221) | getColorN_Pattern(IR) {
    method setStrokeColorN (line 16238) | setStrokeColorN() {
    method setFillColorN (line 16241) | setFillColorN() {
    method setStrokeRGBColor (line 16245) | setStrokeRGBColor(r, g, b) {
    method setFillRGBColor (line 16250) | setFillRGBColor(r, g, b) {
    method _getPattern (line 16256) | _getPattern(objId, matrix = null) {
    method shadingFill (line 16269) | shadingFill(objId) {
    method beginInlineImage (line 16291) | beginInlineImage() {
    method beginImageData (line 16294) | beginImageData() {
    method paintFormXObjectBegin (line 16297) | paintFormXObjectBegin(matrix, bbox) {
    method paintFormXObjectEnd (line 16316) | paintFormXObjectEnd() {
    method beginGroup (line 16323) | beginGroup(group) {
    method endGroup (line 16398) | endGroup(group) {
    method beginAnnotation (line 16422) | beginAnnotation(id, rect, transform, matrix, hasOwnCanvas) {
    method endAnnotation (line 16469) | endAnnotation() {
    method paintImageMaskXObject (line 16478) | paintImageMaskXObject(img) {
    method paintImageMaskXObjectRepeat (line 16504) | paintImageMaskXObjectRepeat(img, scaleX, skewX = 0, skewY = 0, scaleY,...
    method paintImageMaskXObjectGroup (line 16523) | paintImageMaskXObjectGroup(images) {
    method paintImageXObject (line 16554) | paintImageXObject(objId) {
    method paintImageXObjectRepeat (line 16565) | paintImageXObjectRepeat(objId, scaleX, scaleY, positions) {
    method applyTransferMapsToCanvas (line 16588) | applyTransferMapsToCanvas(ctx) {
    method applyTransferMapsToBitmap (line 16596) | applyTransferMapsToBitmap(imgData) {
    method paintInlineImageXObject (line 16612) | paintInlineImageXObject(imgData) {
    method paintInlineImageXObjectGroup (line 16646) | paintInlineImageXObjectGroup(imgData, map) {
    method paintSolidColorImageMask (line 16671) | paintSolidColorImageMask() {
    method markPoint (line 16678) | markPoint(tag) {}
    method markPointProps (line 16679) | markPointProps(tag, properties) {}
    method beginMarkedContent (line 16680) | beginMarkedContent(tag) {
    method beginMarkedContentProps (line 16685) | beginMarkedContentProps(tag, properties) {
    method endMarkedContent (line 16697) | endMarkedContent() {
    method beginCompat (line 16701) | beginCompat() {}
    method endCompat (line 16702) | endCompat() {}
    method consumePath (line 16703) | consumePath(clipBox) {
    method getSinglePixelWidth (line 16725) | getSinglePixelWidth() {
    method getScaleForStroking (line 16739) | getScaleForStroking() {
    method rescaleAndStroke (line 16788) | rescaleAndStroke(saveRestore) {
    method isContentVisible (line 16816) | isContentVisible() {
  function makeColorComp (line 16853) | function makeColorComp(n) {
  function scaleAndClamp (line 16856) | function scaleAndClamp(x) {
  class ColorConverters (line 16859) | class ColorConverters {
    method CMYK_G (line 16860) | static CMYK_G([c, y, m, k]) {
    method G_CMYK (line 16863) | static G_CMYK([g]) {
    method G_RGB (line 16866) | static G_RGB([g]) {
    method G_rgb (line 16869) | static G_rgb([g]) {
    method G_HTML (line 16873) | static G_HTML([g]) {
    method RGB_G (line 16877) | static RGB_G([r, g, b]) {
    method RGB_rgb (line 16880) | static RGB_rgb(color) {
    method RGB_HTML (line 16883) | static RGB_HTML(color) {
    method T_HTML (line 16886) | static T_HTML() {
    method T_rgb (line 16889) | static T_rgb() {
    method CMYK_RGB (line 16892) | static CMYK_RGB([c, y, m, k]) {
    method CMYK_rgb (line 16895) | static CMYK_rgb([c, y, m, k]) {
    method CMYK_HTML (line 16898) | static CMYK_HTML(components) {
    method RGB_CMYK (line 16902) | static RGB_CMYK([r, g, b]) {
  function getRectDims (line 16922) | function getRectDims(rect) {
  class AnnotationElementFactory (line 16928) | class AnnotationElementFactory {
    method create (line 16929) | static create(parameters) {
  class AnnotationElement (line 16989) | class AnnotationElement {
    method constructor (line 16991) | constructor(parameters, {
    method _hasPopupData (line 17016) | static _hasPopupData({
    method hasPopupData (line 17023) | get hasPopupData() {
    method _createContainer (line 17026) | _createContainer(ignoreBorder) {
    method setRotation (line 17117) | setRotation(angle, container = this.container) {
    method _commonActions (line 17141) | get _commonActions() {
    method _dispatchEventFromSandbox (line 17219) | _dispatchEventFromSandbox(actions, jsEvent) {
    method _setDefaultPropertiesFromJS (line 17226) | _setDefaultPropertiesFromJS(element) {
    method _createQuadrilaterals (line 17249) | _createQuadrilaterals() {
    method _createPopup (line 17327) | _createPopup() {
    method render (line 17350) | render() {
    method _getElementsByName (line 17353) | _getElementsByName(name, skipId = null) {
    method show (line 17403) | show() {
    method hide (line 17409) | hide() {
    method getElementsToTriggerPopup (line 17415) | getElementsToTriggerPopup() {
    method addHighlightArea (line 17418) | addHighlightArea() {
    method _isEditable (line 17428) | get _isEditable() {
    method _editOnDoubleClick (line 17431) | _editOnDoubleClick() {
  class LinkAnnotationElement (line 17450) | class LinkAnnotationElement extends AnnotationElement {
    method constructor (line 17451) | constructor(parameters, options = null) {
    method render (line 17459) | render() {
    method #setInternalLink (line 17501) | #setInternalLink() {
    method _bindLink (line 17504) | _bindLink(link, destination) {
    method _bindNamedAction (line 17516) | _bindNamedAction(link, action) {
    method #bindAttachment (line 17524) | #bindAttachment(link, attachment, dest = null) {
    method #bindSetOCGState (line 17532) | #bindSetOCGState(link, action) {
    method _bindJSAction (line 17540) | _bindJSAction(link, data) {
    method _bindResetFormAction (line 17564) | _bindResetFormAction(link, resetForm) {
  class TextAnnotationElement (line 17667) | class TextAnnotationElement extends AnnotationElement {
    method constructor (line 17668) | constructor(parameters) {
    method render (line 17673) | render() {
  class WidgetAnnotationElement (line 17688) | class WidgetAnnotationElement extends AnnotationElement {
    method render (line 17689) | render() {
    method showElementAndHideCanvas (line 17695) | showElementAndHideCanvas(element) {
    method _getKeyModifier (line 17703) | _getKeyModifier(event) {
    method _setEventListener (line 17706) | _setEventListener(element, elementData, baseName, eventName, valueGett...
    method _setEventListeners (line 17747) | _setEventListeners(element, elementData, names, getter) {
    method _setBackgroundColor (line 17764) | _setBackgroundColor(element) {
    method _setTextStyle (line 17768) | _setTextStyle(element) {
    method _setRequired (line 17793) | _setRequired(element, isRequired) {
  class TextWidgetAnnotationElement (line 17802) | class TextWidgetAnnotationElement extends WidgetAnnotationElement {
    method constructor (line 17803) | constructor(parameters) {
    method setPropertyOnSiblings (line 17809) | setPropertyOnSiblings(base, key, value, keyInStorage) {
    method render (line 17820) | render() {
  class SignatureWidgetAnnotationElement (line 18120) | class SignatureWidgetAnnotationElement extends WidgetAnnotationElement {
    method constructor (line 18121) | constructor(parameters) {
  class CheckboxWidgetAnnotationElement (line 18127) | class CheckboxWidgetAnnotationElement extends WidgetAnnotationElement {
    method constructor (line 18128) | constructor(parameters) {
    method render (line 18133) | render() {
  class RadioButtonWidgetAnnotationElement (line 18201) | class RadioButtonWidgetAnnotationElement extends WidgetAnnotationElement {
    method constructor (line 18202) | constructor(parameters) {
    method render (line 18207) | render() {
  class PushButtonWidgetAnnotationElement (line 18284) | class PushButtonWidgetAnnotationElement extends LinkAnnotationElement {
    method constructor (line 18285) | constructor(parameters) {
    method render (line 18290) | render() {
  class ChoiceWidgetAnnotationElement (line 18306) | class ChoiceWidgetAnnotationElement extends WidgetAnnotationElement {
    method constructor (line 18307) | constructor(parameters) {
    method render (line 18312) | render() {
  class PopupAnnotationElement (line 18526) | class PopupAnnotationElement extends AnnotationElement {
    method constructor (line 18527) | constructor(parameters) {
    method render (line 18537) | render() {
  class PopupElement (line 18562) | class PopupElement {
    method constructor (line 18580) | constructor({
    method render (line 18618) | render() {
    method _formatContents (line 18699) | _formatContents({
    method #keyDown (line 18716) | #keyDown(event) {
    method #toggle (line 18724) | #toggle() {
    method #show (line 18736) | #show() {
    method #hide (line 18747) | #hide() {
    method forceHide (line 18755) | forceHide() {
    method maybeShow (line 18762) | maybeShow() {
    method isVisible (line 18769) | get isVisible() {
  class FreeTextAnnotationElement (line 18773) | class FreeTextAnnotationElement extends AnnotationElement {
    method constructor (line 18774) | constructor(parameters) {
    method render (line 18783) | render() {
    method _isEditable (line 18802) | get _isEditable() {
  class LineAnnotationElement (line 18806) | class LineAnnotationElement extends AnnotationElement {
    method constructor (line 18808) | constructor(parameters) {
    method render (line 18814) | render() {
    method getElementsToTriggerPopup (line 18837) | getElementsToTriggerPopup() {
    method addHighlightArea (line 18840) | addHighlightArea() {
  class SquareAnnotationElement (line 18844) | class SquareAnnotationElement extends AnnotationElement {
    method constructor (line 18846) | constructor(parameters) {
    method render (line 18852) | render() {
    method getElementsToTriggerPopup (line 18876) | getElementsToTriggerPopup() {
    method addHighlightArea (line 18879) | addHighlightArea() {
  class CircleAnnotationElement (line 18883) | class CircleAnnotationElement extends AnnotationElement {
    method constructor (line 18885) | constructor(parameters) {
    method render (line 18891) | render() {
    method getElementsToTriggerPopup (line 18915) | getElementsToTriggerPopup() {
    method addHighlightArea (line 18918) | addHighlightArea() {
  class PolylineAnnotationElement (line 18922) | class PolylineAnnotationElement extends AnnotationElement {
    method constructor (line 18924) | constructor(parameters) {
    method render (line 18932) | render() {
    method getElementsToTriggerPopup (line 18959) | getElementsToTriggerPopup() {
    method addHighlightArea (line 18962) | addHighlightArea() {
  class PolygonAnnotationElement (line 18966) | class PolygonAnnotationElement extends PolylineAnnotationElement {
    method constructor (line 18967) | constructor(parameters) {
  class CaretAnnotationElement (line 18973) | class CaretAnnotationElement extends AnnotationElement {
    method constructor (line 18974) | constructor(parameters) {
    method render (line 18980) | render() {
  class InkAnnotationElement (line 18988) | class InkAnnotationElement extends AnnotationElement {
    method constructor (line 18990) | constructor(parameters) {
    method render (line 18999) | render() {
    method getElementsToTriggerPopup (line 19029) | getElementsToTriggerPopup() {
    method addHighlightArea (line 19032) | addHighlightArea() {
  class HighlightAnnotationElement (line 19036) | class HighlightAnnotationElement extends AnnotationElement {
    method constructor (line 19037) | constructor(parameters) {
    method render (line 19044) | render() {
  class UnderlineAnnotationElement (line 19052) | class UnderlineAnnotationElement extends AnnotationElement {
    method constructor (line 19053) | constructor(parameters) {
    method render (line 19060) | render() {
  class SquigglyAnnotationElement (line 19068) | class SquigglyAnnotationElement extends AnnotationElement {
    method constructor (line 19069) | constructor(parameters) {
    method render (line 19076) | render() {
  class StrikeOutAnnotationElement (line 19084) | class StrikeOutAnnotationElement extends AnnotationElement {
    method constructor (line 19085) | constructor(parameters) {
    method render (line 19092) | render() {
  class StampAnnotationElement (line 19100) | class StampAnnotationElement extends AnnotationElement {
    method constructor (line 19101) | constructor(parameters) {
    method render (line 19107) | render() {
  class FileAttachmentAnnotationElement (line 19115) | class FileAttachmentAnnotationElement extends AnnotationElement {
    method constructor (line 19117) | constructor(parameters) {
    method render (line 19133) | render() {
    method getElementsToTriggerPopup (line 19167) | getElementsToTriggerPopup() {
    method addHighlightArea (line 19170) | addHighlightArea() {
    method #download (line 19173) | #download() {
  class AnnotationLayer (line 19177) | class AnnotationLayer {
    method constructor (line 19181) | constructor({
    method #appendElement (line 19195) | #appendElement(element, id) {
    method render (line 19201) | async render(params) {
    method update (line 19267) | update({
    method #setAnnotationCanvasMap (line 19278) | #setAnnotationCanvasMap() {
    method getEditableAnnotations (line 19301) | getEditableAnnotations() {
    method getEditableAnnotation (line 19304) | getEditableAnnotation(id) {
  function __webpack_require__ (line 19318) | function __webpack_require__(moduleId) {

FILE: src/vendor/pdfjs/pdf.sandbox.mjs
  function D (line 25) | function D(){var a=x.buffer;d.HEAP8=z=new Int8Array(a);d.HEAP16=new Int1...
  function ba (line 25) | function ba(){var a=d.preRun.shift();E.unshift(a);}
  function w (line 25) | function w(a){d.onAbort?.(a);a="Aborted("+a+")";u(a);y=!0;a=new WebAssem...
  function ca (line 25) | function ca(){var a=L;return Promise.resolve().then(()=>{if(a==L&&v)var ...
  function da (line 25) | function da(a,b){return ca().then(c=>WebAssembly.instantiate(c,a)).then(...
  function ea (line 25) | function ea(a,b){return da(a,b);}
  function U (line 25) | function U(){}
  function e (line 25) | function e(t){return(t=t.toTimeString().match(/\(([A-Za-z ]+)\)$/))?t[1]...
  function a (line 25) | function a(c){X=c.exports;x=X.m;D();F.unshift(X.n);H--;d.monitorRunDepen...
  function na (line 25) | function na(){function a(){if(!Z&&(Z=!0,d.calledRun=!0,!y)){N(F);k(d);if...
  class SandboxSupportBase (line 27) | class SandboxSupportBase {
    method constructor (line 28) | constructor(win) {
    method destroy (line 33) | destroy() {
    method exportValueToSandbox (line 40) | exportValueToSandbox(val) {
    method importValueFromSandbox (line 43) | importValueFromSandbox(val) {
    method createErrorForSandbox (line 46) | createErrorForSandbox(errorMessage) {
    method callSandboxFunction (line 49) | callSandboxFunction(name, args) {
    method createSandboxExternals (line 57) | createSandboxExternals() {
  class SandboxSupport (line 144) | class SandboxSupport extends SandboxSupportBase {
    method exportValueToSandbox (line 145) | exportValueToSandbox(val) {
    method importValueFromSandbox (line 148) | importValueFromSandbox(val) {
    method createErrorForSandbox (line 151) | createErrorForSandbox(errorMessage) {
  class Sandbox (line 155) | class Sandbox {
    method constructor (line 156) | constructor(win, module) {
    method create (line 162) | create(data) {
    method dispatchEvent (line 186) | dispatchEvent(event) {
    method dumpMemoryUse (line 189) | dumpMemoryUse() {
    method nukeSandbox (line 192) | nukeSandbox() {
    method evalForTesting (line 200) | evalForTesting(code, key) {
  function QuickJSSandbox (line 204) | function QuickJSSandbox() {

FILE: src/vendor/pdfjs/pdf.worker.mjs
  constant IDENTITY_MATRIX (line 27) | const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0];
  constant FONT_IDENTITY_MATRIX (line 28) | const FONT_IDENTITY_MATRIX = [0.001, 0, 0, 0.001, 0, 0];
  constant MAX_IMAGE_SIZE_TO_CACHE (line 29) | const MAX_IMAGE_SIZE_TO_CACHE = 10e6;
  constant LINE_FACTOR (line 30) | const LINE_FACTOR = 1.35;
  constant LINE_DESCENT_FACTOR (line 31) | const LINE_DESCENT_FACTOR = 0.35;
  constant BASELINE_FACTOR (line 32) | const BASELINE_FACTOR = LINE_DESCENT_FACTOR / LINE_FACTOR;
  constant OPS (line 208) | const OPS = {
  function setVerbosityLevel (line 303) | function setVerbosityLevel(level) {
  function getVerbosityLevel (line 308) | function getVerbosityLevel() {
  function info (line 311) | function info(msg) {
  function warn (line 316) | function warn(msg) {
  function unreachable (line 321) | function unreachable(msg) {
  function assert (line 324) | function assert(cond, msg) {
  function _isValidProtocol (line 329) | function _isValidProtocol(url) {
  function createValidAbsoluteUrl (line 341) | function createValidAbsoluteUrl(url, baseUrl = null, options = null) {
  function shadow (line 366) | function shadow(obj, prop, value, nonSerializable = false) {
  function BaseException (line 376) | function BaseException(message, name) {
  class PasswordException (line 387) | class PasswordException extends BaseException {
    method constructor (line 388) | constructor(msg, code) {
  class UnknownErrorException (line 393) | class UnknownErrorException extends BaseException {
    method constructor (line 394) | constructor(msg, details) {
  class InvalidPDFException (line 399) | class InvalidPDFException extends BaseException {
    method constructor (line 400) | constructor(msg) {
  class MissingPDFException (line 404) | class MissingPDFException extends BaseException {
    method constructor (line 405) | constructor(msg) {
  class UnexpectedResponseException (line 409) | class UnexpectedResponseException extends BaseException {
    method constructor (line 410) | constructor(msg, status) {
  class FormatError (line 415) | class FormatError extends BaseException {
    method constructor (line 416) | constructor(msg) {
  class AbortException (line 420) | class AbortException extends BaseException {
    method constructor (line 421) | constructor(msg) {
  function bytesToString (line 425) | function bytesToString(bytes) {
  function stringToBytes (line 442) | function stringToBytes(str) {
  function string32 (line 453) | function string32(value) {
  function objectSize (line 456) | function objectSize(obj) {
  function objectFromMap (line 459) | function objectFromMap(map) {
  function isLittleEndian (line 466) | function isLittleEndian() {
  function isEvalSupported (line 472) | function isEvalSupported() {
  class FeatureTest (line 480) | class FeatureTest {
    method isLittleEndian (line 481) | static get isLittleEndian() {
    method isEvalSupported (line 484) | static get isEvalSupported() {
    method isOffscreenCanvasSupported (line 487) | static get isOffscreenCanvasSupported() {
    method platform (line 490) | static get platform() {
    method isCSSRoundSupported (line 500) | static get isCSSRoundSupported() {
  class Util (line 505) | class Util {
    method makeHexColor (line 506) | static makeHexColor(r, g, b) {
    method scaleMinMax (line 509) | static scaleMinMax(transform, minMax) {
    method transform (line 553) | static transform(m1, m2) {
    method applyTransform (line 556) | static applyTransform(p, m) {
    method applyInverseTransform (line 561) | static applyInverseTransform(p, m) {
    method getAxialAlignedBoundingBox (line 567) | static getAxialAlignedBoundingBox(r, m) {
    method inverseTransform (line 574) | static inverseTransform(m) {
    method getRotation (line 578) | static getRotation(m) {
    method singularValueDecompose2dScale (line 581) | static singularValueDecompose2dScale(m) {
    method normalizeRect (line 593) | static normalizeRect(rect) {
    method intersect (line 605) | static intersect(rect1, rect2) {
    method #getExtremumOnCurve (line 618) | static #getExtremumOnCurve(x0, x1, x2, x3, y0, y1, y2, y3, t, minMax) {
    method #getExtremum (line 632) | static #getExtremum(x0, x1, x2, x3, y0, y1, y2, y3, a, b, c, minMax) {
    method bezierBoundingBox (line 648) | static bezierBoundingBox(x0, y0, x1, y1, x2, y2, x3, y3, minMax) {
  function stringToPDFString (line 663) | function stringToPDFString(str) {
  function stringToUTF8String (line 707) | function stringToUTF8String(str) {
  function utf8StringToString (line 710) | function utf8StringToString(str) {
  function isArrayEqual (line 713) | function isArrayEqual(arr1, arr2) {
  function getModificationDate (line 724) | function getModificationDate(date = new Date()) {
  class PromiseCapability (line 728) | class PromiseCapability {
    method constructor (line 730) | constructor() {
    method settled (line 742) | get settled() {
  function normalizeUnicode (line 748) | function normalizeUnicode(str) {
  function getUuid (line 755) | function getUuid() {
  constant CIRCULAR_REF (line 773) | const CIRCULAR_REF = Symbol("CIRCULAR_REF");
  constant EOF (line 774) | const EOF = Symbol("EOF");
  function clearPrimitiveCaches (line 778) | function clearPrimitiveCaches() {
  class Name (line 783) | class Name {
    method constructor (line 784) | constructor(name) {
    method get (line 787) | static get(name) {
  class Cmd (line 791) | class Cmd {
    method constructor (line 792) | constructor(cmd) {
    method get (line 795) | static get(cmd) {
  class Dict (line 802) | class Dict {
    method constructor (line 803) | constructor(xref = null) {
    method assignXref (line 810) | assignXref(newXref) {
    method size (line 813) | get size() {
    method get (line 816) | get(key1, key2, key3) {
    method getAsync (line 829) | async getAsync(key1, key2, key3) {
    method getArray (line 842) | getArray(key1, key2, key3) {
    method getRaw (line 863) | getRaw(key) {
    method getKeys (line 866) | getKeys() {
    method getRawValues (line 869) | getRawValues() {
    method set (line 872) | set(key, value) {
    method has (line 875) | has(key) {
    method forEach (line 878) | forEach(callback) {
    method empty (line 883) | static get empty() {
    method merge (line 890) | static merge({
    method clone (line 932) | clone() {
  class Ref (line 940) | class Ref {
    method constructor (line 941) | constructor(num, gen) {
    method toString (line 945) | toString() {
    method fromString (line 951) | static fromString(str) {
    method get (line 962) | static get(num, gen) {
  class RefSet (line 967) | class RefSet {
    method constructor (line 968) | constructor(parent = null) {
    method has (line 971) | has(ref) {
    method put (line 974) | put(ref) {
    method remove (line 977) | remove(ref) {
    method clear (line 983) | clear() {
  method [Symbol.iterator] (line 980) | [Symbol.iterator]() {
  class RefSetCache (line 987) | class RefSetCache {
    method constructor (line 988) | constructor() {
    method size (line 991) | get size() {
    method get (line 994) | get(ref) {
    method has (line 997) | has(ref) {
    method put (line 1000) | put(ref, obj) {
    method putAlias (line 1003) | putAlias(ref, aliasRef) {
    method clear (line 1009) | clear() {
  method [Symbol.iterator] (line 1006) | [Symbol.iterator]() {
  function isName (line 1013) | function isName(v, name) {
  function isCmd (line 1016) | function isCmd(v, cmd) {
  function isDict (line 1019) | function isDict(v, type) {
  function isRefsEqual (line 1022) | function isRefsEqual(v1, v2) {
  class BaseStream (line 1028) | class BaseStream {
    method constructor (line 1029) | constructor() {
    method length (line 1034) | get length() {
    method isEmpty (line 1037) | get isEmpty() {
    method isDataLoaded (line 1040) | get isDataLoaded() {
    method getByte (line 1043) | getByte() {
    method getBytes (line 1046) | getBytes(length) {
    method peekByte (line 1049) | peekByte() {
    method peekBytes (line 1056) | peekBytes(length) {
    method getUint16 (line 1061) | getUint16() {
    method getInt32 (line 1069) | getInt32() {
    method getByteRange (line 1076) | getByteRange(begin, end) {
    method getString (line 1079) | getString(length) {
    method skip (line 1082) | skip(n) {
    method reset (line 1085) | reset() {
    method moveStart (line 1088) | moveStart() {
    method makeSubStream (line 1091) | makeSubStream(start, length, dict = null) {
    method getBaseStreams (line 1094) | getBaseStreams() {
  constant PDF_VERSION_REGEXP (line 1103) | const PDF_VERSION_REGEXP = /^[1-9]\.\d$/;
  function getLookupTableFactory (line 1104) | function getLookupTableFactory(initializer) {
  class MissingDataException (line 1115) | class MissingDataException extends BaseException {
    method constructor (line 1116) | constructor(begin, end) {
  class ParserEOFException (line 1122) | class ParserEOFException extends BaseException {
    method constructor (line 1123) | constructor(msg) {
  class XRefEntryException (line 1127) | class XRefEntryException extends BaseException {
    method constructor (line 1128) | constructor(msg) {
  class XRefParseException (line 1132) | class XRefParseException extends BaseException {
    method constructor (line 1133) | constructor(msg) {
  function arrayBuffersToBytes (line 1137) | function arrayBuffersToBytes(arr) {
  function getInheritableProperty (line 1158) | function getInheritableProperty({
  constant ROMAN_NUMBER_MAP (line 1181) | const ROMAN_NUMBER_MAP = ["", "C", "CC", "CCC", "CD", "D", "DC", "DCC", ...
  function toRomanNumerals (line 1182) | function toRomanNumerals(number, lowerCase = false) {
  function log2 (line 1200) | function log2(x) {
  function readInt8 (line 1206) | function readInt8(data, offset) {
  function readUint16 (line 1209) | function readUint16(data, offset) {
  function readUint32 (line 1212) | function readUint32(data, offset) {
  function isWhiteSpace (line 1215) | function isWhiteSpace(ch) {
  function parseXFAPath (line 1218) | function parseXFAPath(path) {
  function escapePDFName (line 1234) | function escapePDFName(str) {
  function escapeString (line 1255) | function escapeString(str) {
  function _collectJS (line 1265) | function _collectJS(entry, xref, list, parents) {
  function collectActions (line 1302) | function collectActions(xref, dict, eventType) {
  function encodeToXmlString (line 1357) | function encodeToXmlString(str) {
  function validateFontName (line 1390) | function validateFontName(fontFamily, mustWarn = false) {
  function validateCSSFont (line 1412) | function validateCSSFont(cssFontInfo) {
  function recoverJsURL (line 1430) | function recoverJsURL(str) {
  function numberToString (line 1447) | function numberToString(value) {
  function getNewAnnotationsMap (line 1460) | function getNewAnnotationsMap(annotationStorage) {
  function isAscii (line 1478) | function isAscii(str) {
  function stringToUTF16HexString (line 1481) | function stringToUTF16HexString(str) {
  function stringToUTF16String (line 1489) | function stringToUTF16String(str, bigEndian = false) {
  function getRotationMatrix (line 1500) | function getRotationMatrix(rotation, width, height) {
  function getSizeInBytes (line 1512) | function getSizeInBytes(x) {
  class Stream (line 1519) | class Stream extends BaseStream {
    method constructor (line 1520) | constructor(arrayBuffer, start, length, dict) {
    method length (line 1528) | get length() {
    method isEmpty (line 1531) | get isEmpty() {
    method getByte (line 1534) | getByte() {
    method getBytes (line 1540) | getBytes(length) {
    method getByteRange (line 1554) | getByteRange(begin, end) {
    method reset (line 1563) | reset() {
    method moveStart (line 1566) | moveStart() {
    method makeSubStream (line 1569) | makeSubStream(start, length, dict = null) {
  class StringStream (line 1573) | class StringStream extends Stream {
    method constructor (line 1574) | constructor(str) {
  class NullStream (line 1578) | class NullStream extends Stream {
    method constructor (line 1579) | constructor() {
  class ChunkedStream (line 1588) | class ChunkedStream extends Stream {
    method constructor (line 1589) | constructor(length, chunkSize, manager) {
    method getMissingChunks (line 1598) | getMissingChunks() {
    method numChunksLoaded (line 1607) | get numChunksLoaded() {
    method isDataLoaded (line 1610) | get isDataLoaded() {
    method onReceiveData (line 1613) | onReceiveData(begin, chunk) {
    method onReceiveProgressiveData (line 1629) | onReceiveProgressiveData(data) {
    method ensureByte (line 1640) | ensureByte(pos) {
    method ensureRange (line 1656) | ensureRange(begin, end) {
    method nextEmptyChunk (line 1674) | nextEmptyChunk(beginChunk) {
    method hasChunk (line 1684) | hasChunk(chunk) {
    method getByte (line 1687) | getByte() {
    method getBytes (line 1697) | getBytes(length) {
    method getByteRange (line 1717) | getByteRange(begin, end) {
    method makeSubStream (line 1729) | makeSubStream(start, length, dict = null) {
    method getBaseStreams (line 1766) | getBaseStreams() {
  class ChunkedStreamManager (line 1770) | class ChunkedStreamManager {
    method constructor (line 1771) | constructor(pdfNetworkStream, args) {
    method sendRequest (line 1786) | sendRequest(begin, end) {
    method requestAllChunks (line 1828) | requestAllChunks(noFetch = false) {
    method _requestChunks (line 1835) | _requestChunks(chunks) {
    method getStream (line 1874) | getStream() {
    method requestRange (line 1877) | requestRange(begin, end) {
    method requestRanges (line 1887) | requestRanges(ranges = []) {
    method groupChunks (line 1903) | groupChunks(chunks) {
    method onProgress (line 1929) | onProgress(args) {
    method onReceiveData (line 1935) | onReceiveData(args) {
    method onError (line 1993) | onError(err) {
    method getBeginChunk (line 1996) | getBeginChunk(begin) {
    method getEndChunk (line 1999) | getEndChunk(end) {
    method abort (line 2002) | abort(reason) {
  function resizeRgbImage (line 2016) | function resizeRgbImage(src, dest, w1, h1, w2, h2, alpha01) {
  class ColorSpace (line 2039) | class ColorSpace {
    method constructor (line 2040) | constructor(name, numComps) {
    method getRgb (line 2047) | getRgb(src, srcOffset) {
    method getRgbItem (line 2052) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2055) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2058) | getOutputLength(inputLength, alpha01) {
    method isPassthrough (line 2061) | isPassthrough(bits) {
    method isDefaultDecode (line 2064) | isDefaultDecode(decodeMap, bpc) {
    method fillRgb (line 2067) | fillRgb(dest, originalWidth, originalHeight, width, height, actualHeig...
    method usesZeroToOneRange (line 2121) | get usesZeroToOneRange() {
    method _cache (line 2124) | static _cache(cacheKey, xref, localColorSpaceCache, parsedColorSpace) {
    method getCached (line 2143) | static getCached(cacheKey, xref, localColorSpaceCache) {
    method parseAsync (line 2168) | static async parseAsync({
    method parse (line 2179) | static parse({
    method _parse (line 2194) | static _parse(cs, xref, resources = null, pdfFunctionFactory) {
    method isDefaultDecode (line 2303) | static isDefaultDecode(decode, numComps) {
    method singletons (line 2318) | static get singletons() {
  class AlternateCS (line 2332) | class AlternateCS extends ColorSpace {
    method constructor (line 2333) | constructor(numComps, base, tintFn) {
    method getRgbItem (line 2339) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2344) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2375) | getOutputLength(inputLength, alpha01) {
  class PatternCS (line 2379) | class PatternCS extends ColorSpace {
    method constructor (line 2380) | constructor(baseCS) {
    method isDefaultDecode (line 2384) | isDefaultDecode(decodeMap, bpc) {
  class IndexedCS (line 2388) | class IndexedCS extends ColorSpace {
    method constructor (line 2389) | constructor(base, highVal, lookup) {
    method getRgbItem (line 2406) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2411) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2422) | getOutputLength(inputLength, alpha01) {
    method isDefaultDecode (line 2425) | isDefaultDecode(decodeMap, bpc) {
  class DeviceGrayCS (line 2440) | class DeviceGrayCS extends ColorSpace {
    method constructor (line 2441) | constructor() {
    method getRgbItem (line 2444) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2448) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2460) | getOutputLength(inputLength, alpha01) {
  class DeviceRgbCS (line 2464) | class DeviceRgbCS extends ColorSpace {
    method constructor (line 2465) | constructor() {
    method getRgbItem (line 2468) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2473) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2488) | getOutputLength(inputLength, alpha01) {
    method isPassthrough (line 2491) | isPassthrough(bits) {
  class DeviceCmykCS (line 2495) | class DeviceCmykCS extends ColorSpace {
    method constructor (line 2496) | constructor() {
    method #toRgb (line 2499) | #toRgb(src, srcOffset, srcScale, dest, destOffset) {
    method getRgbItem (line 2508) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2511) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2519) | getOutputLength(inputLength, alpha01) {
  class CalGrayCS (line 2523) | class CalGrayCS extends ColorSpace {
    method constructor (line 2524) | constructor(whitePoint, blackPoint, gamma) {
    method #toRgb (line 2547) | #toRgb(src, srcOffset, dest, destOffset, scale) {
    method getRgbItem (line 2556) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2559) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2567) | getOutputLength(inputLength, alpha01) {
  class CalRGBCS (line 2571) | class CalRGBCS extends ColorSpace {
    method constructor (line 2580) | constructor(whitePoint, blackPoint, gamma, matrix) {
    method #matrixProduct (line 2601) | #matrixProduct(a, b, result) {
    method #toFlat (line 2606) | #toFlat(sourceWhitePoint, LMS, result) {
    method #toD65 (line 2611) | #toD65(sourceWhitePoint, LMS, result) {
    method #sRGBTransferFunction (line 2619) | #sRGBTransferFunction(color) {
    method #adjustToRange (line 2628) | #adjustToRange(min, max, value) {
    method #decodeL (line 2631) | #decodeL(L) {
    method #compensateBlackPoint (line 2640) | #compensateBlackPoint(sourceBlackPoint, XYZ_Flat, result) {
    method #normalizeWhitePointToFlat (line 2664) | #normalizeWhitePointToFlat(sourceWhitePoint, XYZ_In, result) {
    method #normalizeWhitePointToD65 (line 2677) | #normalizeWhitePointToD65(sourceWhitePoint, XYZ_In, result) {
    method #toRgb (line 2684) | #toRgb(src, srcOffset, dest, destOffset, scale) {
    method getRgbItem (line 2710) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2713) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2721) | getOutputLength(inputLength, alpha01) {
  class LabCS (line 2725) | class LabCS extends ColorSpace {
    method constructor (line 2726) | constructor(whitePoint, blackPoint, range) {
    method #fn_g (line 2749) | #fn_g(x) {
    method #decode (line 2752) | #decode(value, high1, low2, high2) {
    method #toRgb (line 2755) | #toRgb(src, srcOffset, maxVal, dest, destOffset) {
    method getRgbItem (line 2794) | getRgbItem(src, srcOffset, dest, destOffset) {
    method getRgbBuffer (line 2797) | getRgbBuffer(src, srcOffset, count, dest, destOffset, bits, alpha01) {
    method getOutputLength (line 2805) | getOutputLength(inputLength, alpha01) {
    method isDefaultDecode (line 2808) | isDefaultDecode(decodeMap, bpc) {
    method usesZeroToOneRange (line 2811) | get usesZeroToOneRange() {
  function hexToInt (line 2820) | function hexToInt(a, size) {
  function hexToStr (line 2827) | function hexToStr(a, size) {
  function addHex (line 2836) | function addHex(a, b, size) {
  function incHex (line 2844) | function incHex(a, size) {
  constant MAX_NUM_SIZE (line 2852) | const MAX_NUM_SIZE = 16;
  constant MAX_ENCODED_NUM_SIZE (line 2853) | const MAX_ENCODED_NUM_SIZE = 19;
  class BinaryCMapStream (line 2854) | class BinaryCMapStream {
    method constructor (line 2855) | constructor(data) {
    method readByte (line 2861) | readByte() {
    method readNumber (line 2867) | readNumber() {
    method readSigned (line 2880) | readSigned() {
    method readHex (line 2884) | readHex(num, size) {
    method readHexNumber (line 2888) | readHexNumber(num, size) {
    method readHexSigned (line 2914) | readHexSigned(num, size) {
    method readString (line 2923) | readString() {
  class BinaryCMapReader (line 2932) | class BinaryCMapReader {
    method process (line 2933) | async process(data, cMap, extend) {
  class DecodeStream (line 3079) | class DecodeStream extends BaseStream {
    method constructor (line 3080) | constructor(maybeMinBufferLength) {
    method isEmpty (line 3094) | get isEmpty() {
    method ensureBuffer (line 3100) | ensureBuffer(requested) {
    method getByte (line 3113) | getByte() {
    method getBytes (line 3123) | getBytes(length, ignoreColorSpace = false) {
    method reset (line 3145) | reset() {
    method makeSubStream (line 3148) | makeSubStream(start, length, dict = null) {
    method getBaseStreams (line 3161) | getBaseStreams() {
  class StreamsSequenceStream (line 3165) | class StreamsSequenceStream extends DecodeStream {
    method constructor (line 3166) | constructor(streams, onError = null) {
    method readBlock (line 3175) | readBlock() {
    method getBaseStreams (line 3198) | getBaseStreams() {
  class Ascii85Stream (line 3213) | class Ascii85Stream extends DecodeStream {
    method constructor (line 3214) | constructor(str, maybeLength) {
    method readBlock (line 3223) | readBlock() {
  class AsciiHexStream (line 3279) | class AsciiHexStream extends DecodeStream {
    method constructor (line 3280) | constructor(str, maybeLength) {
    method readBlock (line 3289) | readBlock() {
  constant MAX_UINT_32 (line 3330) | const MAX_UINT_32 = 2 ** 32 - 1;
  class CCITTFaxDecoder (line 3348) | class CCITTFaxDecoder {
    method constructor (line 3349) | constructor(source, options = {}) {
    method readNextChar (line 3387) | readNextChar() {
    method _addPixels (line 3677) | _addPixels(a1, blackPixels) {
    method _addPixelsNeg (line 3693) | _addPixelsNeg(a1, blackPixels) {
    method _findTableCode (line 3719) | _findTableCode(start, end, table, limit) {
    method _getTwoDimCode (line 3739) | _getTwoDimCode() {
    method _getWhiteCode (line 3758) | _getWhiteCode() {
    method _getBlackCode (line 3785) | _getBlackCode() {
    method _lookBits (line 3821) | _lookBits(n) {
    method _eatBits (line 3835) | _eatBits(n) {
  class CCITTFaxStream (line 3846) | class CCITTFaxStream extends DecodeStream {
    method constructor (line 3847) | constructor(str, maybeLength, params) {
    method readBlock (line 3872) | readBlock() {
  class FlateStream (line 3893) | class FlateStream extends DecodeStream {
    method constructor (line 3894) | constructor(str, maybeLength) {
    method getBits (line 3915) | getBits(bits) {
    method getCode (line 3932) | getCode(table) {
    method generateHuffmanTable (line 3956) | generateHuffmanTable(lengths) {
    method #endsStreamOnError (line 3985) | #endsStreamOnError(err) {
    method readBlock (line 3989) | readBlock() {
  class ArithmeticDecoder (line 4368) | class ArithmeticDecoder {
    method constructor (line 4369) | constructor(data, start, end) {
    method byteIn (line 4381) | byteIn() {
    method readBit (line 4405) | readBit(contexts, pos) {
  class Jbig2Error (line 4462) | class Jbig2Error extends BaseException {
    method constructor (line 4463) | constructor(msg) {
  class ContextCache (line 4467) | class ContextCache {
    method getContexts (line 4468) | getContexts(id) {
  class DecodingContext (line 4475) | class DecodingContext {
    method constructor (line 4476) | constructor(data, start, end) {
    method decoder (line 4481) | get decoder() {
    method contextCache (line 4485) | get contextCache() {
  constant MAX_INT_32 (line 4490) | const MAX_INT_32 = 2 ** 31 - 1;
  constant MIN_INT_32 (line 4491) | const MIN_INT_32 = -(2 ** 31);
  function decodeInteger (line 4492) | function decodeInteger(contextCache, procedure, decoder) {
  function decodeIAID (line 4517) | function decodeIAID(contextCache, decoder, codeLength) {
  function decodeBitmapTemplate0 (line 4729) | function decodeBitmapTemplate0(width, height, decodingContext) {
  function decodeBitmap (line 4747) | function decodeBitmap(mmr, width, height, templateIndex, prediction, ski...
  function decodeRefinement (line 4855) | function decodeRefinement(width, height, templateIndex, referenceBitmap,...
  function decodeSymbolDictionary (line 4923) | function decodeSymbolDictionary(huffman, refinement, symbols, numberOfNe...
  function decodeTextRegion (line 5033) | function decodeTextRegion(huffman, refinement, width, height, defaultPix...
  function decodePatternDictionary (line 5140) | function decodePatternDictionary(mmr, patternWidth, patternHeight, maxPa...
  function decodeHalftoneRegion (line 5174) | function decodeHalftoneRegion(mmr, patterns, template, regionWidth, regi...
  function readSegmentHeader (line 5271) | function readSegmentHeader(data, start) {
  function readSegments (line 5363) | function readSegments(header, data, start, end) {
  function readRegionSegmentInformation (line 5392) | function readRegionSegmentInformation(data, start) {
  function processSegment (line 5402) | function processSegment(segment, visitor) {
  function processSegments (line 5594) | function processSegments(segments, visitor) {
  function parseJbig2Chunks (line 5599) | function parseJbig2Chunks(chunks) {
  function parseJbig2 (line 5608) | function parseJbig2(data) {
  class SimpleSegmentVisitor (line 5611) | class SimpleSegmentVisitor {
    method onPageInformation (line 5612) | onPageInformation(info) {
    method drawBitmap (line 5621) | drawBitmap(regionInfo, bitmap) {
    method onImmediateGenericRegion (line 5670) | onImmediateGenericRegion(region, data, start, end) {
    method onImmediateLosslessGenericRegion (line 5676) | onImmediateLosslessGenericRegion() {
    method onSymbolDictionary (line 5679) | onSymbolDictionary(dictionary, currentSegment, referredSegments, data,...
    method onImmediateTextRegion (line 5699) | onImmediateTextRegion(region, referredSegments, data, start, end) {
    method onImmediateLosslessTextRegion (line 5719) | onImmediateLosslessTextRegion() {
    method onPatternDictionary (line 5722) | onPatternDictionary(dictionary, currentSegment, data, start, end) {
    method onImmediateHalftoneRegion (line 5730) | onImmediateHalftoneRegion(region, referredSegments, data, start, end) {
    method onImmediateLosslessHalftoneRegion (line 5737) | onImmediateLosslessHalftoneRegion() {
    method onTables (line 5740) | onTables(currentSegment, data, start, end) {
  class HuffmanLine (line 5748) | class HuffmanLine {
    method constructor (line 5749) | constructor(lineData) {
  class HuffmanTreeNode (line 5767) | class HuffmanTreeNode {
    method constructor (line 5768) | constructor(line) {
    method buildTree (line 5780) | buildTree(line, shift) {
    method decodeNode (line 5792) | decodeNode(reader) {
  class HuffmanTable (line 5807) | class HuffmanTable {
    method constructor (line 5808) | constructor(lines, prefixCodesDone) {
    method decode (line 5820) | decode(reader) {
    method assignPrefixCodes (line 5823) | assignPrefixCodes(lines) {
  function decodeTablesSegment (line 5855) | function decodeTablesSegment(data, start, end) {
  function getStandardTable (line 5883) | function getStandardTable(number) {
  class Reader (line 5945) | class Reader {
    method constructor (line 5946) | constructor(data, start, end) {
    method readBit (line 5954) | readBit() {
    method readBits (line 5966) | readBits(numBits) {
    method byteAlign (line 5974) | byteAlign() {
    method next (line 5977) | next() {
  function getCustomHuffmanTable (line 5984) | function getCustomHuffmanTable(index, referredTo, customTables) {
  function getTextRegionHuffmanTables (line 5997) | function getTextRegionHuffmanTables(textRegion, referredTo, customTables...
  function getSymbolDictionaryHuffmanTables (line 6091) | function getSymbolDictionaryHuffmanTables(dictionary, referredTo, custom...
  function readUncompressedBitmap (line 6138) | function readUncompressedBitmap(reader, width, height) {
  function decodeMMRBitmap (line 6150) | function decodeMMRBitmap(input, width, height, endOfBlock) {
  class Jbig2Image (line 6189) | class Jbig2Image {
    method parseChunks (line 6190) | parseChunks(chunks) {
    method parse (line 6193) | parse(data) {
  class Jbig2Stream (line 6204) | class Jbig2Stream extends DecodeStream {
    method constructor (line 6205) | constructor(stream, maybeLength, params) {
    method bytes (line 6213) | get bytes() {
    method ensureBuffer (line 6216) | ensureBuffer(requested) {}
    method readBlock (line 6217) | readBlock() {
  function convertToRGBA (line 6252) | function convertToRGBA(params) {
  function convertBlackAndWhiteToRGBA (line 6261) | function convertBlackAndWhiteToRGBA({
  function convertRGBToRGBA (line 6302) | function convertRGBToRGBA({
  function grayToRGBA (line 6345) | function grayToRGBA(src, dest) {
  class JpegError (line 6361) | class JpegError extends BaseException {
    method constructor (line 6362) | constructor(msg) {
  class DNLMarkerError (line 6366) | class DNLMarkerError extends BaseException {
    method constructor (line 6367) | constructor(message, scanLines) {
  class EOIMarkerError (line 6372) | class EOIMarkerError extends BaseException {
    method constructor (line 6373) | constructor(msg) {
  function buildHuffmanTable (line 6386) | function buildHuffmanTable(codeLengths, values) {
  function getBlockBufferOffset (line 6430) | function getBlockBufferOffset(component, row, col) {
  function decodeScan (line 6433) | function decodeScan(data, offset, frame, components, resetInterval, spec...
  function quantizeAndInverse (line 6701) | function quantizeAndInverse(component, blockBufferOffset, p) {
  function buildComponentData (line 6904) | function buildComponentData(frame, component) {
  function findNextFileMarker (line 6916) | function findNextFileMarker(data, currentPos, startPos = currentPos) {
  class JpegImage (line 6943) | class JpegImage {
    method constructor (line 6944) | constructor({
    method parse (line 6951) | parse(data, {
    method _getLinearizedBlockData (line 7222) | _getLinearizedBlockData(width, height, isSourcePDF = false) {
    method _isColorConversionNeeded (line 7272) | get _isColorConversionNeeded() {
    method _convertYccToRgb (line 7289) | _convertYccToRgb(data) {
    method _convertYccToRgba (line 7301) | _convertYccToRgba(data, out) {
    method _convertYcckToRgb (line 7313) | _convertYcckToRgb(data) {
    method _convertYcckToRgba (line 7327) | _convertYcckToRgba(data) {
    method _convertYcckToCmyk (line 7340) | _convertYcckToCmyk(data) {
    method _convertCmykToRgb (line 7352) | _convertCmykToRgb(data) {
    method _convertCmykToRgba (line 7366) | _convertCmykToRgba(data) {
    method getData (line 7379) | getData({
  class JpegStream (line 7434) | class JpegStream extends DecodeStream {
    method constructor (line 7435) | constructor(stream, maybeLength, params) {
    method bytes (line 7449) | get bytes() {
    method ensureBuffer (line 7452) | ensureBuffer(requested) {}
    method readBlock (line 7453) | readBlock() {
  function intArrayFromBase64 (line 7596) | function intArrayFromBase64(s) {
  function tryParseAsDataURI (line 7604) | function tryParseAsDataURI(filename) {
  function updateMemoryViews (line 7613) | function updateMemoryViews() {
  function preRun (line 7628) | function preRun() {
  function initRuntime (line 7637) | function initRuntime() {
  function postRun (line 7641) | function postRun() {
  function addOnPreRun (line 7650) | function addOnPreRun(cb) {
  function addOnInit (line 7653) | function addOnInit(cb) {
  function addOnPostRun (line 7656) | function addOnPostRun(cb) {
  function addRunDependency (line 7662) | function addRunDependency(id) {
  function removeRunDependency (line 7666) | function removeRunDependency(id) {
  function getBinarySync (line 7685) | function getBinarySync(file) {
  function instantiateSync (line 7698) | function instantiateSync(file, info) {
  function createWasm (line 7705) | function createWasm() {
  function _jsPrintWarning (line 7874) | function _jsPrintWarning(message_ptr) {
  function _setImageData (line 7878) | function _setImageData(array_ptr, array_size) {
  function _storeErrorMessage (line 7881) | function _storeErrorMessage(message_ptr) {
  function run (line 7912) | function run() {
  class JpxError (line 7957) | class JpxError extends BaseException {
    method constructor (line 7958) | constructor(msg) {
  class JpxImage (line 7962) | class JpxImage {
    method decode (line 7964) | static decode(data, ignoreColorSpace = false) {
    method cleanup (line 7974) | static cleanup() {
    method parseImageProperties (line 7977) | static parseImageProperties(stream) {
  class JpxStream (line 8007) | class JpxStream extends DecodeStream {
    method constructor (line 8008) | constructor(stream, maybeLength, params) {
    method bytes (line 8015) | get bytes() {
    method ensureBuffer (line 8025) | ensureBuffer(requested) {}
    method readBlock (line 8026) | readBlock(ignoreColorSpace) {
  class LZWStream (line 8038) | class LZWStream extends DecodeStream {
    method constructor (line 8039) | constructor(str, maybeLength, earlyChange) {
    method readBits (line 8062) | readBits(n) {
    method readBlock (line 8079) | readBlock() {
  class PredictorStream (line 8157) | class PredictorStream extends DecodeStream {
    method constructor (line 8158) | constructor(str, maybeLength, params) {
    method readBlockTiff (line 8180) | readBlockTiff() {
    method readBlockPng (line 8252) | readBlockPng() {
  class RunLengthStream (line 8342) | class RunLengthStream extends DecodeStream {
    method constructor (line 8343) | constructor(str, maybeLength) {
    method readBlock (line 8348) | readBlock() {
  constant MAX_LENGTH_TO_CACHE (line 8392) | const MAX_LENGTH_TO_CACHE = 1000;
  function getInlineImageCacheKey (line 8393) | function getInlineImageCacheKey(bytes) {
  class Parser (line 8405) | class Parser {
    method constructor (line 8406) | constructor({
    method refill (line 8420) | refill() {
    method shift (line 8424) | shift() {
    method tryShift (line 8433) | tryShift() {
    method getObj (line 8444) | getObj(cipherTransform = null) {
    method findDefaultInlineStreamEnd (line 8511) | findDefaultInlineStreamEnd(stream) {
    method findDCTDecodeInlineStreamEnd (line 8602) | findDCTDecodeInlineStreamEnd(stream) {
    method findASCII85DecodeInlineStreamEnd (line 8679) | findASCII85DecodeInlineStreamEnd(stream) {
    method findASCIIHexDecodeInlineStreamEnd (line 8713) | findASCIIHexDecodeInlineStreamEnd(stream) {
    method inlineStreamSkipEI (line 8731) | inlineStreamSkipEI(stream) {
    method makeInlineImage (line 8746) | makeInlineImage(cipherTransform) {
    method _findStreamLength (line 8825) | _findStreamLength(startPos, signature) {
    method makeStream (line 8854) | makeStream(dict, cipherTransform) {
    method filter (line 8905) | filter(stream, dict, length) {
    method makeFilter (line 8933) | makeFilter(stream, name, maybeLength, params) {
  function toHexDigit (line 8989) | function toHexDigit(ch) {
  class Lexer (line 8998) | class Lexer {
    method constructor (line 8999) | constructor(stream, knownCommands = null) {
    method nextChar (line 9007) | nextChar() {
    method peekChar (line 9010) | peekChar() {
    method getNumber (line 9013) | getNumber() {
    method getString (line 9087) | getString() {
    method getName (line 9185) | getName() {
    method _hexStringWarn (line 9224) | _hexStringWarn(ch) {
    method getHexString (line 9235) | getHexString() {
    method getObj (line 9275) | getObj() {
    method skipToNextLine (line 9377) | skipToNextLine() {
  class Linearization (line 9394) | class Linearization {
    method create (line 9395) | static create(stream) {
  constant BUILT_IN_CMAPS (line 9451) | const BUILT_IN_CMAPS = ["Adobe-GB1-UCS2", "Adobe-CNS1-UCS2", "Adobe-Japa...
  constant MAX_MAP_RANGE (line 9452) | const MAX_MAP_RANGE = 2 ** 24 - 1;
  class CMap (line 9453) | class CMap {
    method constructor (line 9454) | constructor(builtInCMap = false) {
    method addCodespaceRange (line 9463) | addCodespaceRange(n, low, high) {
    method mapCidRange (line 9467) | mapCidRange(low, high, dstLow) {
    method mapBfRange (line 9475) | mapBfRange(low, high, dstLow) {
    method mapBfRangeToArray (line 9490) | mapBfRangeToArray(low, high, array) {
    method mapOne (line 9501) | mapOne(src, dst) {
    method lookup (line 9504) | lookup(code) {
    method contains (line 9507) | contains(code) {
    method forEach (line 9510) | forEach(callback) {
    method charCodeOf (line 9525) | charCodeOf(value) {
    method getMap (line 9537) | getMap() {
    method readCharCode (line 9540) | readCharCode(str, offset, out) {
    method getCharCodeLength (line 9559) | getCharCodeLength(charCode) {
    method length (line 9573) | get length() {
    method isIdentityCMap (line 9576) | get isIdentityCMap() {
  class IdentityCMap (line 9591) | class IdentityCMap extends CMap {
    method constructor (line 9592) | constructor(vertical, n) {
    method mapCidRange (line 9597) | mapCidRange(low, high, dstLow) {
    method mapBfRange (line 9600) | mapBfRange(low, high, dstLow) {
    method mapBfRangeToArray (line 9603) | mapBfRangeToArray(low, high, array) {
    method mapOne (line 9606) | mapOne(src, dst) {
    method lookup (line 9609) | lookup(code) {
    method contains (line 9612) | contains(code) {
    method forEach (line 9615) | forEach(callback) {
    method charCodeOf (line 9620) | charCodeOf(value) {
    method getMap (line 9623) | getMap() {
    method length (line 9630) | get length() {
    method isIdentityCMap (line 9633) | get isIdentityCMap() {
  function strToInt (line 9637) | function strToInt(str) {
  function expectString (line 9644) | function expectString(obj) {
  function expectInt (line 9649) | function expectInt(obj) {
  function parseBfChar (line 9654) | function parseBfChar(cMap, lexer) {
  function parseBfRange (line 9671) | function parseBfRange(cMap, lexer) {
  function parseCidChar (line 9703) | function parseCidChar(cMap, lexer) {
  function parseCidRange (line 9720) | function parseCidRange(cMap, lexer) {
  function parseCodespaceRange (line 9740) | function parseCodespaceRange(cMap, lexer) {
  function parseWMode (line 9762) | function parseWMode(cMap, lexer) {
  function parseCMapName (line 9768) | function parseCMapName(cMap, lexer) {
  function parseCMap (line 9774) | async function parseCMap(cMap, lexer, fetchBuiltInCMap, useCMap) {
  function extendCMap (line 9830) | async function extendCMap(cMap, fetchBuiltInCMap, useCMap) {
  function createBuiltInCMap (line 9846) | async function createBuiltInCMap(name, fetchBuiltInCMap) {
  class CMapFactory (line 9872) | class CMapFactory {
    method create (line 9873) | static async create({
  function getEncoding (line 9904) | function getEncoding(encodingName) {
  constant MAX_SUBR_NESTING (line 9929) | const MAX_SUBR_NESTING = 10;
  constant NUM_STANDARD_CFF_STRINGS (line 9931) | const NUM_STANDARD_CFF_STRINGS = 391;
  method stackFn (line 10049) | stackFn(stack, index) {
  method stackFn (line 10056) | stackFn(stack, index) {
  method stackFn (line 10063) | stackFn(stack, index) {
  method stackFn (line 10070) | stackFn(stack, index) {
  method stackFn (line 10101) | stackFn(stack, index) {
  class CFFParser (line 10141) | class CFFParser {
    method constructor (line 10142) | constructor(file, properties, seacAnalysisEnabled) {
    method parse (line 10147) | parse() {
    method parseHeader (line 10208) | parseHeader() {
    method parseDict (line 10233) | parseDict(dict) {
    method parseIndex (line 10298) | parseIndex(pos) {
    method parseNameIndex (line 10328) | parseNameIndex(index) {
    method parseStringIndex (line 10336) | parseStringIndex(index) {
    method createDict (line 10344) | createDict(Type, dict, strings) {
    method parseCharString (line 10351) | parseCharString(state, data, localSubrIndex, globalSubrIndex) {
    method parseCharStrings (line 10502) | parseCharStrings({
    method emptyPrivateDictionary (line 10569) | emptyPrivateDictionary(parentDict) {
    method parsePrivateDict (line 10574) | parsePrivateDict(parentDict) {
    method parseCharsets (line 10610) | parseCharsets(pos, length, strings, cid) {
    method parseEncoding (line 10656) | parseEncoding(pos, properties, strings, charset) {
    method parseFDSelect (line 10714) | parseFDSelect(pos, length) {
  class CFF (line 10751) | class CFF {
    method constructor (line 10752) | constructor() {
    method duplicateFirstGlyph (line 10765) | duplicateFirstGlyph() {
    method hasGlyphId (line 10776) | hasGlyphId(id) {
  class CFFHeader (line 10784) | class CFFHeader {
    method constructor (line 10785) | constructor(major, minor, hdrSize, offSize) {
  class CFFStrings (line 10792) | class CFFStrings {
    method constructor (line 10793) | constructor() {
    method get (line 10796) | get(index) {
    method getSID (line 10805) | getSID(str) {
    method add (line 10816) | add(value) {
    method count (line 10819) | get count() {
  class CFFIndex (line 10823) | class CFFIndex {
    method constructor (line 10824) | constructor() {
    method add (line 10828) | add(data) {
    method set (line 10832) | set(index, data) {
    method get (line 10836) | get(index) {
    method count (line 10839) | get count() {
  class CFFDict (line 10843) | class CFFDict {
    method constructor (line 10844) | constructor(tables, strings) {
    method setByKey (line 10854) | setByKey(key, value) {
    method setByName (line 10874) | setByName(name, value) {
    method hasName (line 10880) | hasName(name) {
    method getByName (line 10883) | getByName(name) {
    method removeByName (line 10893) | removeByName(name) {
    method createTables (line 10896) | static createTables(layout) {
  class CFFTopDict (line 10918) | class CFFTopDict extends CFFDict {
    method tables (line 10919) | static get tables() {
    method constructor (line 10922) | constructor(strings) {
  class CFFPrivateDict (line 10928) | class CFFPrivateDict extends CFFDict {
    method tables (line 10929) | static get tables() {
    method constructor (line 10932) | constructor(strings) {
  class CFFCharset (line 10942) | class CFFCharset {
    method constructor (line 10943) | constructor(predefined, format, charset, raw) {
  class CFFEncoding (line 10950) | class CFFEncoding {
    method constructor (line 10951) | constructor(predefined, format, encoding, raw) {
  class CFFFDSelect (line 10958) | class CFFFDSelect {
    method constructor (line 10959) | constructor(format, fdSelect) {
    method getFDIndex (line 10963) | getFDIndex(glyphIndex) {
  class CFFOffsetTracker (line 10970) | class CFFOffsetTracker {
    method constructor (line 10971) | constructor() {
    method isTracking (line 10974) | isTracking(key) {
    method track (line 10977) | track(key, location) {
    method offset (line 10983) | offset(value) {
    method setEntryLocation (line 10988) | setEntryLocation(key, values, output) {
  class CFFCompiler (line 11013) | class CFFCompiler {
    method constructor (line 11014) | constructor(cff) {
    method compile (line 11017) | compile() {
    method encodeNumber (line 11089) | encodeNumber(value) {
    method EncodeFloatRegExp (line 11095) | static get EncodeFloatRegExp() {
    method encodeFloat (line 11098) | encodeFloat(num) {
    method encodeInteger (line 11126) | encodeInteger(value) {
    method compileHeader (line 11143) | compileHeader(header) {
    method compileNameIndex (line 11146) | compileNameIndex(names) {
    method compileTopDicts (line 11166) | compileTopDicts(dicts, length, removeCidKeys) {
    method compilePrivateDicts (line 11189) | compilePrivateDicts(dicts, trackers, output) {
    method compileDict (line 11212) | compileDict(dict, offsetTracker) {
    method compileStringIndex (line 11259) | compileStringIndex(strings) {
    method compileCharStrings (line 11266) | compileCharStrings(charStrings) {
    method compileCharset (line 11278) | compileCharset(charset, numGlyphs, strings, isCIDFont) {
    method compileEncoding (line 11309) | compileEncoding(encoding) {
    method compileFDSelect (line 11312) | compileFDSelect(fdSelect) {
    method compileTypedArray (line 11343) | compileTypedArray(data) {
    method compileIndex (line 11346) | compileIndex(index, trackers = []) {
  function mapSpecialUnicodeValues (line 15960) | function mapSpecialUnicodeValues(code) {
  function getUnicodeForGlyph (line 15970) | function getUnicodeForGlyph(name, glyphsUnicodeMap) {
  function getUnicodeRangeFor (line 15998) | function getUnicodeRangeFor(value, lastPosition = -1) {
  function getCharUnicodeCategory (line 16019) | function getCharUnicodeCategory(char) {
  function clearUnicodeCaches (line 16033) | function clearUnicodeCaches() {
  constant SEAC_ANALYSIS_ENABLED (line 16042) | const SEAC_ANALYSIS_ENABLED = true;
  function recoverGlyphName (line 16055) | function recoverGlyphName(name, glyphsUnicodeMap) {
  function type1FontGlyphMapping (line 16070) | function type1FontGlyphMapping(properties, builtInEncoding, glyphNames) {
  function normalizeFontName (line 16117) | function normalizeFontName(name) {
  function getStandardFontName (line 16962) | function getStandardFontName(name) {
  function isKnownFontName (line 16967) | function isKnownFontName(name) {
  class ToUnicodeMap (line 16974) | class ToUnicodeMap {
    method constructor (line 16975) | constructor(cmap = []) {
    method length (line 16978) | get length() {
    method forEach (line 16981) | forEach(callback) {
    method has (line 16986) | has(i) {
    method get (line 16989) | get(i) {
    method charCodeOf (line 16992) | charCodeOf(value) {
    method amend (line 17004) | amend(map) {
  class IdentityToUnicodeMap (line 17010) | class IdentityToUnicodeMap {
    method constructor (line 17011) | constructor(firstChar, lastChar) {
    method length (line 17015) | get length() {
    method forEach (line 17018) | forEach(callback) {
    method has (line 17023) | has(i) {
    method get (line 17026) | get(i) {
    method charCodeOf (line 17032) | charCodeOf(v) {
    method amend (line 17035) | amend(map) {
  class CFFFont (line 17044) | class CFFFont {
    method constructor (line 17045) | constructor(file, properties) {
    method numGlyphs (line 17060) | get numGlyphs() {
    method getCharset (line 17063) | getCharset() {
    method getGlyphMapping (line 17066) | getGlyphMapping() {
    method hasGlyphId (line 17113) | hasGlyphId(id) {
    method _createBuiltInEncoding (line 17116) | _createBuiltInEncoding() {
  function getUint32 (line 17148) | function getUint32(data, offset) {
  function getUint16 (line 17151) | function getUint16(data, offset) {
  function getInt16 (line 17154) | function getInt16(data, offset) {
  function getInt8 (line 17157) | function getInt8(data, offset) {
  function getFloat214 (line 17160) | function getFloat214(data, offset) {
  function getSubroutineBias (line 17163) | function getSubroutineBias(subrs) {
  function parseCmap (line 17173) | function parseCmap(data, start, end) {
  function parseCff (line 17223) | function parseCff(data, start, end, seacAnalysisEnabled) {
  function parseGlyfTable (line 17236) | function parseGlyfTable(glyf, loca, isGlyphLocationsLong) {
  function lookupCmap (line 17254) | function lookupCmap(ranges, unicode) {
  function compileGlyf (line 17275) | function compileGlyf(code, cmds, font) {
  function compileCharString (line 17447) | function compileCharString(charStringCode, cmds, font, glyphId) {
  constant NOOP (line 17810) | const NOOP = [];
  class CompiledFont (line 17811) | class CompiledFont {
    method constructor (line 17812) | constructor(fontMatrix) {
    method getPathJs (line 17820) | getPathJs(unicode) {
    method compileGlyph (line 17843) | compileGlyph(code, glyphId) {
    method compileGlyphImpl (line 17872) | compileGlyphImpl() {
    method hasBuiltPath (line 17875) | hasBuiltPath(unicode) {
  class TrueTypeCompiled (line 17883) | class TrueTypeCompiled extends CompiledFont {
    method constructor (line 17884) | constructor(glyphs, cmap, fontMatrix) {
    method compileGlyphImpl (line 17889) | compileGlyphImpl(code, cmds) {
  class Type2Compiled (line 17893) | class Type2Compiled extends CompiledFont {
    method constructor (line 17894) | constructor(cffInfo, cmap, fontMatrix, glyphNameMap) {
    method compileGlyphImpl (line 17907) | compileGlyphImpl(code, cmds, glyphId) {
  class FontRendererFactory (line 17911) | class FontRendererFactory {
    method create (line 17912) | static create(font, seacAnalysisEnabled) {
  constant ON_CURVE_POINT (line 20978) | const ON_CURVE_POINT = 1 << 0;
  constant X_SHORT_VECTOR (line 20979) | const X_SHORT_VECTOR = 1 << 1;
  constant Y_SHORT_VECTOR (line 20980) | const Y_SHORT_VECTOR = 1 << 2;
  constant REPEAT_FLAG (line 20981) | const REPEAT_FLAG = 1 << 3;
  constant X_IS_SAME_OR_POSITIVE_X_SHORT_VECTOR (line 20982) | const X_IS_SAME_OR_POSITIVE_X_SHORT_VECTOR = 1 << 4;
  constant Y_IS_SAME_OR_POSITIVE_Y_SHORT_VECTOR (line 20983) | const Y_IS_SAME_OR_POSITIVE_Y_SHORT_VECTOR = 1 << 5;
  constant OVERLAP_SIMPLE (line 20984) | const OVERLAP_SIMPLE = 1 << 6;
  constant ARG_1_AND_2_ARE_WORDS (line 20985) | const ARG_1_AND_2_ARE_WORDS = 1 << 0;
  constant ARGS_ARE_XY_VALUES (line 20986) | const ARGS_ARE_XY_VALUES = 1 << 1;
  constant WE_HAVE_A_SCALE (line 20987) | const WE_HAVE_A_SCALE = 1 << 3;
  constant MORE_COMPONENTS (line 20988) | const MORE_COMPONENTS = 1 << 5;
  constant WE_HAVE_AN_X_AND_Y_SCALE (line 20989) | const WE_HAVE_AN_X_AND_Y_SCALE = 1 << 6;
  constant WE_HAVE_A_TWO_BY_TWO (line 20990) | const WE_HAVE_A_TWO_BY_TWO = 1 << 7;
  constant WE_HAVE_INSTRUCTIONS (line 20991) | const WE_HAVE_INSTRUCTIONS = 1 << 8;
  class GlyfTable (line 20992) | class GlyfTable {
    method constructor (line 20993) | constructor({
    method getSize (line 21017) | getSize() {
    method write (line 21023) | write() {
    method scale (line 21052) | scale(factors) {
  class Glyph (line 21058) | class Glyph {
    method constructor (line 21059) | constructor({
    method parse (line 21068) | static parse(pos, glyf) {
    method getSize (line 21092) | getSize() {
    method write (line 21099) | write(pos, buf) {
    method scale (line 21114) | scale(factor) {
  class GlyphHeader (line 21129) | class GlyphHeader {
    method constructor (line 21130) | constructor({
    method parse (line 21143) | static parse(pos, glyf) {
    method getSize (line 21152) | getSize() {
    method write (line 21155) | write(pos, buf) {
    method scale (line 21163) | scale(x, factor) {
  class Contour (line 21168) | class Contour {
    method constructor (line 21169) | constructor({
  class SimpleGlyph (line 21179) | class SimpleGlyph {
    method constructor (line 21180) | constructor({
    method parse (line 21187) | static parse(pos, glyf, numberOfContours) {
    method getSize (line 21271) | getSize() {
    method write (line 21298) | write(pos, buf) {
    method scale (line 21374) | scale(x, factor) {
  class CompositeGlyph (line 21385) | class CompositeGlyph {
    method constructor (line 21386) | constructor({
    method parse (line 21401) | static parse(pos, glyf) {
    method getSize (line 21454) | getSize() {
    method write (line 21469) | write(pos, buf) {
    method scale (line 21505) | scale(x, factor) {}
  function writeInt16 (line 21511) | function writeInt16(dest, offset, num) {
  function writeInt32 (line 21515) | function writeInt32(dest, offset, num) {
  function writeData (line 21521) | function writeData(dest, offset, data) {
  constant OTF_HEADER_SIZE (line 21534) | const OTF_HEADER_SIZE = 12;
  constant OTF_TABLE_ENTRY_SIZE (line 21535) | const OTF_TABLE_ENTRY_SIZE = 16;
  class OpenTypeFileBuilder (line 21536) | class OpenTypeFileBuilder {
    method constructor (line 21537) | constructor(sfnt) {
    method getSearchParams (line 21541) | static getSearchParams(entriesCount, entrySize) {
    method toArray (line 21555) | toArray() {
    method addTable (line 21606) | addTable(tag, data) {
  constant HINTING_ENABLED (line 21619) | const HINTING_ENABLED = false;
  constant COMMAND_MAP (line 21620) | const COMMAND_MAP = {
  class Type1CharString (line 21637) | class Type1CharString {
    method constructor (line 21638) | constructor() {
    method convert (line 21645) | convert(encoded, subrs, seacAnalysisEnabled) {
    method executeCommand (line 21838) | executeCommand(howManyArgs, command, keepStack) {
  constant EEXEC_ENCRYPT_KEY (line 21862) | const EEXEC_ENCRYPT_KEY = 55665;
  constant CHAR_STRS_ENCRYPT_KEY (line 21863) | const CHAR_STRS_ENCRYPT_KEY = 4330;
  function isHexDigit (line 21864) | function isHexDigit(code) {
  function decrypt (line 21867) | function decrypt(data, key, discardNumber) {
  function decryptAscii (line 21888) | function decryptAscii(data, key, discardNumber) {
  function isSpecial (line 21914) | function isSpecial(c) {
  class Type1Parser (line 21917) | class Type1Parser {
    method constructor (line 21918) | constructor(stream, encrypted, seacAnalysisEnabled) {
    method readNumberArray (line 21928) | readNumberArray() {
    method readNumber (line 21940) | readNumber() {
    method readInt (line 21944) | readInt() {
    method readBoolean (line 21948) | readBoolean() {
    method nextChar (line 21952) | nextChar() {
    method prevChar (line 21955) | prevChar() {
    method getToken (line 21959) | getToken() {
    method readCharStrings (line 21988) | readCharStrings(bytes, lenIV) {
    method extractFontProgram (line 21994) | extractFontProgram(properties) {
    method extractFontHeader (line 22127) | extractFontHeader(properties) {
  function findBlock (line 22186) | function findBlock(streamBytes, signature, startIndex) {
  function getHeaderBlock (line 22212) | function getHeaderBlock(stream, suggestedLength) {
  function getEexecBlock (line 22258) | function getEexecBlock(stream, suggestedLength) {
  class Type1Font (line 22268) | class Type1Font {
    method constructor (line 22269) | constructor(name, file, properties) {
    method numGlyphs (line 22299) | get numGlyphs() {
    method getCharset (line 22302) | getCharset() {
    method getGlyphMapping (line 22311) | getGlyphMapping(properties) {
    method hasGlyphId (line 22338) | hasGlyphId(id) {
    method getSeacs (line 22348) | getSeacs(charstrings) {
    method getType2Charstrings (line 22358) | getType2Charstrings(type1Charstrings) {
    method getType2Subrs (line 22365) | getType2Subrs(type1Subrs) {
    method wrap (line 22385) | wrap(name, glyphs, charstrings, subrs, properties) {
  constant PRIVATE_USE_AREAS (line 22473) | const PRIVATE_USE_AREAS = [[0xe000, 0xf8ff], [0x100000, 0x10fffd]];
  constant PDF_GLYPH_SPACE_UNITS (line 22474) | const PDF_GLYPH_SPACE_UNITS = 1000;
  constant EXPORT_DATA_PROPERTIES (line 22475) | const EXPORT_DATA_PROPERTIES = ["ascent", "bbox", "black", "bold", "char...
  constant EXPORT_DATA_EXTRA_PROPERTIES (line 22476) | const EXPORT_DATA_EXTRA_PROPERTIES = ["cMap", "defaultEncoding", "differ...
  function adjustWidths (line 22477) | function adjustWidths(properties) {
  function adjustTrueTypeToUnicode (line 22491) | function adjustTrueTypeToUnicode(properties, isSymbolicFont, nameRecords) {
  function adjustType1ToUnicode (line 22536) | function adjustType1ToUnicode(properties, builtInEncoding) {
  function amendFallbackToUnicode (line 22567) | function amendFallbackToUnicode(properties) {
  class fonts_Glyph (line 22585) | class fonts_Glyph {
    method constructor (line 22586) | constructor(originalCharCode, fontChar, unicode, accent, width, vmetri...
    method category (line 22597) | get category() {
  function int16 (line 22601) | function int16(b0, b1) {
  function writeSignedInt16 (line 22604) | function writeSignedInt16(bytes, index, value) {
  function signedInt16 (line 22608) | function signedInt16(b0, b1) {
  function writeUint32 (line 22612) | function writeUint32(bytes, index, value) {
  function int32 (line 22618) | function int32(b0, b1, b2, b3) {
  function string16 (line 22621) | function string16(value) {
  function safeString16 (line 22624) | function safeString16(value) {
  function isTrueTypeFile (line 22632) | function isTrueTypeFile(file) {
  function isTrueTypeCollectionFile (line 22636) | function isTrueTypeCollectionFile(file) {
  function isOpenTypeFile (line 22640) | function isOpenTypeFile(file) {
  function isType1File (line 22644) | function isType1File(file) {
  function isCFFFile (line 22654) | function isCFFFile(file) {
  function getFontFileType (line 22661) | function getFontFileType(file, {
  function applyStandardFontGlyphMap (line 22692) | function applyStandardFontGlyphMap(map, glyphMap) {
  function buildToFontChar (line 22697) | function buildToFontChar(encoding, glyphsUnicodeMap, differences) {
  function isMacNameRecord (line 22714) | function isMacNameRecord(r) {
  function isWinNameRecord (line 22717) | function isWinNameRecord(r) {
  function convertCidString (line 22720) | function convertCidString(charCode, cid, shouldThrow = false) {
  function adjustMapping (line 22734) | function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUn...
  function getRanges (line 22781) | function getRanges(glyphs, toUnicodeExtraMap, numGlyphs) {
  function createCmapTable (line 22831) | function createCmapTable(glyphs, toUnicodeExtraMap, numGlyphs) {
  function validateOS2Table (line 22914) | function validateOS2Table(os2, file) {
  function createOS2Table (line 22935) | function createOS2Table(properties, charstrings, override) {
  function createPostTable (line 22991) | function createPostTable(properties) {
  function createPostscriptName (line 22995) | function createPostscriptName(name) {
  function createNameTable (line 22998) | function createNameTable(name, proto) {
  class Font (line 23032) | class Font {
    method constructor (line 23033) | constructor(name, file, properties) {
    method renderer (line 23157) | get renderer() {
    method exportData (line 23161) | exportData(extraProperties = false) {
    method fallbackToSystemFont (line 23173) | fallbackToSystemFont(properties) {
    method checkAndRepair (line 23270) | checkAndRepair(name, font, properties) {
    method convert (line 24596) | convert(fontName, font, properties) {
    method spaceWidth (line 24698) | get spaceWidth() {
    method _charToGlyph (line 24728) | _charToGlyph(charcode, isSpace = false) {
    method charsToGlyphs (line 24783) | charsToGlyphs(chars) {
    method getCharPositions (line 24812) | getCharPositions(chars) {
    method glyphCacheValues (line 24830) | get glyphCacheValues() {
    method encodeString (line 24833) | encodeString(str) {
  class ErrorFont (line 24867) | class ErrorFont {
    method constructor (line 24868) | constructor(error) {
    method charsToGlyphs (line 24873) | charsToGlyphs() {
    method encodeString (line 24876) | encodeString(chars) {
    method exportData (line 24879) | exportData(extraProperties = false) {
  class Pattern (line 24900) | class Pattern {
    method constructor (line 24901) | constructor() {
    method parseShading (line 24904) | static parseShading(shading, xref, res, pdfFunctionFactory, localColor...
  class BaseShading (line 24929) | class BaseShading {
    method constructor (line 24931) | constructor() {
    method getIR (line 24936) | getIR() {
  class RadialAxialShading (line 24940) | class RadialAxialShading extends BaseShading {
    method constructor (line 24941) | constructor(dict, xref, resources, pdfFunctionFactory, localColorSpace...
    method getIR (line 25049) | getIR() {
  class MeshStreamReader (line 25071) | class MeshStreamReader {
    method constructor (line 25072) | constructor(stream, context) {
    method hasData (line 25082) | get hasData() {
    method readBits (line 25097) | readBits(n) {
    method align (line 25121) | align() {
    method readFlag (line 25125) | readFlag() {
    method readCoordinate (line 25128) | readCoordinate() {
    method readComponents (line 25136) | readComponents() {
  function buildB (line 25154) | function buildB(count) {
  function getB (line 25163) | function getB(count) {
  function clearPatternCaches (line 25166) | function clearPatternCaches() {
  class MeshShading (line 25169) | class MeshShading extends BaseShading {
    method constructor (line 25173) | constructor(stream, xref, resources, pdfFunctionFactory, localColorSpa...
    method _decodeType4Shading (line 25238) | _decodeType4Shading(reader) {
    method _decodeType5Shading (line 25279) | _decodeType5Shading(reader, verticesPerRow) {
    method _decodeType6Shading (line 25297) | _decodeType6Shading(reader) {
    method _decodeType7Shading (line 25414) | _decodeType7Shading(reader) {
    method _buildFigureFromPatch (line 25539) | _buildFigureFromPatch(index) {
    method _updateBounds (line 25612) | _updateBounds() {
    method _packData (line 25627) | _packData() {
    method getIR (line 25657) | getIR() {
  class D
Condensed preview — 322 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (4,299K chars).
[
  {
    "path": ".changeset/README.md",
    "chars": 512,
    "preview": "# Changesets\n\nHello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that wo"
  },
  {
    "path": ".changeset/config.json",
    "chars": 311,
    "preview": "{\n  \"$schema\": \"https://unpkg.com/@changesets/config@3.1.3/schema.json\",\n  \"changelog\": [\"@changesets/changelog-github\","
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "chars": 1732,
    "preview": "name: Bug Report\ndescription: Report a bug (crashes, errors, unexpected behavior)\ntitle: \"[Bug] \"\nlabels: [\"bug\"]\nbody:\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "chars": 297,
    "preview": "blank_issues_enabled: false\ncontact_links:\n  - name: Documentation\n    url: https://github.com/run-llama/liteparse#readm"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.yml",
    "chars": 887,
    "preview": "name: Feature Request\ndescription: Suggest a new feature or improvement\ntitle: \"[Feature] \"\nlabels: [\"enhancement\"]\nbody"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/parsing_issue.yml",
    "chars": 2562,
    "preview": "name: Parsing Issue\ndescription: Report an issue with document parsing (incorrect output, missing text, etc.)\ntitle: \"[P"
  },
  {
    "path": ".github/workflows/ci.yml",
    "chars": 1145,
    "preview": "name: CI\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n\njobs:\n  lint-and-build:\n    runs-on: ub"
  },
  {
    "path": ".github/workflows/e2e-output.yml",
    "chars": 4608,
    "preview": "name: E2E Output Validation\n\non:\n  pull_request:\n    branches: [main]\n  push:\n    branches: [main]\n\n# For PRs, we need w"
  },
  {
    "path": ".github/workflows/homebrew_release.yml",
    "chars": 617,
    "preview": "name: Push Release to HomeBrew repository\n\non:\n  workflow_dispatch:\n\njobs:\n  push-release-homebrew:\n    runs-on: ubuntu-"
  },
  {
    "path": ".github/workflows/ocr_servers.yml",
    "chars": 1008,
    "preview": "name: Validate OCR Servers\n\non:\n  pull_request:\n\njobs:\n  testing_paddleocr:\n    runs-on: ubuntu-latest\n    strategy:\n   "
  },
  {
    "path": ".github/workflows/release.yml",
    "chars": 1423,
    "preview": "name: Release\n\non:\n  push:\n    branches:\n      - main\n\nconcurrency: ${{ github.workflow }}-${{ github.ref }}\n\njobs:\n  re"
  },
  {
    "path": ".github/workflows/sync-docs.yml",
    "chars": 1199,
    "preview": "name: Sync Docs to Developer Hub\n\non:\n  push:\n    branches: [main]\n    paths:\n      - \"docs/**\"\n  workflow_dispatch:\n\njo"
  },
  {
    "path": ".gitignore",
    "chars": 752,
    "preview": "# Dependencies\nnode_modules/\npnpm-lock.yaml\n\n# Build output\ndist/\nbin/\nsea-prep.blob\nsea-config.json\n\n# Environment\n*cla"
  },
  {
    "path": ".prettierignore",
    "chars": 67,
    "preview": "dist\nnode_modules\nsrc/vendor\n*.md\npnpm-lock.yaml\npackage-lock.json\n"
  },
  {
    "path": ".prettierrc",
    "chars": 133,
    "preview": "{\n  \"semi\": true,\n  \"singleQuote\": false,\n  \"tabWidth\": 2,\n  \"trailingComma\": \"es5\",\n  \"printWidth\": 100,\n  \"bracketSpac"
  },
  {
    "path": "AGENTS.md",
    "chars": 6255,
    "preview": "# LiteParse - Agent Documentation\n\n> This file provides comprehensive context for AI coding agents working on this codeb"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 6117,
    "preview": "# @llamaindex/liteparse\n\n## 1.4.6\n\n### Patch Changes\n\n- [#120](https://github.com/run-llama/liteparse/pull/120) [`9cde44"
  },
  {
    "path": "CLAUDE.md",
    "chars": 10,
    "preview": "@AGENTS.md"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 6020,
    "preview": "# Contributing to LiteParse\n\nThank you for your interest in contributing to LiteParse! This document provides guidelines"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "OCR_API_SPEC.md",
    "chars": 5454,
    "preview": "# LiteParse OCR API Specification\n\nThis document defines the standard HTTP API that OCR servers must implement to work w"
  },
  {
    "path": "README.md",
    "chars": 12571,
    "preview": "# LiteParse\n\n[![CI](https://github.com/run-llama/liteparse/actions/workflows/ci.yml/badge.svg)](https://github.com/run-l"
  },
  {
    "path": "SECURITY.md",
    "chars": 2173,
    "preview": "# Security Policy\n\n## Reporting a Vulnerability\n\nIf you discover a security vulnerability in LiteParse, please report it"
  },
  {
    "path": "cli/README.md",
    "chars": 947,
    "preview": "# cli/\n\nCommand-line interface for LiteParse using Commander.js.\n\n## Files\n\n### parse.ts\n**CLI entry point with two main"
  },
  {
    "path": "cli/parse.ts",
    "chars": 17953,
    "preview": "import { Command, Option } from \"commander\";\nimport fs from \"fs/promises\";\nimport { existsSync, readdirSync, statSync } "
  },
  {
    "path": "dataset_eval_utils/README.md",
    "chars": 3588,
    "preview": "# LiteParse Eval Utils\n\nUtilities for generating and evaluating datasets for PDF parsing performance. Compares text extr"
  },
  {
    "path": "dataset_eval_utils/pyproject.toml",
    "chars": 667,
    "preview": "[project]\nname = \"liteparse-eval\"\nversion = \"0.1.0\"\ndescription = \"Utilities for generating and evaluating datasets for "
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/__init__.py",
    "chars": 464,
    "preview": "\"\"\"LiteParse Eval - Document parsing evaluation and benchmarking toolkit.\"\"\"\n\nfrom liteparse_eval.providers import (\n   "
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/benchmark.py",
    "chars": 9754,
    "preview": "\"\"\"\nPerformance benchmarking tool for parser providers.\n\nMeasures latency and resource usage across multiple runs for a "
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/evaluation.py",
    "chars": 17122,
    "preview": "\"\"\"\nEvaluation and benchmarking script for text extraction and LLM-based document understanding.\n\nThis script provides:\n"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/processing.py",
    "chars": 8389,
    "preview": "\"\"\"\nProcess PDFs and images to create a structured dataset using Anthropic's Claude with vision.\n\"\"\"\n\nimport base64\nimpo"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/__init__.py",
    "chars": 384,
    "preview": "from .llm import LLMProvider, AnthropicProvider, QA_PROMPT\nfrom .parsers import (\n    ParserProvider,\n    LiteparseProvi"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/llm/__init__.py",
    "chars": 143,
    "preview": "from .base import LLMProvider, QA_PROMPT\nfrom .anthropic import AnthropicProvider\n\n__all__ = [\"LLMProvider\", \"AnthropicP"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/llm/anthropic.py",
    "chars": 2412,
    "preview": "from anthropic import Anthropic\n\nfrom .base import LLMProvider, QA_PROMPT, JUDGE_PROMPT\n\n\nclass AnthropicProvider(LLMPro"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/llm/base.py",
    "chars": 2230,
    "preview": "from abc import ABC, abstractmethod\nfrom pathlib import Path\n\nQA_PROMPT = \"<document>{ocr_text}</document>\\n\\nAnswer the"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/__init__.py",
    "chars": 319,
    "preview": "from .base import ParserProvider\nfrom .liteparse import LiteparseProvider\nfrom .markitdown import MarkItDownProvider\nfro"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/base.py",
    "chars": 1031,
    "preview": "from abc import ABC, abstractmethod\nfrom pathlib import Path\n\n\nclass ParserProvider(ABC):\n    \"\"\"Abstract base class for"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/liteparse.py",
    "chars": 2405,
    "preview": "from pathlib import Path\nfrom typing import Optional\n\nfrom liteparse import LiteParse\n\nfrom .base import ParserProvider\n"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/markitdown.py",
    "chars": 722,
    "preview": "from pathlib import Path\n\nfrom markitdown import MarkItDown\n\nfrom .base import ParserProvider\n\n\nclass MarkItDownProvider"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/pymupdf.py",
    "chars": 527,
    "preview": "from pathlib import Path\n\nimport fitz  # PyMuPDF\n\nfrom .base import ParserProvider\n\n\nclass PyMuPDFProvider(ParserProvide"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/providers/parsers/pypdf.py",
    "chars": 689,
    "preview": "from pathlib import Path\n\nimport pypdf\n\nfrom .base import ParserProvider\n\n\nclass PyPDFProvider(ParserProvider):\n    \"\"\"\n"
  },
  {
    "path": "dataset_eval_utils/src/liteparse_eval/report.py",
    "chars": 20590,
    "preview": "\"\"\"HTML report generation for text extraction evaluation results.\"\"\"\n\nimport base64\nimport html\nimport io\nfrom datetime "
  },
  {
    "path": "docs/src/content/docs/liteparse/_meta.yml",
    "chars": 43,
    "preview": "label: LiteParse\norder: 1\ncollapsed: false\n"
  },
  {
    "path": "docs/src/content/docs/liteparse/cli-reference.md",
    "chars": 5124,
    "preview": "---\ntitle: CLI Reference\ndescription: Complete reference for all LiteParse CLI commands and options.\nsidebar:\n  order: 5"
  },
  {
    "path": "docs/src/content/docs/liteparse/getting_started.md",
    "chars": 1582,
    "preview": "---\ntitle: Getting Started\ndescription: Install LiteParse and parse your first document in under a minute.\nsidebar:\n  or"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/_meta.yml",
    "chars": 40,
    "preview": "label: Guides\norder: 2\ncollapsed: false\n"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/agent-skill.md",
    "chars": 1883,
    "preview": "---\ntitle: Agent Skill\ndescription: Add LiteParse as a skill for coding agents like Claude Code, Cursor, and others.\nsid"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/library-usage.md",
    "chars": 5185,
    "preview": "---\ntitle: Library Usage\ndescription: Use LiteParse programmatically from TypeScript or Python.\nsidebar:\n  order: 1\n---\n"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/multi-format.md",
    "chars": 2043,
    "preview": "---\ntitle: Multi-Format Support\ndescription: Parse Word documents, spreadsheets, presentations, and images with LitePars"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/ocr.md",
    "chars": 4439,
    "preview": "---\ntitle: OCR Configuration\ndescription: Configure OCR in LiteParse — built-in Tesseract, or bring your own via HTTP se"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/parsing-urls.md",
    "chars": 1300,
    "preview": "---\ntitle: Parsing URLs\ndescription: Parse remote documents by reading URLs.\nsidebar:\n  order: 5\n---\n\nTo parse remote fi"
  },
  {
    "path": "docs/src/content/docs/liteparse/guides/visual-citations.md",
    "chars": 6820,
    "preview": "---\ntitle: Visual Citations with Bounding Boxes\ndescription: Use bounding boxes and screenshots to show exactly where in"
  },
  {
    "path": "docs/src/content/docs/liteparse/index.md",
    "chars": 1570,
    "preview": "---\ntitle: What is LiteParse?\ndescription: Fast, local PDF parsing with spatial text parsing, OCR, and bounding boxes.\ns"
  },
  {
    "path": "docs.config.mjs",
    "chars": 277,
    "preview": "export default {\n  section: \"liteparse\",\n  label: \"LiteParse\",\n  content: [\n    { src: \"./docs/src/content/docs/litepars"
  },
  {
    "path": "eslint.config.js",
    "chars": 855,
    "preview": "import eslint from \"@eslint/js\";\nimport tseslint from \"typescript-eslint\";\nimport eslintConfigPrettier from \"eslint-conf"
  },
  {
    "path": "ocr/README.md",
    "chars": 2680,
    "preview": "# ocr/\n\nExample OCR server implementations that conform to the LiteParse OCR API specification.\n\nThese servers allow you"
  },
  {
    "path": "ocr/easyocr/Dockerfile",
    "chars": 469,
    "preview": "FROM ghcr.io/astral-sh/uv:python3.12-trixie\n\n# Install system dependencies\nRUN apt-get update && apt-get install -y \\\n  "
  },
  {
    "path": "ocr/easyocr/README.md",
    "chars": 1659,
    "preview": "# EasyOCR Service\n\nThis is a simple Flask server that wraps EasyOCR to conform to the LiteParse OCR API specification (s"
  },
  {
    "path": "ocr/easyocr/pyproject.toml",
    "chars": 380,
    "preview": "[project]\nname = \"easyocr-liteparse\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\nre"
  },
  {
    "path": "ocr/easyocr/server.py",
    "chars": 3080,
    "preview": "import io\nimport logging\nfrom typing import Any\n\nimport easyocr\nimport numpy as np\nimport uvicorn\nfrom fastapi import Fa"
  },
  {
    "path": "ocr/easyocr/test_server.py",
    "chars": 1997,
    "preview": "import io\nfrom typing import Any\n\nimport pytest\nfrom fastapi.testclient import TestClient\nfrom PIL import Image\n\nfrom se"
  },
  {
    "path": "ocr/paddleocr/Dockerfile",
    "chars": 469,
    "preview": "FROM ghcr.io/astral-sh/uv:python3.12-trixie\n\n# Install system dependencies\nRUN apt-get update && apt-get install -y \\\n  "
  },
  {
    "path": "ocr/paddleocr/README.md",
    "chars": 2358,
    "preview": "# PaddleOCR Service\n\nThis is a simple Flask server that wraps PaddleOCR to conform to the LiteParse OCR API specificatio"
  },
  {
    "path": "ocr/paddleocr/pyproject.toml",
    "chars": 584,
    "preview": "[project]\nname = \"paddleocr-liteparse\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\n"
  },
  {
    "path": "ocr/paddleocr/server.py",
    "chars": 5582,
    "preview": "import io\nimport logging\nfrom typing import Any\n\nimport numpy as np\nimport uvicorn\nfrom fastapi import FastAPI, HTTPExce"
  },
  {
    "path": "ocr/paddleocr/test_server.py",
    "chars": 3196,
    "preview": "import io\nfrom typing import Any\n\nimport pytest\nfrom fastapi.testclient import TestClient\nfrom paddleocr import PaddleOC"
  },
  {
    "path": "package.json",
    "chars": 2599,
    "preview": "{\n  \"name\": \"@llamaindex/liteparse\",\n  \"version\": \"1.4.6\",\n  \"description\": \"Open-source PDF parsing with spatial text e"
  },
  {
    "path": "packages/python/README.md",
    "chars": 2920,
    "preview": "# LiteParse Python\n\nPython wrapper for [LiteParse](https://github.com/run-llama/liteparse) - fast, lightweight document "
  },
  {
    "path": "packages/python/liteparse/__init__.py",
    "chars": 630,
    "preview": "from .parser import LiteParse\nfrom .types import (\n    # Enums\n    OutputFormat,\n    ImageFormat,\n    # Results\n    Pars"
  },
  {
    "path": "packages/python/liteparse/parser.py",
    "chars": 28779,
    "preview": "\"\"\"LiteParse Python wrapper - wraps the Node.js CLI via subprocess.\"\"\"\n\nimport asyncio\nimport json\nimport os\nimport shut"
  },
  {
    "path": "packages/python/liteparse/py.typed",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "packages/python/liteparse/types.py",
    "chars": 2664,
    "preview": "from __future__ import annotations\n\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Li"
  },
  {
    "path": "packages/python/pyproject.toml",
    "chars": 1406,
    "preview": "[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[project]\nname = \"liteparse\"\nversion = \"1.2.1"
  },
  {
    "path": "packages/python/tests/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "packages/python/tests/conftest.py",
    "chars": 1047,
    "preview": "\"\"\"Shared fixtures for e2e tests.\"\"\"\n\nfrom pathlib import Path\n\nimport pytest\n\nfrom liteparse import LiteParse\n\n# Resolv"
  },
  {
    "path": "packages/python/tests/test_batch_e2e.py",
    "chars": 2936,
    "preview": "\"\"\"E2E tests for LiteParse.batch_parse() — validates Python types match CLI output.\"\"\"\n\nimport tempfile\nfrom pathlib imp"
  },
  {
    "path": "packages/python/tests/test_parse_e2e.py",
    "chars": 7013,
    "preview": "\"\"\"E2E tests for LiteParse.parse() — validates Python types match CLI output.\"\"\"\n\nfrom pathlib import Path\n\nimport pytes"
  },
  {
    "path": "packages/python/tests/test_screenshot_e2e.py",
    "chars": 3830,
    "preview": "\"\"\"E2E tests for LiteParse.screenshot() — validates Python types match CLI output.\"\"\"\n\nimport tempfile\nfrom pathlib impo"
  },
  {
    "path": "scripts/compare-dataset.ts",
    "chars": 10984,
    "preview": "/**\n * Compares current liteparse output against a baseline dataset\n *\n * Usage:\n *   npx tsx scripts/compare-dataset.ts"
  },
  {
    "path": "scripts/compare-outputs.sh",
    "chars": 717,
    "preview": "#!/bin/bash\n# Compare dataset outputs and set GitHub Actions output variables\n# Usage: ./compare-outputs.sh <expected-da"
  },
  {
    "path": "scripts/create-dataset.ts",
    "chars": 5948,
    "preview": "/**\n * Creates a dataset for regression testing\n *\n * Output structure:\n *   dataset/\n *     data/\n *       doc1.pdf\n * "
  },
  {
    "path": "scripts/generate-api-docs.sh",
    "chars": 664,
    "preview": "#!/usr/bin/env bash\n# Generates API reference docs from TypeDoc and outputs a single\n# Starlight-compatible markdown fil"
  },
  {
    "path": "scripts/publish-to-homebrew-repo.sh",
    "chars": 678,
    "preview": "#!/bin/bash\n\necho \"Installing homebrew-npm-noob tool\"\nuv tool install --upgrade homebrew-npm-noob\necho \"Setting up repos"
  },
  {
    "path": "scripts/sync-docs-to-developer-hub.sh",
    "chars": 641,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\nDOCS_REPO=\"${1:?Usage: $0 /path/to/developer-hub-repo}\"\nSCRIPT_DIR=\"$(cd \"$(dirna"
  },
  {
    "path": "scripts/upload-dataset.ts",
    "chars": 2770,
    "preview": "/**\n * Regenerates and uploads the dataset to HuggingFace\n *\n * Usage:\n *   HF_TOKEN=xxx npx tsx scripts/upload-dataset."
  },
  {
    "path": "src/conversion/README.md",
    "chars": 3623,
    "preview": "# src/conversion/\n\nMulti-format document conversion to PDF using external tools.\n\n## Files\n\n### convertToPdf.ts\n**Conver"
  },
  {
    "path": "src/conversion/convertToPdf.test.ts",
    "chars": 12671,
    "preview": "import os from \"os\";\nimport path from \"path\";\nimport { vi, describe, it, expect, afterEach } from \"vitest\";\n\ninterface S"
  },
  {
    "path": "src/conversion/convertToPdf.ts",
    "chars": 13525,
    "preview": "import { promises as fs, constants as fsConstants } from \"fs\";\nimport { spawn } from \"child_process\";\nimport path from \""
  },
  {
    "path": "src/core/README.md",
    "chars": 4231,
    "preview": "# src/core/\n\nThe core module contains the main orchestrator class, configuration management, and TypeScript type definit"
  },
  {
    "path": "src/core/config.test.ts",
    "chars": 816,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { LiteParseConfig } from \"./types\";\nimport { DEFAULT_CONFIG, merge"
  },
  {
    "path": "src/core/config.ts",
    "chars": 798,
    "preview": "import { LiteParseConfig } from \"./types.js\";\n\nexport const DEFAULT_CONFIG: LiteParseConfig = {\n  // OCR - defaults to i"
  },
  {
    "path": "src/core/parser.test.ts",
    "chars": 17986,
    "preview": "import { vi, describe, it, expect } from \"vitest\";\n\nconst {\n  mockPages,\n  mockPdfDocument,\n  mockParsedPages,\n  mockBou"
  },
  {
    "path": "src/core/parser.ts",
    "chars": 17558,
    "preview": "import pLimit from \"p-limit\";\nimport {\n  LiteParseConfig,\n  LiteParseInput,\n  ParseResult,\n  ScreenshotResult,\n  TextIte"
  },
  {
    "path": "src/core/types.ts",
    "chars": 10996,
    "preview": "/**\n * Supported output formats for parsed documents.\n *\n * - `\"json\"` — Structured JSON with per-page text items, bound"
  },
  {
    "path": "src/engines/README.md",
    "chars": 1555,
    "preview": "# src/engines/\n\nThe engines module provides pluggable abstractions for PDF parsing and OCR functionality. Each engine ty"
  },
  {
    "path": "src/engines/ocr/README.md",
    "chars": 2890,
    "preview": "# src/engines/ocr/\n\nOCR engines for extracting text from images.\n\n## Files\n\n### interface.ts\n**Defines the OcrEngine con"
  },
  {
    "path": "src/engines/ocr/http-simple.test.ts",
    "chars": 3703,
    "preview": "import { vi, describe, it, expect } from \"vitest\";\nimport { PassThrough } from \"stream\";\n\nconst mockStream = new PassThr"
  },
  {
    "path": "src/engines/ocr/http-simple.ts",
    "chars": 2621,
    "preview": "import axios from \"axios\";\nimport FormData from \"form-data\";\nimport fs from \"fs\";\nimport { OcrEngine, OcrOptions, OcrRes"
  },
  {
    "path": "src/engines/ocr/interface.ts",
    "chars": 424,
    "preview": "export interface OcrEngine {\n  name: string;\n  recognize(image: string | Buffer, options: OcrOptions): Promise<OcrResult"
  },
  {
    "path": "src/engines/ocr/tesseract.test.ts",
    "chars": 3320,
    "preview": "import { vi, describe, it, expect } from \"vitest\";\n\n// In tesseract.js v6+, words are nested in blocks → paragraphs → li"
  },
  {
    "path": "src/engines/ocr/tesseract.ts",
    "chars": 7064,
    "preview": "import { createWorker, createScheduler, Scheduler, Worker } from \"tesseract.js\";\nimport { OcrEngine, OcrOptions, OcrResu"
  },
  {
    "path": "src/engines/pdf/README.md",
    "chars": 5114,
    "preview": "# src/engines/pdf/\n\nPDF parsing engines for loading documents and extracting content.\n\n## Files\n\n### interface.ts\n**Defi"
  },
  {
    "path": "src/engines/pdf/interface.ts",
    "chars": 2111,
    "preview": "import { TextItem } from \"../../core/types.js\";\n\n/** Options for page extraction */\nexport interface ExtractOptions {\n  "
  },
  {
    "path": "src/engines/pdf/pdfium-renderer.test.ts",
    "chars": 3806,
    "preview": "import { vi, describe, it, expect, beforeEach } from \"vitest\";\nimport { PdfiumRenderer } from \"./pdfium-renderer\";\n\ncons"
  },
  {
    "path": "src/engines/pdf/pdfium-renderer.ts",
    "chars": 6005,
    "preview": "import { PDFiumLibrary, PDFiumDocument, type PDFiumPageRenderOptions } from \"@hyzyla/pdfium\";\nimport sharp from \"sharp\";"
  },
  {
    "path": "src/engines/pdf/pdfjs.test.ts",
    "chars": 7636,
    "preview": "import { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport { vi, describe, it, expect } from \"vit"
  },
  {
    "path": "src/engines/pdf/pdfjs.ts",
    "chars": 26409,
    "preview": "import fs from \"node:fs/promises\";\nimport {\n  PdfEngine,\n  PdfDocument,\n  PageData,\n  Image,\n  Annotation,\n  BoundingBox"
  },
  {
    "path": "src/engines/pdf/pdfjsImporter.ts",
    "chars": 348,
    "preview": "import { fileURLToPath } from \"node:url\";\nimport { dirname } from \"node:path\";\n\nexport async function importPdfJs() {\n  "
  },
  {
    "path": "src/index.ts",
    "chars": 109,
    "preview": "#!/usr/bin/env node\n\nimport { program } from \"../cli/parse.js\";\n\n// Run the CLI\nprogram.parse(process.argv);\n"
  },
  {
    "path": "src/lib.ts",
    "chars": 781,
    "preview": "/**\n * @packageDocumentation\n *\n * LiteParse — open-source PDF parsing with spatial text extraction, OCR, and bounding b"
  },
  {
    "path": "src/output/README.md",
    "chars": 1703,
    "preview": "# src/output/\n\nOutput formatters that convert parsed document data into different formats.\n\n## Files\n\n### json.ts\n**Stru"
  },
  {
    "path": "src/output/json.test.ts",
    "chars": 5247,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { buildJSON, formatJSON } from \"./json\";\n\nconst results = [\n  { te"
  },
  {
    "path": "src/output/json.ts",
    "chars": 877,
    "preview": "import { ParseResult, ParsedPage, ParseResultJson } from \"../core/types.js\";\n\n/**\n * Build JSON output from parsed pages"
  },
  {
    "path": "src/output/text.test.ts",
    "chars": 1508,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { formatPageText, formatText } from \"./text\";\n\nconst pages = [\n  {"
  },
  {
    "path": "src/output/text.ts",
    "chars": 454,
    "preview": "import { ParseResult, ParsedPage } from \"../core/types.js\";\n\n/**\n * Format pages as plain text\n */\nexport function forma"
  },
  {
    "path": "src/processing/README.md",
    "chars": 10402,
    "preview": "# src/processing/\n\nThe processing module is the **heart of LiteParse** - responsible for transforming raw PDF content in"
  },
  {
    "path": "src/processing/bbox.test.ts",
    "chars": 8931,
    "preview": "import { expect, describe, it } from \"vitest\";\nimport { buildBbox, buildBoundingBoxes, filterImagesForOCR } from \"./bbox"
  },
  {
    "path": "src/processing/bbox.ts",
    "chars": 9405,
    "preview": "import {\n  TextItem,\n  BoundingBox,\n  ProjectionTextBox,\n  OcrData,\n  LiteParseConfig,\n} from \"../core/types.js\";\nimport"
  },
  {
    "path": "src/processing/cleanText.test.ts",
    "chars": 1066,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { ParsedPage } from \"../core/types.js\";\nimport { cleanRawText } fr"
  },
  {
    "path": "src/processing/cleanText.ts",
    "chars": 2276,
    "preview": "import { ParsedPage, LiteParseConfig } from \"../core/types.js\";\n\n/**\n * Detect and remove margins from a single page.\n *"
  },
  {
    "path": "src/processing/grid.ts",
    "chars": 583,
    "preview": "import { PageData } from \"../engines/pdf/interface.js\";\nimport { ParsedPage, LiteParseConfig } from \"../core/types.js\";\n"
  },
  {
    "path": "src/processing/gridDebugLogger.ts",
    "chars": 14969,
    "preview": "import { mkdirSync, writeFileSync } from \"fs\";\nimport { mkdir, writeFile } from \"fs/promises\";\nimport { dirname } from \""
  },
  {
    "path": "src/processing/gridProjection.test.ts",
    "chars": 14086,
    "preview": "import { expect, describe, it } from \"vitest\";\nimport { bboxToLine, projectPagesToGrid, projectToGrid } from \"./gridProj"
  },
  {
    "path": "src/processing/gridProjection.ts",
    "chars": 70087,
    "preview": "import { strToSubscriptString, strToPostScript } from \"./textUtils.js\";\nimport { buildBbox } from \"./bbox.js\";\nimport { "
  },
  {
    "path": "src/processing/gridVisualizer.ts",
    "chars": 7060,
    "preview": "import sharp from \"sharp\";\nimport { mkdirSync } from \"fs\";\nimport type { VisualizerPageData, RenderedSegment } from \"./g"
  },
  {
    "path": "src/processing/markupUtils.test.ts",
    "chars": 923,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { MarkupData } from \"../core/types.js\";\nimport { applyMarkupTags }"
  },
  {
    "path": "src/processing/markupUtils.ts",
    "chars": 707,
    "preview": "import { MarkupData } from \"../core/types.js\";\n\n/**\n * Apply markup tags to text based on markup data\n * Returns text wi"
  },
  {
    "path": "src/processing/ocrUtils.ts",
    "chars": 2999,
    "preview": "import { Image, EasyOcrResultLine } from \"../engines/pdf/interface.js\";\n\nexport interface OcrBlock {\n  c: string; // Con"
  },
  {
    "path": "src/processing/octUtils.test.ts",
    "chars": 1868,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { parseImageOcrBlocks, easyOcrResultLinesToList } from \"./ocrUtils"
  },
  {
    "path": "src/processing/searchItems.test.ts",
    "chars": 3891,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { searchItems } from \"./searchItems\";\nimport { JsonTextItem } from"
  },
  {
    "path": "src/processing/searchItems.ts",
    "chars": 3424,
    "preview": "import { JsonTextItem, SearchItemsOptions } from \"../core/types.js\";\n\n/**\n * Search text items for matches, returning sy"
  },
  {
    "path": "src/processing/textUtils.test.ts",
    "chars": 2003,
    "preview": "import { describe, expect, it } from \"vitest\";\nimport { strToPostScript, strToSubscriptString, cleanOcrTableArtifacts } "
  },
  {
    "path": "src/processing/textUtils.ts",
    "chars": 3066,
    "preview": "/**\n * Clean common OCR artifacts from table documents.\n * OCR often misreads vertical table border lines as bracket-lik"
  },
  {
    "path": "src/vendor/pdfjs/LICENSE",
    "chars": 10174,
    "preview": "\n                                 Apache License\n                           Version 2.0, January 2004\n                  "
  },
  {
    "path": "src/vendor/pdfjs/README.md",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "src/vendor/pdfjs/cmaps/CNS2-V.bcmap",
    "chars": 91,
    "preview": "\u0003RCopyright 1990-2009 Adobe Systems Incorporated.\nAll rights reserved.\nSee ./LICENSE\u0006CNS2-H"
  },
  {
    "path": "src/vendor/pdfjs/cmaps/ETenms-B5-H.bcmap",
    "chars": 99,
    "preview": "\u0002RCopyright 1990-2009 Adobe Systems Incorporated.\nAll rights reserved.\nSee ./LICENSE\tETen-B5-H`\u0001 ^\u0001"
  },
  {
    "path": "src/vendor/pdfjs/cmaps/GB-H.bcmap",
    "chars": 356,
    "preview": "\u0002RCopyright 1990-2009 Adobe Systems Incorporated.\nAll rights reserved.\nSee ./LICENSE\u0001\u0001!!]aX!!]`21>\u0002\tp\u0002\u000bz$]\u0006\"Rd-U7*\u0017\r\b4%+"
  },
  {
    "path": "src/vendor/pdfjs/cmaps/LICENSE",
    "chars": 2080,
    "preview": "%%Copyright: -----------------------------------------------------------\n%%Copyright: Copyright 1990-2009 Adobe Systems "
  },
  {
    "path": "src/vendor/pdfjs/pdf.mjs",
    "chars": 635853,
    "preview": "/**\n * @licstart The following is the entire license notice for the\n * JavaScript code in this page\n *\n * Copyright 2023"
  },
  {
    "path": "src/vendor/pdfjs/pdf.sandbox.mjs",
    "chars": 723741,
    "preview": "/**\n * @licstart The following is the entire license notice for the\n * JavaScript code in this page\n *\n * Copyright 2023"
  },
  {
    "path": "src/vendor/pdfjs/pdf.worker.mjs",
    "chars": 2160028,
    "preview": "/**\n * @licstart The following is the entire license notice for the\n * JavaScript code in this page\n *\n * Copyright 2023"
  },
  {
    "path": "src/vendor/pdfjs/standard_fonts/LICENSE_FOXIT",
    "chars": 1553,
    "preview": "// Copyright 2014 PDFium Authors. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or "
  },
  {
    "path": "src/vendor/pdfjs/standard_fonts/LICENSE_LIBERATION",
    "chars": 4414,
    "preview": "Digitized data copyright (c) 2010 Google Corporation\n\twith Reserved Font Arimo, Tinos and Cousine.\nCopyright (c) 2012 Re"
  },
  {
    "path": "tsconfig.json",
    "chars": 639,
    "preview": "{\n  \"compilerOptions\": {\n    \"target\": \"ES2022\",\n    \"module\": \"ESNext\",\n    \"moduleResolution\": \"node\",\n    \"lib\": [\"ES"
  },
  {
    "path": "typedoc.json",
    "chars": 456,
    "preview": "{\n  \"$schema\": \"https://typedoc.org/schema.json\",\n  \"entryPoints\": [\"./src/lib.ts\"],\n  \"plugin\": [\"typedoc-plugin-markdo"
  },
  {
    "path": "vitest.config.ts",
    "chars": 523,
    "preview": "import { defineConfig } from \"vitest/config\";\n\nexport default defineConfig({\n  test: {\n    globals: true,\n    environmen"
  }
]

// ... and 175 more files (download for full content)

About this extraction

This page contains the full source code of the run-llama/liteparse GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 322 files (4.0 MB), approximately 1.0M tokens, and a symbol index with 5040 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!