Showing preview only (2,879K chars total). Download the full file or copy to clipboard to get everything.
Repository: opendataloader-project/opendataloader-pdf
Branch: main
Commit: ad359d9701e5
Files: 287
Total size: 2.7 MB
Directory structure:
gitextract_cwg3t_9k/
├── .editorconfig
├── .gitattributes
├── .github/
│ ├── CODEOWNERS
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ ├── feature_request.md
│ │ └── question.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── SECURITY.md
│ └── workflows/
│ ├── release.yml
│ ├── sync-docs.yml
│ └── test-benchmark.yml
├── .gitignore
├── CHANGELOG.md
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── LICENSE_TEMPLATE/
│ └── license.txt
├── NOTICE
├── README.md
├── SUPPORT.md
├── THIRD_PARTY/
│ ├── THIRD_PARTY_LICENSES.md
│ ├── THIRD_PARTY_NOTICES.md
│ └── licenses/
│ ├── BSD-2-Clause.txt
│ ├── BSD-3-Clause.txt
│ ├── Blue-Oak-1.0.0.txt
│ ├── CDDL-1.1.txt
│ ├── EDL-1.0.txt
│ ├── EPL-1.0.txt
│ ├── EPL-2.0.txt
│ ├── ISC.txt
│ ├── LICENSE-JJ2000.txt
│ ├── MIT.txt
│ ├── MPL-2.0.txt
│ ├── PSF-2.0.txt
│ └── Plexus Classworlds License.txt
├── build-scripts/
│ ├── fetch_shaded_jar.py
│ └── set_version.py
├── content/
│ └── docs/
│ ├── _generated/
│ │ ├── node-convert-options.mdx
│ │ └── python-convert-options.mdx
│ ├── accessibility-compliance.mdx
│ ├── accessibility-glossary.mdx
│ ├── ai-safety.mdx
│ ├── benchmark/
│ │ ├── index.mdx
│ │ ├── meta.json
│ │ ├── mhs.mdx
│ │ ├── nid.mdx
│ │ ├── speed.mdx
│ │ └── teds.mdx
│ ├── cli-options-reference.mdx
│ ├── community.mdx
│ ├── contributing.mdx
│ ├── development-workflow.mdx
│ ├── faq.mdx
│ ├── hybrid-mode.mdx
│ ├── index.mdx
│ ├── json-schema.mdx
│ ├── license.mdx
│ ├── meta.json
│ ├── quick-start-java.mdx
│ ├── quick-start-nodejs.mdx
│ ├── quick-start-python.mdx
│ ├── rag-integration.mdx
│ ├── reading-order.mdx
│ ├── tagged-pdf-collaboration.mdx
│ ├── tagged-pdf-rag.mdx
│ ├── tagged-pdf.mdx
│ ├── upcoming-roadmap.mdx
│ └── whats-new-v2.mdx
├── docs/
│ ├── hybrid/
│ │ ├── docling-speed-optimization-plan.md
│ │ ├── experiments/
│ │ │ ├── chunking_strategy/
│ │ │ │ ├── conclusion.json
│ │ │ │ ├── docling_benchmark_report.json
│ │ │ │ └── docling_page_range_benchmark.py
│ │ │ ├── speed/
│ │ │ │ ├── baseline_results.json
│ │ │ │ ├── fastapi_results.json
│ │ │ │ ├── speed-experiment-2026-01-03.md
│ │ │ │ └── subprocess_results.json
│ │ │ └── triage/
│ │ │ └── triage-experiments.md
│ │ ├── hybrid-mode-design.md
│ │ ├── hybrid-mode-tasks.md
│ │ └── research/
│ │ ├── comparison-summary.md
│ │ ├── docling-openapi.json
│ │ ├── docling-sample-response-lorem.json
│ │ ├── docling-sample-response.json
│ │ ├── documents-with-tables.txt
│ │ ├── iobject-structure.md
│ │ ├── opendataloader-sample-response.json
│ │ └── opendataloader-sample-response.md
│ └── superpowers/
│ ├── plans/
│ │ └── 2026-03-16-cid-font-detection.md
│ └── specs/
│ └── 2026-03-16-cid-font-detection-design.md
├── examples/
│ └── python/
│ ├── batch/
│ │ ├── README.md
│ │ ├── batch_processing.py
│ │ └── requirements.txt
│ └── rag/
│ ├── README.md
│ ├── basic_chunking.py
│ ├── langchain_example.py
│ └── requirements.txt
├── java/
│ ├── .run/
│ │ └── OpenDataLoaderCli.run.xml
│ ├── checkstyle.xml
│ ├── opendataloader-pdf-cli/
│ │ ├── pom.xml
│ │ └── src/
│ │ ├── main/
│ │ │ └── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ └── cli/
│ │ │ ├── CLIMain.java
│ │ │ └── CLIOptions.java
│ │ └── test/
│ │ └── java/
│ │ └── org/
│ │ └── opendataloader/
│ │ └── pdf/
│ │ └── cli/
│ │ ├── CLIMainTest.java
│ │ ├── CLIOptionsContentSafetyTest.java
│ │ └── CLIOptionsTest.java
│ ├── opendataloader-pdf-core/
│ │ ├── pom.xml
│ │ └── src/
│ │ ├── main/
│ │ │ └── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ ├── api/
│ │ │ │ ├── Config.java
│ │ │ │ ├── FilterConfig.java
│ │ │ │ └── OpenDataLoaderPDF.java
│ │ │ ├── containers/
│ │ │ │ └── StaticLayoutContainers.java
│ │ │ ├── entities/
│ │ │ │ ├── SemanticFormula.java
│ │ │ │ └── SemanticPicture.java
│ │ │ ├── html/
│ │ │ │ ├── HtmlGenerator.java
│ │ │ │ ├── HtmlGeneratorFactory.java
│ │ │ │ └── HtmlSyntax.java
│ │ │ ├── hybrid/
│ │ │ │ ├── DoclingFastServerClient.java
│ │ │ │ ├── DoclingSchemaTransformer.java
│ │ │ │ ├── HancomClient.java
│ │ │ │ ├── HancomSchemaTransformer.java
│ │ │ │ ├── HybridClient.java
│ │ │ │ ├── HybridClientFactory.java
│ │ │ │ ├── HybridConfig.java
│ │ │ │ ├── HybridSchemaTransformer.java
│ │ │ │ ├── TriageLogger.java
│ │ │ │ └── TriageProcessor.java
│ │ │ ├── json/
│ │ │ │ ├── JsonName.java
│ │ │ │ ├── JsonWriter.java
│ │ │ │ ├── ObjectMapperHolder.java
│ │ │ │ └── serializers/
│ │ │ │ ├── CaptionSerializer.java
│ │ │ │ ├── DoubleSerializer.java
│ │ │ │ ├── FormulaSerializer.java
│ │ │ │ ├── HeaderFooterSerializer.java
│ │ │ │ ├── HeadingSerializer.java
│ │ │ │ ├── ImageSerializer.java
│ │ │ │ ├── LineChunkSerializer.java
│ │ │ │ ├── ListItemSerializer.java
│ │ │ │ ├── ListSerializer.java
│ │ │ │ ├── ParagraphSerializer.java
│ │ │ │ ├── PictureSerializer.java
│ │ │ │ ├── SemanticTextNodeSerializer.java
│ │ │ │ ├── SerializerUtil.java
│ │ │ │ ├── TableCellSerializer.java
│ │ │ │ ├── TableRowSerializer.java
│ │ │ │ ├── TableSerializer.java
│ │ │ │ ├── TextChunkSerializer.java
│ │ │ │ └── TextLineSerializer.java
│ │ │ ├── markdown/
│ │ │ │ ├── MarkdownGenerator.java
│ │ │ │ ├── MarkdownGeneratorFactory.java
│ │ │ │ ├── MarkdownHTMLGenerator.java
│ │ │ │ └── MarkdownSyntax.java
│ │ │ ├── pdf/
│ │ │ │ ├── PDFLayer.java
│ │ │ │ └── PDFWriter.java
│ │ │ ├── processors/
│ │ │ │ ├── AbstractTableProcessor.java
│ │ │ │ ├── CaptionProcessor.java
│ │ │ │ ├── ClusterTableProcessor.java
│ │ │ │ ├── ContentFilterProcessor.java
│ │ │ │ ├── DocumentProcessor.java
│ │ │ │ ├── HeaderFooterProcessor.java
│ │ │ │ ├── HeadingProcessor.java
│ │ │ │ ├── HiddenTextProcessor.java
│ │ │ │ ├── HybridDocumentProcessor.java
│ │ │ │ ├── LevelProcessor.java
│ │ │ │ ├── ListProcessor.java
│ │ │ │ ├── ParagraphProcessor.java
│ │ │ │ ├── SpecialTableProcessor.java
│ │ │ │ ├── StrikethroughProcessor.java
│ │ │ │ ├── TableBorderProcessor.java
│ │ │ │ ├── TableStructureNormalizer.java
│ │ │ │ ├── TaggedDocumentProcessor.java
│ │ │ │ ├── TextLineProcessor.java
│ │ │ │ ├── TextProcessor.java
│ │ │ │ └── readingorder/
│ │ │ │ └── XYCutPlusPlusSorter.java
│ │ │ ├── text/
│ │ │ │ └── TextGenerator.java
│ │ │ └── utils/
│ │ │ ├── Base64ImageUtils.java
│ │ │ ├── BulletedParagraphUtils.java
│ │ │ ├── ContentSanitizer.java
│ │ │ ├── ImagesUtils.java
│ │ │ ├── ModeWeightStatistics.java
│ │ │ ├── SanitizationRule.java
│ │ │ ├── TextNodeStatistics.java
│ │ │ ├── TextNodeStatisticsConfig.java
│ │ │ ├── TextNodeUtils.java
│ │ │ └── levels/
│ │ │ ├── LevelInfo.java
│ │ │ ├── LineArtBulletParagraphLevelInfo.java
│ │ │ ├── ListLevelInfo.java
│ │ │ ├── TableLevelInfo.java
│ │ │ └── TextBulletParagraphLevelInfo.java
│ │ └── test/
│ │ ├── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ ├── EmbedImagesIntegrationTest.java
│ │ │ ├── ImageDirIntegrationTest.java
│ │ │ ├── IntegrationTest.java
│ │ │ ├── Issue336IntegrationTest.java
│ │ │ ├── PageSeparatorIntegrationTest.java
│ │ │ ├── PagesOptionIntegrationTest.java
│ │ │ ├── api/
│ │ │ │ ├── ConfigTest.java
│ │ │ │ └── FilterConfigTest.java
│ │ │ ├── containers/
│ │ │ │ └── StaticLayoutContainersTest.java
│ │ │ ├── hybrid/
│ │ │ │ ├── DoclingFastServerClientTest.java
│ │ │ │ ├── DoclingSchemaTransformerTest.java
│ │ │ │ ├── HancomClientTest.java
│ │ │ │ ├── HancomSchemaTransformerTest.java
│ │ │ │ ├── HealthCheckTest.java
│ │ │ │ ├── HybridClientFactoryTest.java
│ │ │ │ ├── TriageLoggerTest.java
│ │ │ │ ├── TriageProcessorIntegrationTest.java
│ │ │ │ └── TriageProcessorTest.java
│ │ │ ├── json/
│ │ │ │ └── serializers/
│ │ │ │ ├── ImageSerializerTest.java
│ │ │ │ └── LineArtSerializerTest.java
│ │ │ ├── markdown/
│ │ │ │ ├── MarkdownGeneratorTest.java
│ │ │ │ └── MarkdownTableTest.java
│ │ │ ├── processors/
│ │ │ │ ├── CaptionProcessorTest.java
│ │ │ │ ├── CidFontDetectionTest.java
│ │ │ │ ├── ContentFilterProcessorTest.java
│ │ │ │ ├── HeaderFooterProcessorTest.java
│ │ │ │ ├── HeadingProcessorTest.java
│ │ │ │ ├── HybridDocumentProcessorTest.java
│ │ │ │ ├── LevelProcessorTest.java
│ │ │ │ ├── ListProcessorTest.java
│ │ │ │ ├── ParagraphProcessorTest.java
│ │ │ │ ├── SpecialTableProcessorTest.java
│ │ │ │ ├── StrikethroughProcessorTest.java
│ │ │ │ ├── TableBorderProcessorTest.java
│ │ │ │ ├── TextLineProcessorTest.java
│ │ │ │ ├── TextProcessorTest.java
│ │ │ │ └── readingorder/
│ │ │ │ └── XYCutPlusPlusSorterTest.java
│ │ │ ├── regression/
│ │ │ │ └── ToUnicodeRegressionTest.java
│ │ │ └── utils/
│ │ │ ├── Base64ImageUtilsTest.java
│ │ │ ├── ContentSanitizerTest.java
│ │ │ ├── ImageFormatSupportTest.java
│ │ │ ├── ImagesUtilsTest.java
│ │ │ ├── ModeWeightStatisticsTest.java
│ │ │ └── TextNodeStatisticsTest.java
│ │ └── resources/
│ │ └── generate-cid-test-pdf.py
│ └── pom.xml
├── node/
│ └── opendataloader-pdf/
│ ├── .gitignore
│ ├── .npmrc
│ ├── .prettierrc.json
│ ├── eslint.config.js
│ ├── package.json
│ ├── scripts/
│ │ └── setup.cjs
│ ├── src/
│ │ ├── cli-options.generated.ts
│ │ ├── cli.ts
│ │ ├── convert-options.generated.ts
│ │ └── index.ts
│ ├── test/
│ │ ├── convert-options.test.ts
│ │ ├── convert.integration.test.ts
│ │ └── run.integration.test.ts
│ ├── tsconfig.json
│ ├── tsup.config.ts
│ └── vitest.config.ts
├── options.json
├── package.json
├── python/
│ └── opendataloader-pdf/
│ ├── .gitignore
│ ├── hatch_build.py
│ ├── pyproject.toml
│ ├── src/
│ │ └── opendataloader_pdf/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── cli_options_generated.py
│ │ ├── convert_generated.py
│ │ ├── hybrid_server.py
│ │ ├── runner.py
│ │ └── wrapper.py
│ └── tests/
│ ├── conftest.py
│ ├── test_cli_options.py
│ ├── test_convert_integration.py
│ ├── test_hybrid_server.py
│ ├── test_hybrid_server_nonblocking.py
│ ├── test_hybrid_server_partial_success.py
│ └── test_hybrid_server_unicode.py
├── samples/
│ └── json/
│ └── lorem.json
├── schema.json
└── scripts/
├── bench.sh
├── build-all.sh
├── build-java.sh
├── build-node.sh
├── build-python.sh
├── experiments/
│ ├── docling_baseline_bench.py
│ ├── docling_fastapi_bench.py
│ ├── docling_speed_report.py
│ └── docling_subprocess_bench.py
├── generate-options.mjs
├── generate-schema.mjs
├── run-cli.sh
├── test-java.sh
├── test-node.sh
├── test-python.sh
└── utils.mjs
================================================
FILE CONTENTS
================================================
================================================
FILE: .editorconfig
================================================
root = true
[*]
charset = utf-8
indent_style = space
indent_size = 4
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
[*.md]
trim_trailing_whitespace = false
================================================
FILE: .gitattributes
================================================
# Unify all text files to LF line endings
* text eol=lf
# Binary files should not have line ending conversions
*.exe binary
*.dll binary
*.so binary
*.dylib binary
*.class binary
*.jar binary
*.zip binary
*.png binary
*.jpg binary
*.jpeg binary
*.gif binary
*.pdf binary
================================================
FILE: .github/CODEOWNERS
================================================
# Owner for all documents
*.md @bdoubrov @hnc-sujicho
# Default owners for everything else in the repository
* @MaximPlusov @LonelyMidoriya @hyunhee-jo @bundolee
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Report an issue
title: ""
labels: bug
assignees: ""
---
### Bug
<!-- Describe the buggy behavior you have observed. -->
...
### Steps to reproduce
<!-- Describe the sequence of steps for reproducing the bug. -->
...
### Version
<!-- Copy version. -->
...
### Java version
<!-- Copy the output of `java --version`. -->
...
<!-- ⚠️ ATTENTION: When sharing screenshots, attachments, or other data make sure not to include any sensitive information. -->
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea
title: ""
labels: enhancement
assignees: ""
---
### Requested feature
<!-- Describe the feature you have in mind and the user need it addresses. -->
...
### Alternatives
<!-- Describe any alternatives you have considered. -->
...
<!-- ⚠️ ATTENTION: When sharing screenshots, attachments, or other data make sure not to include any sensitive information. -->
================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: Question
about: Ask a question
title: ""
labels: question
assignees: ""
---
### Question
<!-- Describe what you would like to achieve and which part you need help with. -->
...
<!-- ⚠️ ATTENTION: When sharing screenshots, attachments, or other data make sure not to include any sensitive information. -->
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!-- Thank you for your contribution! -->
<!-- STEPS TO FOLLOW:
1. Add a description of the changes (frequently the same as the commit description)
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
3. Make sure the PR title follows the **Commit Message Formatting**: https://www.conventionalcommits.org/en/v1.0.0/#summary.
4. Follow the steps in the checklist below, starting with the **Commit Message Formatting**.
-->
<!-- Uncomment this section with the issue number if an issue is being resolved
**Issue resolved by this Pull Request:**
Resolves #
--->
**Checklist:**
- [ ] Documentation has been updated, if necessary.
- [ ] Examples have been added, if necessary.
- [ ] Tests have been added, if necessary.
================================================
FILE: .github/SECURITY.md
================================================
# Security Policy
## Reporting a Vulnerability
If you think you've identified a security issue in the project repository, please DO NOT report the issue publicly via
the GitHub issue tracker, etc.
Instead, send an email with as many details as possible. This is a private mailing list for the maintainers team.
Please do not create a public issue.
### Security Vulnerability Response
Each report is acknowledged and analyzed by the core maintainers within 3 working days.
Any vulnerability information shared with core maintainers stays within the project and will not be disseminated to
other projects unless it is necessary to get the issue fixed.
After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full
announcement, and may ask for additional information or guidance.
## Security Alerts
We will send announcements of security vulnerabilities and steps to remediate on the project announcements.
================================================
FILE: .github/workflows/release.yml
================================================
name: Release
on:
push:
tags:
- 'v*'
workflow_dispatch:
jobs:
release:
runs-on: ubuntu-latest
env:
VERSION: '0.0.0'
permissions:
contents: write
id-token: write
steps:
# =================================================================
# 1. SETUP
# =================================================================
- name: Checkout code
uses: actions/checkout@v6
- name: Initialize VERSION
run: |
if [[ "${GITHUB_REF}" == refs/tags/v* ]]; then
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
else
echo "VERSION=0.0.0" >> $GITHUB_ENV
fi
- name: Set up Java
uses: actions/setup-java@v5
with:
java-version: '21'
distribution: 'temurin'
cache: 'maven'
server-id: central
server-username: MAVEN_CENTRAL_USERNAME
server-password: MAVEN_CENTRAL_PASSWORD
gpg-private-key: ${{ secrets.MAVEN_GPG_KEY }}
gpg-passphrase: ${{ secrets.MAVEN_GPG_PASSPHRASE }}
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Set up Node.js and pnpm
uses: actions/setup-node@v6
with:
node-version: '20'
registry-url: 'https://registry.npmjs.org'
- name: Install pnpm
uses: pnpm/action-setup@v5
with:
version: 9
# =================================================================
# 2. BUILD & TEST
# =================================================================
- name: Build and test all packages
run: ./scripts/build-all.sh ${{ env.VERSION }}
# =================================================================
# 3. DEPLOY (only on tag push)
# =================================================================
- name: '[Java] Deploy to Maven Central'
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
run: mvn -B -pl opendataloader-pdf-core deploy -P release
working-directory: ./java
env:
MAVEN_CENTRAL_USERNAME: ${{ secrets.MAVEN_CENTRAL_USERNAME }}
MAVEN_CENTRAL_PASSWORD: ${{ secrets.MAVEN_CENTRAL_PASSWORD }}
MAVEN_GPG_KEY: ${{ secrets.MAVEN_GPG_KEY }}
MAVEN_GPG_PASSPHRASE: ${{ secrets.MAVEN_GPG_PASSPHRASE }}
- name: '[Python] Publish to PyPI'
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: ./python/opendataloader-pdf/dist
- name: '[Node.js] Publish to npm'
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
run: pnpm publish --no-git-checks
working-directory: ./node/opendataloader-pdf
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
# =================================================================
# 4. GITHUB RELEASE (only on tag push)
# =================================================================
- name: Package CLI as ZIP
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
run: |
cd java/opendataloader-pdf-cli/target
mkdir -p release
cp "opendataloader-pdf-cli-${{ env.VERSION }}.jar" release/
cp ../../../README.md release/
cp ../../../LICENSE release/
cp ../../../NOTICE release/
cp -r ../../../THIRD_PARTY release/
cd release
zip -r "../opendataloader-pdf-cli-${{ env.VERSION }}.zip" .
cd ../..
- name: Create GitHub Release
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ github.ref_name }}
name: Release ${{ github.ref_name }}
generate_release_notes: true
files: |
java/opendataloader-pdf-cli/target/opendataloader-pdf-cli-${{ env.VERSION }}.zip
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# =================================================================
# 5. SYNC DOCS TO HOMEPAGE (only on tag push)
# =================================================================
- name: Sync docs to homepage repo
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
# Pinned to v1.7.3 for security - verify before updating
uses: cpina/github-action-push-to-another-repository@55306faa4ed53b815ae49e564af8cfb359d32ae2 # v1.7.3
with:
source-directory: 'content/docs'
destination-github-username: 'opendataloader-project'
destination-repository-name: 'opendataloader.org'
target-directory: 'apps/v1/content/docs'
target-branch: main
env:
API_TOKEN_GITHUB: ${{ secrets.HOMEPAGE_SYNC_TOKEN }}
================================================
FILE: .github/workflows/sync-docs.yml
================================================
# Sync documentation to homepage repository on release
#
# Required Setup:
# 1. Create a GitHub Personal Access Token (PAT) with minimal scope:
# - 'contents: write' permission ONLY for opendataloader.org repository
# - Use fine-grained PAT if possible for better security
# 2. Add the token as a repository secret named 'HOMEPAGE_SYNC_TOKEN'
# Settings > Secrets and variables > Actions > New repository secret
# 3. The token owner must have write access to opendataloader.org repository
#
# Testing: This workflow only runs on published releases.
# To test manually, create a pre-release or use workflow_dispatch.
#
# Security: Third-party action is pinned to a specific SHA for integrity.
name: Sync docs to homepage
on:
release:
types: [published]
workflow_dispatch:
permissions:
contents: read
jobs:
sync:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Push to homepage repo
# Pinned to v1.7.3 for security - verify before updating
uses: cpina/github-action-push-to-another-repository@55306faa4ed53b815ae49e564af8cfb359d32ae2 # v1.7.3
with:
source-directory: 'content/docs'
destination-github-username: 'opendataloader-project'
destination-repository-name: 'opendataloader.org'
target-directory: 'apps/v1/content/docs'
target-branch: main
env:
API_TOKEN_GITHUB: ${{ secrets.HOMEPAGE_SYNC_TOKEN }}
================================================
FILE: .github/workflows/test-benchmark.yml
================================================
name: Test & Benchmark
on:
pull_request:
branches: [main]
paths:
- 'java/**'
- 'python/**'
- 'node/**'
- 'scripts/**'
workflow_dispatch:
concurrency:
group: ci-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Java
uses: actions/setup-java@v5
with:
distribution: 'temurin'
java-version: '21'
- name: Setup uv
uses: astral-sh/setup-uv@v7
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: '20'
- name: Setup pnpm
run: npm install -g pnpm
- name: Build & Test All
run: ./scripts/build-all.sh
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
files: java/opendataloader-pdf-core/target/site/jacoco/jacoco.xml
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }}
- name: Upload build artifacts
uses: actions/upload-artifact@v7
with:
name: java-build
path: java/opendataloader-pdf-cli/target/*.jar
retention-days: 1
benchmark:
needs: test
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Java
uses: actions/setup-java@v5
with:
distribution: 'temurin'
java-version: '21'
- name: Download build artifacts
uses: actions/download-artifact@v8
with:
name: java-build
path: java/opendataloader-pdf-cli/target/
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Setup uv
uses: astral-sh/setup-uv@v7
- name: Run benchmark
run: ./scripts/bench.sh --skip-build --check-regression
- name: Upload evaluation results
uses: actions/upload-artifact@v7
if: always()
with:
name: benchmark-results
path: /tmp/opendataloader-bench/prediction/opendataloader/evaluation.json
================================================
FILE: .gitignore
================================================
*.class
# OS specific files
.DS_Store
Thumbs.db
# IDE / Editor files
**/.idea/
**/.vscode/
*.iml
*.ipr
*.iws
# Emacs
\#*\#
*~
.#*
# Vim
*.swp
*.swo
*.swn
# Java (Maven & Gradle)
**/target/
**/build/
**/bin/
**/.gradle/
**/*.jar
**/*.war
**/*.ear
**/dependency-reduced-pom.xml
**/.flattened-pom.xml
# Logs
*.log
logs/
**/logs/
**/npm-debug.log*
**/yarn-debug.log*
**/yarn-error.log*
**/pnpm-debug.log*
# Node.js
**/node_modules/
**/.next/
**/.turbo/
**/dist/
**/.cache/
*.tsbuildinfo
# Python
**/__pycache__/
**/*.py[cod]
*.pyo
*.pyd
*.so
**/.venv/
**/.env
**/.env.*
*.egg-info/
**/.eggs/
**/.mypy_cache/
**/.pytest_cache/
**/.coverage
**/htmlcov/
# Temporary / Generated
**/tmp/
**/temp/
# Git worktrees
.worktrees/
# Configuration files
.claude/settings.local.json
.claude/plans/
================================================
FILE: CHANGELOG.md
================================================
# Changelog
## 0.1.0
- Initial release.
================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md
## Gotchas
After changing CLI options in Java, **must** run `npm run sync` — this regenerates `options.json` and all Python/Node.js bindings. Forgetting this silently breaks the wrappers.
When using `--enrich-formula` or `--enrich-picture-description` on the hybrid server, the client **must** use `--hybrid-mode full`. Otherwise enrichments are silently skipped (they only run on the backend, not in Java).
## Conventions
`content/docs/` auto-syncs to opendataloader.org on release. Edits here go live.
## Benchmark
- `./scripts/bench.sh` — Run benchmark (auto-clones opendataloader-bench for PDFs and evaluation logic)
- `./scripts/bench.sh --doc-id <id>` — Debug specific document
- `./scripts/bench.sh --check-regression` — CI mode with threshold check
- Benchmark code lives in [opendataloader-bench](https://github.com/opendataloader-project/opendataloader-bench)
- Metrics: **NID** (reading order), **TEDS** (table structure), **MHS** (heading structure), **Table Detection F1**, **Speed**
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
- Demonstrating empathy and kindness toward other people
- Being respectful of differing opinions, viewpoints, and experiences
- Giving and gracefully accepting constructive feedback
- Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
- Focusing on what is best not just for us as individuals, but for the
overall community
Examples of unacceptable behavior include:
- The use of sexualized language or imagery, and sexual attention or
advances of any kind
- Trolling, insulting or derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or email
address, without their explicit permission
- Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement using
[open.dataloader@hancom.com](mailto:open.dataloader@hancom.com).
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series
of actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within
the community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org)],
version 2.0, available at
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to This Project
Thank you for your interest in contributing!
We welcome contributions from everyone. This document outlines the guidelines for how to contribute effectively and
respectfully.
---
## 📌 Types of Contributions We Welcome
We appreciate various kinds of contributions, including but not limited to:
- 🛠️ **Code contributions** (bug fixes, performance improvements, new features)
- 🐞 **Bug reports**
- 💡 **Feature suggestions**
- ❓ **Questions and discussions**
- 📚 **Improving documentation**
---
## ❓ How to Ask Questions
If you have questions:
1. Check the [README](./README.md) and
existing [issues](https://github.com/opendataloader-project/opendataloader-pdf/issues) first.
2. If your question hasn't been addressed, open a new issue using the `Question` label.
---
## 🐛 How to Report Bugs
When reporting a bug, please include the following:
- A clear and descriptive title
- Steps to reproduce the issue
- Expected vs actual behavior
- Environment info (OS, version, etc.)
- Logs or screenshots if available
Use the **Bug Report** issue template when creating the issue.
---
## 💡 How to Suggest a Feature
To suggest a new feature:
1. Search existing issues to avoid duplicates.
2. If it's new, open a new issue using the **Feature Request** template.
3. Describe your idea, use cases, and possible alternatives.
---
## 🔧 How to Contribute Code
### Step-by-Step Process
1. **Fork** the repository.
2. **Clone** your fork:
```bash
git clone https://github.com/your-username/opendataloader-pdf.git
cd opendataloader-pdf
```
3. **Create a feature branch:**
```bash
git checkout -b my-feature
```
4. **Build** the project:
**Prerequisites:** Java 11+, Maven, Python 3.10+, uv, Node.js 20+, pnpm
See the [Development Workflow guide](https://opendataloader.org/docs/development-workflow) for OS-specific install instructions.
```bash
# Build Java packages
npm run build-java
# If you changed CLI options in Java, sync bindings (regenerates options.json, Python/Node.js wrappers)
npm run sync
```
> **Important**: If you modified any CLI options in Java, you **must** run `npm run sync` before committing. This regenerates `options.json` and all Python/Node.js bindings. Forgetting this silently breaks the wrappers.
5. Make your changes and commit them.
6. **Push** your branch:
```bash
git push origin my-feature
```
7. **Open a Pull Request** (PR) against the `main` branch.
8. Respond to review comments and update your PR as needed.
---
## 🧹 Coding Style & Guidelines
- Follow existing code conventions.
- Run linters/formatters before committing.
- Write unit tests for any new or changed logic.
- Run `./scripts/bench.sh` before submitting a PR — CI will fail if benchmark scores drop below thresholds.
- Keep your changes minimal and focused.
## ✅ Commit Message Guidelines
Use the following format:
```
<type> <short summary>
```
### Common types:
- Add: New feature
- Fix: Bug fix
- Update: Code update
## 📝 CLA / DCO Requirements
Depending on your contribution, we may ask you to sign:
- CLA – Contributor License Agreement
- DCO – Developer Certificate of Origin
To sign the DCO, add `Signed-off-by` to your commit message:
```
git commit -s -m "your message"
```
Make sure your Git config contains your real name and email.
Thank you again for helping us improve this project! 🙌
If you have any questions, open an issue or join the discussion.
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: LICENSE_TEMPLATE/license.txt
================================================
Copyright 2025-2026 Hancom Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: NOTICE
================================================
OpenDataLoader PDF
Copyright 2025-2026 Hancom, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This product includes 'OpenDataLoader PDF' distributed under the Apache
License 2.0, along with various third-party software components. For the
complete source code and detailed copyright notices and license information
for each third-party component, please visit:
https://github.com/opendataloader-project/opendataloader-pdf
THIRD-PARTY LICENSES
This project includes third-party libraries and components, licensed under
their respective open source licenses. For details, see:
THIRD_PARTY/THIRD_PARTY_LICENSES.md
THIRD_PARTY/THIRD_PARTY_NOTICES.md
THIRD_PARTY/licenses/
HISTORICAL NOTE
Versions of OpenDataLoader PDF prior to 2.0 were licensed under the
Mozilla Public License 2.0 (MPL-2.0). From version 2.0 onwards, the
project is licensed under the Apache License 2.0.
================================================
FILE: README.md
================================================
<!-- AI-AGENT-SUMMARY
name: opendataloader-pdf
category: PDF data extraction, PDF accessibility automation
license: Apache-2.0
solves: [PDF to structured data for RAG/LLM pipelines, automate PDF accessibility compliance — layout analysis + auto-tagging to Tagged PDF (first open-source end-to-end)]
input: PDF files (digital, scanned, tagged)
output: Markdown, JSON (with bounding boxes), HTML, Tagged PDF, PDF/UA (enterprise)
sdk: Python, Node.js, Java
requirements: Java 11+
pricing: open-source core (data extraction, layout analysis, auto-tagging to Tagged PDF), enterprise add-on (PDF/UA export, accessibility studio)
extraction-benchmark: #1 overall extraction accuracy (0.90) in hybrid mode, 0.93 table extraction accuracy, 0.05s/page local mode
accessibility-validation: PDF Association collaboration, Well-Tagged PDF specification, veraPDF automated validation
key-differentiators: [benchmark #1 PDF parser, deterministic output, bounding boxes for every element, XY-Cut++ reading order, AI safety filters, hybrid AI mode, first open-source PDF auto-tagging to Tagged PDF, PDF Association + Dual Lab (veraPDF) collaboration, Well-Tagged PDF spec compliance]
-->
# OpenDataLoader PDF
**PDF Parser for AI-ready data. Automate PDF accessibility. Open-source.**
[](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
[](https://pypi.org/project/opendataloader-pdf/)
[](https://www.npmjs.com/package/@opendataloader/pdf)
[](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
[](https://github.com/opendataloader-project/opendataloader-pdf#java)
<a href="https://trendshift.io/repositories/21917" target="_blank"><img src="https://trendshift.io/api/badge/repositories/21917" alt="opendataloader-project%2Fopendataloader-pdf | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
🔍 **PDF parser for AI data extraction** — Extract Markdown, JSON (with bounding boxes), and HTML from any PDF. #1 in benchmarks (0.90 overall). Deterministic local mode + AI hybrid mode for complex pages.
- **How accurate is it?** — #1 in benchmarks: 0.90 overall, 0.93 table accuracy across 200 real-world PDFs including multi-column and scientific papers. Deterministic local mode + AI hybrid mode for complex pages ([benchmarks](#extraction-benchmarks))
- **Scanned PDFs and OCR?** — Yes. Built-in OCR (80+ languages) in hybrid mode. Works with poor-quality scans at 300 DPI+ ([hybrid mode](#hybrid-mode-1-accuracy-for-complex-pdfs))
- **Tables, formulas, images, charts?** — Yes. Complex/borderless tables, LaTeX formulas, and AI-generated picture/chart descriptions all via hybrid mode ([hybrid mode](#hybrid-mode-1-accuracy-for-complex-pdfs))
- **How do I use this for RAG?** — `pip install opendataloader-pdf`, convert in 3 lines. Outputs structured Markdown for chunking, JSON with bounding boxes for source citations, and HTML. LangChain integration available. Python, Node.js, Java SDKs ([quick start](#get-started-in-30-seconds) | [LangChain](#langchain-integration))
♿ **PDF accessibility automation** — The same layout analysis engine also powers auto-tagging. First open-source tool to generate Tagged PDFs end-to-end (coming Q2 2026).
- **What's the problem?** — Accessibility regulations are now enforced worldwide. Manual PDF remediation costs $50–200 per document and doesn't scale ([regulations](#pdf-accessibility--pdfua-conversion))
- **What's free?** — Layout analysis + auto-tagging (Q2 2026, Apache 2.0). Untagged PDF in → Tagged PDF out. No proprietary SDK dependency ([auto-tagging preview](#auto-tagging-preview-coming-q2-2026))
- **What about PDF/UA compliance?** — Converting Tagged PDF to PDF/UA-1 or PDF/UA-2 is an enterprise add-on. Auto-tagging generates the Tagged PDF; PDF/UA export is the final step ([pipeline](#accessibility-pipeline))
- **Why trust this?** — Built in collaboration with [PDF Association](https://pdfa.org) and [Dual Lab](https://duallab.com) ([veraPDF](https://verapdf.org) developers). Auto-tagging follows the Well-Tagged PDF specification, validated with veraPDF ([collaboration](https://opendataloader.org/docs/tagged-pdf-collaboration))
## Get Started in 30 Seconds
**Requires**: Java 11+ and Python 3.10+ ([Node.js](https://opendataloader.org/docs/quick-start-nodejs) | [Java](https://opendataloader.org/docs/quick-start-java) also available)
> Before you start: run `java -version`. If not found, install JDK 11+ from [Adoptium](https://adoptium.net/).
```bash
pip install -U opendataloader-pdf
```
```python
import opendataloader_pdf
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
format="markdown,json"
)
```

*Annotated PDF output — each element (heading, paragraph, table, image) detected with bounding boxes and semantic type.*
## What Problems Does This Solve?
| Problem | Solution | Status |
|---------|----------|--------|
| **PDF structure lost during parsing** — wrong reading order, broken tables, no element coordinates | Deterministic local PDF to Markdown/JSON with bounding boxes, XY-Cut++ reading order | Shipped |
| **Complex tables, scanned PDFs, formulas, charts** need AI-level understanding | Hybrid mode routes complex pages to AI backend (#1 in benchmarks) | Shipped |
| **PDF accessibility compliance** — EAA, ADA, Section 508 enforced. Manual remediation $50–200/doc | Auto-tagging: layout analysis → Tagged PDF (free, Q2 2026). Built with PDF Association & veraPDF validation. PDF/UA export (enterprise add-on) | Auto-tag: Q2 2026 |
## Capability Matrix
| Capability | Supported | Tier |
|------------|-----------|------|
| **Data extraction** | | |
| Extract text with correct reading order | Yes | Free |
| Bounding boxes for every element | Yes | Free |
| Table extraction (simple borders) | Yes | Free |
| Table extraction (complex/borderless) | Yes | Free (Hybrid) |
| Heading hierarchy detection | Yes | Free |
| List detection (numbered, bulleted, nested) | Yes | Free |
| Image extraction with coordinates | Yes | Free |
| AI chart/image description | Yes | Free (Hybrid) |
| OCR for scanned PDFs | Yes | Free (Hybrid) |
| Formula extraction (LaTeX) | Yes | Free (Hybrid) |
| Tagged PDF structure extraction | Yes | Free |
| AI safety (prompt injection filtering) | Yes | Free |
| Header/footer/watermark filtering | Yes | Free |
| **Accessibility** | | |
| Auto-tagging → Tagged PDF for untagged PDFs | Coming Q2 2026 | Free (Apache 2.0) |
| PDF/UA-1, PDF/UA-2 export | 💼 Available | Enterprise |
| Accessibility studio (visual editor) | 💼 Available | Enterprise |
| **Limitations** | | |
| Process Word/Excel/PPT | No | — |
| GPU required | No | — |
## Extraction Benchmarks
**opendataloader-pdf [hybrid] ranks #1 overall (0.90)** across reading order, table, and heading extraction accuracy.
| Engine | Overall | Reading Order | Table | Heading | Speed (s/page) |
|--------|---------|---------------|-------|---------|----------------|
| **opendataloader [hybrid]** | **0.90** | **0.94** | **0.93** | **0.83** | 0.43 |
| opendataloader | 0.72 | 0.91 | 0.49 | 0.76 | **0.05** |
| docling | 0.86 | 0.90 | 0.89 | 0.80 | 0.73 |
| marker | 0.83 | 0.89 | 0.81 | 0.80 | 53.93 |
| mineru | 0.82 | 0.86 | 0.87 | 0.74 | 5.96 |
| pymupdf4llm | 0.57 | 0.89 | 0.40 | 0.41 | 0.09 |
| markitdown | 0.29 | 0.88 | 0.00 | 0.00 | **0.04** |
> Scores normalized to [0, 1]. Higher is better for accuracy; lower is better for speed. **Bold** = best. [Full benchmark details](https://github.com/opendataloader-project/opendataloader-bench)
[](https://github.com/opendataloader-project/opendataloader-bench)
## Which Mode Should I Use?
| Your Document | Mode | Install | Server Command | Client Command |
|---------------|------|---------|----------------|----------------|
| Standard digital PDF | Fast (default) | `pip install opendataloader-pdf` | None needed | `opendataloader-pdf file1.pdf file2.pdf folder/` |
| Complex or nested tables | **Hybrid** | `pip install "opendataloader-pdf[hybrid]"` | `opendataloader-pdf-hybrid --port 5002` | `opendataloader-pdf --hybrid docling-fast file1.pdf file2.pdf folder/` |
| Scanned / image-based PDF | Hybrid + OCR | `pip install "opendataloader-pdf[hybrid]"` | `opendataloader-pdf-hybrid --port 5002 --force-ocr` | `opendataloader-pdf --hybrid docling-fast file1.pdf file2.pdf folder/` |
| Non-English scanned PDF | Hybrid + OCR | `pip install "opendataloader-pdf[hybrid]"` | `opendataloader-pdf-hybrid --port 5002 --force-ocr --ocr-lang "ko,en"` | `opendataloader-pdf --hybrid docling-fast file1.pdf file2.pdf folder/` |
| Mathematical formulas | Hybrid + formula | `pip install "opendataloader-pdf[hybrid]"` | `opendataloader-pdf-hybrid --enrich-formula` | `opendataloader-pdf --hybrid docling-fast --hybrid-mode full file1.pdf file2.pdf folder/` |
| Charts needing description | Hybrid + picture | `pip install "opendataloader-pdf[hybrid]"` | `opendataloader-pdf-hybrid --enrich-picture-description` | `opendataloader-pdf --hybrid docling-fast --hybrid-mode full file1.pdf file2.pdf folder/` |
| Untagged PDFs needing accessibility | Auto-tagging → Tagged PDF | Coming Q2 2026 | — | — |
## Quick Start
### Python
```bash
pip install -U opendataloader-pdf
```
```python
import opendataloader_pdf
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
format="markdown,json"
)
```
### Node.js
```bash
npm install @opendataloader/pdf
```
```typescript
import { convert } from '@opendataloader/pdf';
await convert(['file1.pdf', 'file2.pdf', 'folder/'], {
outputDir: 'output/',
format: 'markdown,json'
});
```
### Java
```xml
<dependency>
<groupId>org.opendataloader</groupId>
<artifactId>opendataloader-pdf-core</artifactId>
</dependency>
```
[Python Quick Start](https://opendataloader.org/docs/quick-start-python) | [Node.js Quick Start](https://opendataloader.org/docs/quick-start-nodejs) | [Java Quick Start](https://opendataloader.org/docs/quick-start-java)
## Hybrid Mode: #1 Accuracy for Complex PDFs
Hybrid mode combines fast local Java processing with AI backends. Simple pages stay local (0.05s); complex pages route to AI for +90% table accuracy.
```bash
pip install -U "opendataloader-pdf[hybrid]"
```
**Terminal 1** — Start the backend server:
```bash
opendataloader-pdf-hybrid --port 5002
```
**Terminal 2** — Process PDFs:
```bash
# Batch all files in one call — each invocation spawns a JVM process, so repeated calls are slow
opendataloader-pdf --hybrid docling-fast file1.pdf file2.pdf folder/
```
**Python:**
```python
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
hybrid="docling-fast"
)
```
### OCR for Scanned PDFs
Start the backend with `--force-ocr` for image-based PDFs with no selectable text:
```bash
opendataloader-pdf-hybrid --port 5002 --force-ocr
```
For non-English documents, specify the language:
```bash
opendataloader-pdf-hybrid --port 5002 --force-ocr --ocr-lang "ko,en"
```
Supported languages: `en`, `ko`, `ja`, `ch_sim`, `ch_tra`, `de`, `fr`, `ar`, and more.
### Formula Extraction (LaTeX)
Extract mathematical formulas as LaTeX from scientific PDFs:
```bash
# Server: enable formula enrichment
opendataloader-pdf-hybrid --enrich-formula
# Batch all files in one call — each invocation spawns a JVM process, so repeated calls are slow
opendataloader-pdf --hybrid docling-fast --hybrid-mode full file1.pdf file2.pdf folder/
```
Output in JSON:
```json
{
"type": "formula",
"page number": 1,
"bounding box": [226.2, 144.7, 377.1, 168.7],
"content": "\\frac{f(x+h) - f(x)}{h}"
}
```
> **Note**: Formula and picture description enrichments require `--hybrid-mode full` on the client side.
### Chart & Image Description
Generate AI descriptions for charts and images — useful for RAG search and accessibility alt text:
```bash
# Server
opendataloader-pdf-hybrid --enrich-picture-description
# Batch all files in one call — each invocation spawns a JVM process, so repeated calls are slow
opendataloader-pdf --hybrid docling-fast --hybrid-mode full file1.pdf file2.pdf folder/
```
Output in JSON:
```json
{
"type": "picture",
"page number": 1,
"bounding box": [72.0, 400.0, 540.0, 650.0],
"description": "A bar chart showing waste generation by region from 2016 to 2030..."
}
```
> Uses SmolVLM (256M), a lightweight vision model. Custom prompts supported via `--picture-description-prompt`.
### Hancom Data Loader Integration — Coming Soon
Enterprise-grade AI document analysis via [Hancom Data Loader](https://sdk.hancom.com/en/services/1?utm_source=github&utm_medium=readme&utm_campaign=opendataloader-pdf) — customer-customized models trained on your domain-specific documents. 30+ element types (tables, charts, formulas, captions, footnotes, etc.), VLM-based image/chart understanding, complex table extraction (merged cells, nested tables), SLA-backed OCR for scanned documents, and native HWP/HWPX support. Supports PDF, DOCX, XLSX, PPTX, HWP, PNG, JPG. [Live demo](https://livedemo.sdk.hancom.com/en/dataloader?utm_source=github&utm_medium=readme&utm_campaign=opendataloader-pdf)
[Hybrid Mode Guide](https://opendataloader.org/docs/hybrid-mode)
## Output Formats
| Format | Use Case |
|--------|----------|
| **JSON** | Structured data with bounding boxes, semantic types |
| **Markdown** | Clean text for LLM context, RAG chunks |
| **HTML** | Web display with styling |
| **Annotated PDF** | Visual debugging — see detected structures ([sample](https://opendataloader.org/demo/samples/01030000000000)) |
| **Text** | Plain text extraction |
Combine formats: `format="json,markdown"`
### JSON Output Example
```json
{
"type": "heading",
"id": 42,
"level": "Title",
"page number": 1,
"bounding box": [72.0, 700.0, 540.0, 730.0],
"heading level": 1,
"font": "Helvetica-Bold",
"font size": 24.0,
"text color": "[0.0]",
"content": "Introduction"
}
```
| Field | Description |
|-------|-------------|
| `type` | Element type: heading, paragraph, table, list, image, caption, formula |
| `id` | Unique identifier for cross-referencing |
| `page number` | 1-indexed page reference |
| `bounding box` | `[left, bottom, right, top]` in PDF points (72pt = 1 inch) |
| `heading level` | Heading depth (1+) |
| `content` | Extracted text |
[Full JSON Schema](https://opendataloader.org/docs/json-schema)
## Advanced Features
### Tagged PDF Support
When a PDF has structure tags, OpenDataLoader extracts the **exact layout** the author intended — no guessing, no heuristics. Headings, lists, tables, and reading order are preserved from the source.
```python
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
use_struct_tree=True # Use native PDF structure tags
)
```
Most PDF parsers ignore structure tags entirely. [Learn more](https://opendataloader.org/docs/tagged-pdf)
### AI Safety: Prompt Injection Protection
PDFs can contain hidden prompt injection attacks. OpenDataLoader automatically filters:
- Hidden text (transparent, zero-size fonts)
- Off-page content
- Suspicious invisible layers
To sanitize sensitive data (emails, URLs, phone numbers → placeholders), enable it explicitly:
```bash
# Batch all files in one call — each invocation spawns a JVM process, so repeated calls are slow
opendataloader-pdf file1.pdf file2.pdf folder/ --sanitize
```
[AI Safety Guide](https://opendataloader.org/docs/ai-safety)
### LangChain Integration
```bash
pip install -U langchain-opendataloader-pdf
```
```python
from langchain_opendataloader_pdf import OpenDataLoaderPDFLoader
loader = OpenDataLoaderPDFLoader(
file_path=["file1.pdf", "file2.pdf", "folder/"],
format="text"
)
documents = loader.load()
```
[LangChain Docs](https://docs.langchain.com/oss/python/integrations/document_loaders/opendataloader_pdf) | [GitHub](https://github.com/opendataloader-project/langchain-opendataloader-pdf) | [PyPI](https://pypi.org/project/langchain-opendataloader-pdf/)
### Advanced Options
```python
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
format="json,markdown,pdf",
image_output="embedded", # "off", "embedded" (Base64), or "external" (default)
image_format="jpeg", # "png" or "jpeg"
use_struct_tree=True, # Use native PDF structure
)
```
[Full CLI Options Reference](https://opendataloader.org/docs/cli-options-reference)
## PDF Accessibility & PDF/UA Conversion
**Problem**: Millions of existing PDFs lack structure tags, failing accessibility regulations (EAA, ADA/Section 508, Korea Digital Inclusion Act). Manual remediation costs $50–200 per document and doesn't scale.
**OpenDataLoader's approach**: Built in collaboration with [PDF Association](https://pdfa.org) and [Dual Lab](https://duallab.com) (developers of [veraPDF](https://verapdf.org), the industry-reference open-source PDF/A and PDF/UA validator). Auto-tagging follows the [Well-Tagged PDF specification](https://pdfa.org/resource/well-tagged-pdf/) and is validated programmatically using veraPDF — automated conformance checks against PDF accessibility standards, not manual review. No existing open-source tool generates Tagged PDFs end-to-end — most rely on proprietary SDKs for the tag-writing step. OpenDataLoader does it all under Apache 2.0. ([collaboration details](https://opendataloader.org/docs/tagged-pdf-collaboration))
| Regulation | Deadline | Requirement |
|------------|----------|-------------|
| **European Accessibility Act (EAA)** | June 28, 2025 | Accessible digital products across the EU |
| **ADA & Section 508** | In effect | U.S. federal agencies and public accommodations |
| **Digital Inclusion Act** | In effect | South Korea digital service accessibility |
### Standards & Validation
| Aspect | Detail |
|--------|--------|
| **Specification** | [Well-Tagged PDF](https://pdfa.org/resource/well-tagged-pdf/) by PDF Association |
| **Validation** | [veraPDF](https://verapdf.org) — industry-reference open-source PDF/A & PDF/UA validator |
| **Collaboration** | PDF Association + [Dual Lab](https://duallab.com) (veraPDF developers) co-develop tagging and validation |
| **License** | Auto-tagging → Tagged PDF: Apache 2.0 (free). PDF/UA export: Enterprise |
### Accessibility Pipeline
| Step | Feature | Status | Tier |
|------|---------|--------|------|
| 1. **Audit** | Read existing PDF tags, detect untagged PDFs | Shipped | Free |
| 2. **Auto-tag → Tagged PDF** | Generate structure tags for untagged PDFs | Coming Q2 2026 | Free (Apache 2.0) |
| 3. **Export PDF/UA** | Convert to PDF/UA-1 or PDF/UA-2 compliant files | 💼 Available | Enterprise |
| 4. **Visual editing** | Accessibility studio — review and fix tags | 💼 Available | Enterprise |
> **💼 Enterprise features** are available on request. [Contact us](https://opendataloader.org/contact) to get started.
### Auto-Tagging Preview (Coming Q2 2026)
```python
# API shape preview — available Q2 2026
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
auto_tag=True # Generate structure tags for untagged PDFs
)
```
### End-to-End Compliance Workflow
```
Existing PDFs (untagged)
│
▼
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ 1. Audit │───>│ 2. Auto-Tag │───>│ 3. Export │───>│ 4. Studio │
│ (check tags) │ │ (→ Tagged PDF) │ │ (PDF/UA) │ │ (visual editor) │
└─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────┘
│ │ │ │
▼ ▼ ▼ ▼
use_struct_tree auto_tag PDF/UA export Accessibility Studio
(Available now) (Q2 2026, Apache 2.0) (Enterprise) (Enterprise)
```
[PDF Accessibility Guide](https://opendataloader.org/docs/accessibility-compliance)
## Roadmap
| Feature | Timeline | Tier |
|---------|----------|------|
| **Auto-tagging → Tagged PDF** — Generate Tagged PDFs from untagged PDFs | Q2 2026 | Free |
| **[Hancom Data Loader](https://sdk.hancom.com/en/services/1?utm_source=github&utm_medium=readme&utm_campaign=opendataloader-pdf)** — Enterprise AI document analysis, customer-customized models, VLM-based chart/image understanding, production-grade OCR | Q2-Q3 2026 | Free |
| **Structure validation** — Verify PDF tag trees | Q2 2026 | Planned |
[Full Roadmap](https://opendataloader.org/docs/upcoming-roadmap)
## Frequently Asked Questions
### What is the best PDF parser for RAG?
For RAG pipelines, you need a parser that preserves document structure, maintains correct reading order, and provides element coordinates for citations. OpenDataLoader is designed specifically for this — it outputs structured JSON with bounding boxes, handles multi-column layouts with XY-Cut++, and runs locally without GPU. In hybrid mode, it ranks #1 overall (0.90) in benchmarks.
### What is the best open-source PDF parser?
OpenDataLoader PDF is the only open-source parser that combines: rule-based deterministic extraction (no GPU), bounding boxes for every element, XY-Cut++ reading order, built-in AI safety filters, native Tagged PDF support, and hybrid AI mode for complex documents. It ranks #1 in overall accuracy (0.90) while running locally on CPU.
### How do I extract tables from PDF for LLM?
OpenDataLoader detects tables using border analysis and text clustering, preserving row/column structure. For complex tables, enable hybrid mode for +90% accuracy improvement (0.49 to 0.93 TEDS score):
```python
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
format="json",
hybrid="docling-fast" # For complex tables
)
```
### How does it compare to docling, marker, or pymupdf4llm?
OpenDataLoader [hybrid] ranks #1 overall (0.90) across reading order, table, and heading accuracy. Key differences: docling (0.86) is strong but lacks bounding boxes and AI safety filters. marker (0.83) requires GPU and is 100x slower (53.93s/page). pymupdf4llm (0.57) is fast but has poor table (0.40) and heading (0.41) accuracy. OpenDataLoader is the only parser that combines deterministic local extraction, bounding boxes for every element, and built-in prompt injection protection. See [full benchmark](https://github.com/opendataloader-project/opendataloader-bench).
### Can I use this without sending data to the cloud?
Yes. OpenDataLoader runs 100% locally. No API calls, no data transmission — your documents never leave your environment. The hybrid mode backend also runs locally on your machine. Ideal for legal, healthcare, and financial documents.
### Does it support OCR for scanned PDFs?
Yes, via hybrid mode. Install with `pip install "opendataloader-pdf[hybrid]"`, start the backend with `--force-ocr`, then process as usual. Supports multiple languages including Korean, Japanese, Chinese, Arabic, and more via `--ocr-lang`.
### Does it work with Korean, Japanese, or Chinese documents?
Yes. For digital PDFs, text extraction works out of the box. For scanned PDFs, use hybrid mode with `--force-ocr --ocr-lang "ko,en"` (or `ja`, `ch_sim`, `ch_tra`). Coming soon: [Hancom Data Loader](https://sdk.hancom.com/en/services/1?utm_source=github&utm_medium=readme&utm_campaign=opendataloader-pdf) integration — enterprise-grade AI document analysis with built-in production-grade OCR and customer-customized models optimized for your specific document types and workflows.
### How fast is it?
Local mode processes 20+ pages per second on CPU (0.05s/page). Hybrid mode processes 2+ pages per second (0.43s/page) with significantly higher accuracy for complex documents. No GPU required. Benchmarked on Apple M4. [Full benchmark details](https://github.com/opendataloader-project/opendataloader-bench). With multi-process batch processing, throughput exceeds 100 pages per second on 8+ core machines.
### Does it handle multi-column layouts?
Yes. OpenDataLoader uses XY-Cut++ reading order analysis to correctly sequence text across multi-column pages, sidebars, and mixed layouts. This works in both local and hybrid modes without any configuration.
### What is hybrid mode?
Hybrid mode combines fast local Java processing with an AI backend. Simple pages are processed locally (0.05s/page); complex pages (tables, scanned content, formulas, charts) are automatically routed to the AI backend for higher accuracy. The backend runs locally on your machine — no cloud required. See [Which Mode Should I Use?](#which-mode-should-i-use) and [Hybrid Mode Guide](https://opendataloader.org/docs/hybrid-mode).
### Does it work with LangChain?
Yes. Install `langchain-opendataloader-pdf` for an official LangChain document loader integration. See [LangChain docs](https://docs.langchain.com/oss/python/integrations/document_loaders/opendataloader_pdf).
### How do I chunk PDFs for RAG?
OpenDataLoader outputs structured Markdown with headings, tables, and lists preserved — ideal input for semantic chunking. Each element in JSON output includes `type`, `heading level`, and `page number`, so you can split by section or page boundary. For most RAG pipelines: parse with `format="markdown"` for text chunks, or `format="json"` when you need element-level control. Pair with LangChain's `RecursiveCharacterTextSplitter` or your own heading-based splitter for best results.
### How do I cite PDF sources in RAG answers?
Every element in JSON output includes a `bounding box` (`[left, bottom, right, top]` in PDF points) and `page number`. When your RAG pipeline returns an answer, map the source chunk back to its bounding box to highlight the exact location in the original PDF. This enables "click to source" UX — users see which paragraph, table, or figure the answer came from. No other open-source parser provides bounding boxes for every element by default.
### How do I convert PDF to Markdown for LLM?
```python
import opendataloader_pdf
# Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
opendataloader_pdf.convert(
input_path=["file1.pdf", "file2.pdf", "folder/"],
output_dir="output/",
format="markdown"
)
```
OpenDataLoader preserves heading hierarchy, table structure, and reading order in the Markdown output. For complex documents with borderless tables or scanned pages, use hybrid mode (`hybrid="docling-fast"`) for higher accuracy. The output is clean enough to feed directly into LLM context windows or RAG chunking pipelines.
### Is there an automated PDF accessibility remediation tool?
Yes. OpenDataLoader is the first open-source tool that automates PDF accessibility end-to-end. Built in collaboration with [PDF Association](https://pdfa.org) and [Dual Lab](https://duallab.com) (veraPDF developers), auto-tagging follows the Well-Tagged PDF specification and is validated programmatically using veraPDF. The layout analysis engine detects document structure (headings, tables, lists, reading order) and generates accessibility tags automatically. Auto-tagging (Q2 2026) converts untagged PDFs into Tagged PDFs under Apache 2.0 — no proprietary SDK dependency. For organizations needing full PDF/UA compliance, enterprise add-ons provide PDF/UA export and a visual tag editor. This replaces manual remediation workflows that typically cost $50–200+ per document.
### Is this really the first open-source PDF auto-tagging tool?
Yes. Existing tools either depend on proprietary SDKs for writing structure tags, only output non-PDF formats (e.g., Docling outputs Markdown/JSON but cannot produce Tagged PDFs), or require manual intervention. OpenDataLoader is the first to do layout analysis → tag generation → Tagged PDF output entirely under an open-source license (Apache 2.0), with no proprietary dependency. Auto-tagging follows the PDF Association's Well-Tagged PDF specification and is validated using veraPDF, the industry-reference open-source PDF/A and PDF/UA validator.
### How do I convert existing PDFs to PDF/UA?
OpenDataLoader provides an end-to-end pipeline: audit existing PDFs for tags (`use_struct_tree=True`), auto-tag untagged PDFs into Tagged PDFs (Q2 2026, free under Apache 2.0), and export as PDF/UA-1 or PDF/UA-2 (enterprise add-on). Auto-tagging follows the PDF Association's Well-Tagged PDF specification and is validated using veraPDF. Auto-tagging generates the Tagged PDF; PDF/UA export is the final step. [Contact us](https://opendataloader.org/contact) for enterprise integration.
### How do I make my PDFs accessible for EAA compliance?
The European Accessibility Act requires accessible digital products by June 28, 2025. OpenDataLoader supports the full remediation workflow: audit → auto-tag → Tagged PDF → PDF/UA export. Auto-tagging follows the PDF Association's Well-Tagged PDF specification and is validated using veraPDF, ensuring standards-compliant output. Auto-tagging to Tagged PDF will be open-sourced under Apache 2.0 (Q2 2026). PDF/UA export and accessibility studio are enterprise add-ons. See our [Accessibility Guide](https://opendataloader.org/docs/accessibility-compliance).
### Is OpenDataLoader PDF free?
The core library is **open-source under Apache 2.0** — free for commercial use. This includes all extraction features (text, tables, images, OCR, formulas, charts via hybrid mode), AI safety filters, Tagged PDF support, and auto-tagging to Tagged PDF (Q2 2026). We are committed to keeping the core accessibility pipeline (layout analysis → auto-tagging → Tagged PDF) free and open-source. Enterprise add-ons (PDF/UA export, accessibility studio) are available for organizations needing end-to-end regulatory compliance.
### Why did the license change from MPL 2.0 to Apache 2.0?
MPL 2.0 requires file-level copyleft, which often triggers legal review before enterprise adoption. Apache 2.0 is fully permissive — no copyleft obligations, easier to integrate into commercial projects. If you are using a pre-2.0 version, it remains under MPL 2.0 and you can continue using it. Upgrading to 2.0+ means your project follows Apache 2.0 terms, which are strictly more permissive — no additional obligations, no action needed on your side.
## Documentation
- [Quick Start (Python)](https://opendataloader.org/docs/quick-start-python)
- [Quick Start (Node.js)](https://opendataloader.org/docs/quick-start-nodejs)
- [Quick Start (Java)](https://opendataloader.org/docs/quick-start-java)
- [JSON Schema Reference](https://opendataloader.org/docs/json-schema)
- [CLI Options](https://opendataloader.org/docs/cli-options-reference)
- [Hybrid Mode Guide](https://opendataloader.org/docs/hybrid-mode)
- [Tagged PDF Support](https://opendataloader.org/docs/tagged-pdf)
- [AI Safety Features](https://opendataloader.org/docs/ai-safety)
- [PDF Accessibility](https://opendataloader.org/docs/accessibility-compliance)
## Contributing
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
## License
[Apache License 2.0](LICENSE)
> **Note:** Versions prior to 2.0 are licensed under the [Mozilla Public License 2.0](https://www.mozilla.org/MPL/2.0/).
---
**Found this useful?** Give us a star to help others discover OpenDataLoader.
================================================
FILE: SUPPORT.md
================================================
# Support
This project uses GitHub Issues to track bugs and feature requests. Please search the existing
issues before filing new issues to avoid duplicates. For new issues, file your bug or
feature request as a new Issue.
For help and questions about using this project, please contact our team via Teams or tag us in the issues.
## AI-Powered Issue Processing
This project uses AI to automatically process GitHub issues through a three-stage workflow:
### How It Works
1. **Triage**: Validates your issue (checks for duplicates, spam, and project scope)
2. **Analyze**: Analyzes the codebase to understand the issue and determine the best approach
3. **Fix**: Automatically creates a PR for eligible issues
### What to Expect
After submitting an issue, you may see these labels:
| Label | Meaning |
|-------|---------|
| `fix/auto-eligible` | AI can automatically fix this issue |
| `fix/manual-required` | Requires human expert review |
| `fix/comment-only` | No code change needed; resolved via comment |
### Commands (CODEOWNERS only)
- `@ai-issue analyze` - Request re-analysis of an issue
- `@ai-issue fix` - Trigger automatic fix attempt
================================================
FILE: THIRD_PARTY/THIRD_PARTY_LICENSES.md
================================================
# THIRD-PARTY LICENSES
This project includes third-party libraries and components, licensed under their respective open source licenses.
Hancom, Inc. distributes the veraPDF components under the Mozilla Public License 2.0 (MPL-2.0), chosen from dual-licensed options.
| Component | Version | License | Download URL |
| :--- | :--- | :--- | :--- |
| @esbuild/aix-ppc64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/android-arm | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/android-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/android-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/darwin-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/darwin-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/freebsd-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/freebsd-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-arm | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-ia32 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-loong64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-mips64el | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-ppc64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-riscv64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-s390x | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/linux-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/netbsd-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/netbsd-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/openbsd-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/openbsd-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/openharmony-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/sunos-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/win32-arm64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/win32-ia32 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @esbuild/win32-x64 | 0.25.12, 0.27.0 | MIT | https://github.com/evanw/esbuild |
| @eslint-community/eslint-utils | 4.9.0, 4.9.1 | MIT | https://github.com/eslint-community/eslint-utils |
| @eslint-community/regexpp | 4.12.2 | MIT | https://github.com/eslint-community/regexpp |
| @eslint/config-array | 0.21.1 | Apache-2.0 | https://github.com/eslint/rewrite |
| @eslint/config-helpers | 0.4.2 | Apache-2.0 | https://github.com/eslint/rewrite/tree/main/packages/config-helpers |
| @eslint/core | 0.17.0 | Apache-2.0 | https://github.com/eslint/rewrite |
| @eslint/eslintrc | 3.3.3 | MIT | https://github.com/eslint/eslintrc |
| @eslint/js | 9.39.2 | MIT | https://eslint.org |
| @eslint/object-schema | 2.1.7 | Apache-2.0 | https://github.com/eslint/rewrite |
| @eslint/plugin-kit | 0.4.1 | Apache-2.0 | https://github.com/eslint/rewrite |
| @humanfs/core | 0.19.1 | Apache-2.0 | https://github.com/humanwhocodes/humanfs |
| @humanfs/node | 0.16.7 | Apache-2.0 | https://github.com/humanwhocodes/humanfs |
| @humanwhocodes/module-importer | 1.0.1 | Apache-2.0 | https://github.com/humanwhocodes/module-importer |
| @humanwhocodes/retry | 0.4.3 | Apache-2.0 | https://github.com/humanwhocodes/retrier |
| @isaacs/balanced-match | 4.0.1 | MIT | https://github.com/isaacs/balanced-match |
| @isaacs/brace-expansion | 5.0.1 | MIT | https://github.com/isaacs/brace-expansion |
| @isaacs/cliui | 8.0.2 | ISC | https://github.com/yargs/cliui |
| @jridgewell/gen-mapping | 0.3.13 | MIT | https://github.com/jridgewell/gen-mapping |
| @jridgewell/resolve-uri | 3.1.2 | MIT | https://github.com/jridgewell/resolve-uri |
| @jridgewell/sourcemap-codec | 1.5.5 | MIT | https://github.com/jridgewell/sourcemap-codec |
| @jridgewell/trace-mapping | 0.3.31 | MIT | https://github.com/jridgewell/trace-mapping |
| @pkgjs/parseargs | 0.11.0 | MIT | https://github.com/pkgjs/parseargs |
| @rollup/rollup-android-arm-eabi | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-android-arm64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-darwin-arm64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-darwin-x64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-freebsd-arm64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-freebsd-x64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-arm-gnueabihf | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-arm-musleabihf | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-arm64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-arm64-musl | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-loong64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-ppc64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-riscv64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-riscv64-musl | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-s390x-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-x64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-linux-x64-musl | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-openharmony-arm64 | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-win32-arm64-msvc | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-win32-ia32-msvc | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-win32-x64-gnu | 4.53.2 | MIT | https://rollupjs.org/ |
| @rollup/rollup-win32-x64-msvc | 4.53.2 | MIT | https://rollupjs.org/ |
| @standard-schema/spec | 1.0.0 | MIT | https://github.com/standard-schema/standard-schema |
| @types/chai | 5.2.3 | MIT | https://github.com/DefinitelyTyped/DefinitelyTyped/tree/master/types/chai |
| @types/deep-eql | 4.0.2 | MIT | https://github.com/DefinitelyTyped/DefinitelyTyped/tree/master/types/deep-eql |
| @types/estree | 1.0.8 | MIT | https://github.com/DefinitelyTyped/DefinitelyTyped/tree/master/types/estree |
| @types/json-schema | 7.0.15 | MIT | https://github.com/DefinitelyTyped/DefinitelyTyped/tree/master/types/json-schema |
| @types/node | 25.2.0 | MIT | https://github.com/DefinitelyTyped/DefinitelyTyped/tree/master/types/node |
| @typescript-eslint/eslint-plugin | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/parser | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/project-service | 8.54.0 | MIT | https://typescript-eslint.io |
| @typescript-eslint/scope-manager | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/tsconfig-utils | 8.54.0 | MIT | https://typescript-eslint.io |
| @typescript-eslint/type-utils | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/types | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/typescript-estree | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/utils | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @typescript-eslint/visitor-keys | 8.54.0 | MIT | https://github.com/typescript-eslint/typescript-eslint |
| @vitest/expect | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| @vitest/mocker | 4.0.18 | MIT | https://github.com/vitest-dev/vitest/tree/main/packages/mocker |
| @vitest/pretty-format | 4.0.18 | MIT | https://github.com/vitest-dev/vitest/tree/main/packages/utils |
| @vitest/runner | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| @vitest/snapshot | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| @vitest/spy | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| @vitest/utils | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| Acorn | 8.15.0 | MIT | https://github.com/ternjs/Acorn |
| Acorn-JSX | 5.3.2 | MIT | https://github.com/RReverser/Acorn-JSX |
| ajv | 6.12.6 | MIT | https://github.com/ajv-validator/ajv.git |
| annotated-types | 0.7.0 | MIT | https://github.com/annotated-types/annotated-types |
| ansi-regex | 6.2.2 | MIT | https://github.com/sindresorhus/ansi-regex |
| ansi-styles | v4.3.0, 6.2.3 | MIT | https://github.com/sindresorhus/ansi-styles |
| any-promise | 1.3.0 | MIT | http://github.com/kevinbeaty/any-promise |
| anyio | 4.9.0 | MIT | https://pypi.org/project/anyio/ |
| Apache Commons Logging | 1.3.4 | Apache-2.0 | http://commons.apache.org/proper/commons-logging/ |
| Apache PDFBox | 3.0.4 | Apache-2.0 | https://pdfbox.apache.org/ |
| Apache PDFBox io | 3.0.4 | Apache-2.0 | https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox-io/ |
| API Guardian | 1.1.2 | Apache-2.0 | https://repo1.maven.org/maven2/org/apiguardian/apiguardian-api/1.1.2/ |
| assertion-error | 2.0.1 | MIT | https://github.com/chaijs/assertion-error |
| AssertJ - Fluent Assertions for Java | 3.27.7 | Apache-2.0 | https://assertj.github.io/doc/ |
| balanced-match | 1.0.2 | MIT | https://github.com/juliangruber/balanced-match |
| brace-expansion | 1.1.12, 2.0.2 | MIT | https://github.com/juliangruber/brace-expansion |
| bundle-require | 5.1.0 | MIT | https://www.npmjs.com/package/bundle-require |
| Byte Buddy | 1.18.3 | Apache-2.0 | http://bytebuddy.net |
| cac | 6.7.14 | MIT | https://github.com/egoist/cac |
| callsites | 3.1.0 | MIT | https://github.com/sindresorhus/callsites |
| Chai | 6.2.1 | MIT | http://chaijs.com/ |
| Chalk | 4.1.2 | MIT | https://github.com/sindresorhus/chalk |
| chokidar | 4.0.3 | MIT | https://github.com/paulmillr/chokidar |
| color-name | 1.1.4 | MIT | https://github.com/colorjs/color-name |
| com.sun.xml.bind:jaxb-impl | 2.3.2 | EDL-1.0 | https://mvnrepository.com/artifact/com.sun.xml.bind/jaxb-impl |
| commander | 14.0.3 | MIT | https://github.com/tj/commander.js |
| Commander.js | 4.1.1 | MIT | https://github.com/tj/commander.js |
| commons-cli | 1.10.0 | Apache-2.0 | http://commons.apache.org/cli/ |
| confbox | 0.1.8 | MIT | https://github.com/unjs/confbox |
| consola | 3.4.2 | MIT | https://github.com/nuxt/consola |
| debug-js/debug | 4.4.3 | MIT | https://github.com/debug-js/debug |
| deep-is | 0.1.4 | MIT | https://github.com/thlorenz/deep-is |
| eastasianwidth | 0.2.0 | MIT | https://github.com/komagata/eastasianwidth |
| emoji-regex | 9.2.2 | MIT | https://github.com/mathiasbynens/emoji-regex |
| es-module-lexer | 1.7.0 | MIT | https://github.com/guybedford/es-module-lexer |
| esbuild | 0.25.12, 0.27.0 | MIT | https://esbuild.github.io/ |
| escape-string-regexp | v4.0.0 | MIT | https://github.com/sindresorhus/escape-string-regexp |
| ESLint | 9.39.2 | MIT | http://eslint.org/ |
| eslint-scope | 8.4.0 | BSD-2-Clause | https://github.com/eslint/eslint-scope |
| eslint-visitor-keys | 3.4.3, 4.2.1 | Apache-2.0 | https://github.com/eslint/eslint-visitor-keys |
| espree | 10.4.0 | BSD-2-Clause | https://github.com/eslint/espree |
| esquery | 1.6.0 | BSD-3-Clause | https://github.com/jrfeenst/esquery |
| esrecurse | v4.3.0 | BSD-2-Clause | https://github.com/estools/esrecurse |
| estraverse | 5.3.0 | BSD-2-Clause | https://github.com/Constellation/estraverse |
| estree-walker | 3.0.3 | MIT | https://github.com/Rich-Harris/estree-walker |
| esutils | 2.0.3 | BSD-2-Clause | https://github.com/Constellation/esutils |
| expect-type | 1.2.2 | Apache-2.0 | https://github.com/mmkal/ts/tree/master/packages/expect-type |
| fast-deep-equal | v3.1.3 | MIT | https://github.com/epoberezkin/fast-deep-equal |
| fast-json-stable-stringify | 2.1.0 | MIT | https://github.com/epoberezkin/fast-json-stable-stringify |
| fast-levenshtein | 2.0.6 | MIT | https://github.com/hiddentao/fast-levenshtein |
| fdir | 6.5.0 | MIT | https://github.com/thecodrr/fdir |
| file-entry-cache | 8.0.0 | MIT | https://github.com/royriojas/file-entry-cache |
| find-up | v5.0.0 | MIT | https://github.com/sindresorhus/find-up |
| fix-dts-default-cjs-exports | 1.0.1 | MIT | https://github.com/userquin/fix-dts-default-cjs-exports |
| flat-cache | 4.0.1 | MIT | https://github.com/royriojas/flat-cache |
| flatted | 3.3.3 | ISC | https://github.com/WebReflection/flatted |
| foreground-child | 3.3.1 | ISC | https://github.com/isaacs/foreground-child |
| fsevents | 2.3.3 | MIT | https://github.com/fsevents/fsevents |
| get-tsconfig | 4.13.0 | MIT | https://github.com/typeslick/get-tsconfig |
| glob | 13.0.1 | Blue Oak 1.0.0 | https://github.com/isaacs/node-glob |
| glob-parent | 6.0.2 | ISC | https://github.com/es128/glob-parent |
| h11 | 0.16.0 | MIT | https://github.com/njsmith/h11 |
| Hamcrest | 1.3 | BSD-3-Clause | http://hamcrest.org/ |
| has-flag | 4.0.0 | MIT | https://github.com/sindresorhus/has-flag |
| httpcore | 1.0.9 | BSD-3-Clause | https://github.com/encode/httpcore |
| httpx | 0.27.2 | BSD-3-Clause | https://www.python-httpx.org/ |
| idna | 3.10 | BSD-3-Clause | https://github.com/kjd/idna |
| import-fresh | 3.3.1 | MIT | https://github.com/sindresorhus/import-fresh |
| imurmurhash | 0.1.4 | MIT | https://github.com/jensyt/imurmurhash-js |
| is-extglob | 2.1.1 | MIT | https://github.com/jonschlinkert/is-extglob |
| is-glob | 4.0.3 | MIT | https://www.npmjs.com/package/is-glob |
| isaacs/jackspeak | 3.4.3 | Blue Oak 1.0.0 | https://github.com/isaacs/jackspeak |
| isexe | 2.0.0 | ISC | https://github.com/isaacs/isexe |
| jackson-annotations | 2.15.0 | Apache-2.0 | https://github.com/FasterXML/jackson-annotations |
| jackson-core | 2.15.0 | Apache-2.0 | https://github.com/FasterXML/jackson-core |
| jackson-databind | 2.15.0 | Apache-2.0 | https://github.com/FasterXML/jackson-databind |
| Jakarta Activation API | 1.2.0 | CDDL-1.1 | https://eclipse-ee4j.github.io/jaf/ |
| Java Advanced Imaging Image I/O Tools API core (standalone) | 1.4.0 | BSD-3-Clause | https://github.com/jai-imageio/jai-imageio-core |
| JAXB CORE | 2.3.0.1 | CDDL-1.1 | http://jaxb.java.net/ |
| jaxb-api | 2.4.0-b180830.0359 | CDDL-1.1 | https://jakarta.ee/specifications/xml-binding |
| Jetbrains annotations | 13.0 | Apache-2.0 | http://www.jetbrains.org |
| joycon | 3.1.1 | MIT | https://github.com/egoist/joycon |
| JPEG2000 support for Java Advanced Imaging Image I/O Tools API | 1.3.0 | Sun BSD | https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-jpeg2000/ |
| js-yaml | 4.1.1 | MIT | https://github.com/nodeca/js-yaml |
| json-buffer | 3.0.1 | MIT | https://github.com/dominictarr/json-buffer |
| json-schema-traverse | 0.4.1 | MIT | https://github.com/epoberezkin/json-schema-traverse |
| JUnit | 4.13.2 | EPL-1.0 | https://junit.org/junit5/ |
| JUnit Jupiter (Aggregator) | 5.14.2 | EPL-2.0 | https://junit.org/ |
| keyv | 4.5.4 | MIT | https://github.com/lukechilds/keyv |
| Kotlin | 1.8.21 | Apache-2.0 | http://kotlin.jetbrains.org |
| kotlin-stdlib-common | 1.9.10 | Apache-2.0 | https://kotlinlang.org/ |
| langchain | 0.3.80 | MIT | https://github.com/langchain-ai/langchain |
| langchain-text-splitters | 0.3.9 | MIT | https://github.com/langchain-ai/langchain |
| langsmith | 0.3.45 | MIT | https://smith.langchain.com/ |
| levn | 0.4.1 | MIT | https://github.com/gkz/levn |
| libcspice-sys | 0.1.1 | MIT | https://crates.io/crates/libcspice-sys |
| lilconfig | 3.1.3 | MIT | https://github.com/antonk52/lilconfig |
| lines-and-columns | 1.2.4 | MIT | https://github.com/eventualbuddha/lines-and-columns |
| load-tsconfig | 0.2.5 | MIT | https://www.npmjs.com/package/load-tsconfig |
| locate-path | v6.0.0 | MIT | https://github.com/sindresorhus/locate-path |
| lodash.merge | 4.6.2 | MIT | https://lodash.com/ |
| mafintosh/why-is-node-running | 2.3.0 | MIT | https://github.com/mafintosh/why-is-node-running |
| magic-string | 0.30.21 | MIT | https://github.com/rich-harris/magic-string |
| mdBook | 0.4.36 | MPL-2.0 | https://github.com/rust-lang/mdBook |
| minimatch | 3.1.2, 9.0.5 | ISC | https://github.com/isaacs/minimatch |
| minimatch | 10.1.2 | Blue Oak 1.0.0 | https://github.com/isaacs/minimatch |
| minipass | 7.1.2 | ISC | https://github.com/isaacs/minipass |
| mlly | 1.8.0 | MIT | https://github.com/unjs/mlly |
| MockWebServer | 4.12.0 | Apache-2.0 | https://github.com/square/okhttp/ |
| moxystudio/node-cross-spawn | 7.0.6 | MIT | https://github.com/moxystudio/node-cross-spawn |
| Mozilla Rhino | 1.7.14.1 | MPL-2.0 | http://www.mozilla.org/rhino/ |
| ms.js | 2.1.3 | MIT | https://github.com/guille/ms.js |
| mz | 2.7.0 | MIT | https://github.com/normalize/mz |
| nanoid | 3.3.11 | MIT | https://github.com/ai/nanoid |
| natural-compare | 1.4.0 | MIT | https://github.com/litejs/natural-compare |
| nobody | 2.1.1 | MIT | https://github.com/debug-js/debug |
| node-concat-map | 0.0.1 | MIT | https://github.com/substack/node-concat-map |
| node-glob | 10.5.0 | ISC | http://github.com/isaacs/node-glob |
| node-ignore | 5.3.2, 7.0.5 | MIT | https://github.com/kaelzhang/node-ignore |
| node-lru-cache | 10.4.3, 11.2.2 | ISC | http://github.com/isaacs/node-lru-cache |
| node-semver | 7.7.3 | ISC | https://github.com/npm/node-semver |
| nodeca-argparse | 2.0.1 | PSF-2.0 | https://github.com/nodeca/argparse |
| object-assign | 4.1.1 | MIT | https://github.com/sindresorhus/object-assign |
| OkHttp | 4.12.0 | Apache-2.0 | https://github.com/square/okhttp |
| OkIO | 3.6.0 | Apache-2.0 | https://square.github.io/okio/ |
| optionator | 0.9.4 | MIT | https://github.com/gkz/optionator |
| org.apiguardian:apiguardian-api | 1.1.2 | Apache-2.0 | https://github.com/apiguardian-team/apiguardian |
| org.jetbrains.kotlin:kotlin-stdlib-jdk7 | 1.8.21 | Apache-2.0 | https://kotlinlang.org/ |
| org.jetbrains.kotlin:kotlin-stdlib-jdk8 | 1.8.21 | Apache-2.0 | https://kotlinlang.org/ |
| org.junit.jupiter:junit-jupiter-api | 5.14.2 | EPL-2.0 | https://junit.org/ |
| org.junit.jupiter:junit-jupiter-engine | 5.14.2 | EPL-2.0 | https://junit.org/ |
| org.junit.jupiter:junit-jupiter-params | 5.14.2 | EPL-2.0 | https://junit.org/ |
| org.junit.platform:junit-platform-commons | 1.14.2 | EPL-2.0 | https://junit.org/ |
| org.junit.platform:junit-platform-engine | 1.14.2 | EPL-2.0 | https://junit.org/ |
| org.opentest4j:opentest4j | 1.3.0 | Apache-2.0 | https://github.com/ota4j-team/opentest4j |
| org.verapdf:core | 1.29.56 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:metadata-fixer | 1.29.194 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:parser | 1.29.64 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:pdf-model | 1.29.12 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:validation-model | 1.29.194 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:verapdf-xmp-core | 1.29.56 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:wcag-algorithms | 1.29.43 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| org.verapdf:wcag-validation | 1.29.194 | MPL-2.0 | https://repo1.maven.org/maven2/org/verapdf/ |
| orjson | 3.10.15 | Apache-2.0 | https://github.com/ijl/orjson |
| p-limit | 3.1.0 | MIT | https://github.com/sindresorhus/p-limit |
| p-locate | v5.0.0 | MIT | https://github.com/sindresorhus/p-locate |
| package-json-from-dist | 1.0.1 | Blue Oak 1.0.0 | https://github.com/isaacs/package-json-from-dist |
| Packaging | 24.2 | BSD-2-Clause | https://github.com/pypa/packaging |
| parent-module | 1.0.1 | MIT | https://github.com/sindresorhus/parent-module |
| path-exists | 4.0.0 | MIT | https://github.com/sindresorhus/path-exists |
| path-key | 3.1.1 | MIT | https://github.com/sindresorhus/path-key |
| path-scurry | 1.11.1, 2.0.1 | Blue Oak 1.0.0 | https://github.com/isaacs/path-walker |
| pathe | 2.0.3 | MIT | https://github.com/unjs/pathe |
| PDFBox JBIG2 ImageIO plugin | 3.0.3 | Apache-2.0 | https://www.apache.org/jbig2-imageio/ |
| picocolors | 1.1.1 | ISC | https://github.com/alexeyraspopov/picocolors |
| picomatch | 4.0.3 | MIT | https://github.com/micromatch/picomatch |
| pirates | 4.0.7 | MIT | https://github.com/ariporad/pirates |
| pkg-types | 1.3.1 | MIT | https://github.com/unjs/pkg-types |
| PostCSS | 8.5.6 | MIT | http://postcss.org/ |
| postcss-load-config | 6.0.1 | MIT | https://github.com/michael-ciniawsky/postcss-load-config |
| prelude-ls | 1.2.1 | MIT | https://github.com/gkz/prelude-ls |
| Prettier IO | 3.8.1 | MIT | https://prettier.io |
| psf-requests | 2.32.5 | Apache-2.0 | http://docs.python-requests.org |
| Punycode.js | 2.3.1 | MIT | http://mths.be/punycode |
| pydantic | 2.10.6 | MIT | https://pydantic-docs.helpmanual.io/ |
| pydantic-core | 2.27.2 | MIT | https://github.com/pydantic/pydantic-core |
| python-certifi | 2024.12.14 | MPL-2.0 | https://certifiio.readthedocs.io/en/latest/ |
| python-json-patch | 1.33 | BSD-3-Clause | https://github.com/stefankoegl/python-json-patch/ |
| python-json-pointer | 3.0.0 | BSD-3-Clause | https://github.com/stefankoegl/python-json-pointer |
| python-typing-extensions | 4.15.0 | PSF-2.0 | https://tracker.debian.org/pkg/python-typing-extensions |
| python3-charset-normalizer | 3.4.1 | MIT | https://github.com/ousret/charset_normalizer |
| PyYAML | 6.0.2 | MIT | https://pyyaml.org/ |
| Qix-/color-convert | 2.0.1 | MIT | https://github.com/Qix-/color-convert |
| readdirp | 4.1.2 | MIT | https://github.com/thlorenz/readdirp |
| requests-toolbelt | 1.0.0 | Apache-2.0 | https://toolbelt.readthedocs.io |
| resolve-from | 4.0.0, 5.0.0 | MIT | https://github.com/sindresorhus/resolve-from |
| resolve-pkg-maps | 1.0.0 | MIT | https://github.com/privatenumber/resolve-pkg-maps |
| rollup/rollup | 4.53.2 | MIT | https://github.com/rollup/rollup |
| Saxon XSLT and XQuery Processor | 12.8 | MPL-2.0 | http://saxon.sourceforge.net |
| shebang-command | 2.0.0 | MIT | https://github.com/kevva/shebang-command |
| shebang-regex | 3.0.0 | MIT | https://github.com/sindresorhus/shebang-regex |
| siginfo | 2.0.0 | ISC | https://github.com/emilbayes/siginfo |
| sindresorhus/globals | 14.0.0 | MIT | https://github.com/sindresorhus/globals |
| sindresorhus/supports-color | v7.2.0 | MIT | https://github.com/sindresorhus/supports-color |
| sniffio | 1.3.1 | Apache-2.0 | https://github.com/python-trio/sniffio |
| source-map | 0.7.6 | BSD-3-Clause | https://github.com/mozilla/source-map |
| source-map-js | 1.2.1 | BSD-3-Clause | https://github.com/7rulnik/source-map |
| stable-stringify | 1.0.1 | MIT | https://github.com/samn/json-stable-stringify |
| stackback | 0.0.2 | MIT | https://github.com/defunctzombie/node-stackback |
| StAX Utilities Project | 20070216 | BSD-3-Clause | http://java.net/projects/stax-utils/ |
| std-env | 3.10.0 | MIT | https://github.com/pi0/std-env |
| string-width | 4.2.3, 5.1.2 | MIT | https://github.com/sindresorhus/string-width |
| Strip ANSI | 6.0.1, 7.1.2 | MIT | https://github.com/chalk/strip-ansi |
| strip-json-comments | 3.1.1 | MIT | https://github.com/sindresorhus/strip-json-comments |
| sucrase | 3.35.0 | MIT | https://github.com/decaffeinate/bulk-decaffeinate |
| tapjs/signal-exit | 4.1.0 | ISC | https://github.com/tapjs/signal-exit |
| tenacity | 8.5.0 | Apache-2.0 | https://github.com/jd/tenacity |
| thenify | 3.3.1 | MIT | https://github.com/thenables/thenify |
| thenify-all | 1.6.0 | MIT | https://github.com/thenables/thenify-all |
| tinybench | 2.9.0 | MIT | https://github.com/tinylibs/tinybench |
| tinyexec | 0.3.2, 1.0.2 | MIT | https://github.com/tinylibs/tinyexec |
| tinyglobby | 0.2.15 | MIT | https://github.com/SuperchupuDev/tinyglobby |
| tinyrainbow | 3.0.3 | MIT | https://github.com/tinylibs/tinyrainbow |
| tree-kill | v1.2.2 | MIT | https://github.com/pkrumins/node-tree-kill |
| ts-api-utils | 2.4.0 | MIT | https://github.com/JoshuaKGoldberg/ts-api-utils |
| ts-interface-checker | 0.1.13 | Apache-2.0 | https://github.com/gristlabs/ts-interface-checker |
| tsup | 8.5.1 | MIT | https://github.com/egoist/tsup |
| tsx | 4.20.5 | MIT | https://github.com/basarat/tsx |
| type-check | 0.4.0 | MIT | https://github.com/gkz/type-check |
| TypeScript | 5.9.3 | Apache-2.0 | http://www.typescriptlang.org/ |
| ufo | 1.6.1 | MIT | https://github.com/nuxt-contrib/ufo |
| undici-types | 7.16.0 | MIT | https://undici.nodejs.org |
| upstage/dp-bench | - | MIT | https://huggingface.co/datasets/upstage/dp-bench |
| uri-js | 4.4.1 | BSD-2-Clause | https://github.com/garycourt/uri-js |
| urllib3 | 2.6.1 | MIT | https://urllib3.readthedocs.io/en/stable |
| vitejs | 7.3.1 | MIT | http://vitejs.dev/ |
| vitest | 4.0.18 | MIT | https://github.com/vitest-dev/vitest |
| which | 2.0.2 | ISC | https://github.com/isaacs/node-which |
| word-wrap | 1.2.5 | MIT | https://github.com/jonschlinkert |
| wrap-ansi | v7.0.0, 8.1.0 | MIT | https://github.com/chalk/wrap-ansi |
| XML Resolver | 5.3.3 | Apache-2.0 | https://github.com/ndw/xmlresolver |
| yocto-queue | 0.1.0 | MIT | https://github.com/sindresorhus/yocto-queue |
| zstandard | 0.23.0 | BSD-3-Clause | https://github.com/indygreg/python-zstandard |
================================================
FILE: THIRD_PARTY/THIRD_PARTY_NOTICES.md
================================================
# THIRD-PARTY NOTICES (Copyright & Attributions)
Copyright © 2025-2026 Hancom, Inc. All rights reserved.
Below are copyright and notice texts for third-party libraries and components used in this project.
Full license texts are provided in the `licenses/` directory.
See also: [THIRD_PARTY_LICENSES](./THIRD_PARTY_LICENSES.md) for details.
---
##### Apache Software Foundation notices
The following components include software developed at The Apache Software Foundation (https://www.apache.org/).
- Apache Commons Logging
- Apache PDFBox (FontBox, PDFBox, PDFBox-IO, JBIG2 ImageIO plugin)
- Apache Commons CLI
- Apache Maven (Artifact, Plugin API, Reporting API, Shared, Wagon etc.)
- Apache Maven Doxia
- XML Resolver
"This product includes software developed at The Apache Software Foundation (https://www.apache.org/)."
---
##### Component Attributions
- `@esbuild/android-arm` (0.25.12, 0.27.0): Copyright (c) 2018, 2021 The Go Authors. All rights reserved.
- `@esbuild/android-x64` (0.25.12, 0.27.0): Copyright (c) 2018, 2021 The Go Authors. All rights reserved.
- `@esbuild/openharmony-arm64` (0.25.12, 0.27.0): Copyright (c) 2018, 2021 The Go Authors. All rights reserved.
- `@eslint-community/eslint-utils` (4.9.0, 4.9.1): Copyright (c) 2018 Toru Nagashima
- `@eslint-community/regexpp` (4.12.2): Copyright (c) 2018 Toru Nagashima
- `@eslint/config-array` (0.21.1): Copyright (c) 2018-2025 the Deno authors.
- `@eslint/eslintrc` (3.3.3): Copyright (c) 2015-2017 Evgeny Poberezkin
- `@humanwhocodes/retry` (0.4.3): Copyright (c) 2011-2023 Isaac Z. Schlueter, Ben Noordhuis, and Contributors
- `@isaacs/cliui` (8.0.2): Copyright (c) 2015 Contributors
- `@jridgewell/gen-mapping` (0.3.13): Copyright (c) 2024 Justin Ridgewell
- `@jridgewell/resolve-uri` (3.1.2): Copyright (c) 2019 Justin Ridgewell
- `@jridgewell/sourcemap-codec` (1.5.5): Copyright (c) 2024 Justin Ridgewell
- `@jridgewell/trace-mapping` (0.3.31): Copyright (c) 2024 Justin Ridgewell
- `@standard-schema/spec` (1.0.0): Copyright (c) 2024 Colin McDonnell
- `@typescript-eslint/eslint-plugin` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/parser` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/project-service` (8.54.0): Copyright (c) 2025 typescript-eslint and other contributors
- `@typescript-eslint/scope-manager` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/tsconfig-utils` (8.54.0): Copyright (c) 2025 typescript-eslint and other contributors
- `@typescript-eslint/type-utils` (8.54.0): Copyright (c) 2021 typescript-eslint and other contributors
- `@typescript-eslint/types` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/typescript-estree` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/utils` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@typescript-eslint/visitor-keys` (8.54.0): Copyright (c) 2019 typescript-eslint and other contributors
- `@vitest/expect` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/mocker` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/pretty-format` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/runner` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/snapshot` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/spy` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors
- `@vitest/utils` (4.0.18): Copyright (c) 2021-Present VoidZero Inc. and Vitest contributors, Copyright (c) 2014-2023 Simon Lydell, Copyright (c) 2018 The diff-match-patch Authors., Copyright (c) 2013 Jake Luer
- `Acorn` (8.15.0): Copyright (c) 2012-2022 by various contributors (see AUTHORS)
- `Acorn-JSX` (5.3.2): Copyright (c) 2012-2017 by Ingvar Stepanyan
- `ajv` (6.12.6): Copyright (c) 2011 Gary Court, Copyright (c) 2015-2017 Evgeny Poberezkin
- `annotated-types` (0.7.0): Copyright (c) 2022 the contributors
- `any-promise` (1.3.0): Copyright (c) 2014-2016 Kevin Beaty
- `anyio` (4.9.0): Copyright (c) 2018 Alex Grönholm
- `Apache Commons Logging` (1.3.4): Copyright (c) 1989-2024 Free Software Foundation and The Apache Software Foundation, Copyright (c) 2013-2022 Oracle and/or its affiliates.
- `Apache PDFBox` (3.0.4): Copyright (c) 1990-2024 Adobe Systems Incorporated, www.pdfbox.org, Harald Kuhr, Google Corporation, Red Hat Inc., Unicode Inc., The Apache Software Foundation, GitHub Inc.
- `Apache PDFBox FontBox` (3.0.4): Copyright (c) 2006-2024 www.fontbox.org, The Apache Software Foundation, Grzegorz Luk, Lohit Fonts Project, Unicode Inc.
- `Apache PDFBox io` (3.0.4): Copyright (c) 2002-2024 The Apache Software Foundation
- `assertion-error` (2.0.1): Copyright (c) 2013 Jake Luer
- `AssertJ - Fluent Assertions for Java` (3.27.7): Copyright (c) 1989-2026 Free Software Foundation, Bitstream Inc., Tavmjong Bah, Oracle and/or its affiliates, the original author or authors.
- `balanced-match` (1.0.2): Copyright (c) 2013 Julian Gruber
- `brace-expansion` (1.1.12, 2.0.2): Copyright (c) 2013 Julian Gruber
- `bundle-require` (5.1.0): Copyright (c) 2021 EGOIST
- `Byte Buddy` (1.18.3): Copyright (c) 2000-2011 INRIA, France Telecom, Copyright (c) 2014-Present Rafael Winterhalter
- `Chai` (6.2.1): Copyright (c) 2011-2017 Chai.js Assertion Library, Jake Luer, Sakthipriyan Vairamani
- `chokidar` (4.0.3): Copyright (c) 2012-2019 Paul Miller, Elan Shanker
- `color-name` (1.1.4): Copyright (c) 2015 Dmitry Ivanov
- `com.sun.xml.bind:jaxb-impl` (2.3.2): Copyright (c) 1995-2018 Jean-loup Gailly, Mark Adler, Stuart Knightley, David Duponchel, Vitaly Puzrin, Andrey Tupitsin, Oracle, jQuery Foundation
- `Commander.js` (4.1.1): Copyright (c) 2011 TJ Holowaychuk
- `commons-cli` (1.10.0): Copyright (c) 1989-2025 Free Software Foundation, Oracle, The Apache Software Foundation
- `confbox` (0.1.8): Copyright (c) 2011-2018 Vitaly Puzrin, Aseem Kishore
- `debug-js/debug` (4.4.3): Copyright (c) 2014-2021 TJ Holowaychuk, Josh Junon
- `deep-is` (0.1.4): Copyright (c) 2009-2013 Thomas Robinson, James Halliday, Thorsten Lorenz
- `es-module-lexer` (1.7.0): Copyright (c) 2012-2022 by various contributors, Guy Bedford
- `esbuild` (0.25.12, 0.27.0): Copyright (c) 2020 Evan Wallace
- `ESLint` (9.39.2): Copyright (c) 2013 Joel Feenstra
- `eslint-scope` (8.4.0): Copyright (c) 2012-2015 Yusuke Suzuki, Alex Seville, Thiago de Arruda
- `espree` (10.4.0): Copyright (c) 2012-2015 Acorn Contributors, Sebastian McKenzie
- `esquery` (1.6.0): Copyright (c) 2012-2013 Ariya Hidayat, Yusuke Suzuki, Joel Feenstra
- `esrecurse` (v4.3.0): Copyright (c) 2014 Yusuke Suzuki
- `estraverse` (5.3.0): Copyright (c) 2012-2016 Ariya Hidayat, Yusuke Suzuki
- `esutils` (2.0.3): Copyright (c) 2013-2014 Yusuke Suzuki, Ivan Nikulin
- `expect-type` (1.2.2): Copyright (c) 2024 Misha Kaletsky
- `fast-deep-equal` (v3.1.3): Copyright (c) 2017 Evgeny Poberezkin
- `fdir` (6.5.0): Copyright (c) 2023-2024 Abdullah Atta
- `fix-dts-default-cjs-exports` (1.0.1): Copyright (c) 2025-Present Joaquin
- `flatted` (3.3.3): Copyright (c) 2018-2025 Andrea Giammarchi
- `foreground-child` (3.3.1): Copyright (c) 2015-2023 Isaac Z. Schlueter and Contributors
- `fsevents` (2.3.3): Copyright (c) 2010-2020 Philipp Dunkel, Ben Noordhuis, Elan Shankar, Paul Miller
- `glob-parent` (6.0.2): Copyright (c) 2015-2021 Elan Shanker, Blaine Bublitz, Eric Schoffstall
- `h11` (0.16.0): Copyright (c) 2006-2016 Jonathan E. Taylor, Scipy Developers, Statsmodels Developers, Nathaniel J. Smith, Chris Wanstrath
- `Hamcrest` (1.3): Copyright (c) 2000-2010 hamcrest.org
- `httpcore` (1.0.9): Copyright (c) 2020 Encode OSS Ltd.
- `httpx` (0.27.2): Copyright (c) 2019 Encode OSS Ltd.
- `idna` (3.10): Copyright (c) 2013-2024 Kim Davies and contributors.
- `imurmurhash` (0.1.4): Copyright (c) 2013 Gary Court, Jens Taylor
- `is-extglob` (2.1.1): Copyright (c) 2014-2016 Jon Schlinkert
- `is-glob` (4.0.3): Copyright (c) 2014-2017 Jon Schlinkert
- `jackson-annotations` (2.15.0): Copyright (c) 2007 Tatu Saloranta
- `jackson-core` (2.15.0): Copyright (c) 2007-2020 Tatu Saloranta, Raffaello Giulietti
- `jackson-databind` (2.15.0): Copyright (c) 2007-2011 Tatu Saloranta, Google Inc.
- `Jakarta Activation API` (1.2.0): Copyright (c) 1989-2017 Free Software Foundation, Oracle
- `Java Advanced Imaging Image I/O Tools API core` (1.4.0): Copyright (c) 1990-2018 Wang Labs Inc., Sun Microsystems, Stian Soiland-Reyes, University of Manchester, Butch Howard, Mark Carroll, Peter Hull, Robin Stevens, Yannick De Turck, Luca Bellonda, Curtis Rueden, Ghislain Bonamy, Mykola Pavluchynskyi, Roger Leigh, Sebastien Besson, Peter Jodeleit
- `JAXB CORE` (2.3.0.1): Copyright (c) 1997-2018 Stuart Knightley, David Duponchel, Oracle, jQuery Foundation
- `jaxb-api` (2.4.0-b180830.0359): Copyright (c) 1989-2018 Free Software Foundation, Stuart Knightley, David Duponchel, Oracle, jQuery Foundation
- `Jetbrains annotations` (13.0): Copyright (c) 2000-2013 JetBrains s.r.o., Sascha Weinreuter
- `JPEG2000 support for Java Advanced Imaging Image I/O Tools API` (1.3.0): Copyright (c) 1999-2006 JJ2000 Partners, Sun Microsystems
- `js-yaml` (4.1.1): Copyright (c) 2011-2015 Vitaly Puzrin
- `json-buffer` (3.0.1): Copyright (c) 2013 Dominic Tarr
- `json-schema-traverse` (0.4.1): Copyright (c) 2017 Evgeny Poberezkin
- `JUnit Jupiter (Aggregator)` (5.14.2): Copyright (c) 2015-2026 the original author or authors.
- `Kotlin` (1.8.21): Copyright (c) 2010-2023 JetBrains s.r.o. and Kotlin Programming Language contributors.
- `kotlin-stdlib-common` (1.9.10): Copyright (c) 2007-2023 Google Inc., JetBrains s.r.o., The Guava Authors
- `langchain-opendataloader-pdf` (1.0.1): Copyright (c) 2024 LangChain, Inc.
- `libcspice-sys` (0.1.1): Copyright (c) 2025 libcspice-sys contributors
- `lilconfig` (3.1.3): Copyright (c) 2022 Anton Kastritskiy
- `lines-and-columns` (1.2.4): Copyright (c) 2015 Brian Donovan
- `load-tsconfig` (0.2.5): Copyright (c) 2021 EGOIST
- `mafintosh/why-is-node-running` (2.3.0): Copyright (c) 2016 Mathias Buus
- `magic-string` (0.30.21): Copyright (c) 2018 Rich Harris
- `mdBook` (0.4.36): Copyright (c) 2006-2020 Ivan Sagalaev, Ajax.org B.V., Oliver Nightingale, Wei Song
- `minimatch` (9.0.5): Copyright (c) 2011-2023 Isaac Z. Schlueter and Contributors
- `minipass` (7.1.2): Copyright (c) 2017-2023 npm, Inc., Isaac Z. Schlueter, and Contributors
- `MockWebServer` (4.12.0): Copyright (c) 2011-2019 Google Inc., Square, Inc.
- `moxystudio/node-cross-spawn` (7.0.6): Copyright (c) 2018 Made With MOXY Lda
- `Mozilla Rhino` (1.7.14.1): Copyright (c) 1991-2022 Lucent Technologies, Free Software Foundation, Stuart Knightley, Oracle, Vitaly Puzrin, Sun Microsystems, V8 project authors, Raffaello Giulietti
- `ms.js` (2.1.3): Copyright (c) 2020 Vercel, Inc.
- `mz` (2.7.0): Copyright (c) 2014-2016 Jonathan Ong
- `nanoid` (3.3.11): Copyright (c) 2017 Andrey Sitnik
- `natural-compare` (1.4.0): Copyright (c) 2012-2015 Lauri Rooden
- `nobody` (2.1.1): Copyright (c) 2014-2025 TJ Holowaychuk, Josh Junon, Kevin Deng
- `node-glob` (10.5.0): Copyright (c) 2009-2023 Isaac Z. Schlueter and Contributors
- `node-ignore` (5.3.2, 7.0.5): Copyright (c) 2013 Kael Zhang
- `node-lru-cache` (10.4.3, 11.2.2): Copyright (c) 2010-2023 Isaac Z. Schlueter and Contributors
- `nodeca-argparse` (2.0.1): Copyright (c) 1991-2020 Stichting Mathematisch Centrum Amsterdam, Gregory P. Ward, Python Software Foundation, argparse.js authors
- `OkHttp` (4.12.0): Copyright (c) 2010-2020 The Android Open Source Project, Square Inc., Twitter Inc.
- `OkIO` (3.6.0): Copyright (c) 2014-2023 Square, Inc.
- `org.apiguardian:apiguardian-api` (1.1.2): Copyright (c) 1995-2018 Jean-loup Gailly, Mark Adler, Stuart Knightley, Vitaly Puzrin, Oracle, jQuery Foundation
- `org.junit.jupiter:junit-jupiter-*` (5.14.2): Copyright (c) 1989-2026 Free Software Foundation, Oracle, the original author or authors.
- `org.junit.platform:junit-platform-*` (1.14.2): Copyright (c) 1989-2026 Free Software Foundation, Oracle, the original author or authors.
- `org.opentest4j:opentest4j` (1.3.0): Copyright (c) 1989-2023 Free Software Foundation, Oracle, the original author or authors.
- `orjson` (3.10.15): Copyright (c) 1991-2023 Alex Crichton, Milo Yip, Ryohei Machida, The Rust Project Developers, Andrew Gallant, Nicholas Allegra, Nikolai Vazquez, The bytecount Developers, PyO3 Project, Sergio Benitez, Ashley Mannix, The Servo Project Developers, YaoYuan, Parker Timmerman, Stephen M. Coakley, The Uuid Project Developers, Ulf Adams, Unicode Inc., winapi-rs developers
- `Packaging` (24.2): Copyright (c) 2017-Present Ofek Lev
- `pathe` (2.0.3): Copyright (c) 2023-Present Fabio Spampinato
- `PDFBox JBIG2 ImageIO plugin` (3.0.3): Copyright (c) 1995-2019 levigo holding GmbH, The Apache Software Foundation
- `picocolors` (1.1.1): Copyright (c) 2021-2024 Oleksii Raspopov, Kostiantyn Denysov, Anton Verinov
- `picomatch` (4.0.3): Copyright (c) 2017-Present Jon Schlinkert
- `pirates` (4.0.7): Copyright (c) 2015-2018 Ari Porad
- `PostCSS` (8.5.6): Copyright (c) 2013 Andrey Sitnik
- `Prettier IO` (3.8.1): Copyright (c) 2009-2026 Google LLC, Kevin Decker, Vitaly Puzrin, Woong Jun, Raynos, Ingvar Stepanyan, Aseem Kishore, Andrey Sitnik, Dominic Tarr, James Halliday, Kael Zhang, Liucw, Mikola Lysenko, Alex Bell, Stefan Thomas, Yehuda Katz, Jon Schlinkert, Teambition, Simon Lydell, Sebastian McKenzie, Tilde Inc., Titus Wormer, Elan Shanker, Matteo Collina, Denys Kniazevych, Joshua Holbrook, Mark Wubben, Pat Sissons, Thomas Watson Steen, Andrew Powell, Evgeny Poberezkin, Luke Childs, Andrea Giammarchi, KFlash, typescript-eslint, Fabio Spampinato, Jared Wray, Oleksii Raspopov, Eemeli Aro, EditorConfig Team.
- `psf-requests` (2.32.5): Copyright (c) 2012-2019 Kenneth Reitz
- `pydantic` (2.10.6): Copyright (c) 2017-Present Pydantic Services Inc.
- `pydantic-core` (2.27.2): Copyright (c) 2022 Samuel Colvin
- `python-json-patch` (1.33): Copyright (c) 2011 Stefan Kögl
- `python-json-pointer` (3.0.0): Copyright (c) 2011 Stefan Kögl
- `python-typing-extensions` (4.15.0): Copyright (c) 1991-1995 Stichting Mathematisch Centrum Amsterdam
- `python3-charset-normalizer` (3.4.1): Copyright (c) 2021-2025 Ahmed TAHRI
- `PyYAML` (6.0.2): Copyright (c) 2006-2021 Kirill Simonov, Ingy döt Net
- `Qix-/color-convert` (2.0.1): Copyright (c) 2011-2016 Heather Arthur, Josh Junon
- `readdirp` (4.1.2): Copyright (c) 2012-2019 Thorsten Lorenz, Paul Miller
- `requests-toolbelt` (1.0.0): Copyright (c) 2014 Ian Cordasco, Cory Benfield
- `rollup/rollup` (4.53.2): Copyright (c) 2012-2024 Paul Miller, Elan Shanker, Thorsten Lorenz, Jon Schlinkert, Benjamin Coe, Isaac Z. Schlueter, RollupJS Plugin Contributors, Sindre Sorhus, Oleksii Raspopov, Rich Harris, Justin Ridgewell
- `Saxon XSLT and XQuery Processor` (12.8): Copyright (c) 1998-2025 James Clark, Saxonica Limited, Michael Froh, Oracle
- `siginfo` (2.0.0): Copyright (c) 2017 Emil Bay
- `source-map` (0.7.6): Copyright (c) 2009-2014 Mozilla Foundation, The Closure Compiler Authors
- `source-map-js` (1.2.1): Copyright (c) 2009-2014 Mozilla Foundation, The Closure Compiler Authors
- `stackback` (0.0.2): Copyright (c) 2012 the V8 project authors.
- `StAX Utilities Project` (20070216): Copyright (c) 2004-2006 Christian Niles, Sun Microsystems, John Kristian
- `sucrase` (3.35.0): Copyright (c) 2012-2018 various contributors
- `tapjs/signal-exit` (4.1.0): Copyright (c) 2015-2023 Benjamin Coe, Isaac Z. Schlueter
- `tenacity` (8.5.0): Copyright (c) 2013-2018 Ray Holder, Joshua Harlow, Julien Danjou, Elisey Zanko
- `thenify` (3.3.1): Copyright (c) 2014-2016 Jonathan Ong
- `thenify-all` (1.6.0): Copyright (c) 2014 Jonathan Ong
- `tinybench` (2.9.0): Copyright (c) 2022 Tinylibs
- `tinyexec` (0.3.2, 1.0.2): Copyright (c) 2024 Tinylibs
- `tinyglobby` (0.2.15): Copyright (c) 2024 Madeline Gurriar
- `tinyrainbow` (3.0.3): Copyright (c) 2022 Tinylibs
- `tree-kill` (v1.2.2): Copyright (c) 2018 Peter Krumins
- `tsup` (8.5.1): Copyright (c) 2021 EGOIST
- `TypeScript` (5.9.3): Copyright (c) 1991-2018 Unicode Inc., The Khronos Group Inc., WHATWG
- `uri-js` (4.4.1): Copyright (c) 2011 Gary Court
- `urllib3` (2.6.1): Copyright (c) 2008-2020 Andrey Petrov and contributors
- `vitejs` (7.3.1): Copyright (c) 2010-2025 Sencha Inc., William Stein, Einar Otto Stangvik, LearnBoost, TJ Holowaychuk, Paul Miller, Elan Shanker, Thorsten Lorenz, Arnout Kazemier, James Halliday, Troy Goode, Jonathan Ong, Jared Hanson, Ivan Nikulin, Maxime Thirouin, Nathan Rajlich, Jon Schlinkert, Douglas Christopher Wilson, Simon Lydell, Alexey Litvinov, Andreas Lubbe, Glen Maddern, Tiancheng Gu, Scott Motte, Facebook Inc., Luigi Pinca, Yuxi You, MOXY Lda, Josh Junon, Guy Bedford, Rich Harris, Sindre Sorhus, VoidZero Inc., The Preact Authors, dominikg, Anthony Fu, Anton Kastritskiy, sapphi-red, Mark Dalgleish, Alexander Madyankin, Justin Ridgewell, Kevin Deng.
- `word-wrap` (1.2.5): Copyright (c) 2014-2023 Jon Schlinkert
- `XML Resolver` (5.3.3): Copyright (c) 1989-2023 Free Software Foundation, The Open Healthcare Group, Jonathan Borden, Oracle, W3C, The Internet Society
- `zstandard` (0.23.0): Copyright (c) 1989-2021 Free Software Foundation, Yuta Mori, Gregory Szorc, Tino Reichardt
================================================
FILE: THIRD_PARTY/licenses/BSD-2-Clause.txt
================================================
BSD Two Clause License
======================
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
================================================
FILE: THIRD_PARTY/licenses/BSD-3-Clause.txt
================================================
BSD 3-clause "New" or "Revised" License
Copyright (c) <YEAR>, <OWNER>
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the <ORGANIZATION> nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: THIRD_PARTY/licenses/Blue-Oak-1.0.0.txt
================================================
Blue Oak Model License
======================
Version 1.0.0
Purpose
-------
This license gives everyone as much permission to work with this software as
possible, while protecting contributors from liability.
Acceptance
----------
In order to receive this license, you must agree to its rules. The rules of this
license are both obligations under that agreement and conditions to your license.
You must not do anything with this software that triggers a rule that you cannot
or will not follow.
Copyright
---------
Each contributor licenses you to do everything with this software that would
otherwise infringe that contributor's copyright in it.
Notices
-------
You must ensure that everyone who gets a copy of any part of this software from
you, with or without changes, also gets the text of this license or a link to
https://blueoakcouncil.org/license/1.0.0.
Excuse
------
If anyone notifies you in writing that you have not complied with Notices, you
can keep your license by taking all practical steps to comply within 30 days
after the notice. If you do not do so, your license ends immediately.
Patent
------
Each contributor licenses you to do everything with this software that would
otherwise infringe any patent claims they can license or become able to license.
Reliability
-----------
No contributor can revoke this license.
No Liability
------------
As far as the law allows, this software comes as is, without any warranty or
condition, and no contributor will be liable to anyone for any damages related to
this software or this license, under any kind of legal claim.
================================================
FILE: THIRD_PARTY/licenses/CDDL-1.1.txt
================================================
COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1
==============================================================
1. Definitions.
1.1. “Contributor” means each individual or entity that creates or contributes
to the creation of Modifications.
1.2. “Contributor Version” means the combination of the Original Software,
prior Modifications used by a Contributor (if any), and the Modifications made
by that particular Contributor.
1.3. “Covered Software” means (a) the Original Software, or (b) Modifications,
or (c) the combination of files containing Original Software with files
containing Modifications, in each case including portions thereof.
1.4. “Executable” means the Covered Software in any form other than Source
Code.
1.5. “Initial Developer” means the individual or entity that first makes
Original Software available under this License.
1.6. “Larger Work” means a work which combines Covered Software or portions
thereof with code not governed by the terms of this License.
1.7. “License” means this document.
1.8. “Licensable” means having the right to grant, to the maximum extent
possible, whether at the time of the initial grant or subsequently acquired,
any and all of the rights conveyed herein.
1.9. “Modifications” means the Source Code and Executable form of any of the
following:
A. Any file that results from an addition to, deletion from or modification
of the contents of a file containing Original Software or previous
Modifications;
B. Any new file that contains any part of the Original Software or previous
Modification; or
C. Any new file that is contributed or otherwise made available under the
terms of this License.
1.10. “Original Software” means the Source Code and Executable form of computer
software code that is originally released under this License.
1.11. “Patent Claims” means any patent claim(s), now owned or hereafter
acquired, including without limitation, method, process, and apparatus claims,
in any patent Licensable by grantor.
1.12. “Source Code” means (a) the common form of computer software code in
which modifications are made and (b) associated documentation included in or
with such code.
1.13. “You” (or “Your”) means an individual or a legal entity exercising rights
under, and complying with all of the terms of, this License. For legal
entities, “You” includes any entity which controls, is controlled by, or is
under common control with You. For purposes of this definition, “control” means
(a) the power, direct or indirect, to cause the direction or management of such
entity, whether by contract or otherwise, or (b) ownership of more than fifty
percent (50%) of the outstanding shares or beneficial ownership of such entity.
2. License Grants.
2.1. The Initial Developer Grant.
Conditioned upon Your compliance with Section 3.1 below and subject to third
party intellectual property claims, the Initial Developer hereby grants You a
world-wide, royalty-free, non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by Initial Developer, to use, reproduce, modify, display, perform,
sublicense and distribute the Original Software (or portions thereof), with
or without Modifications, and/or as part of a Larger Work; and
(b) under Patent Claims infringed by the making, using or selling of Original
Software, to make, have made, use, practice, sell, and offer for sale, and/or
otherwise dispose of the Original Software (or portions thereof).
(c) The licenses granted in Sections 2.1(a) and (b) are effective on the date
Initial Developer first distributes or otherwise makes the Original Software
available to a third party under the terms of this License.
(d) Notwithstanding Section 2.1(b) above, no patent license is granted: (1)
for code that You delete from the Original Software, or (2) for infringements
caused by: (i) the modification of the Original Software, or (ii) the
combination of the Original Software with other software or devices.
2.2. Contributor Grant.
Conditioned upon Your compliance with Section 3.1 below and subject to third
party intellectual property claims, each Contributor hereby grants You a
world-wide, royalty-free, non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by Contributor to use, reproduce, modify, display, perform,
sublicense and distribute the Modifications created by such Contributor (or
portions thereof), either on an unmodified basis, with other Modifications,
as Covered Software and/or as part of a Larger Work; and
(b) under Patent Claims infringed by the making, using, or selling of
Modifications made by that Contributor either alone and/or in combination
with its Contributor Version (or portions of such combination), to make, use,
sell, offer for sale, have made, and/or otherwise dispose of: (1)
Modifications made by that Contributor (or portions thereof); and (2) the
combination of Modifications made by that Contributor with its Contributor
Version (or portions of such combination).
(c) The licenses granted in Sections 2.2(a) and 2.2(b) are effective on the
date Contributor first distributes or otherwise makes the Modifications
available to a third party.
(d) Notwithstanding Section 2.2(b) above, no patent license is granted:
(1) for any code that Contributor has deleted from the Contributor Version;
(2) for infringements caused by: (i) third party modifications of
Contributor Version, or (ii) the combination of Modifications made by that
Contributor with other software (except as part of the Contributor Version)
or other devices; or
(3) under Patent Claims infringed by Covered Software in the absence of
Modifications made by that Contributor.
3. Distribution Obligations.
3.1. Availability of Source Code.
Any Covered Software that You distribute or otherwise make available in
Executable form must also be made available in Source Code form and that
Source Code form must be distributed only under the terms of this License.
You must include a copy of this License with every copy of the Source Code
form of the Covered Software You distribute or otherwise make available. You
must inform recipients of any such Covered Software in Executable form as to
how they can obtain such Covered Software in Source Code form in a reasonable
manner on or through a medium customarily used for software exchange.
3.2. Modifications.
The Modifications that You create or to which You contribute are governed by
the terms of this License. You represent that You believe Your Modifications
are Your original creation(s) and/or You have sufficient rights to grant the
rights conveyed by this License.
3.3. Required Notices.
You must include a notice in each of Your Modifications that identifies You
as the Contributor of the Modification. You may not remove or alter any
copyright, patent or trademark notices contained within the Covered Software,
or any notices of licensing or any descriptive text giving attribution to any
Contributor or the Initial Developer.
3.4. Application of Additional Terms.
You may not offer or impose any terms on any Covered Software in Source Code
form that alters or restricts the applicable version of this License or the
recipients' rights hereunder. You may choose to offer, and to charge a fee
for, warranty, support, indemnity or liability obligations to one or more
recipients of Covered Software. However, you may do so only on Your own
behalf, and not on behalf of the Initial Developer or any Contributor. You
must make it absolutely clear that any such warranty, support, indemnity or
liability obligation is offered by You alone, and You hereby agree to
indemnify the Initial Developer and every Contributor for any liability
incurred by the Initial Developer or such Contributor as a result of
warranty, support, indemnity or liability terms You offer.
3.5. Distribution of Executable Versions.
You may distribute the Executable form of the Covered Software under the
terms of this License or under the terms of a license of Your choice, which
may contain terms different from this License, provided that You are in
compliance with the terms of this License and that the license for the
Executable form does not attempt to limit or alter the recipient's rights in
the Source Code form from the rights set forth in this License. If You
distribute the Covered Software in Executable form under a different license,
You must make it absolutely clear that any terms which differ from this
License are offered by You alone, not by the Initial Developer or
Contributor. You hereby agree to indemnify the Initial Developer and every
Contributor for any liability incurred by the Initial Developer or such
Contributor as a result of any such terms You offer.
3.6. Larger Works.
You may create a Larger Work by combining Covered Software with other code
not governed by the terms of this License and distribute the Larger Work as a
single product. In such a case, You must make sure the requirements of this
License are fulfilled for the Covered Software.
4. Versions of the License.
4.1. New Versions.
Oracle is the initial license steward and may publish revised and/or new
versions of this License from time to time. Each version will be given a
distinguishing version number. Except as provided in Section 4.3, no one
other than the license steward has the right to modify this License.
4.2. Effect of New Versions.
You may always continue to use, distribute or otherwise make the Covered
Software available under the terms of the version of the License under which
You originally received the Covered Software. If the Initial Developer
includes a notice in the Original Software prohibiting it from being
distributed or otherwise made available under any subsequent version of the
License, You must distribute and make the Covered Software available under
the terms of the version of the License under which You originally received
the Covered Software. Otherwise, You may also choose to use, distribute or
otherwise make the Covered Software available under the terms of any
subsequent version of the License published by the license steward.
4.3. Modified Versions.
When You are an Initial Developer and You want to create a new license for
Your Original Software, You may create and use a modified version of this
License if You: (a) rename the license and remove any references to the name
of the license steward (except to note that the license differs from this
License); and (b) otherwise make it clear that the license contains terms
which differ from this License.
5. DISCLAIMER OF WARRANTY.
COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN “AS IS” BASIS, WITHOUT
WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT
LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE IS FREE OF DEFECTS,
MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK
AS TO THE QUALITY AND PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD
ANY COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL
DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING,
REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART
OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT
UNDER THIS DISCLAIMER.
6. TERMINATION.
6.1. This License and the rights granted hereunder will terminate
automatically if You fail to comply with terms herein and fail to cure such
breach within 30 days of becoming aware of the breach. Provisions which, by
their nature, must remain in effect beyond the termination of this License
shall survive.
6.2. If You assert a patent infringement claim (excluding declaratory
judgment actions) against Initial Developer or a Contributor (the Initial
Developer or Contributor against whom You assert such claim is referred to as
“Participant”) alleging that the Participant Software (meaning the
Contributor Version where the Participant is a Contributor or the Original
Software where the Participant is the Initial Developer) directly or
indirectly infringes any patent, then any and all rights granted directly or
indirectly to You by such Participant, the Initial Developer (if the Initial
Developer is not the Participant) and all Contributors under Sections 2.1
and/or 2.2 of this License shall, upon 60 days notice from Participant
terminate prospectively and automatically at the expiration of such 60 day
notice period, unless if within such 60 day period You withdraw Your claim
with respect to the Participant Software against such Participant either
unilaterally or pursuant to a written agreement with Participant.
6.3. If You assert a patent infringement claim against Participant alleging
that the Participant Software directly or indirectly infringes any patent
where such claim is resolved (such as by license or settlement) prior to the
initiation of patent infringement litigation, then the reasonable value of
the licenses granted by such Participant under Sections 2.1 or 2.2 shall be
taken into account in determining the amount or value of any payment or
license.
6.4. In the event of termination under Sections 6.1 or 6.2 above, all end
user licenses that have been validly granted by You or any distributor
hereunder prior to termination (excluding licenses granted to You by any
distributor) shall survive termination.
7. LIMITATION OF LIABILITY.
UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING
NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL DEVELOPER, ANY
OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED SOFTWARE, OR ANY SUPPLIER OF
ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL,
INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT
LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR
MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH
PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS
LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL
INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR
LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND
LIMITATION MAY NOT APPLY TO YOU.
8. U.S. GOVERNMENT END USERS.
The Covered Software is a “commercial item,” as that term is defined in 48
C.F.R. 2.101 (Oct. 1995), consisting of “commercial computer software” (as that
term is defined at 48 C.F.R. § 252.227-7014(a)(1)) and “commercial computer
software documentation” as such terms are used in 48 C.F.R. 12.212 (Sept.
1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through
227.7202-4 (June 1995), all U.S. Government End Users acquire Covered Software
with only those rights set forth herein. This U.S. Government Rights clause is
in lieu of, and supersedes, any other FAR, DFAR, or other clause or provision
that addresses Government rights in computer software under this License.
9. MISCELLANEOUS.
This License represents the complete agreement concerning subject matter
hereof. If any provision of this License is held to be unenforceable, such
provision shall be reformed only to the extent necessary to make it
enforceable. This License shall be governed by the law of the jurisdiction
specified in a notice contained within the Original Software (except to the
extent applicable law, if any, provides otherwise), excluding such
jurisdiction's conflict-of-law provisions. Any litigation relating to this
License shall be subject to the jurisdiction of the courts located in the
jurisdiction and venue specified in a notice contained within the Original
Software, with the losing party responsible for costs, including, without
limitation, court costs and reasonable attorneys' fees and expenses. The
application of the United Nations Convention on Contracts for the International
Sale of Goods is expressly excluded. Any law or regulation which provides that
the language of a contract shall be construed against the drafter shall not
apply to this License. You agree that You alone are responsible for compliance
with the United States export administration regulations (and the export
control laws and regulation of any other countries) when You use, distribute or
otherwise make available any Covered Software.
10. RESPONSIBILITY FOR CLAIMS.
As between Initial Developer and the Contributors, each party is responsible
for claims and damages arising, directly or indirectly, out of its utilization
of rights under this License and You agree to work with Initial Developer and
Contributors to distribute such responsibility on an equitable basis. Nothing
herein is intended or shall be deemed to constitute any admission of liability.
------------------------------------------------------------------------------
NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
(CDDL)
The code released under the CDDL shall be governed by the laws of the State of
California (excluding conflict-of-law provisions). Any litigation relating to
this License shall be subject to the jurisdiction of the Federal Courts of the
Northern District of California and the state courts of the State of
California, with venue lying in Santa Clara County, California.
================================================
FILE: THIRD_PARTY/licenses/EDL-1.0.txt
================================================
Eclipse Distribution License - v 1.0
====================================
Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the Eclipse Foundation, Inc. nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: THIRD_PARTY/licenses/EPL-1.0.txt
================================================
Eclipse Public License - v 1.0
==============================
THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
1. DEFINITIONS
"Contribution" means:
a) in the case of the initial Contributor, the initial code and documentation
distributed under this Agreement, and
b) in the case of each subsequent Contributor:
i) changes to the Program, and
ii) additions to the Program;
where such changes and/or additions to the Program originate from and are
distributed by that particular Contributor. A Contribution 'originates' from a
Contributor if it was added to the Program by such Contributor itself or anyone
acting on such Contributor's behalf. Contributions do not include additions to
the Program which: (i) are separate modules of software distributed in
conjunction with the Program under their own license agreement, and (ii) are not
derivative works of the Program.
"Contributor" means any person or entity that distributes the Program.
"Licensed Patents " mean patent claims licensable by a Contributor which are
necessarily infringed by the use or sale of its Contribution alone or when
combined with the Program.
"Program" means the Contributions distributed in accordance with this Agreement.
"Recipient" means anyone who receives the Program under this Agreement, including
all Contributors.
2. GRANT OF RIGHTS
a) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce,
prepare derivative works of, publicly display, publicly perform, distribute and
sublicense the Contribution of such Contributor, if any, and such derivative
works, in source code and object code form.
b) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed
Patents to make, use, sell, offer to sell, import and otherwise transfer the
Contribution of such Contributor, if any, in source code and object code form.
This patent license shall apply to the combination of the Contribution and the
Program if, at the time the Contribution is added by the Contributor, such
addition of the Contribution causes such combination to be covered by the
Licensed Patents. The patent license shall not apply to any other combinations
which include the Contribution. No hardware per se is licensed hereunder.
c) Recipient understands that although each Contributor grants the licenses to
its Contributions set forth herein, no assurances are provided by any Contributor
that the Program does not infringe the patent or other intellectual property
rights of any other entity. Each Contributor disclaims any liability to Recipient
for claims brought by any other entity based on infringement of intellectual
property rights or otherwise. As a condition to exercising the rights and
licenses granted hereunder, each Recipient hereby assumes sole responsibility to
secure any other intellectual property rights needed, if any. For example, if a
third party patent license is required to allow Recipient to distribute the
Program, it is Recipient's responsibility to acquire that license before
distributing the Program.
d) Each Contributor represents that to its knowledge it has sufficient copyright
rights in its Contribution, if any, to grant the copyright license set forth in
this Agreement.
3. REQUIREMENTS
A Contributor may choose to distribute the Program in object code form under its
own license agreement, provided that:
a) it complies with the terms and conditions of this Agreement;
and
b) its license agreement:
i) effectively disclaims on behalf of all Contributors all warranties and
conditions, express and implied, including warranties or conditions of title and
non-infringement, and implied warranties or conditions of merchantability and
fitness for a particular purpose;
ii) effectively excludes on behalf of all Contributors all liability for damages,
including direct, indirect, special, incidental and consequential damages, such
as lost profits;
iii) states that any provisions which differ from this Agreement are offered by
that Contributor alone and not by any other party;
and
iv) states that source code for the Program is available from such Contributor,
and informs licensees how to obtain it in a reasonable manner on or through a
medium customarily used for software exchange.
When the Program is made available in source code form:
a) it must be made available under this Agreement;
and
b) a copy of this Agreement must be included with each copy of the Program.
Contributors may not remove or alter any copyright notices contained within the
Program.
Each Contributor must identify itself as the originator of its Contribution, if
any, in a manner that reasonably allows subsequent Recipients to identify the
originator of the Contribution.
4. COMMERCIAL DISTRIBUTION
Commercial distributors of software may accept certain responsibilities with
respect to end users, business partners and the like. While this license is
intended to facilitate the commercial use of the Program, the Contributor who
includes the Program in a commercial product offering should do so in a manner
which does not create potential liability for other Contributors. Therefore, if a
Contributor includes the Program in a commercial product offering, such
Contributor ("Commercial Contributor") hereby agrees to defend and indemnify
every other Contributor ("Indemnified Contributor") against any losses, damages
and costs (collectively "Losses") arising from claims, lawsuits and other legal
actions brought by a third party against the Indemnified Contributor to the
extent caused by the acts or omissions of such Commercial Contributor in
connection with its distribution of the Program in a commercial product offering.
The obligations in this section do not apply to any claims or Losses relating to
any actual or alleged intellectual property infringement. In order to qualify, an
Indemnified Contributor must: a) promptly notify the Commercial Contributor in
writing of such claim, and b) allow the Commercial Contributor to control, and
cooperate with the Commercial Contributor in, the defense and any related
settlement negotiations. The Indemnified Contributor may participate in any such
claim at its own expense.
For example, a Contributor might include the Program in a commercial product
offering, Product X. That Contributor is then a Commercial Contributor. If that
Commercial Contributor then makes performance claims, or offers warranties
related to Product X, those performance claims and warranties are such Commercial
Contributor's responsibility alone. Under this section, the Commercial
Contributor would have to defend claims against the other Contributors related to
those performance claims and warranties, and if a court requires any other
Contributor to pay any damages as a result, the Commercial Contributor must pay
those damages.
5. NO WARRANTY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR
IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE,
NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each
Recipient is solely responsible for determining the appropriateness of using and
distributing the Program and assumes all risks associated with its exercise of
rights under this Agreement , including but not limited to the risks and costs of
program errors, compliance with applicable laws, damage to or loss of data,
programs or equipment, and unavailability or interruption of operations.
6. DISCLAIMER OF LIABILITY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST
PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS
GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. GENERAL
If any provision of this Agreement is invalid or unenforceable under applicable
law, it shall not affect the validity or enforceability of the remainder of the
terms of this Agreement, and without further action by the parties hereto, such
provision shall be reformed to the minimum extent necessary to make such
provision valid and enforceable.
If Recipient institutes patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Program itself
(excluding combinations of the Program with other software or hardware) infringes
such Recipient's patent(s), then such Recipient's rights granted under Section
2(b) shall terminate as of the date such litigation is filed.
All Recipient's rights under this Agreement shall terminate if it fails to comply
with any of the material terms or conditions of this Agreement and does not cure
such failure in a reasonable period of time after becoming aware of such
noncompliance. If all Recipient's rights under this Agreement terminate,
Recipient agrees to cease use and distribution of the Program as soon as
reasonably practicable. However, Recipient's obligations under this Agreement and
any licenses granted by Recipient relating to the Program shall continue and
survive.
Everyone is permitted to copy and distribute copies of this Agreement, but in
order to avoid inconsistency the Agreement is copyrighted and may only be
modified in the following manner. The Agreement Steward reserves the right to
publish new versions (including revisions) of this Agreement from time to time.
No one other than the Agreement Steward has the right to modify this Agreement.
The Eclipse Foundation is the initial Agreement Steward. The Eclipse Foundation
may assign the responsibility to serve as the Agreement Steward to a suitable
separate entity. Each new version of the Agreement will be given a distinguishing
version number. The Program (including Contributions) may always be distributed
subject to the version of the Agreement under which it was received. In addition,
after a new version of the Agreement is published, Contributor may elect to
distribute the Program (including its Contributions) under the new version.
Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no
rights or licenses to the intellectual property of any Contributor under this
Agreement, whether expressly, by implication, estoppel or otherwise. All rights
in the Program not expressly granted under this Agreement are reserved.
This Agreement is governed by the laws of the State of New York and the
intellectual property laws of the United States of America. No party to this
Agreement will bring a legal action under this Agreement more than one year after
the cause of action arose. Each party waives its rights to a jury trial in any
resulting litigation.
================================================
FILE: THIRD_PARTY/licenses/EPL-2.0.txt
================================================
Eclipse Public License - v 2.0
==============================
THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
LICENSE (“AGREEMENT”). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
1. DEFINITIONS
--------------
“Contribution” means:
a) in the case of the initial Contributor, the initial content Distributed
under this Agreement, and
b) in the case of each subsequent Contributor:
i) changes to the Program, and
ii) additions to the Program;
where such changes and/or additions to the Program originate from and are
Distributed by that particular Contributor. A Contribution “originates” from a
Contributor if it was added to the Program by such Contributor itself or anyone
acting on such Contributor's behalf. Contributions do not include changes or
additions to the Program that are not Modified Works.
“Contributor” means any person or entity that Distributes the Program.
“Licensed Patents” mean patent claims licensable by a Contributor which are
necessarily infringed by the use or sale of its Contribution alone or when
combined with the Program.
“Program” means the Contributions Distributed in accordance with this Agreement.
“Recipient” means anyone who receives the Program under this Agreement or any
Secondary License (as applicable), including Contributors.
“Derivative Works” shall mean any work, whether in Source Code or other form,
that is based on (or derived from) the Program and for which the editorial
revisions, annotations, elaborations, or other modifications represent, as a
whole, an original work of authorship.
“Modified Works” shall mean any work in Source Code or other form that results
from an addition to, deletion from, or modification of the contents of the
Program, including, for purposes of clarity any new file in Source Code form that
contains any contents of the Program. Modified Works shall not include works that
contain only declarations, interfaces, types, classes, structures, or files of
the Program solely in each case in order to link to, bind by name, or subclass
the Program or Modified Works thereof.
“Distribute” means the acts of a) distributing or b) making available in any
manner that enables the transfer of a copy.
“Source Code” means the form of a Program preferred for making modifications,
including but not limited to software source code, documentation source, and
configuration files.
“Secondary License” means either the GNU General Public License, Version 2.0, or
any later versions of that license, including any exceptions or additional
permissions as identified by the initial Contributor.
2. GRANT OF RIGHTS
------------------
a) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free copyright license to
reproduce, prepare Derivative Works of, publicly display, publicly perform,
Distribute and sublicense the Contribution of such Contributor, if any, and
such Derivative Works.
b) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free patent license under
Licensed Patents to make, use, sell, offer to sell, import and otherwise
transfer the Contribution of such Contributor, if any, in Source Code or other
form. This patent license shall apply to the combination of the Contribution
and the Program if, at the time the Contribution is added by the Contributor,
such addition of the Contribution causes such combination to be covered by the
Licensed Patents. The patent license shall not apply to any other combinations
which include the Contribution. No hardware per se is licensed hereunder.
c) Recipient understands that although each Contributor grants the licenses to
its Contributions set forth herein, no assurances are provided by any
Contributor that the Program does not infringe the patent or other intellectual
property rights of any other entity. Each Contributor disclaims any liability
to Recipient for claims brought by any other entity based on infringement of
intellectual property rights or otherwise. As a condition to exercising the
rights and licenses granted hereunder, each Recipient hereby assumes sole
responsibility to secure any other intellectual property rights needed, if any.
For example, if a third party patent license is required to allow Recipient to
Distribute the Program, it is Recipient's responsibility to acquire that
license before distributing the Program.
d) Each Contributor represents that to its knowledge it has sufficient
copyright rights in its Contribution, if any, to grant the copyright license
set forth in this Agreement.
e) Notwithstanding the terms of any Secondary License, no Contributor makes
additional grants to any Recipient (other than those set forth in this
Agreement) as a result of such Recipient's receipt of the Program under the
terms of a Secondary License (if permitted under the terms of Section 3).
3. REQUIREMENTS
---------------
3.1 If a Contributor Distributes the Program in any form, then:
a) the Program must also be made available as Source Code, in accordance with
section 3.2, and the Contributor must accompany the Program with a statement
that the Source Code for the Program is available under this Agreement, and
informs Recipients how to obtain it in a reasonable manner on or through a
medium customarily used for software exchange; and
b) the Contributor may Distribute the Program under a license different than
this Agreement, provided that such license:
i) effectively disclaims on behalf of all other Contributors all warranties
and conditions, express and implied, including warranties or conditions of
title and non-infringement, and implied warranties or conditions of
merchantability and fitness for a particular purpose;
ii) effectively excludes on behalf of all other Contributors all liability
for damages, including direct, indirect, special, incidental and
consequential damages, such as lost profits;
iii) does not attempt to limit or alter the recipients' rights in the Source
Code under section 3.2; and
iv) requires any subsequent distribution of the Program by any party to be
under a license that satisfies the requirements of this section 3.
3.2 When the Program is Distributed as Source Code:
a) it must be made available under this Agreement, or if the Program
(i) is combined with other material in a separate file or files made
available under a Secondary License, and
(ii) the initial Contributor attached to the Source Code the notice described
in Exhibit A of this Agreement, then the Program may be made available under
the terms of such Secondary Licenses, and
b) a copy of this Agreement must be included with each copy of the Program.
3.3 Contributors may not remove or alter any copyright, patent, trademark,
attribution notices, disclaimers of warranty, or limitations of liability
(‘notices’) contained within the Program from any copy of the Program which they
Distribute, provided that Contributors may add their own appropriate notices.
4. COMMERCIAL DISTRIBUTION
--------------------------
Commercial distributors of software may accept certain responsibilities with
respect to end users, business partners and the like. While this license is
intended to facilitate the commercial use of the Program, the Contributor who
includes the Program in a commercial product offering should do so in a manner
which does not create potential liability for other Contributors. Therefore, if a
Contributor includes the Program in a commercial product offering, such
Contributor (“Commercial Contributor”) hereby agrees to defend and indemnify
every other Contributor (“Indemnified Contributor”) against any losses, damages
and costs (collectively “Losses”) arising from claims, lawsuits and other legal
actions brought by a third party against the Indemnified Contributor to the
extent caused by the acts or omissions of such Commercial Contributor in
connection with its distribution of the Program in a commercial product offering.
The obligations in this section do not apply to any claims or Losses relating to
any actual or alleged intellectual property infringement. In order to qualify, an
Indemnified Contributor must: a) promptly notify the Commercial Contributor in
writing of such claim, and b) allow the Commercial Contributor to control, and
cooperate with the Commercial Contributor in, the defense and any related
settlement negotiations. The Indemnified Contributor may participate in any such
claim at its own expense.
For example, a Contributor might include the Program in a commercial product
offering, Product X. That Contributor is then a Commercial Contributor. If that
Commercial Contributor then makes performance claims, or offers warranties
related to Product X, those performance claims and warranties are such Commercial
Contributor's responsibility alone. Under this section, the Commercial
Contributor would have to defend claims against the other Contributors related to
those performance claims and warranties, and if a court requires any other
Contributor to pay any damages as a result, the Commercial Contributor must pay
those damages.
5. NO WARRANTY
--------------
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT PERMITTED BY
APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES
OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT
LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT,
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely
responsible for determining the appropriateness of using and distributing the
Program and assumes all risks associated with its exercise of rights under this
Agreement, including but not limited to the risks and costs of program errors,
compliance with applicable laws, damage to or loss of data, programs or
equipment, and unavailability or interruption of operations.
6. DISCLAIMER OF LIABILITY
--------------------------
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT PERMITTED BY
APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE
PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
7. GENERAL
----------
If any provision of this Agreement is invalid or unenforceable under applicable
law, it shall not affect the validity or enforceability of the remainder of the
terms of this Agreement, and without further action by the parties hereto, such
provision shall be reformed to the minimum extent necessary to make such
provision valid and enforceable.
If Recipient institutes patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Program itself
(excluding combinations of the Program with other software or hardware) infringes
such Recipient's patent(s), then such Recipient's rights granted under Section
2(b) shall terminate as of the date such litigation is filed.
All Recipient's rights under this Agreement shall terminate if it fails to comply
with any of the material terms or conditions of this Agreement and does not cure
such failure in a reasonable period of time after becoming aware of such
noncompliance. If all Recipient's rights under this Agreement terminate,
Recipient agrees to cease use and distribution of the Program as soon as
reasonably practicable. However, Recipient's obligations under this Agreement and
any licenses granted by Recipient relating to the Program shall continue and
survive.
Everyone is permitted to copy and distribute copies of this Agreement, but in
order to avoid inconsistency the Agreement is copyrighted and may only be
modified in the following manner. The Agreement Steward reserves the right to
publish new versions (including revisions) of this Agreement from time to time.
No one other than the Agreement Steward has the right to modify this Agreement.
The Eclipse Foundation is the initial Agreement Steward. The Eclipse Foundation
may assign the responsibility to serve as the Agreement Steward to a suitable
separate entity. Each new version of the Agreement will be given a distinguishing
version number. The Program (including Contributions) may always be Distributed
subject to the version of the Agreement under which it was received. In addition,
after a new version of the Agreement is published, Contributor may elect to
Distribute the Program (including its Contributions) under the new version.
Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no
rights or licenses to the intellectual property of any Contributor under this
Agreement, whether expressly, by implication, estoppel or otherwise. All rights
in the Program not expressly granted under this Agreement are reserved. Nothing
in this Agreement is intended to be enforceable by any entity that is not a
Contributor or Recipient. No third-party beneficiary rights are created under
this Agreement.
Exhibit A – Form of Secondary Licenses Notice
---------------------------------------------
“This Source Code may also be made available under the following Secondary
Licenses when the conditions for such availability set forth in the Eclipse
Public License, v. 2.0 are satisfied: {name license(s), version(s), and
exceptions or additional permissions here}.”
Simply including a copy of this Agreement, including this Exhibit A is not
sufficient to license the Source Code under Secondary Licenses.
If it is not possible or desirable to put the notice in a particular file,
then You may include the notice in a location (such as a LICENSE file in a
relevant directory) where a recipient would be likely to look for such a
notice.
You may add additional accurate notices of copyright ownership.
================================================
FILE: THIRD_PARTY/licenses/ISC.txt
================================================
ISC License (ISCL)
==================
Copyright (c) 4-digit year, Company or Person's Name
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
================================================
FILE: THIRD_PARTY/licenses/LICENSE-JJ2000.txt
================================================
This software module was originally developed by Raphaël Grosbois and
Diego Santa Cruz (Swiss Federal Institute of Technology-EPFL); Joel
Askelöf (Ericsson Radio Systems AB); and Bertrand Berthelot, David
Bouchard, Félix Henry, Gerard Mozelle and Patrice Onno (Canon Research
Centre France S.A) in the course of development of the JPEG2000
standard as specified by ISO/IEC 15444 (JPEG 2000 Standard). This
software module is an implementation of a part of the JPEG 2000
Standard. Swiss Federal Institute of Technology-EPFL, Ericsson Radio
Systems AB and Canon Research Centre France S.A (collectively JJ2000
Partners) agree not to assert against ISO/IEC and users of the JPEG
2000 Standard (Users) any of their rights under the copyright, not
including other intellectual property rights, for this software module
with respect to the usage by ISO/IEC and Users of this software module
or modifications thereof for use in hardware or software products
claiming conformance to the JPEG 2000 Standard. Those intending to use
this software module in hardware or software products are advised that
their use may infringe existing patents. The original developers of
this software module, JJ2000 Partners and ISO/IEC assume no liability
for use of this software module or modifications thereof. No license
or right to this software module is granted for non JPEG 2000 Standard
conforming products. JJ2000 Partners have full right to use this
software module for his/her own purpose, assign or donate this
software module to any third party and to inhibit third parties from
using this software module for non JPEG 2000 Standard conforming
products. This copyright notice must be included in all copies or
derivative works of this software module.
Copyright (c) 1999/2000 JJ2000 Partners.
================================================
FILE: THIRD_PARTY/licenses/MIT.txt
================================================
MIT License
Copyright (c) <year> <copyright holders>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: THIRD_PARTY/licenses/MPL-2.0.txt
================================================
Mozilla Public License
Version 2.0
======================
1. Definitions
--------------
1.1. "Contributor"
means each individual or legal entity that creates, contributes to the creation
of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used by a
Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached the notice
in Exhibit A, the Executable Form of such Source Code Form, and Modifications
of such Source Code Form, in each case including portions thereof.
1.5. "Incompatible With Secondary Licenses"
means
a.
that the initial Contributor has attached the notice described in Exhibit B
to the Covered Software; or
b.
that the Covered Software was made available under the terms of version 1.1
or earlier of the License, but not also under the terms of a Secondary
License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in a separate
file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible, whether at the
time of the initial grant or subsequently, any and all of the rights conveyed
by this License.
1.10. "Modifications"
means any of the following:
a.
any file in Source Code Form that results from an addition to, deletion
from, or modification of the contents of Covered Software; or
b.
any new file in Source Code Form that contains any Covered Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method, process, and
apparatus claims, in any patent Licensable by such Contributor that would be
infringed, but for the grant of the License, by the making, using, selling,
offering for sale, having made, import, or transfer of either its Contributions
or its Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU Lesser
General Public License, Version 2.1, the GNU Affero General Public License,
Version 3.0, or any later versions of those licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this License. For
legal entities, "You" includes any entity that controls, is controlled by, or
is under common control with You. For purposes of this definition, "control"
means (a) the power, direct or indirect, to cause the direction or management
of such entity, whether by contract or otherwise, or (b) ownership of more than
fifty percent (50%) of the outstanding shares or beneficial ownership of such
entity.
2. License Grants and Conditions
--------------------------------
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive
license:
a.
under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available, modify,
display, perform, distribute, and otherwise exploit its Contributions,
either on an unmodified basis, with Modifications, or as part of a Larger
Work; and
b.
under Patent Claims of such Contributor to make, use, sell, offer for sale,
have made, import, and otherwise transfer either its Contributions or its
Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution become
effective for each Contribution on the date the Contributor first distributes
such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under this
License. No additional rights or licenses will be implied from the distribution
or licensing of Covered Software under this License. Notwithstanding
Section 2.1(b) above, no patent license is granted by a Contributor:
a.
for any code that a Contributor has removed from Covered Software; or
b.
for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
c.
under Patent Claims infringed by Covered Software in the absence of its
Contributions.
This License does not grant any rights in the trademarks, service marks, or
logos of any Contributor (except as may be necessary to comply with the notice
requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to distribute
the Covered Software under a subsequent version of this License (see
Section 10.2) or under the terms of a Secondary License (if permitted under the
terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its Contributions are
its original creation(s) or it has sufficient rights to grant the rights to its
Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under applicable
copyright doctrines of fair use, fair dealing, or other equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
Section 2.1.
3. Responsibilities
-------------------
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under the
terms of this License. You must inform recipients that the Source Code Form of
the Covered Software is governed by the terms of this License, and how they can
obtain a copy of this License. You may not attempt to alter or restrict the
recipients' rights in the Source Code Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
a.
such Covered Software must also be made available in Source Code Form, as
described in Section 3.1, and You must inform recipients of the Executable
Form how they can obtain a copy of such Source Code Form by reasonable
means in a timely manner, at a charge no more than the cost of distribution
to the recipient; and
b.
You may distribute such Executable Form under the terms of this License, or
sublicense it under different terms, provided that the license for the
Executable Form does not attempt to limit or alter the recipients' rights
in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for the
Covered Software. If the Larger Work is a combination of Covered Software with
a work governed by one or more Secondary Licenses, and the Covered Software is
not Incompatible With Secondary Licenses, this License permits You to
additionally distribute such Covered Software under the terms of such Secondary
License(s), so that the recipient of the Larger Work may, at their option,
further distribute the Covered Software under the terms of either this License
or such Secondary License(s).
3.4. Notices
You may not remove or alter the substance of any license notices (including
copyright notices, patent notices, disclaimers of warranty, or limitations of
liability) contained within the Source Code Form of the Covered Software,
except that You may alter any license notices to the extent required to remedy
known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support, indemnity
or liability obligations to one or more recipients of Covered Software.
However, You may do so only on Your own behalf, and not on behalf of any
Contributor. You must make it absolutely clear that any such warranty, support,
indemnity, or liability obligation is offered by You alone, and You hereby
agree to indemnify every Contributor for any liability incurred by such
Contributor as a result of warranty, support, indemnity or liability terms You
offer. You may include additional disclaimers of warranty and limitations of
liability specific to any jurisdiction.
4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------
If it is impossible for You to comply with any of the terms of this License with
respect to some or all of the Covered Software due to statute, judicial order, or
regulation then You must: (a) comply with the terms of this License to the
maximum extent possible; and (b) describe the limitations and the code they
affect. Such description must be placed in a text file included with all
distributions of the Covered Software under this License. Except to the extent
prohibited by statute or regulation, such description must be sufficiently
detailed for a recipient of ordinary skill to be able to understand it.
5. Termination
--------------
5.1. The rights granted under this License will terminate automatically if You
fail to comply with any of its terms. However, if You become compliant, then
the rights granted under this License from a particular Contributor are
reinstated (a) provisionally, unless and until such Contributor explicitly and
finally terminates Your grants, and (b) on an ongoing basis, if such
Contributor fails to notify You of the non-compliance by some reasonable means
prior to 60 days after You have come back into compliance. Moreover, Your
grants from a particular Contributor are reinstated on an ongoing basis if such
Contributor notifies You of the non-compliance by some reasonable means, this
is the first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after Your
receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions, counter-claims, and
cross-claims) alleging that a Contributor Version directly or indirectly
infringes any patent, then the rights granted to You by any and all
Contributors for the Covered Software under Section 2.1 of this License shall
terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
license agreements (excluding distributors and resellers) which have been
validly granted by You or Your distributors under this License prior to
termination shall survive termination.
6. Disclaimer of Warranty
-------------------------
Covered Software is provided under this License on an "as is" basis, without
warranty of any kind, either expressed, implied, or statutory, including, without
limitation, warranties that the Covered Software is free of defects,
merchantable, fit for a particular purpose or non-infringing. The entire risk as
to the quality and performance of the Covered Software is with You. Should any
Covered Software prove defective in any respect, You (not any Contributor) assume
the cost of any necessary servicing, repair, or correction. This disclaimer of
warranty constitutes an essential part of this License. No use of any Covered
Software is authorized under this License except under this disclaimer.
7. Limitation of Liability
--------------------------
Under no circumstances and under no legal theory, whether tort (including
negligence), contract, or otherwise, shall any Contributor, or anyone who
distributes Covered Software as permitted above, be liable to You for any direct,
indirect, special, incidental, or consequential damages of any character
including, without limitation, damages for lost profits, loss of goodwill, work
stoppage, computer failure or malfunction, or any and all other commercial
damages or losses, even if such party shall have been informed of the possibility
of such damages. This limitation of liability shall not apply to liability for
death or personal injury resulting from such party's negligence to the extent
applicable law prohibits such limitation. Some jurisdictions do not allow the
exclusion or limitation of incidental or consequential damages, so this exclusion
and limitation may not apply to You.
8. Litigation
-------------
Any litigation relating to this License may be brought only in the courts of a
jurisdiction where the defendant maintains its principal place of business and
such litigation shall be governed by laws of that jurisdiction, without reference
to its conflict-of-law provisions. Nothing in this Section shall prevent a
party's ability to bring cross-claims or counter-claims.
9. Miscellaneous
----------------
This License represents the complete agreement concerning the subject matter
hereof. If any provision of this License is held to be unenforceable, such
provision shall be reformed only to the extent necessary to make it enforceable.
Any law or regulation which provides that the language of a contract shall be
construed against the drafter shall not be used to construe this License against
a Contributor.
10. Versions of the License
---------------------------
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section 10.3,
no one other than the license steward has the right to modify or publish new
versions of this License. Each version will be given a distinguishing version
number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version of the
License under which You originally received the Covered Software, or under the
terms of any subsequent version published by the license steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to create a
new license for such software, you may create and use a modified version of
this License if you rename the license and remove any references to the name of
the license steward (except to note that such modified license differs from
this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the notice
described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
-------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can
obtain one at http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular file, then
You may include the notice in a location (such as a LICENSE file in a relevant
directory) where a recipient would be likely to look for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------
This Source Code Form is "Incompatible With Secondary Licenses", as defined
by the Mozilla Public License, v. 2.0.
================================================
FILE: THIRD_PARTY/licenses/PSF-2.0.txt
================================================
PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
============================================
--------------------------------------------
1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"),
and the Individual or Organization ("Licensee") accessing and otherwise using
this software ("Python") in source or binary form and its associated
documentation.
2. Subject to the terms and conditions of this License Agreement, PSF hereby
grants Licensee a nonexclusive, royalty-free, world-wide license to
reproduce, analyze, test, perform and/or display publicly, prepare derivative
works, distribute, and otherwise use Python alone or in any derivative
version, provided, however, that PSF's License Agreement and PSF's notice of
copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004 Python Software
Foundation; All Rights Reserved" are retained in Python alone or in any
derivative version prepared by Licensee.
3. In the event Licensee prepares a derivative work that is based on or
incorporates Python or any part thereof, and wants to make the derivative
work available to others as provided herein, then Licensee hereby agrees to
include in any such work a brief summary of the changes made to Python.
4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES
NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT
NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF
MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY
INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE
THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
6. This License Agreement will automatically terminate upon a material breach
of its terms and conditions.
7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between PSF and
Licensee. This License Agreement does not grant permission to use PSF
trademarks or trade name in a trademark sense to endorse or promote products
or services of Licensee, or any third party.
8. By copying, installing or otherwise using Python, Licensee agrees to be
bound by the terms and conditions of this License Agreement.
================================================
FILE: THIRD_PARTY/licenses/Plexus Classworlds License.txt
================================================
Plexus Classworlds License
==========================
Copyright 2002 (C) The Codehaus. All Rights Reserved.
Redistribution and use of this software and associated documentation
("Software"), with or without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain copyright statements and notices.
Redistributions must also contain a copy of this document.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. The name "classworlds" must not be used to endorse or promote products
derived from this Software without prior written permission of The Codehaus.
For written permission, please contact bob@codehaus.org.
4. Products derived from this Software may not be called "classworlds" nor may
"classworlds" appear in their names without prior written permission of The
Codehaus. "classworlds" is a registered trademark of The Codehaus.
5. Due credit should be given to The Codehaus.
(http://classworlds.codehaus.org/).
THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS "AS IS" AND ANY
EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: build-scripts/fetch_shaded_jar.py
================================================
"""
Finds and copies the latest shaded JAR from the Java build to the Python package source.
This script is intended to be run from the monorepo root, typically as part of a
CI/CD pipeline, before the Python package is built.
"""
import argparse
import logging
import re
import shutil
import sys
from pathlib import Path
from typing import Optional
# Requires 'packaging' library (pip install packaging)
from packaging.version import parse as parse_version
def find_latest_jar_by_semver(target_dir: Path) -> Optional[Path]:
"""Finds the shaded JAR with the highest semantic version in its filename."""
# Example filename: opendataloader-pdf-runtime-0.1.0.jar
jar_pattern = "opendataloader-pdf-runtime-*.jar"
version_regex = re.compile(r"opendataloader-pdf-runtime-(.+?)\.jar")
latest_version = parse_version("0.0.0")
latest_jar_path = None
# Exclude Maven's 'original' JARs to ensure we get the shaded (fat) JAR.
potential_jars = [p for p in target_dir.glob(jar_pattern) if 'original' not in p.name]
if not potential_jars:
return None
# Iterate through potential JARs to find the one with the highest version number.
for jar_path in potential_jars:
match = version_regex.search(jar_path.name)
if match:
try:
current_version = parse_version(match.group(1))
if current_version > latest_version:
latest_version = current_version
latest_jar_path = jar_path
except Exception:
# Ignore files with non-parseable version strings.
continue
return latest_jar_path
def main():
"""Parse command-line arguments and orchestrate the copy process."""
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s', stream=sys.stdout)
parser = argparse.ArgumentParser(description="Copies the latest shaded JAR to the Python source tree.")
parser.add_argument("java_target_dir", type=Path, help="Path to the Java module's 'target' directory.")
parser.add_argument("python_jars_dir", type=Path, help="Path to the Python package's destination directory for JARs.")
args = parser.parse_args()
java_target_path: Path = args.java_target_dir.resolve()
python_jars_path: Path = args.python_jars_dir.resolve()
if not java_target_path.is_dir():
parser.error(f"Java target directory not found: {java_target_path}")
# Ensure the destination directory exists.
python_jars_path.mkdir(parents=True, exist_ok=True)
source_jar_path = find_latest_jar_by_semver(java_target_path)
if not source_jar_path:
parser.error(f"No versioned shaded JAR found in: {java_target_path}")
# Standardize the destination name for consistent access within the Python package.
destination_jar_path = python_jars_path / 'runtime.jar'
shutil.copy2(source_jar_path, destination_jar_path)
logging.info(f"Copied '{source_jar_path.name}' to '{destination_jar_path}'")
if __name__ == "__main__":
main()
================================================
FILE: build-scripts/set_version.py
================================================
# build-scripts/set_version.py
import os
import re
import sys
def set_version(version_file, pom_file, pyproject_toml_file):
with open(version_file, 'r') as f:
version = f.read().strip()
# Update Maven POM
with open(pom_file, 'r') as f:
pom_content = f.read()
pom_content = re.sub(r'<version>.*</version>', f'<version>{version}</version>', pom_content, count=1)
with open(pom_file, 'w') as f:
f.write(pom_content)
print(f"Updated Maven POM version to {version}")
# Update Python pyproject.toml
with open(pyproject_toml_file, 'r') as f:
pyproject_content = f.read()
pyproject_content = re.sub(r'version = ".*"', f'version = "{version}"', pyproject_content, count=1)
with open(pyproject_toml_file, 'w') as f:
f.write(pyproject_content)
print(f"Updated Python pyproject.toml version to {version}")
if __name__ == "__main__":
# Paths are relative to the monorepo root
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
version_path = os.path.join(root_dir, 'VERSION')
java_pom_path = os.path.join(root_dir, 'java', 'pom.xml')
python_pyproject_path = os.path.join(root_dir, 'python', 'packages', 'opendataloader_pdf', 'pyproject.toml')
if not os.path.exists(version_path):
print(f"Error: VERSION file not found at {version_path}")
sys.exit(1)
if not os.path.exists(java_pom_path):
print(f"Error: Java pom.xml not found at {java_pom_path}")
sys.exit(1)
if not os.path.exists(python_pyproject_path):
print(f"Error: Python pyproject.toml not found at {python_pyproject_path}")
sys.exit(1)
set_version(version_path, java_pom_path, python_pyproject_path)
================================================
FILE: content/docs/_generated/node-convert-options.mdx
================================================
---
title: Node.js Convert Options
description: Options for the Node.js convert function
---
{/* AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY */}
{/* Run `npm run generate-options` to regenerate */}
| Option | Type | Default | Description |
|-------------------------|----------------------|--------------|------------------------------------------------------------------------------------------------------------------------------------|
| `outputDir` | `string` | - | Directory where output files are written. Default: input file directory |
| `password` | `string` | - | Password for encrypted PDF files |
| `format` | `string \| string[]` | - | Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json |
| `quiet` | `boolean` | `false` | Suppress console logging output |
| `contentSafetyOff` | `string \| string[]` | - | Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg |
| `sanitize` | `boolean` | `false` | Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders |
| `keepLineBreaks` | `boolean` | `false` | Preserve original line breaks in extracted text |
| `replaceInvalidChars` | `string` | `" "` | Replacement character for invalid/unrecognized characters. Default: space |
| `useStructTree` | `boolean` | `false` | Use PDF structure tree (tagged PDF) for reading order and semantic structure |
| `tableMethod` | `string` | `"default"` | Table detection method. Values: default (border-based), cluster (border + cluster). Default: default |
| `readingOrder` | `string` | `"xycut"` | Reading order algorithm. Values: off, xycut. Default: xycut |
| `markdownPageSeparator`
gitextract_cwg3t_9k/
├── .editorconfig
├── .gitattributes
├── .github/
│ ├── CODEOWNERS
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ ├── feature_request.md
│ │ └── question.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── SECURITY.md
│ └── workflows/
│ ├── release.yml
│ ├── sync-docs.yml
│ └── test-benchmark.yml
├── .gitignore
├── CHANGELOG.md
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── LICENSE_TEMPLATE/
│ └── license.txt
├── NOTICE
├── README.md
├── SUPPORT.md
├── THIRD_PARTY/
│ ├── THIRD_PARTY_LICENSES.md
│ ├── THIRD_PARTY_NOTICES.md
│ └── licenses/
│ ├── BSD-2-Clause.txt
│ ├── BSD-3-Clause.txt
│ ├── Blue-Oak-1.0.0.txt
│ ├── CDDL-1.1.txt
│ ├── EDL-1.0.txt
│ ├── EPL-1.0.txt
│ ├── EPL-2.0.txt
│ ├── ISC.txt
│ ├── LICENSE-JJ2000.txt
│ ├── MIT.txt
│ ├── MPL-2.0.txt
│ ├── PSF-2.0.txt
│ └── Plexus Classworlds License.txt
├── build-scripts/
│ ├── fetch_shaded_jar.py
│ └── set_version.py
├── content/
│ └── docs/
│ ├── _generated/
│ │ ├── node-convert-options.mdx
│ │ └── python-convert-options.mdx
│ ├── accessibility-compliance.mdx
│ ├── accessibility-glossary.mdx
│ ├── ai-safety.mdx
│ ├── benchmark/
│ │ ├── index.mdx
│ │ ├── meta.json
│ │ ├── mhs.mdx
│ │ ├── nid.mdx
│ │ ├── speed.mdx
│ │ └── teds.mdx
│ ├── cli-options-reference.mdx
│ ├── community.mdx
│ ├── contributing.mdx
│ ├── development-workflow.mdx
│ ├── faq.mdx
│ ├── hybrid-mode.mdx
│ ├── index.mdx
│ ├── json-schema.mdx
│ ├── license.mdx
│ ├── meta.json
│ ├── quick-start-java.mdx
│ ├── quick-start-nodejs.mdx
│ ├── quick-start-python.mdx
│ ├── rag-integration.mdx
│ ├── reading-order.mdx
│ ├── tagged-pdf-collaboration.mdx
│ ├── tagged-pdf-rag.mdx
│ ├── tagged-pdf.mdx
│ ├── upcoming-roadmap.mdx
│ └── whats-new-v2.mdx
├── docs/
│ ├── hybrid/
│ │ ├── docling-speed-optimization-plan.md
│ │ ├── experiments/
│ │ │ ├── chunking_strategy/
│ │ │ │ ├── conclusion.json
│ │ │ │ ├── docling_benchmark_report.json
│ │ │ │ └── docling_page_range_benchmark.py
│ │ │ ├── speed/
│ │ │ │ ├── baseline_results.json
│ │ │ │ ├── fastapi_results.json
│ │ │ │ ├── speed-experiment-2026-01-03.md
│ │ │ │ └── subprocess_results.json
│ │ │ └── triage/
│ │ │ └── triage-experiments.md
│ │ ├── hybrid-mode-design.md
│ │ ├── hybrid-mode-tasks.md
│ │ └── research/
│ │ ├── comparison-summary.md
│ │ ├── docling-openapi.json
│ │ ├── docling-sample-response-lorem.json
│ │ ├── docling-sample-response.json
│ │ ├── documents-with-tables.txt
│ │ ├── iobject-structure.md
│ │ ├── opendataloader-sample-response.json
│ │ └── opendataloader-sample-response.md
│ └── superpowers/
│ ├── plans/
│ │ └── 2026-03-16-cid-font-detection.md
│ └── specs/
│ └── 2026-03-16-cid-font-detection-design.md
├── examples/
│ └── python/
│ ├── batch/
│ │ ├── README.md
│ │ ├── batch_processing.py
│ │ └── requirements.txt
│ └── rag/
│ ├── README.md
│ ├── basic_chunking.py
│ ├── langchain_example.py
│ └── requirements.txt
├── java/
│ ├── .run/
│ │ └── OpenDataLoaderCli.run.xml
│ ├── checkstyle.xml
│ ├── opendataloader-pdf-cli/
│ │ ├── pom.xml
│ │ └── src/
│ │ ├── main/
│ │ │ └── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ └── cli/
│ │ │ ├── CLIMain.java
│ │ │ └── CLIOptions.java
│ │ └── test/
│ │ └── java/
│ │ └── org/
│ │ └── opendataloader/
│ │ └── pdf/
│ │ └── cli/
│ │ ├── CLIMainTest.java
│ │ ├── CLIOptionsContentSafetyTest.java
│ │ └── CLIOptionsTest.java
│ ├── opendataloader-pdf-core/
│ │ ├── pom.xml
│ │ └── src/
│ │ ├── main/
│ │ │ └── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ ├── api/
│ │ │ │ ├── Config.java
│ │ │ │ ├── FilterConfig.java
│ │ │ │ └── OpenDataLoaderPDF.java
│ │ │ ├── containers/
│ │ │ │ └── StaticLayoutContainers.java
│ │ │ ├── entities/
│ │ │ │ ├── SemanticFormula.java
│ │ │ │ └── SemanticPicture.java
│ │ │ ├── html/
│ │ │ │ ├── HtmlGenerator.java
│ │ │ │ ├── HtmlGeneratorFactory.java
│ │ │ │ └── HtmlSyntax.java
│ │ │ ├── hybrid/
│ │ │ │ ├── DoclingFastServerClient.java
│ │ │ │ ├── DoclingSchemaTransformer.java
│ │ │ │ ├── HancomClient.java
│ │ │ │ ├── HancomSchemaTransformer.java
│ │ │ │ ├── HybridClient.java
│ │ │ │ ├── HybridClientFactory.java
│ │ │ │ ├── HybridConfig.java
│ │ │ │ ├── HybridSchemaTransformer.java
│ │ │ │ ├── TriageLogger.java
│ │ │ │ └── TriageProcessor.java
│ │ │ ├── json/
│ │ │ │ ├── JsonName.java
│ │ │ │ ├── JsonWriter.java
│ │ │ │ ├── ObjectMapperHolder.java
│ │ │ │ └── serializers/
│ │ │ │ ├── CaptionSerializer.java
│ │ │ │ ├── DoubleSerializer.java
│ │ │ │ ├── FormulaSerializer.java
│ │ │ │ ├── HeaderFooterSerializer.java
│ │ │ │ ├── HeadingSerializer.java
│ │ │ │ ├── ImageSerializer.java
│ │ │ │ ├── LineChunkSerializer.java
│ │ │ │ ├── ListItemSerializer.java
│ │ │ │ ├── ListSerializer.java
│ │ │ │ ├── ParagraphSerializer.java
│ │ │ │ ├── PictureSerializer.java
│ │ │ │ ├── SemanticTextNodeSerializer.java
│ │ │ │ ├── SerializerUtil.java
│ │ │ │ ├── TableCellSerializer.java
│ │ │ │ ├── TableRowSerializer.java
│ │ │ │ ├── TableSerializer.java
│ │ │ │ ├── TextChunkSerializer.java
│ │ │ │ └── TextLineSerializer.java
│ │ │ ├── markdown/
│ │ │ │ ├── MarkdownGenerator.java
│ │ │ │ ├── MarkdownGeneratorFactory.java
│ │ │ │ ├── MarkdownHTMLGenerator.java
│ │ │ │ └── MarkdownSyntax.java
│ │ │ ├── pdf/
│ │ │ │ ├── PDFLayer.java
│ │ │ │ └── PDFWriter.java
│ │ │ ├── processors/
│ │ │ │ ├── AbstractTableProcessor.java
│ │ │ │ ├── CaptionProcessor.java
│ │ │ │ ├── ClusterTableProcessor.java
│ │ │ │ ├── ContentFilterProcessor.java
│ │ │ │ ├── DocumentProcessor.java
│ │ │ │ ├── HeaderFooterProcessor.java
│ │ │ │ ├── HeadingProcessor.java
│ │ │ │ ├── HiddenTextProcessor.java
│ │ │ │ ├── HybridDocumentProcessor.java
│ │ │ │ ├── LevelProcessor.java
│ │ │ │ ├── ListProcessor.java
│ │ │ │ ├── ParagraphProcessor.java
│ │ │ │ ├── SpecialTableProcessor.java
│ │ │ │ ├── StrikethroughProcessor.java
│ │ │ │ ├── TableBorderProcessor.java
│ │ │ │ ├── TableStructureNormalizer.java
│ │ │ │ ├── TaggedDocumentProcessor.java
│ │ │ │ ├── TextLineProcessor.java
│ │ │ │ ├── TextProcessor.java
│ │ │ │ └── readingorder/
│ │ │ │ └── XYCutPlusPlusSorter.java
│ │ │ ├── text/
│ │ │ │ └── TextGenerator.java
│ │ │ └── utils/
│ │ │ ├── Base64ImageUtils.java
│ │ │ ├── BulletedParagraphUtils.java
│ │ │ ├── ContentSanitizer.java
│ │ │ ├── ImagesUtils.java
│ │ │ ├── ModeWeightStatistics.java
│ │ │ ├── SanitizationRule.java
│ │ │ ├── TextNodeStatistics.java
│ │ │ ├── TextNodeStatisticsConfig.java
│ │ │ ├── TextNodeUtils.java
│ │ │ └── levels/
│ │ │ ├── LevelInfo.java
│ │ │ ├── LineArtBulletParagraphLevelInfo.java
│ │ │ ├── ListLevelInfo.java
│ │ │ ├── TableLevelInfo.java
│ │ │ └── TextBulletParagraphLevelInfo.java
│ │ └── test/
│ │ ├── java/
│ │ │ └── org/
│ │ │ └── opendataloader/
│ │ │ └── pdf/
│ │ │ ├── EmbedImagesIntegrationTest.java
│ │ │ ├── ImageDirIntegrationTest.java
│ │ │ ├── IntegrationTest.java
│ │ │ ├── Issue336IntegrationTest.java
│ │ │ ├── PageSeparatorIntegrationTest.java
│ │ │ ├── PagesOptionIntegrationTest.java
│ │ │ ├── api/
│ │ │ │ ├── ConfigTest.java
│ │ │ │ └── FilterConfigTest.java
│ │ │ ├── containers/
│ │ │ │ └── StaticLayoutContainersTest.java
│ │ │ ├── hybrid/
│ │ │ │ ├── DoclingFastServerClientTest.java
│ │ │ │ ├── DoclingSchemaTransformerTest.java
│ │ │ │ ├── HancomClientTest.java
│ │ │ │ ├── HancomSchemaTransformerTest.java
│ │ │ │ ├── HealthCheckTest.java
│ │ │ │ ├── HybridClientFactoryTest.java
│ │ │ │ ├── TriageLoggerTest.java
│ │ │ │ ├── TriageProcessorIntegrationTest.java
│ │ │ │ └── TriageProcessorTest.java
│ │ │ ├── json/
│ │ │ │ └── serializers/
│ │ │ │ ├── ImageSerializerTest.java
│ │ │ │ └── LineArtSerializerTest.java
│ │ │ ├── markdown/
│ │ │ │ ├── MarkdownGeneratorTest.java
│ │ │ │ └── MarkdownTableTest.java
│ │ │ ├── processors/
│ │ │ │ ├── CaptionProcessorTest.java
│ │ │ │ ├── CidFontDetectionTest.java
│ │ │ │ ├── ContentFilterProcessorTest.java
│ │ │ │ ├── HeaderFooterProcessorTest.java
│ │ │ │ ├── HeadingProcessorTest.java
│ │ │ │ ├── HybridDocumentProcessorTest.java
│ │ │ │ ├── LevelProcessorTest.java
│ │ │ │ ├── ListProcessorTest.java
│ │ │ │ ├── ParagraphProcessorTest.java
│ │ │ │ ├── SpecialTableProcessorTest.java
│ │ │ │ ├── StrikethroughProcessorTest.java
│ │ │ │ ├── TableBorderProcessorTest.java
│ │ │ │ ├── TextLineProcessorTest.java
│ │ │ │ ├── TextProcessorTest.java
│ │ │ │ └── readingorder/
│ │ │ │ └── XYCutPlusPlusSorterTest.java
│ │ │ ├── regression/
│ │ │ │ └── ToUnicodeRegressionTest.java
│ │ │ └── utils/
│ │ │ ├── Base64ImageUtilsTest.java
│ │ │ ├── ContentSanitizerTest.java
│ │ │ ├── ImageFormatSupportTest.java
│ │ │ ├── ImagesUtilsTest.java
│ │ │ ├── ModeWeightStatisticsTest.java
│ │ │ └── TextNodeStatisticsTest.java
│ │ └── resources/
│ │ └── generate-cid-test-pdf.py
│ └── pom.xml
├── node/
│ └── opendataloader-pdf/
│ ├── .gitignore
│ ├── .npmrc
│ ├── .prettierrc.json
│ ├── eslint.config.js
│ ├── package.json
│ ├── scripts/
│ │ └── setup.cjs
│ ├── src/
│ │ ├── cli-options.generated.ts
│ │ ├── cli.ts
│ │ ├── convert-options.generated.ts
│ │ └── index.ts
│ ├── test/
│ │ ├── convert-options.test.ts
│ │ ├── convert.integration.test.ts
│ │ └── run.integration.test.ts
│ ├── tsconfig.json
│ ├── tsup.config.ts
│ └── vitest.config.ts
├── options.json
├── package.json
├── python/
│ └── opendataloader-pdf/
│ ├── .gitignore
│ ├── hatch_build.py
│ ├── pyproject.toml
│ ├── src/
│ │ └── opendataloader_pdf/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── cli_options_generated.py
│ │ ├── convert_generated.py
│ │ ├── hybrid_server.py
│ │ ├── runner.py
│ │ └── wrapper.py
│ └── tests/
│ ├── conftest.py
│ ├── test_cli_options.py
│ ├── test_convert_integration.py
│ ├── test_hybrid_server.py
│ ├── test_hybrid_server_nonblocking.py
│ ├── test_hybrid_server_partial_success.py
│ └── test_hybrid_server_unicode.py
├── samples/
│ └── json/
│ └── lorem.json
├── schema.json
└── scripts/
├── bench.sh
├── build-all.sh
├── build-java.sh
├── build-node.sh
├── build-python.sh
├── experiments/
│ ├── docling_baseline_bench.py
│ ├── docling_fastapi_bench.py
│ ├── docling_speed_report.py
│ └── docling_subprocess_bench.py
├── generate-options.mjs
├── generate-schema.mjs
├── run-cli.sh
├── test-java.sh
├── test-node.sh
├── test-python.sh
└── utils.mjs
SYMBOL INDEX (1511 symbols across 161 files)
FILE: build-scripts/fetch_shaded_jar.py
function find_latest_jar_by_semver (line 19) | def find_latest_jar_by_semver(target_dir: Path) -> Optional[Path]:
function main (line 50) | def main():
FILE: build-scripts/set_version.py
function set_version (line 7) | def set_version(version_file, pom_file, pyproject_toml_file):
FILE: docs/hybrid/experiments/chunking_strategy/docling_page_range_benchmark.py
class BenchmarkResult (line 31) | class BenchmarkResult:
function get_project_root (line 40) | def get_project_root() -> Path:
function create_converter (line 45) | def create_converter() -> DocumentConverter:
function convert_with_page_range (line 55) | def convert_with_page_range(
function pages_to_ranges (line 64) | def pages_to_ranges(pages: list[int]) -> list[tuple[int, int]]:
function run_benchmark_for_ranges (line 85) | def run_benchmark_for_ranges(
function get_chunks_for_pages (line 121) | def get_chunks_for_pages(
function run_scenario_benchmark (line 134) | def run_scenario_benchmark(
function main (line 210) | def main():
FILE: examples/python/batch/batch_processing.py
function batch_convert (line 26) | def batch_convert(pdf_paths: list[str], output_dir: str) -> list[Path]:
function convert_directory (line 38) | def convert_directory(directory: str, output_dir: str) -> list[Path]:
function summarize_results (line 49) | def summarize_results(json_files: list[Path]) -> None:
function main (line 71) | def main():
FILE: examples/python/rag/basic_chunking.py
function convert_pdf_to_json (line 22) | def convert_pdf_to_json(pdf_path: str, output_dir: str) -> Path:
function load_document (line 35) | def load_document(json_path: Path) -> dict:
function chunk_by_element (line 41) | def chunk_by_element(doc: dict) -> list[dict]:
function chunk_by_section (line 63) | def chunk_by_section(doc: dict) -> list[dict]:
function chunk_with_min_size (line 111) | def chunk_with_min_size(doc: dict, min_chars: int = 200) -> list[dict]:
function format_citation (line 155) | def format_citation(metadata: dict) -> str:
function main (line 170) | def main():
FILE: examples/python/rag/langchain_example.py
function main (line 18) | def main():
FILE: java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIMain.java
class CLIMain (line 29) | public class CLIMain {
method main (line 35) | public static void main(String[] args) {
method run (line 48) | static int run(String[] args) {
method configureLogging (line 97) | private static void configureLogging(boolean quiet) {
method processPath (line 112) | private static boolean processPath(File file, Config config) {
method processDirectory (line 125) | private static boolean processDirectory(File file, Config config) {
method processFile (line 145) | private static boolean processFile(File file, Config config) {
method isPdfFile (line 162) | private static boolean isPdfFile(File file) {
FILE: java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java
class CLIOptions (line 32) | public class CLIOptions {
method defineOptions (line 195) | public static Options defineOptions() {
method createConfigFromCommandLine (line 204) | public static Config createConfigFromCommandLine(CommandLine commandLi...
method applyImageOptions (line 272) | private static void applyImageOptions(Config config, CommandLine comma...
method applyPagesOption (line 306) | private static void applyPagesOption(Config config, CommandLine comman...
method applyTableMethodOption (line 312) | private static void applyTableMethodOption(Config config, CommandLine ...
method applyContentSafetyOption (line 330) | private static void applyContentSafetyOption(Config config, CommandLin...
method applySanitizeOption (line 380) | private static void applySanitizeOption(Config config, CommandLine com...
method applyFormatOption (line 386) | private static void applyFormatOption(Config config, CommandLine comma...
method parseOptionValues (line 436) | private static Set<String> parseOptionValues(String[] optionValues) {
method applyHybridOptions (line 453) | private static void applyHybridOptions(Config config, CommandLine comm...
method exportOptionsAsJson (line 519) | public static void exportOptionsAsJson(PrintStream out) {
method escapeJson (line 558) | private static String escapeJson(String value) {
class OptionDefinition (line 574) | private static class OptionDefinition {
method OptionDefinition (line 582) | OptionDefinition(String longName, String shortName, String type, Obj...
method toOption (line 593) | Option toOption() {
FILE: java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIMainTest.java
class CLIMainTest (line 27) | class CLIMainTest {
method testProcessingFailureReturnsNonZeroExitCode (line 42) | @Test
method testDirectoryWithFailingFileReturnsNonZeroExitCode (line 66) | @Test
method testNoArgumentsReturnsZero (line 86) | @Test
method testInvalidArgumentsReturnsExitCode2 (line 96) | @Test
method testNonExistentFileReturnsNonZeroExitCode (line 105) | @Test
FILE: java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIOptionsContentSafetyTest.java
class CLIOptionsContentSafetyTest (line 30) | class CLIOptionsContentSafetyTest {
method parseArgs (line 32) | private Config parseArgs(String... args) throws Exception {
method sanitizeFlagEnablesSensitiveDataFilter (line 39) | @Test
method defaultDoesNotEnableSensitiveDataFilter (line 46) | @Test
method sanitizeWithContentSafetyOffAllStillEnablesSanitize (line 53) | @Test
method contentSafetyOffAllDoesNotTouchSensitiveData (line 68) | @Test
method deprecatedSensitiveDataValueIsAccepted (line 75) | @Test
method deprecatedSensitiveDataValuePrintsWarning (line 82) | @Test
method sanitizeWithDeprecatedSensitiveDataStillEnablesSanitize (line 96) | @Test
FILE: java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIOptionsTest.java
class CLIOptionsTest (line 38) | class CLIOptionsTest {
method setUp (line 47) | @BeforeEach
method testDefineOptions_containsImageOutputOption (line 55) | @Test
method testDefineOptions_containsImageFormatOption (line 60) | @Test
method testCreateConfig_withImageOutputEmbedded (line 65) | @Test
method testCreateConfig_withImageOutputExternal (line 76) | @Test
method testCreateConfig_defaultImageOutput (line 87) | @Test
method testCreateConfig_withImageOutputOff (line 100) | @Test
method testCreateConfig_withValidImageFormat (line 112) | @ParameterizedTest
method testCreateConfig_withUppercaseImageFormat (line 123) | @Test
method testCreateConfig_withInvalidImageFormat (line 133) | @Test
method testCreateConfig_withEmptyImageFormat (line 143) | @Test
method testCreateConfig_withImageOutputAndImageFormat (line 153) | @Test
method testCreateConfig_imageFormatWithExternalOutput (line 164) | @Test
method testCreateConfig_withWebpImageFormat_shouldFail (line 175) | @Test
method testDefaultImageFormat (line 186) | @Test
method testCreateConfig_withInvalidImageOutput (line 196) | @Test
method testCreateConfig_withUppercaseImageOutput (line 206) | @Test
method testCreateConfig_defaultReadingOrder (line 216) | @Test
method testCreateConfig_withReadingOrderOff (line 227) | @Test
method testDefineOptions_containsPagesOption (line 239) | @Test
method testCreateConfig_withPages (line 244) | @Test
method testCreateConfig_withSinglePage (line 255) | @Test
method testCreateConfig_withPageRange (line 266) | @Test
method testCreateConfig_defaultPages (line 277) | @Test
method testCreateConfig_withInvalidPages (line 288) | @Test
method testCreateConfig_withReversePageRange (line 298) | @Test
method testDefineOptions_containsImageDirOption (line 310) | @Test
method testCreateConfig_withImageDir (line 315) | @Test
method testCreateConfig_defaultImageDir (line 326) | @Test
method testCreateConfig_withImageDirAndOutputDir (line 336) | @Test
method testCreateConfig_withEmptyImageDir (line 349) | @Test
method testCreateConfig_withWhitespaceImageDir (line 359) | @Test
method testDefineOptions_containsHybridModeOption (line 371) | @Test
method testDefineOptions_containsHybridOcrOption (line 376) | @Test
method testCreateConfig_withHybridModeAuto (line 382) | @Test
method testCreateConfig_withHybridModeFull (line 393) | @Test
method testCreateConfig_withInvalidHybridMode (line 404) | @Test
method testCreateConfig_withDeprecatedHybridOcr (line 414) | @Test
method testCreateConfig_defaultHybridMode (line 425) | @Test
method testCreateConfig_withDoclingBackend (line 435) | @Test
method testCreateConfig_defaultHybridFallbackIsFalse (line 446) | @Test
method testCreateConfig_withHybridFallbackExplicit (line 457) | @Test
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java
class Config (line 30) | public class Config {
method getFilterConfig (line 131) | public FilterConfig getFilterConfig() {
method Config (line 138) | public Config() {
method getPassword (line 146) | public String getPassword() {
method setPassword (line 155) | public void setPassword(String password) {
method isGenerateMarkdown (line 167) | public boolean isGenerateMarkdown() {
method setGenerateMarkdown (line 176) | public void setGenerateMarkdown(boolean generateMarkdown) {
method isGenerateHtml (line 185) | public boolean isGenerateHtml() {
method setGenerateHtml (line 194) | public void setGenerateHtml(boolean generateHtml) {
method isGeneratePDF (line 203) | public boolean isGeneratePDF() {
method setGeneratePDF (line 212) | public void setGeneratePDF(boolean generatePDF) {
method isKeepLineBreaks (line 221) | public boolean isKeepLineBreaks() {
method setKeepLineBreaks (line 230) | public void setKeepLineBreaks(boolean keepLineBreaks) {
method isGenerateJSON (line 239) | public boolean isGenerateJSON() {
method setGenerateJSON (line 248) | public void setGenerateJSON(boolean generateJSON) {
method isGenerateText (line 257) | public boolean isGenerateText() {
method setGenerateText (line 266) | public void setGenerateText(boolean generateText) {
method isUseHTMLInMarkdown (line 275) | public boolean isUseHTMLInMarkdown() {
method setUseHTMLInMarkdown (line 285) | public void setUseHTMLInMarkdown(boolean useHTMLInMarkdown) {
method isAddImageToMarkdown (line 294) | public boolean isAddImageToMarkdown() {
method setAddImageToMarkdown (line 304) | public void setAddImageToMarkdown(boolean addImageToMarkdown) {
method getOutputFolder (line 313) | public String getOutputFolder() {
method setOutputFolder (line 323) | public void setOutputFolder(String outputFolder) {
method getReplaceInvalidChars (line 332) | public String getReplaceInvalidChars() {
method setReplaceInvalidChars (line 341) | public void setReplaceInvalidChars(String replaceInvalidChars) {
method isUseStructTree (line 350) | public boolean isUseStructTree() {
method setUseStructTree (line 359) | public void setUseStructTree(boolean useStructTree) {
method isClusterTableMethod (line 368) | public boolean isClusterTableMethod() {
method getTableMethod (line 377) | public String getTableMethod() {
method setTableMethod (line 387) | public void setTableMethod(String tableMethod) {
method getTableMethodOptions (line 402) | public static String getTableMethodOptions(CharSequence delimiter) {
method isValidTableMethod (line 412) | public static boolean isValidTableMethod(String method) {
method getReadingOrder (line 421) | public String getReadingOrder() {
method setReadingOrder (line 431) | public void setReadingOrder(String readingOrder) {
method getReadingOrderOptions (line 446) | public static String getReadingOrderOptions(CharSequence delimiter) {
method isValidReadingOrder (line 456) | public static boolean isValidReadingOrder(String order) {
method getMarkdownPageSeparator (line 465) | public String getMarkdownPageSeparator() {
method setMarkdownPageSeparator (line 474) | public void setMarkdownPageSeparator(String markdownPageSeparator) {
method getTextPageSeparator (line 483) | public String getTextPageSeparator() {
method setTextPageSeparator (line 492) | public void setTextPageSeparator(String textPageSeparator) {
method getHtmlPageSeparator (line 501) | public String getHtmlPageSeparator() {
method setHtmlPageSeparator (line 510) | public void setHtmlPageSeparator(String htmlPageSeparator) {
method isEmbedImages (line 519) | public boolean isEmbedImages() {
method isImageOutputOff (line 528) | public boolean isImageOutputOff() {
method getImageOutput (line 537) | public String getImageOutput() {
method setImageOutput (line 547) | public void setImageOutput(String imageOutput) {
method getImageOutputOptions (line 562) | public static String getImageOutputOptions(CharSequence delimiter) {
method isValidImageOutput (line 572) | public static boolean isValidImageOutput(String mode) {
method getImageFormat (line 581) | public String getImageFormat() {
method setImageFormat (line 591) | public void setImageFormat(String imageFormat) {
method getImageFormatOptions (line 606) | public static String getImageFormatOptions(CharSequence delimiter) {
method isValidImageFormat (line 616) | public static boolean isValidImageFormat(String format) {
method getImageDir (line 625) | public String getImageDir() {
method setImageDir (line 635) | public void setImageDir(String imageDir) {
method getPages (line 652) | public String getPages() {
method setPages (line 662) | public void setPages(String pages) {
method getPageNumbers (line 676) | public List<Integer> getPageNumbers() {
method parsePageRanges (line 690) | private static List<Integer> parsePageRanges(String pages) {
method parseRange (line 710) | private static void parseRange(String range, String fullInput, List<In...
method parseSinglePage (line 737) | private static void parseSinglePage(String page, String fullInput, Lis...
method getHybrid (line 755) | public String getHybrid() {
method setHybrid (line 765) | public void setHybrid(String hybrid) {
method getHybridOptions (line 780) | public static String getHybridOptions(CharSequence delimiter) {
method isValidHybrid (line 790) | public static boolean isValidHybrid(String hybrid) {
method isHybridEnabled (line 799) | public boolean isHybridEnabled() {
method getHybridConfig (line 808) | public HybridConfig getHybridConfig() {
method getHybridModeOptions (line 818) | public static String getHybridModeOptions(CharSequence delimiter) {
method isValidHybridMode (line 828) | public static boolean isValidHybridMode(String mode) {
method isIncludeHeaderFooter (line 837) | public boolean isIncludeHeaderFooter() {
method setIncludeHeaderFooter (line 846) | public void setIncludeHeaderFooter(boolean includeHeaderFooter) {
method isDetectStrikethrough (line 850) | public boolean isDetectStrikethrough() {
method setDetectStrikethrough (line 854) | public void setDetectStrikethrough(boolean detectStrikethrough) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/FilterConfig.java
class FilterConfig (line 28) | public class FilterConfig {
method initializeDefaultRules (line 37) | private void initializeDefaultRules() {
method FilterConfig (line 83) | public FilterConfig() {
method setFilterHiddenText (line 93) | public void setFilterHiddenText(boolean filterHiddenText) {
method isFilterHiddenText (line 102) | public boolean isFilterHiddenText() {
method setFilterOutOfPage (line 111) | public void setFilterOutOfPage(boolean filterOutOfPage) {
method isFilterOutOfPage (line 120) | public boolean isFilterOutOfPage() {
method isFilterTinyText (line 129) | public boolean isFilterTinyText() {
method setFilterTinyText (line 138) | public void setFilterTinyText(boolean filterTinyText) {
method isFilterHiddenOCG (line 147) | public boolean isFilterHiddenOCG() {
method setFilterHiddenOCG (line 156) | public void setFilterHiddenOCG(boolean filterHiddenOCG) {
method isFilterSensitiveData (line 165) | public boolean isFilterSensitiveData() {
method setFilterSensitiveData (line 174) | public void setFilterSensitiveData(boolean filterSensitiveData) {
method getFilterRules (line 183) | public List<SanitizationRule> getFilterRules() {
method addFilterRule (line 193) | public void addFilterRule(String pattern, String replacement) {
method removeFilterRule (line 202) | public void removeFilterRule(String pattern) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java
class OpenDataLoaderPDF (line 27) | public final class OpenDataLoaderPDF {
method OpenDataLoaderPDF (line 29) | private OpenDataLoaderPDF() {
method processFile (line 39) | public static void processFile(String inputPdfName, Config config) thr...
method shutdown (line 49) | public static void shutdown() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/containers/StaticLayoutContainers.java
class StaticLayoutContainers (line 31) | public class StaticLayoutContainers {
method clearContainers (line 45) | public static void clearContainers() {
method getCurrentContentId (line 58) | public static long getCurrentContentId() {
method incrementContentId (line 62) | public static long incrementContentId() {
method setCurrentContentId (line 68) | public static void setCurrentContentId(long currentContentId) {
method getImagesDirectory (line 72) | public static String getImagesDirectory() {
method getImagesDirectoryName (line 76) | public static String getImagesDirectoryName() {
method setImagesDirectory (line 81) | public static void setImagesDirectory(String imagesDirectory) {
method getContrastRatioConsumer (line 85) | public static ContrastRatioConsumer getContrastRatioConsumer(String so...
method closeContrastRatioConsumer (line 97) | public static void closeContrastRatioConsumer() {
method getHeadings (line 108) | public static List<SemanticHeading> getHeadings() {
method setHeadings (line 112) | public static void setHeadings(List<SemanticHeading> headings) {
method isUseStructTree (line 116) | public static Boolean isUseStructTree() {
method setIsUseStructTree (line 120) | public static void setIsUseStructTree(Boolean isUseStructTree) {
method incrementImageIndex (line 124) | public static int incrementImageIndex() {
method resetImageIndex (line 130) | public static void resetImageIndex() {
method isEmbedImages (line 134) | public static boolean isEmbedImages() {
method setEmbedImages (line 138) | public static void setEmbedImages(boolean embedImages) {
method getImageFormat (line 142) | public static String getImageFormat() {
method setImageFormat (line 147) | public static void setImageFormat(String format) {
method setReplacementCharRatio (line 151) | public static void setReplacementCharRatio(int pageNumber, double rati...
method getReplacementCharRatio (line 155) | public static double getReplacementCharRatio(int pageNumber) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/entities/SemanticFormula.java
class SemanticFormula (line 29) | public class SemanticFormula extends BaseObject {
method SemanticFormula (line 39) | public SemanticFormula(BoundingBox boundingBox, String latex) {
method getLatex (line 49) | public String getLatex() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/entities/SemanticPicture.java
class SemanticPicture (line 30) | public class SemanticPicture extends BaseObject {
method SemanticPicture (line 41) | public SemanticPicture(BoundingBox boundingBox, int index) {
method SemanticPicture (line 52) | public SemanticPicture(BoundingBox boundingBox, int index, String desc...
method getPictureIndex (line 63) | public int getPictureIndex() {
method getDescription (line 72) | public String getDescription() {
method hasDescription (line 81) | public boolean hasDescription() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java
class HtmlGenerator (line 52) | public class HtmlGenerator implements Closeable {
method HtmlGenerator (line 85) | public HtmlGenerator(File inputPdf, Config config) throws IOException {
method writeToHtml (line 102) | public void writeToHtml(List<List<IObject>> contents) {
method writePageSeparator (line 129) | protected void writePageSeparator(int pageNumber) throws IOException {
method write (line 144) | protected void write(IObject object) throws IOException {
method writeHeaderOrFooter (line 181) | protected void writeHeaderOrFooter(SemanticHeaderOrFooter headerOrFoot...
method writeFormula (line 193) | protected void writeFormula(SemanticFormula formula) throws IOException {
method writeImage (line 207) | protected void writeImage(ImageChunk image) {
method writePicture (line 240) | protected void writePicture(SemanticPicture picture) {
method writeList (line 291) | protected void writeList(PDFList list) throws IOException {
method writeSemanticTextNode (line 317) | protected void writeSemanticTextNode(SemanticTextNode textNode) throws...
method writeTable (line 330) | protected void writeTable(TableBorder table) throws IOException {
method writeParagraph (line 373) | protected void writeParagraph(SemanticParagraph paragraph) throws IOEx...
method writeHeading (line 397) | protected void writeHeading(SemanticHeading heading) throws IOException {
method writeCellTag (line 405) | private void writeCellTag(TableBorderCell cell, boolean isHeader) thro...
method enterTable (line 424) | protected void enterTable() {
method leaveTable (line 431) | protected void leaveTable() {
method isInsideTable (line 442) | protected boolean isInsideTable() {
method getCorrectString (line 452) | protected String getCorrectString(String value) {
method escapeHtmlAttribute (line 466) | protected String escapeHtmlAttribute(String value) {
method close (line 479) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGeneratorFactory.java
class HtmlGeneratorFactory (line 26) | public class HtmlGeneratorFactory {
method getHtmlGenerator (line 36) | public static HtmlGenerator getHtmlGenerator(File inputPdf, Config con...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlSyntax.java
class HtmlSyntax (line 21) | public class HtmlSyntax {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingFastServerClient.java
class DoclingFastServerClient (line 53) | public class DoclingFastServerClient implements HybridClient {
method DoclingFastServerClient (line 75) | public DoclingFastServerClient(HybridConfig config) {
method DoclingFastServerClient (line 92) | DoclingFastServerClient(String baseUrl, OkHttpClient httpClient, Objec...
method checkAvailability (line 98) | @Override
method convert (line 129) | @Override
method convertAsync (line 139) | @Override
method getBaseUrl (line 155) | public String getBaseUrl() {
method buildConvertRequest (line 162) | private Request buildConvertRequest(HybridRequest request) {
method parseResponse (line 184) | private HybridResponse parseResponse(Response response) throws IOExcep...
method extractPageContents (line 240) | private Map<Integer, JsonNode> extractPageContents(JsonNode jsonConten...
method extractFailedPages (line 272) | private List<Integer> extractFailedPages(JsonNode root) {
method shutdown (line 294) | public void shutdown() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingSchemaTransformer.java
class DoclingSchemaTransformer (line 71) | public class DoclingSchemaTransformer implements HybridSchemaTransformer {
method getBackendType (line 94) | @Override
method transform (line 99) | @Override
method transformPage (line 151) | @Override
method determinePageCount (line 176) | private int determinePageCount(JsonNode json, Map<Integer, Double> pag...
method scanContentForPageCount (line 211) | private int scanContentForPageCount(JsonNode json) {
method getPageNumberFromProv (line 234) | private int getPageNumberFromProv(JsonNode node) {
method transformText (line 249) | private void transformText(JsonNode textNode, List<List<IObject>> resu...
method createHeading (line 300) | private SemanticHeading createHeading(String text, BoundingBox bbox, J...
method createParagraph (line 326) | private SemanticParagraph createParagraph(String text, BoundingBox bbo...
method createFormula (line 347) | private SemanticFormula createFormula(String latex, BoundingBox bbox) {
method transformPicture (line 356) | private void transformPicture(JsonNode pictureNode, List<List<IObject>...
method extractPictureDescription (line 394) | private String extractPictureDescription(JsonNode pictureNode) {
method transformTable (line 410) | private void transformTable(JsonNode tableNode, List<List<IObject>> re...
method extractBoundingBox (line 528) | private BoundingBox extractBoundingBox(JsonNode bboxNode, int pageInde...
method getTextValue (line 565) | private String getTextValue(JsonNode node, String fieldName) {
method sortByReadingOrder (line 578) | private void sortByReadingOrder(List<IObject> contents) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HancomClient.java
class HancomClient (line 50) | public class HancomClient implements HybridClient {
method HancomClient (line 80) | public HancomClient(HybridConfig config) {
method HancomClient (line 98) | HancomClient(String baseUrl, OkHttpClient httpClient, ObjectMapper obj...
method checkAvailability (line 104) | @Override
method convert (line 128) | @Override
method convertAsync (line 149) | @Override
method getBaseUrl (line 165) | public String getBaseUrl() {
method uploadFile (line 176) | private String uploadFile(byte[] pdfBytes) throws IOException {
method getVisualInfo (line 222) | private JsonNode getVisualInfo(String fileId) throws IOException {
method deleteFile (line 257) | private void deleteFile(String fileId) {
method normalizeUrl (line 280) | private static String normalizeUrl(String url) {
method shutdown (line 294) | public void shutdown() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HancomSchemaTransformer.java
class HancomSchemaTransformer (line 71) | public class HancomSchemaTransformer implements HybridSchemaTransformer {
method getBackendType (line 90) | @Override
method transform (line 95) | @Override
method transformPage (line 131) | @Override
method determinePageCount (line 161) | private int determinePageCount(JsonNode json, Map<Integer, Double> pag...
method scanElementsForPageCount (line 180) | private int scanElementsForPageCount(JsonNode json) {
method transformElement (line 199) | private void transformElement(JsonNode element, List<List<IObject>> re...
method createParagraph (line 285) | private SemanticParagraph createParagraph(String text, BoundingBox bbo...
method createHeading (line 302) | private SemanticHeading createHeading(String text, BoundingBox bbox) {
method createFormula (line 320) | private SemanticFormula createFormula(String latex, BoundingBox bbox) {
method createPicture (line 329) | private SemanticPicture createPicture(BoundingBox bbox) {
method transformTable (line 354) | private TableBorder transformTable(JsonNode element, BoundingBox table...
method extractBoundingBox (line 482) | private BoundingBox extractBoundingBox(JsonNode bboxNode, int pageInde...
method getTextValue (line 505) | private String getTextValue(JsonNode node, String fieldName) {
method sortByReadingOrder (line 518) | private void sortByReadingOrder(List<IObject> contents) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridClient.java
type HybridClient (line 39) | public interface HybridClient {
type OutputFormat (line 44) | enum OutputFormat {
method OutputFormat (line 54) | OutputFormat(String apiValue) {
method getApiValue (line 59) | public String getApiValue() {
class HybridRequest (line 70) | final class HybridRequest {
method HybridRequest (line 82) | public HybridRequest(byte[] pdfBytes, Set<Integer> pageNumbers,
method allPages (line 97) | public static HybridRequest allPages(byte[] pdfBytes) {
method allPages (line 108) | public static HybridRequest allPages(byte[] pdfBytes, Set<OutputForm...
method forPages (line 119) | public static HybridRequest forPages(byte[] pdfBytes, Set<Integer> p...
method forPages (line 131) | public static HybridRequest forPages(byte[] pdfBytes, Set<Integer> p...
method getPdfBytes (line 136) | public byte[] getPdfBytes() {
method getPageNumbers (line 140) | public Set<Integer> getPageNumbers() {
method getOutputFormats (line 149) | public Set<OutputFormat> getOutputFormats() {
method wantsJson (line 158) | public boolean wantsJson() {
method wantsMarkdown (line 167) | public boolean wantsMarkdown() {
method wantsHtml (line 176) | public boolean wantsHtml() {
class HybridResponse (line 184) | final class HybridResponse {
method HybridResponse (line 200) | public HybridResponse(String markdown, String html, JsonNode json,
method HybridResponse (line 219) | public HybridResponse(String markdown, String html, JsonNode json, M...
method HybridResponse (line 230) | public HybridResponse(String markdown, JsonNode json, Map<Integer, J...
method empty (line 239) | public static HybridResponse empty() {
method getMarkdown (line 243) | public String getMarkdown() {
method getHtml (line 247) | public String getHtml() {
method getJson (line 251) | public JsonNode getJson() {
method getPageContents (line 255) | public Map<Integer, JsonNode> getPageContents() {
method getFailedPages (line 268) | public List<Integer> getFailedPages() {
method hasFailedPages (line 277) | public boolean hasFailedPages() {
method equals (line 281) | @Override
method hashCode (line 293) | @Override
method checkAvailability (line 307) | void checkAvailability() throws IOException;
method convert (line 316) | HybridResponse convert(HybridRequest request) throws IOException;
method convertAsync (line 327) | CompletableFuture<HybridResponse> convertAsync(HybridRequest request);
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridClientFactory.java
class HybridClientFactory (line 43) | public class HybridClientFactory {
method HybridClientFactory (line 60) | private HybridClientFactory() {
method getOrCreate (line 76) | public static HybridClient getOrCreate(String hybrid, HybridConfig con...
method createClient (line 89) | private static HybridClient createClient(String hybrid, HybridConfig c...
method create (line 113) | @Deprecated
method create (line 126) | @Deprecated
method shutdown (line 137) | public static void shutdown() {
method isSupported (line 154) | public static boolean isSupported(String hybrid) {
method getSupportedBackends (line 168) | public static String getSupportedBackends() {
method getAllKnownBackends (line 177) | public static String getAllKnownBackends() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridConfig.java
class HybridConfig (line 24) | public class HybridConfig {
method HybridConfig (line 55) | public HybridConfig() {
method getUrl (line 63) | public String getUrl() {
method setUrl (line 72) | public void setUrl(String url) {
method getTimeoutMs (line 81) | public int getTimeoutMs() {
method setTimeoutMs (line 91) | public void setTimeoutMs(int timeoutMs) {
method isFallbackToJava (line 103) | public boolean isFallbackToJava() {
method setFallbackToJava (line 112) | public void setFallbackToJava(boolean fallbackToJava) {
method getMaxConcurrentRequests (line 121) | public int getMaxConcurrentRequests() {
method setMaxConcurrentRequests (line 131) | public void setMaxConcurrentRequests(int maxConcurrentRequests) {
method getDefaultUrl (line 144) | public static String getDefaultUrl(String hybrid) {
method getEffectiveUrl (line 167) | public String getEffectiveUrl(String hybrid) {
method getMode (line 179) | public String getMode() {
method setMode (line 188) | public void setMode(String mode) {
method isFullMode (line 197) | public boolean isFullMode() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridSchemaTransformer.java
type HybridSchemaTransformer (line 35) | public interface HybridSchemaTransformer {
method transform (line 49) | List<List<IObject>> transform(HybridResponse response, Map<Integer, Do...
method transformPage (line 62) | List<IObject> transformPage(int pageNumber, JsonNode pageContent, doub...
method getBackendType (line 69) | String getBackendType();
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/TriageLogger.java
class TriageLogger (line 67) | public class TriageLogger {
method TriageLogger (line 79) | public TriageLogger() {
method logToFile (line 93) | public void logToFile(
method logToWriter (line 120) | public void logToWriter(
method createTriageJson (line 138) | public ObjectNode createTriageJson(
method createSignalsNode (line 197) | private ObjectNode createSignalsNode(TriageSignals signals) {
method toJsonString (line 217) | public String toJsonString(
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/TriageProcessor.java
class TriageProcessor (line 49) | public class TriageProcessor {
type TriageDecision (line 121) | public enum TriageDecision {
class TriageResult (line 131) | public static final class TriageResult {
method TriageResult (line 145) | public TriageResult(int pageNumber, TriageDecision decision, double ...
method java (line 160) | public static TriageResult java(int pageNumber, double confidence, T...
method backend (line 172) | public static TriageResult backend(int pageNumber, double confidence...
method getPageNumber (line 181) | public int getPageNumber() {
method getDecision (line 190) | public TriageDecision getDecision() {
method getConfidence (line 199) | public double getConfidence() {
method getSignals (line 208) | public TriageSignals getSignals() {
method equals (line 212) | @Override
method hashCode (line 223) | @Override
method toString (line 228) | @Override
class TriageSignals (line 242) | public static final class TriageSignals {
method TriageSignals (line 279) | public TriageSignals(int lineChunkCount, int textChunkCount, double ...
method TriageSignals (line 290) | public TriageSignals(int lineChunkCount, int textChunkCount, double ...
method empty (line 323) | public static TriageSignals empty() {
method getLineChunkCount (line 334) | public int getLineChunkCount() {
method getTextChunkCount (line 343) | public int getTextChunkCount() {
method getLineToTextRatio (line 352) | public double getLineToTextRatio() {
method getAlignedLineGroups (line 361) | public int getAlignedLineGroups() {
method hasTableBorder (line 370) | public boolean hasTableBorder() {
method hasSuspiciousPattern (line 379) | public boolean hasSuspiciousPattern() {
method hasVectorTableSignal (line 388) | public boolean hasVectorTableSignal() {
method hasTextTablePattern (line 398) | public boolean hasTextTablePattern() {
method getHorizontalLineCount (line 405) | public int getHorizontalLineCount() {
method getVerticalLineCount (line 409) | public int getVerticalLineCount() {
method getLineArtCount (line 413) | public int getLineArtCount() {
method hasGridLines (line 417) | public boolean hasGridLines() {
method hasTableBorderLines (line 421) | public boolean hasTableBorderLines() {
method hasRowSeparatorPattern (line 425) | public boolean hasRowSeparatorPattern() {
method hasAlignedShortLines (line 429) | public boolean hasAlignedShortLines() {
method getTablePatternCount (line 433) | public int getTablePatternCount() {
method getMaxConsecutiveStreak (line 437) | public int getMaxConsecutiveStreak() {
method getPatternDensity (line 441) | public double getPatternDensity() {
method hasConsecutivePatterns (line 445) | public boolean hasConsecutivePatterns() {
method getLargeImageRatio (line 454) | public double getLargeImageRatio() {
method hasLargeImage (line 464) | public boolean hasLargeImage() {
method getLargeImageAspectRatio (line 474) | public double getLargeImageAspectRatio() {
method equals (line 478) | @Override
method hashCode (line 504) | @Override
method toString (line 515) | @Override
class TriageThresholds (line 545) | public static class TriageThresholds {
method TriageThresholds (line 553) | public TriageThresholds() {
method getLineRatioThreshold (line 561) | public double getLineRatioThreshold() {
method setLineRatioThreshold (line 570) | public void setLineRatioThreshold(double lineRatioThreshold) {
method getAlignedLineGroupsThreshold (line 579) | public int getAlignedLineGroupsThreshold() {
method setAlignedLineGroupsThreshold (line 588) | public void setAlignedLineGroupsThreshold(int alignedLineGroupsThres...
method getGridGapMultiplier (line 597) | public double getGridGapMultiplier() {
method setGridGapMultiplier (line 606) | public void setGridGapMultiplier(double gridGapMultiplier) {
method TriageProcessor (line 611) | private TriageProcessor() {
method classifyPage (line 633) | public static TriageResult classifyPage(
method classifyPage (line 648) | public static TriageResult classifyPage(
method extractSignals (line 717) | static TriageSignals extractSignals(
class SignalAccumulator (line 810) | private static class SignalAccumulator {
method processLineChunk (line 828) | void processLineChunk(LineChunk lineChunk) {
method processLineArtChunk (line 850) | void processLineArtChunk() {
method processImageChunk (line 854) | void processImageChunk(ImageChunk imageChunk) {
method processTextChunk (line 866) | void processTextChunk(TextChunk textChunk) {
method areSuspiciousTextChunks (line 894) | private boolean areSuspiciousTextChunks(TextChunk previous, TextChun...
method hasAlignedShortHorizontalLines (line 917) | boolean hasAlignedShortHorizontalLines() {
method checkTableBorderPresence (line 953) | private static boolean checkTableBorderPresence(int pageNumber) {
method checkSuspiciousPatterns (line 971) | private static boolean checkSuspiciousPatterns(List<TextChunk> textChu...
method areOnSameBaseline (line 1004) | private static boolean areOnSameBaseline(TextChunk chunk1, TextChunk c...
method countAlignedLineGroups (line 1018) | private static int countAlignedLineGroups(List<TextChunk> textChunks, ...
method triageAllPages (line 1086) | public static Map<Integer, TriageResult> triageAllPages(
method triageAllPages (line 1099) | public static Map<Integer, TriageResult> triageAllPages(
method triageAllPages (line 1122) | public static Map<Integer, TriageResult> triageAllPages(
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/JsonName.java
class JsonName (line 18) | public class JsonName {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/JsonWriter.java
class JsonWriter (line 41) | public class JsonWriter {
method getJsonGenerator (line 43) | private static JsonGenerator getJsonGenerator(String fileName) throws ...
method writeToJson (line 50) | public static void writeToJson(File inputPDF, String outputFolder, Lis...
method writeDocumentInfo (line 73) | private static void writeDocumentInfo(JsonGenerator generator, String ...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/ObjectMapperHolder.java
class ObjectMapperHolder (line 35) | public class ObjectMapperHolder {
method getObjectMapper (line 97) | public static ObjectMapper getObjectMapper() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/CaptionSerializer.java
class CaptionSerializer (line 29) | public class CaptionSerializer extends StdSerializer<SemanticCaption> {
method CaptionSerializer (line 36) | public CaptionSerializer(Class<SemanticCaption> t) {
method serialize (line 40) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/DoubleSerializer.java
class DoubleSerializer (line 30) | public class DoubleSerializer extends StdSerializer<Double> {
method DoubleSerializer (line 37) | public DoubleSerializer(Class<Double> t) {
method serialize (line 43) | @Override
method round (line 49) | private static double round(double value, int decimalPlaces) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/FormulaSerializer.java
class FormulaSerializer (line 40) | public class FormulaSerializer extends StdSerializer<SemanticFormula> {
method FormulaSerializer (line 42) | public FormulaSerializer(Class<SemanticFormula> t) {
method serialize (line 46) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/HeaderFooterSerializer.java
class HeaderFooterSerializer (line 32) | public class HeaderFooterSerializer extends StdSerializer<SemanticHeader...
method HeaderFooterSerializer (line 39) | public HeaderFooterSerializer(Class<SemanticHeaderOrFooter> t) {
method serialize (line 43) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/HeadingSerializer.java
class HeadingSerializer (line 30) | public class HeadingSerializer extends StdSerializer<SemanticHeading> {
method HeadingSerializer (line 37) | public HeadingSerializer(Class<SemanticHeading> t) {
method serialize (line 41) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ImageSerializer.java
class ImageSerializer (line 31) | public class ImageSerializer extends StdSerializer<ImageChunk> {
method ImageSerializer (line 33) | public ImageSerializer(Class<ImageChunk> t) {
method serialize (line 37) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/LineChunkSerializer.java
class LineChunkSerializer (line 26) | public class LineChunkSerializer extends StdSerializer<LineChunk> {
method LineChunkSerializer (line 28) | public LineChunkSerializer(Class<LineChunk> t) {
method serialize (line 32) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ListItemSerializer.java
class ListItemSerializer (line 29) | public class ListItemSerializer extends StdSerializer<ListItem> {
method ListItemSerializer (line 31) | public ListItemSerializer(Class<ListItem> t) {
method serialize (line 35) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ListSerializer.java
class ListSerializer (line 27) | public class ListSerializer extends StdSerializer<PDFList> {
method ListSerializer (line 29) | public ListSerializer(Class<PDFList> t) {
method serialize (line 33) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ParagraphSerializer.java
class ParagraphSerializer (line 26) | public class ParagraphSerializer extends StdSerializer<SemanticParagraph> {
method ParagraphSerializer (line 28) | public ParagraphSerializer(Class<SemanticParagraph> t) {
method serialize (line 32) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/PictureSerializer.java
class PictureSerializer (line 36) | public class PictureSerializer extends StdSerializer<SemanticPicture> {
method PictureSerializer (line 38) | public PictureSerializer(Class<SemanticPicture> t) {
method serialize (line 42) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/SemanticTextNodeSerializer.java
class SemanticTextNodeSerializer (line 26) | public class SemanticTextNodeSerializer extends StdSerializer<SemanticTe...
method SemanticTextNodeSerializer (line 28) | public SemanticTextNodeSerializer(Class<SemanticTextNode> t) {
method serialize (line 32) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/SerializerUtil.java
class SerializerUtil (line 27) | public class SerializerUtil {
method writeEssentialInfo (line 28) | public static void writeEssentialInfo(JsonGenerator jsonGenerator, IOb...
method writeTextInfo (line 46) | public static void writeTextInfo(JsonGenerator jsonGenerator, Semantic...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableCellSerializer.java
class TableCellSerializer (line 28) | public class TableCellSerializer extends StdSerializer<TableBorderCell> {
method TableCellSerializer (line 30) | public TableCellSerializer(Class<TableBorderCell> t) {
method serialize (line 34) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableRowSerializer.java
class TableRowSerializer (line 27) | public class TableRowSerializer extends StdSerializer<TableBorderRow> {
method TableRowSerializer (line 29) | public TableRowSerializer(Class<TableBorderRow> t) {
method serialize (line 33) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableSerializer.java
class TableSerializer (line 29) | public class TableSerializer extends StdSerializer<TableBorder> {
method TableSerializer (line 31) | public TableSerializer(Class<TableBorder> t) {
method serialize (line 35) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TextChunkSerializer.java
class TextChunkSerializer (line 26) | public class TextChunkSerializer extends StdSerializer<TextChunk> {
method TextChunkSerializer (line 28) | public TextChunkSerializer(Class<TextChunk> t) {
method serialize (line 32) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TextLineSerializer.java
class TextLineSerializer (line 26) | public class TextLineSerializer extends StdSerializer<TextLine> {
method TextLineSerializer (line 28) | public TextLineSerializer(Class<TextLine> t) {
method serialize (line 32) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java
class MarkdownGenerator (line 46) | public class MarkdownGenerator implements Closeable {
method MarkdownGenerator (line 58) | MarkdownGenerator(File inputPdf, Config config) throws IOException {
method writeToMarkdown (line 69) | public void writeToMarkdown(List<List<IObject>> contents) {
method writePageSeparator (line 88) | protected void writePageSeparator(int pageNumber) throws IOException {
method isSupportedContent (line 97) | protected boolean isSupportedContent(IObject content) {
method writeContentsSeparator (line 109) | protected void writeContentsSeparator() throws IOException {
method write (line 114) | protected void write(IObject object) throws IOException {
method writeImage (line 136) | protected void writeImage(ImageChunk image) {
method writePicture (line 167) | protected void writePicture(SemanticPicture picture) {
method writeFormula (line 208) | protected void writeFormula(SemanticFormula formula) throws IOException {
method writeHeaderOrFooter (line 216) | protected void writeHeaderOrFooter(SemanticHeaderOrFooter headerOrFoot...
method writeList (line 225) | protected void writeList(PDFList list) throws IOException {
method writeSemanticTextNode (line 242) | protected void writeSemanticTextNode(SemanticTextNode textNode) throws...
method writeTable (line 258) | protected void writeTable(TableBorder table) throws IOException {
method writeContents (line 287) | protected void writeContents(List<IObject> contents, boolean isTable) ...
method writeParagraph (line 306) | protected void writeParagraph(SemanticParagraph textNode) throws IOExc...
method writeHeading (line 310) | protected void writeHeading(SemanticHeading heading) throws IOException {
method enterTable (line 322) | protected void enterTable() {
method leaveTable (line 326) | protected void leaveTable() {
method isInsideTable (line 332) | protected boolean isInsideTable() {
method getLineBreak (line 336) | protected String getLineBreak() {
method writeLineBreak (line 344) | protected void writeLineBreak() throws IOException {
method writeSpace (line 348) | protected void writeSpace() throws IOException {
method getCorrectMarkdownString (line 352) | protected String getCorrectMarkdownString(String value) {
method close (line 359) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGeneratorFactory.java
class MarkdownGeneratorFactory (line 23) | public class MarkdownGeneratorFactory {
method getMarkdownGenerator (line 24) | public static MarkdownGenerator getMarkdownGenerator(File inputPdf,
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownHTMLGenerator.java
class MarkdownHTMLGenerator (line 28) | public class MarkdownHTMLGenerator extends MarkdownGenerator {
method MarkdownHTMLGenerator (line 30) | protected MarkdownHTMLGenerator(File inputPdf, Config config) throws I...
method writeTable (line 34) | @Override
method writeCellTagBegin (line 68) | private void writeCellTagBegin(TableBorderCell cell, boolean isHeader)...
method writeCellTagEnd (line 86) | private void writeCellTagEnd(boolean isHeader) throws IOException {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownSyntax.java
class MarkdownSyntax (line 18) | public class MarkdownSyntax {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/pdf/PDFLayer.java
type PDFLayer (line 18) | public enum PDFLayer {
method PDFLayer (line 29) | PDFLayer(String value) {
method getValue (line 33) | public String getValue() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/pdf/PDFWriter.java
class PDFWriter (line 54) | public class PDFWriter {
method updatePDF (line 62) | public void updatePDF(File inputPDF, String password, String outputFol...
method drawContent (line 90) | private void drawContent(IObject content, PDFLayer layer) throws IOExc...
method drawContent (line 94) | private void drawContent(IObject content, PDFLayer layer, Map<Integer,...
method drawTableCells (line 111) | private void drawTableCells(TableBorder table, Map<Integer, PDAnnotati...
method drawListItems (line 140) | private void drawListItems(PDFList list, Map<Integer, PDAnnotation> an...
method draw (line 150) | public Map<Integer, PDAnnotation> draw(BoundingBox boundingBox, float[...
method getFloat (line 189) | private static float getFloat(double value) {
method getContents (line 200) | public static String getContents(IObject content) {
method getColor (line 245) | public static float[] getColor(IObject content) {
method getColor (line 265) | public static float[] getColor(SemanticType semanticType) {
method createOptContentsForAnnotations (line 290) | private void createOptContentsForAnnotations(PDDocument document) {
method getOptionalContent (line 308) | public PDOptionalContentGroup getOptionalContent(PDFLayer layer) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/AbstractTableProcessor.java
class AbstractTableProcessor (line 33) | public abstract class AbstractTableProcessor {
method processTables (line 44) | public void processTables(List<List<IObject>> contents) {
method processTables (line 55) | public void processTables(List<List<IObject>> contents, List<Integer> ...
method getTables (line 69) | protected abstract List<List<TableBorder>> getTables(List<List<IObject...
method addTablesToTableCollection (line 71) | private static void addTablesToTableCollection(List<List<TableBorder>>...
method getPagesWithPossibleTables (line 98) | public static List<Integer> getPagesWithPossibleTables(List<List<IObje...
method areSuspiciousTextChunks (line 119) | private static boolean areSuspiciousTextChunks(TextChunk previousTextC...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/CaptionProcessor.java
class CaptionProcessor (line 33) | public class CaptionProcessor {
method processCaptions (line 46) | public static void processCaptions(List<IObject> contents) {
method isImageSubtle (line 100) | private static boolean isImageSubtle(ImageChunk imageChunk) {
method isTextNotContainedInImage (line 118) | public static boolean isTextNotContainedInImage(SemanticFigure image, ...
method acceptImageCaption (line 129) | private static void acceptImageCaption(List<IObject> contents, Semanti...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ClusterTableProcessor.java
class ClusterTableProcessor (line 34) | public class ClusterTableProcessor extends AbstractTableProcessor {
method getTables (line 36) | @Override
method processClusterDetectionTables (line 51) | public static List<TableBorder> processClusterDetectionTables(List<IOb...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ContentFilterProcessor.java
class ContentFilterProcessor (line 39) | public class ContentFilterProcessor {
method getFilteredContents (line 53) | public static List<IObject> getFilteredContents(String inputPdfName, L...
method processBackgrounds (line 95) | public static void processBackgrounds(int pageNumber, List<IObject> co...
method filterConsecutiveSpaces (line 114) | private static void filterConsecutiveSpaces(List<IObject> pageContents) {
method isBackground (line 122) | private static boolean isBackground(IObject content, BoundingBox pageB...
method filterOutOfPageContents (line 129) | private static void filterOutOfPageContents(int pageNumber, List<IObje...
method splitTextChunksByWhiteSpacesInPageContents (line 143) | private static List<IObject> splitTextChunksByWhiteSpacesInPageContent...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java
class DocumentProcessor (line 65) | public class DocumentProcessor {
method processFile (line 75) | public static void processFile(String inputPdfName, Config config) thr...
method getValidPageNumbers (line 101) | private static Set<Integer> getValidPageNumbers(Config config) {
method processDocument (line 136) | private static List<List<IObject>> processDocument(String inputPdfName...
method shouldProcessPage (line 192) | private static boolean shouldProcessPage(int pageNumber, Set<Integer> ...
method generateOutputs (line 196) | private static void generateOutputs(String inputPdfName, List<List<IOb...
method preprocessing (line 245) | public static void preprocessing(String pdfName, Config config) throws...
method updateStaticContainers (line 275) | private static void updateStaticContainers(Config config) {
method setIDs (line 294) | public static void setIDs(List<IObject> contents) {
method setIndexesForDocumentContents (line 305) | public static void setIndexesForDocumentContents(List<List<IObject>> c...
method setIndexesForContentsList (line 316) | public static void setIndexesForContentsList(List<IObject> contents) {
method removeNullObjectsFromList (line 328) | public static List<IObject> removeNullObjectsFromList(List<IObject> co...
method calculateDocumentInfo (line 338) | private static void calculateDocumentInfo() {
method getInfo (line 349) | private static GFCosInfo getInfo(COSTrailer trailer) {
method getContentsValueForTextNode (line 360) | public static String getContentsValueForTextNode(SemanticTextNode text...
method getPageBoundingBox (line 373) | public static BoundingBox getPageBoundingBox(int pageNumber) {
method sortPageContents (line 391) | public static List<IObject> sortPageContents(List<IObject> contents) {
method sortContents (line 437) | public static void sortContents(List<List<IObject>> contents, Config c...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeaderFooterProcessor.java
class HeaderFooterProcessor (line 43) | public class HeaderFooterProcessor {
method processHeadersAndFooters (line 51) | public static void processHeadersAndFooters(List<List<IObject>> conten...
method processHeadersOrFootersContents (line 72) | private static void processHeadersOrFootersContents(List<SemanticHeade...
method updatePageContents (line 80) | private static List<IObject> updatePageContents(List<IObject> pageCont...
method getHeaderOrFooterContentsIndexes (line 106) | private static Set<Integer> getHeaderOrFooterContentsIndexes(SemanticH...
method getHeadersOrFooters (line 117) | private static List<SemanticHeaderOrFooter> getHeadersOrFooters(List<L...
method processHeaderOrFooterContent (line 141) | private static List<IObject> processHeaderOrFooterContent(List<IObject...
method getNumberOfHeaderOrFooterContentsForEachPage (line 150) | private static List<Integer> getNumberOfHeaderOrFooterContentsForEachP...
method getIndexesOfHeaderOrFootersContents (line 183) | private static Set<Integer> getIndexesOfHeaderOrFootersContents(List<I...
method isHeaderOrFooter (line 215) | public static boolean isHeaderOrFooter(IObject content) {
method filterHeaderOrFooterContents (line 225) | private static List<IObject> filterHeaderOrFooterContents(List<IObject...
method arePossibleHeadersOrFooters (line 246) | private static boolean arePossibleHeadersOrFooters(IObject object1, IO...
method getHeadersOrFootersIntervals (line 281) | private static Set<ListInterval> getHeadersOrFootersIntervals(List<Sem...
method getHeadersOfFooterIntervals (line 294) | private static Set<ListInterval> getHeadersOfFooterIntervals(List<List...
method getEqualsItems (line 316) | private static Set<ListInterval> getEqualsItems(List<ListItemTextInfo>...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeadingProcessor.java
class HeadingProcessor (line 43) | public class HeadingProcessor {
method processHeadings (line 53) | public static void processHeadings(List<IObject> contents, boolean isT...
method disassemblePDFList (line 96) | private static List<IObject> disassemblePDFList(PDFList list) {
method convertListItemToSemanticTextNode (line 107) | private static SemanticTextNode convertListItemToSemanticTextNode(Text...
method getTextNodesFromContents (line 115) | private static List<SemanticTextNode> getTextNodesFromContents(List<IO...
method processContent (line 125) | private static void processContent(List<SemanticTextNode> textNodes, I...
method isNotHeadings (line 150) | private static boolean isNotHeadings(PDFList list) {
method setHeadings (line 170) | private static void setHeadings(List<IObject> contents) {
method detectHeadingsLevels (line 192) | public static void detectHeadingsLevels() {
method findClosestLevel (line 221) | private static int findClosestLevel(SemanticHeading heading, SortedMap...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HiddenTextProcessor.java
class HiddenTextProcessor (line 30) | public class HiddenTextProcessor {
method findHiddenText (line 42) | public static List<IObject> findHiddenText(String pdfName, List<IObjec...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HybridDocumentProcessor.java
class HybridDocumentProcessor (line 66) | public class HybridDocumentProcessor {
method HybridDocumentProcessor (line 70) | private HybridDocumentProcessor() {
method processDocument (line 83) | public static List<List<IObject>> processDocument(
method processDocument (line 100) | public static List<List<IObject>> processDocument(
method createEmptyContents (line 209) | private static List<List<IObject>> createEmptyContents(int totalPages) {
method filterAllPages (line 220) | private static Map<Integer, List<IObject>> filterAllPages(
method filterByDecision (line 249) | private static Set<Integer> filterByDecision(
method processJavaPath (line 262) | private static Map<Integer, List<IObject>> processJavaPath(
method applyJavaPagePostProcessing (line 325) | private static void applyJavaPagePostProcessing(List<List<IObject>> co...
method processBackendPath (line 350) | private static Map<Integer, List<IObject>> processBackendPath(
method getClient (line 424) | private static HybridClient getClient(Config config) {
method createTransformer (line 431) | private static HybridSchemaTransformer createTransformer(Config config) {
method getPageHeights (line 450) | private static Map<Integer, Double> getPageHeights(Set<Integer> pageNu...
method mergeResults (line 466) | private static void mergeResults(
method postProcess (line 494) | private static void postProcess(
method shouldProcessPage (line 514) | private static boolean shouldProcessPage(int pageNumber, Set<Integer> ...
method determineOutputFormats (line 531) | private static Set<OutputFormat> determineOutputFormats(Config config) {
method logTriageSummary (line 538) | private static void logTriageSummary(Map<Integer, TriageResult> triage...
method logTriageToFile (line 564) | private static void logTriageToFile(
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/LevelProcessor.java
class LevelProcessor (line 35) | public class LevelProcessor {
method detectLevels (line 41) | public static void detectLevels(List<List<IObject>> contents) {
method setLevels (line 45) | private static void setLevels(List<List<IObject>> contents, Stack<Leve...
method setLevelForHeading (line 117) | private static void setLevelForHeading(SemanticHeading heading) {
method getLevelInfoIndex (line 126) | private static Integer getLevelInfoIndex(Stack<LevelInfo> levelInfos, ...
method setLevelForTable (line 136) | private static void setLevelForTable(TableBorder tableBorder) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ListProcessor.java
class ListProcessor (line 44) | public class ListProcessor {
method processLists (line 53) | public static void processLists(List<List<IObject>> contents, boolean ...
method processListItemContent (line 96) | private static List<IObject> processListItemContent(List<IObject> cont...
method processTextNodeListItemContent (line 107) | private static void processTextNodeListItemContent(List<IObject> conte...
method getTextLabelListIntervals (line 111) | private static List<TextListInterval> getTextLabelListIntervals(List<L...
method processListItem (line 139) | private static void processListItem(List<TextListInterval> listInterva...
method createListItemTextInfo (line 191) | private static ListItemTextInfo createListItemTextInfo(int i, TextLine...
method calculateList (line 203) | private static PDFList calculateList(TextListInterval interval, int st...
method addContentToListItem (line 241) | private static void addContentToListItem(int nextIndex, ListItemInfo c...
method addContentToLastPageListItem (line 281) | private static void addContentToLastPageListItem(int nextIndex, ListIt...
method isListItemLine (line 311) | private static boolean isListItemLine(ListItem listItem, TextLine curr...
method getMaxXGap (line 342) | private static double getMaxXGap(double fontSize) {
method processListsFromTextNodes (line 346) | public static List<IObject> processListsFromTextNodes(List<IObject> co...
method calculateTextChildrenInfo (line 373) | private static List<ListItemTextInfo> calculateTextChildrenInfo(List<S...
method updateListInterval (line 388) | private static void updateListInterval(ListInterval interval, List<Int...
method isCorrectList (line 394) | private static boolean isCorrectList(TextListInterval interval) {//mov...
method isDoubles (line 398) | private static boolean isDoubles(TextListInterval interval) {
method checkNeighborLists (line 411) | public static void checkNeighborLists(List<List<IObject>> contents) {
method addMiddleContentToList (line 463) | private static void addMiddleContentToList(PDFList previousList, PDFLi...
method addFirstLBodyToList (line 476) | private static void addFirstLBodyToList(PDFList currentList, SemanticT...
method isNeighborLists (line 484) | public static boolean isNeighborLists(PDFList previousList, PDFList cu...
method isMiddleContentPartOfList (line 500) | private static boolean isMiddleContentPartOfList(PDFList previousList,...
method getTextChildrenInfosForNeighborLists (line 519) | private static List<ListItemTextInfo> getTextChildrenInfosForNeighborL...
method createListItemTextInfoFromListItem (line 532) | private static ListItemTextInfo createListItemTextInfoFromListItem(int...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ParagraphProcessor.java
class ParagraphProcessor (line 32) | public class ParagraphProcessor {
method processParagraphs (line 36) | public static List<IObject> processParagraphs(List<IObject> contents) {
method getContentsWithDetectedParagraphs (line 56) | private static List<IObject> getContentsWithDetectedParagraphs(List<IO...
method detectParagraphsWithJustifyAlignments (line 74) | private static List<TextBlock> detectParagraphsWithJustifyAlignments(L...
method detectParagraphsWithCenterAlignments (line 97) | private static List<TextBlock> detectParagraphsWithCenterAlignments(Li...
method areLinesOfParagraphsWithCenterAlignments (line 117) | private static boolean areLinesOfParagraphsWithCenterAlignments(TextBl...
method detectFirstAndLastLinesOfParagraphsWithJustifyAlignments (line 132) | private static List<TextBlock> detectFirstAndLastLinesOfParagraphsWith...
method detectParagraphsWithLeftAlignments (line 159) | private static List<TextBlock> detectParagraphsWithLeftAlignments(List...
method areLinesOfParagraphsWithRightAlignments (line 180) | private static boolean areLinesOfParagraphsWithRightAlignments(TextBlo...
method areLinesOfParagraphsWithLeftAlignments (line 201) | private static boolean areLinesOfParagraphsWithLeftAlignments(TextBloc...
method detectFirstLinesOfParagraphWithLeftAlignments (line 245) | private static List<TextBlock> detectFirstLinesOfParagraphWithLeftAlig...
method isFirstLineOfParagraphWithLeftAlignment (line 266) | private static boolean isFirstLineOfParagraphWithLeftAlignment(TextBlo...
method detectTwoLinesParagraphs (line 292) | private static List<TextBlock> detectTwoLinesParagraphs(List<TextBlock...
method isTwoLinesParagraph (line 314) | private static boolean isTwoLinesParagraph(TextBlock previousBlock, Te...
method isFirstLineOfBulletedParagraphWithLeftAlignment (line 335) | private static boolean isFirstLineOfBulletedParagraphWithLeftAlignment...
method detectParagraphsWithRightAlignments (line 364) | private static List<TextBlock> detectParagraphsWithRightAlignments(Lis...
method detectBulletedParagraphsWithLeftAlignments (line 384) | private static List<TextBlock> detectBulletedParagraphsWithLeftAlignme...
method processOtherLines (line 405) | private static List<TextBlock> processOtherLines(List<TextBlock> textB...
method isOneParagraph (line 424) | private static boolean isOneParagraph(TextBlock previousBlock, TextBlo...
method isFirstLineOfBlock (line 450) | private static boolean isFirstLineOfBlock(TextBlock previousBlock, Tex...
method isLastLineOfBlock (line 473) | private static boolean isLastLineOfBlock(TextBlock previousBlock, Text...
method createParagraphFromTextBlock (line 496) | public static SemanticParagraph createParagraphFromTextBlock(TextBlock...
method getDifferentLinesProbability (line 506) | private static double getDifferentLinesProbability(TextBlock previousB...
method areCloseStyle (line 526) | private static boolean areCloseStyle(TextBlock previousBlock, TextBloc...
method areTextBlocksHaveSameTextSize (line 532) | private static boolean areTextBlocksHaveSameTextSize(TextBlock firstBl...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/SpecialTableProcessor.java
class SpecialTableProcessor (line 28) | public class SpecialTableProcessor {
method detectSpecialTables (line 32) | public static List<IObject> detectSpecialTables(List<IObject> contents) {
method detectSpecialKoreanTables (line 37) | private static void detectSpecialKoreanTables(List<IObject> contents) {
method detectSpecialKoreanTable (line 64) | private static TableBorder detectSpecialKoreanTable(List<TextLine> lin...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/StrikethroughProcessor.java
class StrikethroughProcessor (line 41) | public class StrikethroughProcessor {
method processStrikethroughs (line 61) | public static List<IObject> processStrikethroughs(List<IObject> pageCo...
method isTableBorderLine (line 112) | static boolean isTableBorderLine(LineChunk line) {
method isStrikethroughLine (line 124) | static boolean isStrikethroughLine(LineChunk line, TextChunk textChunk) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableBorderProcessor.java
class TableBorderProcessor (line 37) | public class TableBorderProcessor {
method processTableBorders (line 54) | public static List<IObject> processTableBorders(List<IObject> contents...
method addContentToTableBorder (line 120) | private static TableBorder addContentToTableBorder(IObject content) {
method processTableBorder (line 161) | public static void processTableBorder(TableBorder tableBorder, int pag...
method normalizeAndProcessTableBorder (line 165) | static TableBorder normalizeAndProcessTableBorder(List<IObject> rawPag...
method processTableBorderContents (line 171) | private static void processTableBorderContents(TableBorder tableBorder...
method processTableCellContent (line 183) | private static List<IObject> processTableCellContent(List<IObject> con...
method checkNeighborTables (line 201) | public static void checkNeighborTables(List<List<IObject>> contents) {
method checkNeighborTables (line 221) | private static void checkNeighborTables(TableBorder previousTable, Tab...
method getTextChunkPartForRange (line 239) | static TextChunk getTextChunkPartForRange(TextChunk textChunk, double ...
method getTextChunkPartForTableCell (line 255) | private static TextChunk getTextChunkPartForTableCell(TextChunk textCh...
method getTextChunkPartBeforeTable (line 259) | public static TextChunk getTextChunkPartBeforeTable(TextChunk textChun...
method getTextChunkPartAfterTable (line 271) | public static TextChunk getTextChunkPartAfterTable(TextChunk textChunk...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableStructureNormalizer.java
class TableStructureNormalizer (line 32) | class TableStructureNormalizer {
method normalize (line 50) | static TableBorder normalize(List<IObject> rawPageContents, TableBorde...
method collectColumnSnapshots (line 81) | private static List<ColumnSnapshot> collectColumnSnapshots(List<IObjec...
method addTextChunkToColumns (line 108) | private static void addTextChunkToColumns(TextChunk textChunk, TableBo...
method findBestColumn (line 119) | private static int findBestColumn(IObject content, TableBorder tableBo...
method isInsideTableBounds (line 140) | private static boolean isInsideTableBounds(IObject content, TableBorde...
method countDenseColumns (line 145) | private static int countDenseColumns(List<ColumnSnapshot> columnSnapsh...
method collectRowBands (line 155) | private static List<RowBand> collectRowBands(TableBorder tableBorder, ...
method findMatchingRowBand (line 187) | private static RowBand findMatchingRowBand(List<RowBand> rowBands, Tex...
method findBestRowBand (line 199) | private static RowBand findBestRowBand(List<RowBand> rowBands, IObject...
method rebuildTable (line 225) | private static TableBorder rebuildTable(TableBorder originalTable, Lis...
method isReplacementQualityBetter (line 255) | private static boolean isReplacementQualityBetter(TableBorder original...
method countNonEmptyRows (line 279) | private static int countNonEmptyRows(TableBorder tableBorder) {
method countNonEmptyColumns (line 298) | private static int countNonEmptyColumns(TableBorder tableBorder) {
method hasMeaningfulContent (line 317) | private static boolean hasMeaningfulContent(List<IObject> contents) {
method hasMonotonicRowOrder (line 337) | private static boolean hasMonotonicRowOrder(TableBorder tableBorder) {
method collectTableLineStats (line 355) | private static TableLineStats collectTableLineStats(TableBorder tableB...
method countMeaningfulTextLines (line 373) | private static int countMeaningfulTextLines(List<IObject> contents) {
class ColumnSnapshot (line 392) | private static final class ColumnSnapshot {
method addContent (line 398) | private void addContent(IObject content) {
method finalizeSnapshot (line 402) | private void finalizeSnapshot() {
class TableLineStats (line 422) | private static final class TableLineStats {
method TableLineStats (line 427) | private TableLineStats(int oversizedCellCount, int maxMeaningfulText...
class RowBand (line 433) | private static final class RowBand {
method RowBand (line 442) | private RowBand(int columnCount) {
method addLine (line 449) | private void addLine(TextLine textLine) {
method addContent (line 453) | private void addContent(int columnNumber, IObject content) {
method updateBounds (line 458) | private void updateBounds(double contentTopY, double contentBottomY,...
method hasVerticalOverlap (line 466) | private boolean hasVerticalOverlap(double contentTopY, double conten...
method isEmpty (line 470) | private boolean isEmpty() {
method sortContents (line 479) | private void sortContents() {
method getContents (line 485) | private List<IObject> getContents(int columnNumber) {
method createRowBoundingBox (line 489) | private BoundingBox createRowBoundingBox(TableBorder tableBorder) {
method createCellBoundingBox (line 494) | private BoundingBox createCellBoundingBox(TableBorder tableBorder, i...
method getCenterY (line 499) | private double getCenterY() {
method getAverageHeight (line 503) | private double getAverageHeight() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TaggedDocumentProcessor.java
class TaggedDocumentProcessor (line 23) | public class TaggedDocumentProcessor {
method processDocument (line 29) | public static List<List<IObject>> processDocument(String inputPdfName,...
method collectArtifacts (line 64) | private static List<List<IObject>> collectArtifacts(int totalPages) {
method shouldProcessPage (line 91) | private static boolean shouldProcessPage(int pageNumber) {
method processStructElem (line 95) | private static void processStructElem(INode node) {
method addObjectToContent (line 141) | private static void addObjectToContent(IObject object) {
method processParagraph (line 152) | private static void processParagraph(INode paragraph) {
method createParagraph (line 156) | private static SemanticParagraph createParagraph(INode paragraph) {
method processHeading (line 171) | private static void processHeading(INode node) {
method processNumberedHeading (line 177) | private static void processNumberedHeading(INode node) {
method processList (line 185) | private static void processList(INode node) {
method processListItem (line 203) | private static ListItem processListItem(INode node) {
method processTable (line 218) | private static void processTable(INode tableNode) {
method processTableRows (line 269) | private static List<INode> processTableRows(INode table) {
method processTableRowsChildren (line 293) | private static void processTableRowsChildren(INode tableRow) {
method addTableRow (line 302) | private static void addTableRow(int numberOfColumns, List<List<TableBo...
method addTableColumn (line 310) | private static void addTableColumn(List<List<TableBorderCell>> table) {
method processTableCell (line 316) | private static void processTableCell(TableBorderCell cell, INode elem) {
method processChildContents (line 325) | private static void processChildContents(INode elem, List<IObject> con...
method createRowsForTable (line 333) | private static TableBorderRow[] createRowsForTable(List<List<TableBord...
method setBoundingBoxesForTableRowsAndTableCells (line 349) | private static void setBoundingBoxesForTableRowsAndTableCells(TableBor...
method processCaption (line 376) | private static void processCaption(INode node) {
method processTOC (line 381) | private static void processTOC(INode toc) {
method processImage (line 385) | private static void processImage(SemanticFigure image) {
method processTextChunk (line 392) | private static void processTextChunk(SemanticSpan semanticSpan) {
method getContents (line 396) | private static List<IObject> getContents(INode node) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextLineProcessor.java
class TextLineProcessor (line 33) | public class TextLineProcessor {
method processTextLines (line 39) | public static List<IObject> processTextLines(List<IObject> contents) {
method getTextLineWithSpaces (line 80) | private static TextLine getTextLineWithSpaces(TextLine textLine, doubl...
method linkTextLinesWithConnectedLineArtBullet (line 103) | private static void linkTextLinesWithConnectedLineArtBullet(List<IObje...
method isLineConnectedWithLineArt (line 123) | private static boolean isLineConnectedWithLineArt(TextLine textLine, L...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextProcessor.java
class TextProcessor (line 31) | public class TextProcessor {
method replaceUndefinedCharacters (line 41) | public static void replaceUndefinedCharacters(List<IObject> contents, ...
method measureReplacementCharRatio (line 55) | public static double measureReplacementCharRatio(List<IObject> content...
method filterTinyText (line 76) | public static void filterTinyText(List<IObject> contents) {
method trimTextChunksWhiteSpaces (line 88) | public static void trimTextChunksWhiteSpaces(List<IObject> contents) {
method mergeCloseTextChunks (line 97) | public static void mergeCloseTextChunks(List<IObject> contents) {
method removeSameTextChunks (line 114) | public static void removeSameTextChunks(List<IObject> contents) {
method areSameTextChunks (line 130) | public static boolean areSameTextChunks(TextChunk firstTextChunk, Text...
method removeTextDecorationImages (line 137) | public static void removeTextDecorationImages(List<IObject> contents) {
method isTextChunkDecorationImage (line 150) | public static boolean isTextChunkDecorationImage(ImageChunk imageChunk...
method areNeighborsTextChunks (line 157) | private static boolean areNeighborsTextChunks(TextChunk firstTextChunk...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorter.java
class XYCutPlusPlusSorter (line 45) | public class XYCutPlusPlusSorter {
method XYCutPlusPlusSorter (line 70) | private XYCutPlusPlusSorter() {
method sort (line 82) | public static List<IObject> sort(List<IObject> objects) {
method sort (line 94) | public static List<IObject> sort(List<IObject> objects, double beta, d...
method identifyCrossLayoutElements (line 146) | static List<IObject> identifyCrossLayoutElements(List<IObject> objects...
method hasMinimumOverlaps (line 196) | static boolean hasMinimumOverlaps(IObject element, List<IObject> objec...
method calculateHorizontalOverlapRatio (line 233) | static double calculateHorizontalOverlapRatio(BoundingBox box1, Boundi...
method computeDensityRatio (line 260) | static double computeDensityRatio(List<IObject> objects) {
method calculateBoundingRegion (line 286) | static BoundingBox calculateBoundingRegion(List<IObject> objects) {
method calculateTotalArea (line 303) | static double calculateTotalArea(List<IObject> objects) {
method recursiveSegment (line 331) | static List<IObject> recursiveSegment(List<IObject> objects, boolean p...
class CutInfo (line 378) | private static class CutInfo {
method CutInfo (line 382) | CutInfo(double position, double gap) {
method flatMapRecursive (line 391) | private static List<IObject> flatMapRecursive(List<List<IObject>> grou...
method findBestVerticalCutWithProjection (line 406) | private static CutInfo findBestVerticalCutWithProjection(List<IObject>...
method findVerticalCutByEdges (line 450) | private static CutInfo findVerticalCutByEdges(List<IObject> objects) {
method findBestHorizontalCutWithProjection (line 484) | private static CutInfo findBestHorizontalCutWithProjection(List<IObjec...
method splitByHorizontalCut (line 524) | static List<List<IObject>> splitByHorizontalCut(List<IObject> objects,...
method splitByVerticalCut (line 556) | static List<List<IObject>> splitByVerticalCut(List<IObject> objects, d...
method mergeCrossLayoutElements (line 590) | static List<IObject> mergeCrossLayoutElements(List<IObject> sortedMain...
method sortByYThenX (line 644) | static List<IObject> sortByYThenX(List<IObject> objects) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/text/TextGenerator.java
class TextGenerator (line 45) | public class TextGenerator implements Closeable {
method TextGenerator (line 56) | public TextGenerator(File inputPdf, Config config) throws IOException {
method writeToText (line 64) | public void writeToText(List<List<IObject>> contents) {
method writePageSeparator (line 80) | private void writePageSeparator(int pageIndex) throws IOException {
method writeContents (line 89) | private void writeContents(List<IObject> contents, int indentLevel) th...
method write (line 98) | private void write(IObject object, int indentLevel) throws IOException {
method writeHeaderOrFooter (line 116) | private void writeHeaderOrFooter(SemanticHeaderOrFooter headerOrFooter...
method writeList (line 120) | private void writeList(PDFList list, int indentLevel) throws IOExcepti...
method writeTable (line 135) | private void writeTable(TableBorder table, int indentLevel) throws IOE...
method collectPlainText (line 151) | private String collectPlainText(List<IObject> contents) {
method extractPlainText (line 166) | private String extractPlainText(IObject content) {
method writeMultiline (line 197) | private void writeMultiline(String value, int indentLevel) throws IOEx...
method indent (line 214) | private String indent(int level) {
method sanitize (line 221) | private String sanitize(String value) {
method compactWhitespace (line 225) | private String compactWhitespace(String value) {
method close (line 233) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/Base64ImageUtils.java
class Base64ImageUtils (line 28) | public final class Base64ImageUtils {
method Base64ImageUtils (line 37) | private Base64ImageUtils() {
method toDataUri (line 49) | public static String toDataUri(File imageFile, String format) {
method getMimeType (line 73) | public static String getMimeType(String format) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/BulletedParagraphUtils.java
class BulletedParagraphUtils (line 29) | public class BulletedParagraphUtils {
method getLabel (line 46) | public static String getLabel(SemanticTextNode semanticTextNode) {
method isBulletedParagraph (line 56) | public static boolean isBulletedParagraph(SemanticTextNode textNode) {
method isBulletedLine (line 66) | public static boolean isBulletedLine(TextLine textLine) {
method isLabeledLine (line 79) | public static boolean isLabeledLine(TextLine textLine) {
method isBulletedLineArtParagraph (line 105) | public static boolean isBulletedLineArtParagraph(SemanticTextNode text...
method getLabelRegex (line 115) | public static String getLabelRegex(SemanticTextNode textNode) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ContentSanitizer.java
class ContentSanitizer (line 24) | public class ContentSanitizer {
method ContentSanitizer (line 29) | public ContentSanitizer(List<SanitizationRule> rules) {
method ContentSanitizer (line 34) | public ContentSanitizer(List<SanitizationRule> rules, boolean contentS...
method sanitizeContents (line 39) | public void sanitizeContents(List<List<IObject>> contents) {
method processObject (line 51) | private void processObject(IObject obj) {
method processSemanticHeaderOrFooter (line 65) | private void processSemanticHeaderOrFooter(SemanticHeaderOrFooter head...
method processPDFList (line 71) | private void processPDFList(PDFList pdfList) {
method processTableBorder (line 82) | private void processTableBorder(TableBorder tableBorder) {
method processSemanticTextNode (line 96) | private void processSemanticTextNode(SemanticTextNode node) {
method processTextLine (line 106) | private void processTextLine(TextLine textLine) {
method applyReplacementsToChunks (line 126) | protected List<TextChunk> applyReplacementsToChunks(List<TextChunk> or...
method doReplacementsOverlap (line 197) | private static boolean doReplacementsOverlap(ReplacementInfo a, Replac...
method removeOverlappingReplacements (line 201) | private static void removeOverlappingReplacements(List<ReplacementInfo...
method createReplacementChunk (line 224) | private TextChunk createReplacementChunk(List<TextChunk> originalChunk...
method findEndChunkIndex (line 233) | private int findEndChunkIndex(int currentChunkIndex, List<ChunkInfo> c...
method isNotEmptyChunk (line 249) | private boolean isNotEmptyChunk(TextChunk chunk) {
method findAllReplacements (line 253) | protected List<ReplacementInfo> findAllReplacements(String originalTex...
method updateBBoxForReplacement (line 265) | private void updateBBoxForReplacement(TextChunk replacementChunk,
class ReplacementInfo (line 285) | protected static class ReplacementInfo {
method ReplacementInfo (line 290) | ReplacementInfo(int originalStart, int originalEnd, String replaceme...
class ChunkInfo (line 297) | private static class ChunkInfo {
method ChunkInfo (line 302) | ChunkInfo(int start, int length) {
method getChunkInfos (line 309) | private List<ChunkInfo> getChunkInfos(List<TextChunk> textChunks) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java
class ImagesUtils (line 41) | public class ImagesUtils {
method getContrastRatioConsumer (line 45) | public ContrastRatioConsumer getContrastRatioConsumer() {
method createImagesDirectory (line 49) | public void createImagesDirectory(String path) {
method write (line 56) | public void write(List<List<IObject>> contents, String pdfFilePath, St...
method writeFromContents (line 64) | private void writeFromContents(IObject content, String pdfFilePath, St...
method writeImage (line 94) | protected void writeImage(ImageChunk chunk, String pdfFilePath, String...
method writePicture (line 106) | protected void writePicture(SemanticPicture picture, String pdfFilePat...
method createImageFile (line 117) | private void createImageFile(BoundingBox imageBox, String fileName, St...
method isImageFileExists (line 130) | public static boolean isImageFileExists(String fileName) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ModeWeightStatistics.java
class ModeWeightStatistics (line 9) | public class ModeWeightStatistics {
method ModeWeightStatistics (line 19) | public ModeWeightStatistics(double scoreMin, double scoreMax, double m...
method addScore (line 26) | public void addScore(double score) {
method getBoost (line 30) | public double getBoost(double score) {
method sortByFrequency (line 44) | public void sortByFrequency() {
method getMode (line 49) | public double getMode() {
method initHigherScores (line 59) | private void initHigherScores() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/SanitizationRule.java
class SanitizationRule (line 5) | public class SanitizationRule {
method SanitizationRule (line 9) | public SanitizationRule(Pattern pattern, String replacement) {
method getPattern (line 14) | public Pattern getPattern() {
method getReplacement (line 18) | public String getReplacement() {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeStatistics.java
class TextNodeStatistics (line 5) | public class TextNodeStatistics {
method TextNodeStatistics (line 10) | public TextNodeStatistics() {
method TextNodeStatistics (line 14) | public TextNodeStatistics(TextNodeStatisticsConfig config) {
method addTextNode (line 30) | public void addTextNode(SemanticTextNode textNode) {
method fontSizeRarityBoost (line 38) | public double fontSizeRarityBoost(SemanticTextNode textNode) {
method fontWeightRarityBoost (line 43) | public double fontWeightRarityBoost(SemanticTextNode textNode) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeStatisticsConfig.java
class TextNodeStatisticsConfig (line 8) | public class TextNodeStatisticsConfig {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeUtils.java
class TextNodeUtils (line 23) | public class TextNodeUtils {
method getTextColorOrDefault (line 32) | public static double[] getTextColorOrDefault(SemanticTextNode textNode) {
method getTextColorOrNull (line 47) | public static double[] getTextColorOrNull(SemanticTextNode textNode) {
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/LevelInfo.java
class LevelInfo (line 24) | public class LevelInfo {
method LevelInfo (line 31) | public LevelInfo(double left, double right) {
method areSameLevelsInfos (line 36) | public static boolean areSameLevelsInfos(LevelInfo levelInfo1, LevelIn...
method checkBoundingBoxes (line 72) | public static boolean checkBoundingBoxes(LevelInfo levelInfo1, LevelIn...
method isTable (line 84) | public boolean isTable() {
method isList (line 88) | public boolean isList() {
method isLineArtBulletParagraph (line 92) | public boolean isLineArtBulletParagraph() {
method isTextBulletParagraph (line 96) | public boolean isTextBulletParagraph() {
method getMaxXGap (line 100) | public double getMaxXGap() {
method getMaxXGap (line 104) | public static double getMaxXGap(LevelInfo levelInfo1, LevelInfo levelI...
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/LineArtBulletParagraphLevelInfo.java
class LineArtBulletParagraphLevelInfo (line 21) | public class LineArtBulletParagraphLevelInfo extends LevelInfo {
method LineArtBulletParagraphLevelInfo (line 25) | public LineArtBulletParagraphLevelInfo(SemanticTextNode textNode) {
method isLineArtBulletParagraph (line 31) | @Override
method getBullet (line 36) | public LineArtChunk getBullet() {
method getMaxXGap (line 40) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/ListLevelInfo.java
class ListLevelInfo (line 20) | public class ListLevelInfo extends LevelInfo {
method ListLevelInfo (line 25) | public ListLevelInfo(PDFList pdfList) {
method isList (line 33) | @Override
method getCommonPrefix (line 38) | public String getCommonPrefix() {
method getNumberingStyle (line 42) | public String getNumberingStyle() {
method getMaxXGap (line 46) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/TableLevelInfo.java
class TableLevelInfo (line 20) | public class TableLevelInfo extends LevelInfo {
method TableLevelInfo (line 21) | public TableLevelInfo(TableBorder table) {
method isTable (line 25) | @Override
FILE: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/TextBulletParagraphLevelInfo.java
class TextBulletParagraphLevelInfo (line 21) | public class TextBulletParagraphLevelInfo extends LevelInfo {
method TextBulletParagraphLevelInfo (line 26) | public TextBulletParagraphLevelInfo(SemanticTextNode semanticTextNode) {
method isTextBulletParagraph (line 33) | @Override
method getLabel (line 38) | public String getLabel() {
method getLabelRegex (line 42) | public String getLabelRegex() {
method getMaxXGap (line 46) | @Override
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/EmbedImagesIntegrationTest.java
class EmbedImagesIntegrationTest (line 36) | class EmbedImagesIntegrationTest {
method setUp (line 46) | @BeforeEach
method tearDown (line 55) | @AfterEach
method testEmbedImagesInJsonOutput (line 60) | @Test
method testEmbedImagesInHtmlOutput (line 95) | @Test
method testEmbedImagesInMarkdownOutput (line 128) | @Test
method testNoEmbedImagesUsesFilePaths (line 161) | @Test
method testEmbedImagesWithJpegFormat (line 190) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/ImageDirIntegrationTest.java
class ImageDirIntegrationTest (line 35) | class ImageDirIntegrationTest {
method tearDown (line 43) | @AfterEach
method testCustomImageDir_imagesWrittenToCustomPath (line 48) | @Test
method testDefaultImageDir_imagesWrittenToDefaultPath (line 76) | @Test
method testCustomImageDir_jsonReferencesCorrectPath (line 100) | @Test
method testCustomImageDir_markdownReferencesCorrectPath (line 130) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java
class IntegrationTest (line 35) | public class IntegrationTest {
method integrationTestParams (line 37) | static Stream<Arguments> integrationTestParams() {
method test (line 42) | @ParameterizedTest(name = "{index}: ({0}) => {0}")
method checkJsonNodes (line 63) | private static void checkJsonNodes(JsonNode node1, JsonNode node2) {
method checkArrayFields (line 71) | private static void checkArrayFields(JsonNode node1, JsonNode node2, S...
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/Issue336IntegrationTest.java
class Issue336IntegrationTest (line 37) | class Issue336IntegrationTest {
method setUp (line 47) | @BeforeEach
method testSpreadsheetExportedTableKeepsFinancialRowsSeparatedAcrossStandardOutputs (line 53) | @Test
method testSpreadsheetExportedTableKeepsFinancialRowsSeparatedInMarkdownHtmlOutput (line 84) | @Test
method testSpreadsheetExportedTableKeepsFinancialRowsSeparatedInMarkdownImageMode (line 98) | @Test
method assertJsonContainsExpectedRow (line 112) | private static void assertJsonContainsExpectedRow(Path jsonOutput) thr...
method assertMarkdownTableContainsExpectedRow (line 120) | private static void assertMarkdownTableContainsExpectedRow(Path markdo...
method assertHtmlTableContainsExpectedRow (line 130) | private static void assertHtmlTableContainsExpectedRow(Path htmlOutput...
method assertTextContainsExpectedRow (line 147) | private static void assertTextContainsExpectedRow(Path textOutput) thr...
method expectedFinancialRow (line 155) | private static List<String> expectedFinancialRow() {
method containsExpectedValues (line 165) | private static boolean containsExpectedValues(String value, List<Strin...
method extractTableRows (line 174) | private static List<List<String>> extractTableRows(JsonNode root) {
method collectTables (line 200) | private static void collectTables(JsonNode node, List<JsonNode> tables) {
method collectContent (line 219) | private static String collectContent(JsonNode node) {
method appendContent (line 225) | private static void appendContent(JsonNode node, StringBuilder builder) {
method normalizeText (line 260) | private static String normalizeText(String value) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/PageSeparatorIntegrationTest.java
class PageSeparatorIntegrationTest (line 36) | class PageSeparatorIntegrationTest {
method setUp (line 46) | @BeforeEach
method testMarkdownPageSeparatorSimple (line 54) | @Test
method testMarkdownPageSeparatorWithPageNumber (line 71) | @Test
method testMarkdownPageSeparatorEmpty (line 88) | @Test
method testTextPageSeparatorSimple (line 107) | @Test
method testTextPageSeparatorWithPageNumber (line 124) | @Test
method testTextPageSeparatorEmpty (line 141) | @Test
method testHtmlPageSeparatorSimple (line 160) | @Test
method testHtmlPageSeparatorWithPageNumber (line 177) | @Test
method testHtmlPageSeparatorEmpty (line 194) | @Test
method testConfigPageSeparatorDefaults (line 214) | @Test
method testConfigPageSeparatorSetters (line 223) | @Test
method testConfigPageNumberConstant (line 237) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/PagesOptionIntegrationTest.java
class PagesOptionIntegrationTest (line 42) | class PagesOptionIntegrationTest {
method setUp (line 57) | @BeforeEach
method testPagesOptionSinglePage (line 64) | @Test
method testPagesOptionMultiplePages (line 82) | @Test
method testPagesOptionPageRange (line 104) | @Test
method testPagesOptionMixedRangeAndSingle (line 125) | @Test
method testPagesOptionAllPages (line 147) | @Test
method testPagesOptionMarkdown (line 166) | @Test
method testPagesOptionExceedsDocumentPages (line 187) | @Test
method testPagesOptionAllPagesExceedDocument (line 208) | @Test
method testPagesOptionAllPagesExceedDocumentInHybridMode (line 227) | @Test
method testPagesOptionTaggedPdfSinglePage (line 255) | @Test
method testPagesOptionTaggedPdfMultiplePages (line 277) | @Test
method testPagesOptionTaggedPdfAllPages (line 300) | @Test
method parseJson (line 322) | private JsonNode parseJson(Path jsonPath) throws IOException {
method getPageNumbersFromKids (line 331) | private Set<Integer> getPageNumbersFromKids(JsonNode root) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/api/ConfigTest.java
class ConfigTest (line 26) | class ConfigTest {
method testDefaultValues (line 28) | @Test
method testSetImageOutputAffectsIsEmbedImages (line 40) | @Test
method testSetImageFormat (line 57) | @Test
method testIsValidImageFormat_withValidFormats (line 68) | @ParameterizedTest
method testIsValidImageFormat_withInvalidFormats (line 74) | @ParameterizedTest
method testIsValidImageFormat_withNull (line 80) | @Test
method testGetImageFormatOptions (line 85) | @Test
method testImageFormatConstants (line 94) | @Test
method testSetImageFormatNormalizesToLowercase (line 100) | @Test
method testSetImageFormatWithNullDefaultsToPng (line 111) | @Test
method testSetImageFormatThrowsExceptionForInvalidFormat (line 119) | @ParameterizedTest
method testSetImageOutput (line 132) | @Test
method testIsValidImageOutput_withValidModes (line 145) | @ParameterizedTest
method testIsValidImageOutput_withInvalidModes (line 151) | @ParameterizedTest
method testGetImageOutputOptions (line 157) | @Test
method testImageOutputConstants (line 166) | @Test
method testSetImageOutputNormalizesToLowercase (line 173) | @Test
method testSetImageOutputWithNullDefaultsToExternal (line 184) | @Test
method testSetImageOutputThrowsExceptionForInvalidMode (line 192) | @ParameterizedTest
method testExistingConfigFields (line 206) | @Test
method testDefaultPages (line 230) | @Test
method testSetPages_singlePage (line 237) | @Test
method testSetPages_commaSeparated (line 245) | @Test
method testSetPages_range (line 252) | @Test
method testSetPages_mixed (line 259) | @Test
method testSetPages_complexMixed (line 266) | @Test
method testSetPages_withSpaces (line 273) | @Test
method testSetPages_invalidFormat (line 280) | @ParameterizedTest
method testSetPages_nullAndEmpty (line 293) | @Test
method testSetPages_reverseRangeThrows (line 308) | @Test
method testSetPages_zeroPageThrows (line 318) | @Test
method testSetPages_negativePageThrows (line 328) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/api/FilterConfigTest.java
class FilterConfigTest (line 8) | class FilterConfigTest {
method defaultsKeepInvisibleContentFiltersEnabledButSensitiveDataDisabled (line 10) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/containers/StaticLayoutContainersTest.java
class StaticLayoutContainersTest (line 23) | class StaticLayoutContainersTest {
method setUp (line 25) | @BeforeEach
method testClearContainers_resetsEmbedImages (line 30) | @Test
method testClearContainers_resetsImageFormat (line 40) | @Test
method testSetAndGetEmbedImages (line 50) | @Test
method testSetAndGetImageFormat (line 61) | @Test
method testGetImageFormat_withNullValue_returnsDefaultPng (line 72) | @Test
method testIsEmbedImages_withNullValue_returnsFalse (line 79) | @Test
method testSetImagesDirectory (line 86) | @Test
method testIncrementImageIndex (line 94) | @Test
method testResetImageIndex (line 103) | @Test
method testCurrentContentId (line 113) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/DoclingFastServerClientTest.java
class DoclingFastServerClientTest (line 38) | class DoclingFastServerClientTest {
method setUp (line 43) | @BeforeEach
method tearDown (line 55) | @AfterEach
method testSuccessResponseHasNoFailedPages (line 61) | @Test
method testPartialSuccessResponseWithFailedPages (line 82) | @Test
method testPartialSuccessMultipleFailedPages (line 103) | @Test
method testFailureResponseThrowsIOException (line 124) | @Test
method testLegacyResponseWithoutFailedPagesField (line 141) | @Test
method testMalformedFailedPagesValues (line 161) | @Test
method testCheckAvailabilitySucceeds (line 184) | @Test
method testCheckAvailabilityFailsWhenServerUnavailable (line 192) | @Test
method testCheckAvailabilityFailsOnUnhealthyServer (line 201) | @Test
method testPartialSuccessAllPagesFailed (line 210) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/DoclingSchemaTransformerTest.java
class DoclingSchemaTransformerTest (line 41) | public class DoclingSchemaTransformerTest {
method setUp (line 46) | @BeforeEach
method testGetBackendType (line 53) | @Test
method testTransformNullJson (line 58) | @Test
method testTransformEmptyJson (line 69) | @Test
method testTransformSimpleParagraph (line 82) | @Test
method testTransformSectionHeader (line 106) | @Test
method testFilterPageHeaderFooter (line 130) | @Test
method testTransformCaption (line 164) | @Test
method testTransformFootnote (line 185) | @Test
method testTransformSimpleTable (line 206) | @Test
method testTransformTableWithSpans (line 250) | @Test
method testTransformMultiplePages (line 299) | @Test
method testCoordinateTransformBottomLeft (line 337) | @Test
method testCoordinateTransformTopLeft (line 373) | @Test
method testReadingOrderSort (line 409) | @Test
method testMixedContent (line 449) | @Test
method testTransformPage (line 494) | @Test
method testTextMissingProv (line 510) | @Test
method testTableMissingData (line 530) | @Test
method testTransformFormula (line 550) | @Test
method testTransformFormulaWithComplexLatex (line 574) | @Test
method testMixedContentWithFormula (line 598) | @Test
method testTransformPictureWithDescription (line 639) | @Test
method testTransformPictureWithoutDescription (line 669) | @Test
method testTransformMultiplePicturesWithDescriptions (line 693) | @Test
method createDoclingDocument (line 739) | private ObjectNode createDoclingDocument() {
method addProvenance (line 746) | private void addProvenance(ObjectNode node, int pageNo, double l, doub...
method addTableCell (line 758) | private void addTableCell(ArrayNode tableCells, int row, int col, int ...
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/HancomClientTest.java
class HancomClientTest (line 39) | public class HancomClientTest {
method setUp (line 47) | @BeforeEach
method tearDown (line 60) | @AfterEach
method testDefaultUrlConfiguration (line 66) | @Test
method testConvertFullWorkflow (line 74) | @Test
method testConvertWithCleanupOnProcessingError (line 118) | @Test
method testConvertWithSpecificPages (line 144) | @Test
method testUploadFailure (line 171) | @Test
method testDeleteFailureIsIgnored (line 188) | @Test
method testConvertAsync (line 213) | @Test
method createVisualInfoResponse (line 234) | private String createVisualInfoResponse() {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/HancomSchemaTransformerTest.java
class HancomSchemaTransformerTest (line 43) | public class HancomSchemaTransformerTest {
method setUp (line 48) | @BeforeEach
method testGetBackendType (line 55) | @Test
method testTransformNullJson (line 60) | @Test
method testTransformEmptyJson (line 71) | @Test
method testTransformSimpleParagraph (line 87) | @Test
method testTransformHeading (line 108) | @Test
method testFilterPageHeaderFooter (line 129) | @Test
method testTransformFormula (line 154) | @Test
method testTransformFigure (line 175) | @Test
method testTransformSimpleTable (line 193) | @Test
method testTransformTableWithSpans (line 222) | @Test
method testTransformMultiplePages (line 252) | @Test
method testBoundingBoxTransformation (line 287) | @Test
method testReadingOrderSort (line 314) | @Test
method testMixedContent (line 343) | @Test
method testTransformListItem (line 374) | @Test
method testElementMissingBbox (line 393) | @Test
method testTransformPage (line 419) | @Test
method testTransformWithHtmlContent (line 432) | @Test
method createVisualInfoDto (line 469) | private ObjectNode createVisualInfoDto() {
method addElement (line 488) | private void addElement(ArrayNode elements, String type, String label,...
method addTableElement (line 509) | private ObjectNode addTableElement(ArrayNode elements, int pageIndex,
method addTableContentStructure (line 536) | private ArrayNode addTableContentStructure(ObjectNode tableElement) {
method addTableCell (line 544) | private void addTableCell(ArrayNode cells, String text, int row, int col,
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/HealthCheckTest.java
class HealthCheckTest (line 37) | class HealthCheckTest {
method setUp (line 41) | @BeforeEach
method tearDown (line 46) | @AfterEach
method testDoclingHealthCheckSucceeds (line 53) | @Test
method testDoclingHealthCheckFailsWhenServerDown (line 71) | @Test
method testDoclingHealthCheckFailsOnServerError (line 96) | @Test
method testHancomHealthCheckSucceeds (line 116) | @Test
method testHancomHealthCheckFailsWhenServerDown (line 132) | @Test
method testHealthCheckTimesOutQuickly (line 152) | @Test
method stripTrailingSlash (line 174) | private static String stripTrailingSlash(String url) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/HybridClientFactoryTest.java
class HybridClientFactoryTest (line 27) | class HybridClientFactoryTest {
method testCreateDoclingFastClient (line 29) | @Test
method testCreateDoclingFastClientCaseInsensitive (line 41) | @Test
method testCreateHancomClient (line 54) | @Test
method testCreateHancomClientCaseInsensitive (line 66) | @Test
method testCreateAzureClientThrowsUnsupported (line 79) | @Test
method testCreateGoogleClientThrowsUnsupported (line 91) | @Test
method testCreateUnknownBackendThrows (line 103) | @ParameterizedTest
method testCreateNullBackendThrows (line 117) | @Test
method testCreateEmptyBackendThrows (line 125) | @Test
method testIsSupportedDoclingFast (line 133) | @Test
method testIsSupportedHancom (line 140) | @Test
method testIsSupportedUnsupportedBackends (line 147) | @Test
method testIsSupportedNullAndEmpty (line 155) | @Test
method testGetSupportedBackends (line 161) | @Test
method testGetAllKnownBackends (line 170) | @Test
method testBackendConstants (line 180) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/TriageLoggerTest.java
class TriageLoggerTest (line 39) | public class TriageLoggerTest {
method setUp (line 44) | @BeforeEach
method testCreateTriageJsonWithEmptyResults (line 50) | @Test
method testCreateTriageJsonWithResults (line 64) | @Test
method testToJsonString (line 119) | @Test
method testLogToWriter (line 133) | @Test
method testLogToFile (line 150) | @Test
method testPageOrdering (line 179) | @Test
method testDifferentHybridBackends (line 202) | @Test
method testSummaryWithAllJavaPages (line 219) | @Test
method testSummaryWithAllBackendPages (line 236) | @Test
method testDefaultFilename (line 253) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/TriageProcessorIntegrationTest.java
class TriageProcessorIntegrationTest (line 46) | public class TriageProcessorIntegrationTest {
method checkBenchmarkDir (line 75) | @BeforeAll
method testTriageAccuracyOnBenchmarkPDFs (line 94) | @Test
method triageDocument (line 179) | private TriageDecision triageDocument(File pdfFile) throws IOException {
method testSingleDocumentTriage (line 211) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/TriageProcessorTest.java
class TriageProcessorTest (line 46) | public class TriageProcessorTest {
method setUp (line 48) | @BeforeEach
method testEmptyContentReturnsJava (line 57) | @Test
method testNullContentReturnsJava (line 69) | @Test
method testSimpleTextReturnsJava (line 77) | @Test
method testHighLineRatioReturnsBackend (line 92) | @Test
method testTableBorderPresenceReturnsBackend (line 110) | @Test
method testSuspiciousPatternDetectedButDisabled (line 136) | @Test
method testAlignedLineGroupsDetectedButDisabled (line 152) | @Test
method testTriageAllPagesWithMap (line 181) | @Test
method testTriageAllPagesWithList (line 205) | @Test
method testCustomThresholds (line 224) | @Test
method testOutOfReadingOrderReturnsBackend (line 245) | @Test
method testTriageSignalsEmpty (line 262) | @Test
method testTriageResultFactoryMethods (line 274) | @Test
method testThresholdsGettersAndSetters (line 289) | @Test
method testExtractSignalsDirectly (line 311) | @Test
method testClassifyPageHighReplacementRatioRoutesToBackend (line 325) | @Test
method testClassifyPageLowReplacementRatioNoEffect (line 339) | @Test
method testClassifyPageExactThresholdRoutesToBackend (line 352) | @Test
method createTextChunk (line 368) | private TextChunk createTextChunk(double leftX, double bottomY, double...
method createLineChunk (line 375) | private LineChunk createLineChunk(double x1, double y1, double x2, dou...
method setupTableBorderRows (line 379) | private void setupTableBorderRows(TableBorder tableBorder) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/json/serializers/ImageSerializerTest.java
class ImageSerializerTest (line 39) | class ImageSerializerTest {
method setUp (line 47) | @BeforeEach
method tearDown (line 64) | @AfterEach
method createTestImageFile (line 69) | private void createTestImageFile(int index, String format) throws IOEx...
method createImageChunk (line 81) | private ImageChunk createImageChunk(int index) {
method testSerializeWithEmbedImagesTrueOutputsDataField (line 88) | @Test
method testSerializeWithEmbedImagesFalseOutputsSourceField (line 101) | @Test
method testSerializeWithJpegFormat (line 114) | @Test
method testSerializeWithNonExistentImageNoSourceOrData (line 127) | @Test
method testSerializeContainsTypeField (line 138) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/json/serializers/LineArtSerializerTest.java
class LineArtSerializerTest (line 29) | class LineArtSerializerTest {
method lineArtChunkIsNotSerializedAsImage (line 31) | @Test
method tableCellSerializerSkipsLineArtChunkChildren (line 45) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/markdown/MarkdownGeneratorTest.java
class MarkdownGeneratorTest (line 32) | public class MarkdownGeneratorTest {
method testValidHeadingLevels (line 37) | @ParameterizedTest
method testHeadingLevelsCappedAt6 (line 49) | @ParameterizedTest
method testHeadingLevelsMinimumIs1 (line 61) | @ParameterizedTest
method testMaxHeadingLevelIs6 (line 73) | @Test
method testMinHeadingLevelIs1 (line 83) | @Test
method generateHeadingPrefix (line 97) | private String generateHeadingPrefix(int headingLevel) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/markdown/MarkdownTableTest.java
class MarkdownTableTest (line 52) | public class MarkdownTableTest {
method initStaticContainers (line 57) | @BeforeAll
method testKoreanSpecialTableMergedRow (line 73) | @Test
method testColspanCellsAreNotDuplicated (line 125) | @Test
method testSimpleTableWithoutMergedCells (line 174) | @Test
method testRowspanCellsAreNotDuplicated (line 205) | @Test
method addTextContent (line 244) | private void addTextContent(TableBorderCell cell, String text) {
method generateMarkdownTable (line 253) | private String generateMarkdownTable(TableBorder table) throws IOExcep...
method countOccurrences (line 268) | private long countOccurrences(String str, String sub) {
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/CaptionProcessorTest.java
class CaptionProcessorTest (line 33) | public class CaptionProcessorTest {
method testProcessCaptions (line 35) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/CidFontDetectionTest.java
class CidFontDetectionTest (line 46) | public class CidFontDetectionTest {
method checkFixture (line 53) | @BeforeAll
method testCidPdfHighReplacementRatioDetected (line 62) | @Test
method testCidPdfWarningLogEmitted (line 90) | @Test
method testBoundaryBelowThreshold29percent (line 134) | @Test
method testBoundaryAtThreshold30percent (line 150) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/ContentFilterProcessorTest.java
class ContentFilterProcessorTest (line 27) | public class ContentFilterProcessorTest {
method testShortTextWithAbnormallyWideBoundingBox (line 44) | @Test
method testNormalTextWidthNotAbnormal (line 71) | @Test
method testLongTextNotTargetedForCorrection (line 96) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HeaderFooterProcessorTest.java
class HeaderFooterProcessorTest (line 33) | public class HeaderFooterProcessorTest {
method testProcessHeadersAndFooters (line 35) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HeadingProcessorTest.java
class HeadingProcessorTest (line 32) | public class HeadingProcessorTest {
method testProcessHeadings (line 34) | @Test
method testDetectHeadingsLevels (line 54) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HybridDocumentProcessorTest.java
class HybridDocumentProcessorTest (line 40) | public class HybridDocumentProcessorTest {
method testHybridModeEnabled (line 42) | @Test
method testHybridModeDisabled (line 51) | @Test
method testHybridModeDefaultIsOff (line 60) | @Test
method testHybridConfigDefaults (line 68) | @Test
method testHybridConfigEffectiveUrl (line 78) | @Test
method testTriageResultFilterByDecision (line 90) | @Test
method testPageNumberConversion (line 125) | @Test
method testShouldProcessPageWithNullFilter (line 144) | @Test
method testShouldProcessPageWithFilter (line 152) | @Test
method testInvalidHybridBackendThrows (line 167) | @Test
method testHybridConfigTimeout (line 175) | @Test
method testHybridConfigMaxConcurrentRequests (line 189) | @Test
method testHybridConfigFallbackToggle (line 200) | @Test
method shouldProcessPage (line 215) | private static boolean shouldProcessPage(int pageNumber, Set<Integer> ...
method testOutputFormatApiValue (line 221) | @Test
method testHybridRequestDefaultOutputFormats (line 228) | @Test
method testHybridRequestWithJsonOnly (line 244) | @Test
method testHybridRequestWithMarkdownOnly (line 258) | @Test
method testHybridRequestEmptyFormatsFallsBackToAll (line 272) | @Test
method testHybridRequestNullFormatsFallsBackToAll (line 286) | @Test
method testHybridRequestWithHtmlOnly (line 299) | @Test
method testHybridConfigModeDefaults (line 317) | @Test
method testHybridConfigModeFullMode (line 325) | @Test
method testDoclingBackendEnabled (line 334) | @Test
method testDoclingEffectiveUrl (line 343) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/LevelProcessorTest.java
class LevelProcessorTest (line 37) | public class LevelProcessorTest {
method testDetectLevelsForParagraphs (line 39) | @Test
method testDetectLevelsForLists (line 62) | @Test
method testDetectLevelsForTables (line 109) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/ListProcessorTest.java
class ListProcessorTest (line 33) | public class ListProcessorTest {
method testProcessLists (line 35) | @Test
method testProcessListsFromTextNodes (line 51) | @Test
method testCheckNeighborLists (line 70) | @Test
method testProcessListsWithSingleCharacterLabels (line 104) | @Test
method testProcessListsWithEdgeCaseLabels (line 127) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/ParagraphProcessorTest.java
class ParagraphProcessorTest (line 30) | public class ParagraphProcessorTest {
method testProcessParagraphs (line 32) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/SpecialTableProcessorTest.java
class SpecialTableProcessorTest (line 32) | public class SpecialTableProcessorTest {
method testDetectSpecialTables (line 34) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/StrikethroughProcessorTest.java
class StrikethroughProcessorTest (line 30) | public class StrikethroughProcessorTest {
method setUp (line 32) | @BeforeEach
method testStrikethroughDetected (line 39) | @Test
method testUnderlineNotDetectedAsStrikethrough (line 59) | @Test
method testLineAboveTextNotDetected (line 78) | @Test
method testPartialHorizontalOverlapNotDetected (line 97) | @Test
method testNoLinesNoChange (line 116) | @Test
method testVerticalLineIgnored (line 130) | @Test
method testDoubleWrappingPrevented (line 149) | @Test
method testWideLineSpanningMultipleChunksRejected (line 168) | @Test
method testLineMuchWiderThanTextRejected (line 193) | @Test
method testThickLineRejectedAsBackgroundFill (line 213) | @Test
method testThinLineAcceptedAsStrikethrough (line 234) | @Test
method testIsStrikethroughLineAtExactCenter (line 246) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/TableBorderProcessorTest.java
class TableBorderProcessorTest (line 41) | public class TableBorderProcessorTest {
method testProcessTableBorders (line 43) | @Test
method testCheckNeighborTables (line 98) | @Test
method testNormalSmallTableDoesNotTriggerStructuralNormalization (line 153) | @Test
method testUndersegmentedFiveColumnTableIsRebuiltFromRawPageContents (line 179) | @Test
method testNormalizationKeepsOriginalTableWhenRebuildLosesColumns (line 214) | @Test
method testTextBlockTableIsNeverNormalized (line 234) | @Test
method testProcessTableBordersDepthLimitNoStackOverflow (line 261) | @Test
method testProcessTableBordersNormalNestedTableProcessedCorrectly (line 297) | @Test
method createSimpleTable (line 329) | private TableBorder createSimpleTable(int pageNumber, double leftX, do...
method createTable (line 334) | private TableBorder createTable(int pageNumber, double leftX, double b...
method populateOriginalTableContents (line 362) | private void populateOriginalTableContents(TableBorder table) {
method getSingleResultTable (line 373) | private TableBorder getSingleResultTable(List<IObject> contents, int p...
method createTextChunk (line 380) | private TextChunk createTextChunk(int pageNumber, double leftX, double...
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/TextLineProcessorTest.java
class TextLineProcessorTest (line 29) | public class TextLineProcessorTest {
method testProcessTextLines (line 31) | @Test
method testProcessTextLinesSortsChunksByLeftX (line 57) | @Test
method testProcessTextLinesAddsSpacesBetweenDistantChunks (line 90) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/TextProcessorTest.java
class TextProcessorTest (line 28) | public class TextProcessorTest {
method testReplaceUndefinedCharacters (line 30) | @Test
method testReplaceUndefinedCharactersSkipsWhenDefault (line 45) | @Test
method testReplaceUndefinedCharactersMultipleOccurrences (line 58) | @Test
method testReplaceUndefinedCharactersWithRegexSpecialChars (line 69) | @Test
method testReplaceUndefinedCharactersSkipsNonTextChunks (line 81) | @Test
method testRemoveSameTextChunks (line 94) | @Test
method testRemoveTextDecorationImages (line 106) | @Test
method testMergeCloseTextChunksSeparatedByLargeGapNotMerged (line 122) | @Test
method testMergeCloseTextChunksAdjacentMerged (line 156) | @Test
method testMeasureReplacementCharRatioAllReplacement (line 189) | @Test
method testMeasureReplacementCharRatioNoReplacement (line 199) | @Test
method testMeasureReplacementCharRatioMixed (line 209) | @Test
method testMeasureReplacementCharRatioEmptyContents (line 220) | @Test
method testMeasureReplacementCharRatioNonTextChunksIgnored (line 228) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorterTest.java
class XYCutPlusPlusSorterTest (line 36) | class XYCutPlusPlusSorterTest {
method setUp (line 38) | @BeforeEach
method sort_nullList_returnsNull (line 46) | @Test
method sort_emptyList_returnsEmpty (line 52) | @Test
method sort_singleObject_returnsSame (line 58) | @Test
method sort_singleColumn_topToBottom (line 69) | @Test
method identifyCrossLayoutElements_wideHeader_detected (line 88) | @Test
method identifyCrossLayoutElements_narrowElements_notDetected (line 108) | @Test
method identifyCrossLayoutElements_wideButNoOverlaps_notDetected (line 121) | @Test
method hasMinimumOverlaps_sufficientOverlaps_returnsTrue (line 134) | @Test
method hasMinimumOverlaps_insufficientOverlaps_returnsFalse (line 147) | @Test
method computeDensityRatio_denseLayout_highRatio (line 161) | @Test
method computeDensityRatio_sparseLayout_lowRatio (line 174) | @Test
method computeDensityRatio_emptyList_defaultRatio (line 187) | @Test
method splitByHorizontalCut_validCut_correctGroups (line 196) | @Test
method splitByVerticalCut_validCut_correctGroups (line 211) | @Test
method sort_twoColumns_leftColumnFirst (line 228) | @Test
method sort_twoColumnsWithHeader_headerFirst (line 249) | @Test
method sort_headerAndFooter_correctPositions (line 273) | @Test
method sort_horizontalSections_largerYGap_horizontalCutFirst (line 294) | @Test
method sort_withCustomParameters_respectsParameters (line 319) | @Test
method calculateBoundingRegion_multipleObjects_correctBounds (line 336) | @Test
method calculateTotalArea_multipleObjects_sumOfAreas (line 351) | @Test
method mergeCrossLayoutElements_emptyCrossLayout_returnsSortedMain (line 364) | @Test
method mergeCrossLayoutElements_crossLayoutAtTop_insertsFirst (line 377) | @Test
method sort_academicPaperTwoColumn_correctReadingOrder (line 415) | @Test
method sort_twoColumnsOverlappingY_leftColumnFirst (line 501) | @Test
method sort_twoColumnsWithNarrowBridge_leftColumnFirst (line 546) | @Test
method sort_1901_03003_moran_paper_correctReadingOrder (line 589) | @Test
method findPosition (line 694) | private int findPosition(List<IObject> objects, String text) {
method createTextLineWithId (line 703) | private IObject createTextLineWithId(double leftX, double topY, double...
method createTextLine (line 711) | private IObject createTextLine(double leftX, double topY, double right...
method getText (line 720) | private String getText(IObject obj) {
method sort_noStackOverflowWithComplexLayout_issue179 (line 757) | @Test
method sort_wideAndNarrowObjects_noInfiniteRecursion (line 786) | @Test
method sort_manySmallGaps_noInfiniteRecursion (line 825) | @Test
method sort_horizontalGapWithCentersOnOneSide_noInfiniteRecursion (line 846) | @Test
method sort_issue179_regressionTest (line 889) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/regression/ToUnicodeRegressionTest.java
class ToUnicodeRegressionTest (line 35) | class ToUnicodeRegressionTest {
method testIssue166ToUnicodeIntervalByteCarry (line 50) | @Test
method testIssue166ToUnicodeIntervalByteCarryAtLowBoundary (line 70) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/Base64ImageUtilsTest.java
class Base64ImageUtilsTest (line 31) | class Base64ImageUtilsTest {
method testToDataUri_withPngFormat (line 36) | @Test
method testToDataUri_withJpegFormat (line 53) | @Test
method testToDataUri_withNonExistentFile (line 68) | @Test
method testGetMimeType_withValidFormats (line 80) | @ParameterizedTest
method testGetMimeType_withNullFormat (line 93) | @Test
method testGetMimeType_withUnknownFormat (line 98) | @Test
method testMaxEmbeddedImageSizeConstant (line 107) | @Test
method testToDataUriWithImageAtSizeLimit (line 113) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ContentSanitizerTest.java
class ContentSanitizerTest (line 17) | class ContentSanitizerTest {
method setUp (line 20) | @BeforeEach
method createTextChunk (line 26) | TextChunk createTextChunk(String value, double left, double bottom, do...
method assertChunksContainValues (line 33) | private void assertChunksContainValues(List<TextChunk> chunks, String....
method testMultipleReplacementsInSingleChunk (line 43) | @Test
method testReplaceCoveringMultipleFullChunks (line 56) | @Test
method testReplaceCoveringPartsOfChunks (line 76) | @Test
method testReplaceCoveringOneFullChunkInArray (line 95) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImageFormatSupportTest.java
class ImageFormatSupportTest (line 38) | class ImageFormatSupportTest {
method createTestImage (line 46) | private BufferedImage createTestImage() {
method testPngFormatIsSupported (line 61) | @Test
method testJpegFormatIsSupported (line 73) | @Test
method testWebpFormatIsNotSupported (line 85) | @Test
method testListAvailableWriterFormats (line 96) | @Test
method testStandardFormatsAreSupported (line 107) | @ParameterizedTest
method testUnsupportedFormatReturnsFalse (line 119) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImagesUtilsTest.java
class ImagesUtilsTest (line 31) | class ImagesUtilsTest {
method testCreateImagesDirectory (line 33) | @Test
method testWriteImageInitializesContrastRatioConsumer (line 67) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ModeWeightStatisticsTest.java
class ModeWeightStatisticsTest (line 8) | class ModeWeightStatisticsTest {
method getModeReturnsMostFrequentScoreWithinRange (line 10) | @Test
method getModeReturnsNaNWhenNoScoresWithinRange (line 25) | @Test
method getBoostGivesFractionalRankForScoresAboveMode (line 37) | @Test
FILE: java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/TextNodeStatisticsTest.java
class TextNodeStatisticsTest (line 9) | class TextNodeStatisticsTest {
method fontSizeRarityBoostUsesRelativeRankOfScoresAboveBodyMode (line 11) | @Test
method fontWeightRarityBoostUsesDominantWeightWindow (line 37) | @Test
class StubSemanticTextNode (line 63) | private static class StubSemanticTextNode extends SemanticTextNode {
method StubSemanticTextNode (line 67) | StubSemanticTextNode(double fontSize, double fontWeight) {
method getFontSize (line 72) | @Override
method getFontWeight (line 77) | @Override
FILE: java/opendataloader-pdf-core/src/test/resources/generate-cid-test-pdf.py
function find_ttf_font (line 31) | def find_ttf_font():
function read_ttf_tables (line 46) | def read_ttf_tables(font_path):
function build_pdf_with_real_font (line 53) | def build_pdf_with_real_font(output_path, font_path):
function main (line 312) | def main():
FILE: node/opendataloader-pdf/src/cli-options.generated.ts
function registerCliOptions (line 9) | function registerCliOptions(program: Command): void {
FILE: node/opendataloader-pdf/src/cli.ts
function createProgram (line 7) | function createProgram(): Command {
function main (line 33) | async function main(): Promise<number> {
FILE: node/opendataloader-pdf/src/convert-options.generated.ts
type ConvertOptions (line 7) | interface ConvertOptions {
type CliOptions (line 63) | interface CliOptions {
function buildConvertOptions (line 94) | function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {
function buildArgs (line 179) | function buildArgs(options: ConvertOptions): string[] {
FILE: node/opendataloader-pdf/src/index.ts
constant JAR_NAME (line 15) | const JAR_NAME = 'opendataloader-pdf-cli.jar';
type JarExecutionOptions (line 17) | interface JarExecutionOptions {
function executeJar (line 21) | function executeJar(args: string[], executionOptions: JarExecutionOption...
function convert (line 83) | function convert(
type RunOptions (line 108) | interface RunOptions {
function run (line 127) | function run(inputPath: string, options: RunOptions = {}): Promise<strin...
FILE: python/opendataloader-pdf/hatch_build.py
class CustomBuildHook (line 10) | class CustomBuildHook(BuildHookInterface):
method initialize (line 11) | def initialize(self, version, build_data):
FILE: python/opendataloader-pdf/src/opendataloader_pdf/cli_options_generated.py
function add_options_to_parser (line 240) | def add_options_to_parser(parser) -> None:
FILE: python/opendataloader-pdf/src/opendataloader_pdf/convert_generated.py
function convert (line 12) | def convert(
FILE: python/opendataloader-pdf/src/opendataloader_pdf/hybrid_server.py
function build_conversion_response (line 81) | def build_conversion_response(
function sanitize_unicode (line 148) | def sanitize_unicode(data: Any) -> Any:
function _get_loop_setting (line 174) | def _get_loop_setting() -> str:
function _check_dependencies (line 185) | def _check_dependencies():
function create_converter (line 211) | def create_converter(
function create_app (line 274) | def create_app(
function main (line 448) | def main():
FILE: python/opendataloader-pdf/src/opendataloader_pdf/runner.py
function run_jar (line 14) | def run_jar(args: List[str], quiet: bool = False) -> str:
FILE: python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py
function run (line 16) | def run(
function main (line 92) | def main(argv=None) -> int:
FILE: python/opendataloader-pdf/tests/conftest.py
function input_pdf (line 8) | def input_pdf():
function output_dir (line 13) | def output_dir():
FILE: python/opendataloader-pdf/tests/test_cli_options.py
class TestCLIOptions (line 7) | class TestCLIOptions:
method test_cli_options_is_list (line 10) | def test_cli_options_is_list(self):
method test_cli_options_not_empty (line 14) | def test_cli_options_not_empty(self):
method test_each_option_has_required_fields (line 18) | def test_each_option_has_required_fields(self):
method test_option_types_are_valid (line 33) | def test_option_types_are_valid(self):
method test_python_name_is_snake_case (line 39) | def test_python_name_is_snake_case(self):
method test_known_options_exist (line 44) | def test_known_options_exist(self):
method test_sanitize_option_exists (line 60) | def test_sanitize_option_exists(self):
class TestAddOptionsToParser (line 69) | class TestAddOptionsToParser:
method test_adds_all_options (line 72) | def test_adds_all_options(self):
method test_boolean_options_default_to_false (line 87) | def test_boolean_options_default_to_false(self):
method test_string_options_default_to_none (line 100) | def test_string_options_default_to_none(self):
method test_short_options_work (line 113) | def test_short_options_work(self):
method test_long_options_work (line 132) | def test_long_options_work(self):
FILE: python/opendataloader-pdf/tests/test_convert_integration.py
function test_convert_generates_output (line 6) | def test_convert_generates_output(input_pdf, output_dir):
FILE: python/opendataloader-pdf/tests/test_hybrid_server.py
function test_gpu_detected_logging (line 8) | def test_gpu_detected_logging(caplog):
function test_no_gpu_logging (line 38) | def test_no_gpu_logging(caplog):
function test_no_pytorch_logging (line 57) | def test_no_pytorch_logging(caplog):
function test_get_loop_setting_returns_asyncio_on_windows (line 75) | def test_get_loop_setting_returns_asyncio_on_windows():
function test_get_loop_setting_returns_auto_on_non_windows (line 83) | def test_get_loop_setting_returns_auto_on_non_windows():
FILE: python/opendataloader-pdf/tests/test_hybrid_server_nonblocking.py
function mock_docling (line 18) | def mock_docling():
function app_with_converter (line 58) | def app_with_converter(mock_docling):
function test_convert_runs_in_thread_pool (line 71) | async def test_convert_runs_in_thread_pool(app_with_converter, mock_docl...
function test_health_responds_during_conversion (line 104) | async def test_health_responds_during_conversion(app_with_converter):
FILE: python/opendataloader-pdf/tests/test_hybrid_server_partial_success.py
class TestBuildConversionResponse (line 13) | class TestBuildConversionResponse:
method test_success_status (line 16) | def test_success_status(self):
method test_partial_success_status (line 29) | def test_partial_success_status(self):
method test_partial_success_multiple_failed_pages (line 42) | def test_partial_success_multiple_failed_pages(self):
method test_partial_success_no_page_range_with_total_pages (line 57) | def test_partial_success_no_page_range_with_total_pages(self):
method test_partial_success_no_page_range_fallback (line 71) | def test_partial_success_no_page_range_fallback(self):
method test_success_no_errors_field (line 83) | def test_success_no_errors_field(self):
method test_document_field_present (line 94) | def test_document_field_present(self):
method test_partial_success_first_page_failed_with_page_range (line 106) | def test_partial_success_first_page_failed_with_page_range(self):
method test_partial_success_last_page_failed_with_page_range (line 117) | def test_partial_success_last_page_failed_with_page_range(self):
method test_partial_success_all_pages_failed (line 128) | def test_partial_success_all_pages_failed(self):
method test_partial_success_all_pages_failed_with_total_pages (line 140) | def test_partial_success_all_pages_failed_with_total_pages(self):
method test_failure_status_no_failed_pages_detection (line 153) | def test_failure_status_no_failed_pages_detection(self):
method test_partial_success_missing_pages_key (line 165) | def test_partial_success_missing_pages_key(self):
FILE: python/opendataloader-pdf/tests/test_hybrid_server_unicode.py
class TestSanitizeUnicode (line 15) | class TestSanitizeUnicode:
method test_lone_surrogate_replaced (line 18) | def test_lone_surrogate_replaced(self):
method test_all_surrogate_range_replaced (line 25) | def test_all_surrogate_range_replaced(self):
method test_null_character_replaced (line 31) | def test_null_character_replaced(self):
method test_nested_dict_sanitized (line 38) | def test_nested_dict_sanitized(self):
method test_list_sanitized (line 45) | def test_list_sanitized(self):
method test_clean_data_unchanged (line 53) | def test_clean_data_unchanged(self):
method test_non_string_values_preserved (line 59) | def test_non_string_values_preserved(self):
method test_sanitized_output_json_serializable (line 65) | def test_sanitized_output_json_serializable(self):
method test_mixed_valid_and_invalid_unicode (line 80) | def test_mixed_valid_and_invalid_unicode(self):
FILE: scripts/experiments/docling_baseline_bench.py
function convert_pdf (line 29) | def convert_pdf(pdf_path: Path) -> dict:
function main (line 51) | def main():
FILE: scripts/experiments/docling_fastapi_bench.py
function run_server (line 38) | def run_server():
function convert_pdf (line 111) | def convert_pdf(pdf_path: Path) -> dict:
function wait_for_server (line 137) | def wait_for_server(max_retries=60, delay=1.0):
function main (line 150) | def main():
FILE: scripts/experiments/docling_speed_report.py
function load_results (line 19) | def load_results(filename: str) -> dict | None:
function main (line 28) | def main():
FILE: scripts/experiments/docling_subprocess_bench.py
function convert_pdf (line 123) | def convert_pdf(process: subprocess.Popen, pdf_path: Path) -> dict:
function main (line 165) | def main():
FILE: scripts/generate-options.mjs
constant ROOT_DIR (line 15) | const ROOT_DIR = join(__dirname, '..');
constant AUTO_GENERATED_HEADER (line 21) | const AUTO_GENERATED_HEADER = `// AUTO-GENERATED FROM options.json - DO ...
constant AUTO_GENERATED_HEADER_PYTHON (line 25) | const AUTO_GENERATED_HEADER_PYTHON = `# AUTO-GENERATED FROM options.json...
constant AUTO_GENERATED_HEADER_MDX (line 29) | const AUTO_GENERATED_HEADER_MDX = `{/* AUTO-GENERATED FROM options.json ...
function toCamelCase (line 37) | function toCamelCase(str) {
function toSnakeCase (line 44) | function toSnakeCase(str) {
constant LIST_OPTIONS (line 51) | const LIST_OPTIONS = new Set(['format', 'content-safety-off']);
function isListOption (line 56) | function isListOption(opt) {
function escapeString (line 67) | function escapeString(str, quote = "'", { escapePercent = false } = {}) {
function generateNodeCliOptions (line 83) | function generateNodeCliOptions() {
function generateNodeConvertOptions (line 112) | function generateNodeConvertOptions() {
function generatePythonCliOptions (line 225) | function generatePythonCliOptions() {
function generatePythonConvert (line 282) | function generatePythonConvert() {
function generatePythonConvertOptionsMdx (line 374) | function generatePythonConvertOptionsMdx() {
function generateNodeConvertOptionsMdx (line 418) | function generateNodeConvertOptionsMdx() {
function generateOptionsReferenceMdx (line 459) | function generateOptionsReferenceMdx() {
FILE: scripts/generate-schema.mjs
constant ROOT_DIR (line 14) | const ROOT_DIR = join(__dirname, '..');
constant AUTO_GENERATED_HEADER_MDX (line 20) | const AUTO_GENERATED_HEADER_MDX = `{/* AUTO-GENERATED FROM schema.json -...
function formatType (line 28) | function formatType(prop) {
function isRequired (line 65) | function isRequired(propName, requiredList) {
function generateJsonSchemaMdx (line 72) | function generateJsonSchemaMdx() {
FILE: scripts/utils.mjs
function escapeMarkdown (line 10) | function escapeMarkdown(str) {
function formatTable (line 28) | function formatTable(headers, rows) {
Condensed preview — 287 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,939K chars).
[
{
"path": ".editorconfig",
"chars": 187,
"preview": "root = true\n\n[*]\ncharset = utf-8\nindent_style = space\nindent_size = 4\nend_of_line = lf\ninsert_final_newline = true\ntrim_"
},
{
"path": ".gitattributes",
"chars": 272,
"preview": "# Unify all text files to LF line endings\n* text eol=lf\n\n# Binary files should not have line ending conversions\n*.exe bi"
},
{
"path": ".github/CODEOWNERS",
"chars": 163,
"preview": "# Owner for all documents\n*.md @bdoubrov @hnc-sujicho\n\n# Default owners for everything else in the repository\n* @MaximPl"
},
{
"path": ".github/ISSUE_TEMPLATE/bug_report.md",
"chars": 491,
"preview": "---\nname: Bug report\nabout: Report an issue\ntitle: \"\"\nlabels: bug\nassignees: \"\"\n---\n\n### Bug\n\n<!-- Describe the buggy be"
},
{
"path": ".github/ISSUE_TEMPLATE/config.yml",
"chars": 28,
"preview": "blank_issues_enabled: false\n"
},
{
"path": ".github/ISSUE_TEMPLATE/feature_request.md",
"chars": 414,
"preview": "---\nname: Feature request\nabout: Suggest an idea\ntitle: \"\"\nlabels: enhancement\nassignees: \"\"\n---\n\n### Requested feature\n"
},
{
"path": ".github/ISSUE_TEMPLATE/question.md",
"chars": 319,
"preview": "---\nname: Question\nabout: Ask a question\ntitle: \"\"\nlabels: question\nassignees: \"\"\n---\n\n### Question\n\n<!-- Describe what "
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 794,
"preview": "<!-- Thank you for your contribution! -->\n\n<!-- STEPS TO FOLLOW:\n 1. Add a description of the changes (frequently the s"
},
{
"path": ".github/SECURITY.md",
"chars": 973,
"preview": "# Security Policy\n\n## Reporting a Vulnerability\n\nIf you think you've identified a security issue in the project reposito"
},
{
"path": ".github/workflows/release.yml",
"chars": 5116,
"preview": "name: Release\n\non:\n push:\n tags:\n - 'v*'\n workflow_dispatch:\n\njobs:\n release:\n runs-on: ubuntu-latest\n "
},
{
"path": ".github/workflows/sync-docs.yml",
"chars": 1460,
"preview": "# Sync documentation to homepage repository on release\n#\n# Required Setup:\n# 1. Create a GitHub Personal Access Token (P"
},
{
"path": ".github/workflows/test-benchmark.yml",
"chars": 2254,
"preview": "name: Test & Benchmark\n\non:\n pull_request:\n branches: [main]\n paths:\n - 'java/**'\n - 'python/**'\n "
},
{
"path": ".gitignore",
"chars": 793,
"preview": "*.class\n\n# OS specific files\n.DS_Store\nThumbs.db\n\n# IDE / Editor files\n**/.idea/\n**/.vscode/\n*.iml\n*.ipr\n*.iws\n\n# Emacs\n"
},
{
"path": "CHANGELOG.md",
"chars": 41,
"preview": "# Changelog\n\n## 0.1.0\n\n- Initial release."
},
{
"path": "CLAUDE.md",
"chars": 1016,
"preview": "# CLAUDE.md\n\n## Gotchas\n\nAfter changing CLI options in Java, **must** run `npm run sync` — this regenerates `options.jso"
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 5505,
"preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participa"
},
{
"path": "CONTRIBUTING.md",
"chars": 3491,
"preview": "# Contributing to This Project\n\nThank you for your interest in contributing! \nWe welcome contributions from everyone. T"
},
{
"path": "LICENSE",
"chars": 11358,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "LICENSE_TEMPLATE/license.txt",
"chars": 557,
"preview": "Copyright 2025-2026 Hancom Inc.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this fi"
},
{
"path": "NOTICE",
"chars": 1378,
"preview": "OpenDataLoader PDF\nCopyright 2025-2026 Hancom, Inc.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou"
},
{
"path": "README.md",
"chars": 32782,
"preview": "<!-- AI-AGENT-SUMMARY\nname: opendataloader-pdf\ncategory: PDF data extraction, PDF accessibility automation\nlicense: Apac"
},
{
"path": "SUPPORT.md",
"chars": 1157,
"preview": "# Support\n\nThis project uses GitHub Issues to track bugs and feature requests. Please search the existing\nissues before "
},
{
"path": "THIRD_PARTY/THIRD_PARTY_LICENSES.md",
"chars": 25191,
"preview": "# THIRD-PARTY LICENSES\n\nThis project includes third-party libraries and components, licensed under their respective open"
},
{
"path": "THIRD_PARTY/THIRD_PARTY_NOTICES.md",
"chars": 17411,
"preview": "\n\n# THIRD-PARTY NOTICES (Copyright & Attributions)\n\nCopyright © 2025-2026 Hancom, Inc. All rights reserved.\n\nBelow are c"
},
{
"path": "THIRD_PARTY/licenses/BSD-2-Clause.txt",
"chars": 1243,
"preview": "BSD Two Clause License\n======================\n\nRedistribution and use in source and binary forms, with or without modifi"
},
{
"path": "THIRD_PARTY/licenses/BSD-3-Clause.txt",
"chars": 1542,
"preview": "BSD 3-clause \"New\" or \"Revised\" License\n\nCopyright (c) <YEAR>, <OWNER>\nAll rights reserved.\n\nRedistribution and use in s"
},
{
"path": "THIRD_PARTY/licenses/Blue-Oak-1.0.0.txt",
"chars": 1602,
"preview": "Blue Oak Model License\n======================\n\nVersion 1.0.0\n\nPurpose\n-------\n\nThis license gives everyone as much permi"
},
{
"path": "THIRD_PARTY/licenses/CDDL-1.1.txt",
"chars": 18255,
"preview": "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1\n========================================================="
},
{
"path": "THIRD_PARTY/licenses/EDL-1.0.txt",
"chars": 1620,
"preview": "Eclipse Distribution License - v 1.0\n====================================\n\nCopyright (c) 2007, Eclipse Foundation, Inc. "
},
{
"path": "THIRD_PARTY/licenses/EPL-1.0.txt",
"chars": 11278,
"preview": "Eclipse Public License - v 1.0\n==============================\n\nTHE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF T"
},
{
"path": "THIRD_PARTY/licenses/EPL-2.0.txt",
"chars": 14363,
"preview": "Eclipse Public License - v 2.0\n==============================\n\nTHE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF T"
},
{
"path": "THIRD_PARTY/licenses/ISC.txt",
"chars": 789,
"preview": "ISC License (ISCL)\n==================\n\nCopyright (c) 4-digit year, Company or Person's Name\n\nPermission to use, copy, mo"
},
{
"path": "THIRD_PARTY/licenses/LICENSE-JJ2000.txt",
"chars": 1782,
"preview": "This software module was originally developed by Raphaël Grosbois and\nDiego Santa Cruz (Swiss Federal Institute of Techn"
},
{
"path": "THIRD_PARTY/licenses/MIT.txt",
"chars": 1078,
"preview": "MIT License\n\nCopyright (c) <year> <copyright holders>\n\nPermission is hereby granted, free of charge, to any person obtai"
},
{
"path": "THIRD_PARTY/licenses/MPL-2.0.txt",
"chars": 15807,
"preview": "Mozilla Public License\nVersion 2.0\n======================\n\n\n1. Definitions\n--------------\n\n 1.1. \"Contributor\"\n\n means"
},
{
"path": "THIRD_PARTY/licenses/PSF-2.0.txt",
"chars": 2551,
"preview": "PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2\n============================================\n\n-----------------------------"
},
{
"path": "THIRD_PARTY/licenses/Plexus Classworlds License.txt",
"chars": 1937,
"preview": "Plexus Classworlds License\n==========================\n\nCopyright 2002 (C) The Codehaus. All Rights Reserved.\n\nRedistribu"
},
{
"path": "build-scripts/fetch_shaded_jar.py",
"chars": 3062,
"preview": "\"\"\"\nFinds and copies the latest shaded JAR from the Java build to the Python package source.\n\nThis script is intended to"
},
{
"path": "build-scripts/set_version.py",
"chars": 1747,
"preview": "# build-scripts/set_version.py\n\nimport os\nimport re\nimport sys\n\ndef set_version(version_file, pom_file, pyproject_toml_f"
},
{
"path": "content/docs/_generated/node-convert-options.mdx",
"chars": 5586,
"preview": "---\ntitle: Node.js Convert Options\ndescription: Options for the Node.js convert function\n---\n\n{/* AUTO-GENERATED FROM op"
},
{
"path": "content/docs/_generated/python-convert-options.mdx",
"chars": 5839,
"preview": "---\ntitle: Python Convert Options\ndescription: Options for the Python convert function\n---\n\n{/* AUTO-GENERATED FROM opti"
},
{
"path": "content/docs/accessibility-compliance.mdx",
"chars": 4871,
"preview": "---\ntitle: PDF Accessibility Compliance Guide\ndescription: Navigate EAA, ADA, Section 508, and PDF/UA requirements with "
},
{
"path": "content/docs/accessibility-glossary.mdx",
"chars": 7802,
"preview": "---\ntitle: PDF Accessibility Glossary\ndescription: Key terms and concepts for PDF accessibility, Tagged PDF, and PDF/UA "
},
{
"path": "content/docs/ai-safety.mdx",
"chars": 7141,
"preview": "---\ntitle: AI Safety\ndescription: How OpenDataLoader PDF defends against prompt injection hiding inside documents\n---\n\nL"
},
{
"path": "content/docs/benchmark/index.mdx",
"chars": 2234,
"preview": "---\ntitle: Benchmark Overview\ndescription: Benchmarks for OpenDataLoader PDF\n---\n\n## About the Benchmark Project\n\nPDF do"
},
{
"path": "content/docs/benchmark/meta.json",
"chars": 118,
"preview": "{\n \"title\": \"Benchmark Overview\",\n \"description\": \"The documentation\",\n \"pages\": [\"nid\", \"teds\", \"mhs\", \"speed\"]\n}\n"
},
{
"path": "content/docs/benchmark/mhs.mdx",
"chars": 2144,
"preview": "---\ntitle: Heading Levels (MHS)\ndescription: Measures whether document structure is preserved\n---\n\n## Why Heading Struct"
},
{
"path": "content/docs/benchmark/nid.mdx",
"chars": 1879,
"preview": "---\ntitle: Reading Order (NID)\ndescription: Measures whether text is extracted in the correct sequence\n---\n\n## Why Readi"
},
{
"path": "content/docs/benchmark/speed.mdx",
"chars": 1910,
"preview": "---\ntitle: Extraction Speed\ndescription: Measures processing speed per document\n---\n\n## Why Speed Matters\n\nProcessing ti"
},
{
"path": "content/docs/benchmark/teds.mdx",
"chars": 2106,
"preview": "---\ntitle: Table Structure (TEDS)\ndescription: Measures whether tables are accurately reconstructed\n---\n\n## Why Table Ex"
},
{
"path": "content/docs/cli-options-reference.mdx",
"chars": 6378,
"preview": "---\ntitle: CLI Options Reference\ndescription: Complete reference for all CLI options\n---\n\n{/* AUTO-GENERATED FROM option"
},
{
"path": "content/docs/community.mdx",
"chars": 332,
"preview": "---\ntitle: Support channels\ndescription: Discussion and issue reporting\n---\n\n- [GitHub Discussions](https://github.com/o"
},
{
"path": "content/docs/contributing.mdx",
"chars": 569,
"preview": "---\ntitle: Contributing\ndescription: Contribution guidelines\n---\n\nWe believe great software is built together. Start wit"
},
{
"path": "content/docs/development-workflow.mdx",
"chars": 6573,
"preview": "---\ntitle: Development Workflow\ndescription: Build from source, run tests, and contribute to OpenDataLoader PDF. Prerequ"
},
{
"path": "content/docs/faq.mdx",
"chars": 12782,
"preview": "---\ntitle: Frequently Asked Questions\ndescription: Common questions about OpenDataLoader PDF for RAG, LLM, and document "
},
{
"path": "content/docs/hybrid-mode.mdx",
"chars": 11326,
"preview": "---\ntitle: Hybrid Mode\ndescription: Route complex PDF pages to AI backends for OCR, formula extraction, and chart descri"
},
{
"path": "content/docs/index.mdx",
"chars": 3280,
"preview": "---\ntitle: OpenDataLoader PDF\ndescription: PDF to Markdown & JSON for RAG — Fast, Local, No GPU Required\n---\n\nOpenDataLo"
},
{
"path": "content/docs/json-schema.mdx",
"chars": 6989,
"preview": "---\ntitle: JSON Schema\ndescription: Understand the layout structure emitted by OpenDataLoader PDF\n---\n\n{/* AUTO-GENERATE"
},
{
"path": "content/docs/license.mdx",
"chars": 2909,
"preview": "---\ntitle: License\ndescription: License information for OpenDataLoader PDF\n---\n\nOpenDataLoader PDF is released under the"
},
{
"path": "content/docs/meta.json",
"chars": 809,
"preview": "{\n \"title\": \"docs\",\n \"description\": \"The documentation\",\n \"root\": true,\n \"pages\": [\n \"---Overview---\",\n \"index"
},
{
"path": "content/docs/quick-start-java.mdx",
"chars": 2992,
"preview": "---\ntitle: Quick Start with Java\ndescription: Integrate OpenDataLoader PDF as a JVM dependency or CLI\n---\n\nUse the core "
},
{
"path": "content/docs/quick-start-nodejs.mdx",
"chars": 2030,
"preview": "---\ntitle: Quick Start with Node.js\ndescription: Install @opendataloader/pdf and convert PDF files to Markdown or JSON u"
},
{
"path": "content/docs/quick-start-python.mdx",
"chars": 2788,
"preview": "---\ntitle: Quick Start with Python\ndescription: Install opendataloader-pdf and extract text, tables, and headings from P"
},
{
"path": "content/docs/rag-integration.mdx",
"chars": 12882,
"preview": "---\ntitle: RAG Integration Guide\ndescription: How to use OpenDataLoader PDF in Retrieval-Augmented Generation pipelines\n"
},
{
"path": "content/docs/reading-order.mdx",
"chars": 4687,
"preview": "---\ntitle: Reading Order & XY-Cut++\ndescription: How OpenDataLoader PDF handles multi-column layouts and preserves corre"
},
{
"path": "content/docs/tagged-pdf-collaboration.mdx",
"chars": 5636,
"preview": "---\ntitle: Tagged PDF Collaboration\ndescription: Partnering with PDF Association, Dual Lab, and veraPDF for Tagged PDF a"
},
{
"path": "content/docs/tagged-pdf-rag.mdx",
"chars": 7245,
"preview": "---\ntitle: Tagged PDF for RAG Pipelines\ndescription: Leverage PDF structure tags for higher-quality AI data extraction i"
},
{
"path": "content/docs/tagged-pdf.mdx",
"chars": 3318,
"preview": "---\ntitle: Tagged PDF\ndescription: Using native PDF structure tags for accurate AI data extraction and accessibility com"
},
{
"path": "content/docs/upcoming-roadmap.mdx",
"chars": 1856,
"preview": "---\ntitle: Roadmap\ndescription: Upcoming features and development priorities\n---\n\n## Coming Soon\n\n### Q2 2026\n\n| Feature"
},
{
"path": "content/docs/whats-new-v2.mdx",
"chars": 5337,
"preview": "---\ntitle: \"What's New in v2.0\"\ndescription: \"OpenDataLoader PDF v2.0 release highlights: PDF to Markdown for RAG at 100"
},
{
"path": "docs/hybrid/docling-speed-optimization-plan.md",
"chars": 10921,
"preview": "# Docling Speed Optimization Plan\n\n## Progress Tracker\n\n| Task | Status | Completed | Result |\n|------|--------|--------"
},
{
"path": "docs/hybrid/experiments/chunking_strategy/conclusion.json",
"chars": 1008,
"preview": "{\n \"conclusion\": \"Optimized ranges (consecutive page merging) is always the best strategy\",\n \"recommendation\": \"Merge "
},
{
"path": "docs/hybrid/experiments/chunking_strategy/docling_benchmark_report.json",
"chars": 14738,
"preview": "{\n \"metadata\": {\n \"pdf_file\": \"1901.03003.pdf\",\n \"total_pages\": 15,\n \"warmup_runs\": 1,\n \"measure_runs\": 3,\n"
},
{
"path": "docs/hybrid/experiments/chunking_strategy/docling_page_range_benchmark.py",
"chars": 8270,
"preview": "#!/usr/bin/env python3\n\"\"\"\nDocling Page Range Benchmark\n\n페이지 범위별 변환 성능 비교:\n- 25%, 50%, 75%, 100% 페이지 시나리오\n- 각 시나리오별 최적 청"
},
{
"path": "docs/hybrid/experiments/speed/baseline_results.json",
"chars": 28754,
"preview": "{\n \"approach\": \"baseline\",\n \"description\": \"docling-serve HTTP API\",\n \"timestamp\": \"2026-01-03 14:23:41\",\n \"config\":"
},
{
"path": "docs/hybrid/experiments/speed/fastapi_results.json",
"chars": 31974,
"preview": "{\n \"approach\": \"fastapi\",\n \"description\": \"FastAPI server with docling SDK singleton\",\n \"timestamp\": \"2026-01-03 14:2"
},
{
"path": "docs/hybrid/experiments/speed/speed-experiment-2026-01-03.md",
"chars": 1803,
"preview": "# Docling Speed Experiment Results\n\n**Date**: 2026-01-03 14:31:43\n\n## Summary\n\n| Approach | Description | Avg (s/doc) | "
},
{
"path": "docs/hybrid/experiments/speed/subprocess_results.json",
"chars": 470298,
"preview": "{\n \"approach\": \"subprocess\",\n \"description\": \"Persistent Python subprocess with docling SDK\",\n \"timestamp\": \"2026-01-"
},
{
"path": "docs/hybrid/experiments/triage/triage-experiments.md",
"chars": 12250,
"preview": "---\nname: triage-lab\ndescription: Triage logic experiment records and optimization history\n---\n\n# Triage Lab - Experimen"
},
{
"path": "docs/hybrid/hybrid-mode-design.md",
"chars": 4650,
"preview": "# Hybrid PDF Processing System - Design Document\n\n## Overview\n\nHybrid PDF processing system combining Java heuristics + "
},
{
"path": "docs/hybrid/hybrid-mode-tasks.md",
"chars": 38198,
"preview": "# Hybrid Mode Implementation Tasks\n\nEach task is independently executable. A new Claude Code session can reference this "
},
{
"path": "docs/hybrid/research/comparison-summary.md",
"chars": 2228,
"preview": "# Docling vs OpenDataLoader Output Comparison\n\n## Test Document\n- File: `01030000000045.pdf` (1 page with table)\n\n## Ele"
},
{
"path": "docs/hybrid/research/docling-openapi.json",
"chars": 148717,
"preview": "{\"openapi\":\"3.1.0\",\"info\":{\"title\":\"Docling Serve\",\"version\":\"1.9.0\"},\"paths\":{\"/openapi-3.0.json\":{\"get\":{\"summary\":\"Op"
},
{
"path": "docs/hybrid/research/docling-sample-response-lorem.json",
"chars": 120512,
"preview": "{\"document\":{\"filename\":\"lorem.pdf\",\"md_content\":\"## Lorem Ipsum\\n\\nLorem ipsum dolor sit amet, consectetur adipi"
},
{
"path": "docs/hybrid/research/docling-sample-response.json",
"chars": 231790,
"preview": "{\"document\":{\"filename\":\"01030000000045.pdf\",\"md_content\":\"election integrity. The registration of local election observ"
},
{
"path": "docs/hybrid/research/documents-with-tables.txt",
"chars": 798,
"preview": "01030000000045.pdf\n01030000000046.pdf\n01030000000047.pdf\n01030000000051.pdf\n01030000000052.pdf\n01030000000053.pdf\n010300"
},
{
"path": "docs/hybrid/research/iobject-structure.md",
"chars": 2458,
"preview": "# IObject Class Structure\n\n## Overview\nIObject is imported from `org.verapdf.wcag.algorithms.entities.IObject` (external"
},
{
"path": "docs/hybrid/research/opendataloader-sample-response.json",
"chars": 11858,
"preview": "{\n \"file name\" : \"01030000000045.pdf\",\n \"number of pages\" : 1,\n \"author\" : null,\n \"title\" : null,\n \"creation date\" "
},
{
"path": "docs/hybrid/research/opendataloader-sample-response.md",
"chars": 780,
"preview": "Civil Society Engagement\n\nelection integrity. The registration of local election observers runs until 25 May, and the NE"
},
{
"path": "docs/superpowers/plans/2026-03-16-cid-font-detection.md",
"chars": 22722,
"preview": "# CID Font Extraction Failure Detection — Implementation Plan\n\n> **For agentic workers:** REQUIRED: Use superpowers:suba"
},
{
"path": "docs/superpowers/specs/2026-03-16-cid-font-detection-design.md",
"chars": 7334,
"preview": "# CID Font Extraction Failure Detection\n\nIssue: [#286](https://github.com/opendataloader-project/opendataloader-pdf/issu"
},
{
"path": "examples/python/batch/README.md",
"chars": 1333,
"preview": "# Batch Processing Example\n\nDemonstrates processing multiple PDFs in a single invocation to avoid repeated Java JVM star"
},
{
"path": "examples/python/batch/batch_processing.py",
"chars": 3462,
"preview": "#!/usr/bin/env python3\n\"\"\"\nBatch Processing Example\n\nDemonstrates processing multiple PDFs in a single invocation to avo"
},
{
"path": "examples/python/batch/requirements.txt",
"chars": 50,
"preview": "# Requires Python 3.10+\nopendataloader-pdf>=1.4.0\n"
},
{
"path": "examples/python/rag/README.md",
"chars": 2424,
"preview": "# RAG Examples for OpenDataLoader PDF\n\nWorking examples demonstrating how to use OpenDataLoader PDF in RAG (Retrieval-Au"
},
{
"path": "examples/python/rag/basic_chunking.py",
"chars": 7440,
"preview": "#!/usr/bin/env python3\n\"\"\"\nBasic RAG Chunking Example - No External Dependencies\n\nDemonstrates PDF-to-chunks conversion "
},
{
"path": "examples/python/rag/langchain_example.py",
"chars": 2603,
"preview": "#!/usr/bin/env python3\n\"\"\"\nLangChain Integration Example\n\nDemonstrates using the official langchain-opendataloader-pdf p"
},
{
"path": "examples/python/rag/requirements.txt",
"chars": 94,
"preview": "opendataloader-pdf>=1.4.0\nlangchain-opendataloader-pdf>=0.1.0\nlangchain-text-splitters>=0.2.0\n"
},
{
"path": "java/.run/OpenDataLoaderCli.run.xml",
"chars": 628,
"preview": "<component name=\"ProjectRunConfigurationManager\">\n <configuration default=\"false\" name=\"OpenDataLoaderCli\" type=\"Applic"
},
{
"path": "java/checkstyle.xml",
"chars": 481,
"preview": "<?xml version=\"1.0\"?>\n<!DOCTYPE module PUBLIC \"-//Checkstyle//DTD Checkstyle Configuration 1.3//EN\" \"https://checkstyle."
},
{
"path": "java/opendataloader-pdf-cli/pom.xml",
"chars": 6455,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright 2025 Hancom Inc.\n\n Licensed under the Apache License, Vers"
},
{
"path": "java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIMain.java",
"chars": 5638,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java",
"chars": 30606,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIMainTest.java",
"chars": 3957,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIOptionsContentSafetyTest.java",
"chars": 4543,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIOptionsTest.java",
"chars": 16715,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/pom.xml",
"chars": 8039,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright 2025-2026 Hancom Inc.\n\n Licensed under the Apache License,"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java",
"chars": 28953,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/FilterConfig.java",
"chars": 6275,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java",
"chars": 1887,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/containers/StaticLayoutContainers.java",
"chars": 6017,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/entities/SemanticFormula.java",
"chars": 1723,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/entities/SemanticPicture.java",
"chars": 2761,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java",
"chars": 19929,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGeneratorFactory.java",
"chars": 1272,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlSyntax.java",
"chars": 3480,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingFastServerClient.java",
"chars": 11784,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingSchemaTransformer.java",
"chars": 22659,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HancomClient.java",
"chars": 11075,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HancomSchemaTransformer.java",
"chars": 20202,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridClient.java",
"chars": 12326,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridClientFactory.java",
"chars": 6924,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridConfig.java",
"chars": 6316,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridSchemaTransformer.java",
"chars": 2786,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/TriageLogger.java",
"chars": 7829,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/TriageProcessor.java",
"chars": 44827,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/JsonName.java",
"chars": 3514,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/JsonWriter.java",
"chars": 4453,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/ObjectMapperHolder.java",
"chars": 4795,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/CaptionSerializer.java",
"chars": 1862,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/DoubleSerializer.java",
"chars": 1941,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/FormulaSerializer.java",
"chars": 1791,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/HeaderFooterSerializer.java",
"chars": 2172,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/HeadingSerializer.java",
"chars": 1839,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ImageSerializer.java",
"chars": 2722,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/LineChunkSerializer.java",
"chars": 1450,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ListItemSerializer.java",
"chars": 2273,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ListSerializer.java",
"chars": 2174,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/ParagraphSerializer.java",
"chars": 1549,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/PictureSerializer.java",
"chars": 3060,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/SemanticTextNodeSerializer.java",
"chars": 1605,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/SerializerUtil.java",
"chars": 2580,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableCellSerializer.java",
"chars": 2208,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableRowSerializer.java",
"chars": 2078,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TableSerializer.java",
"chars": 2825,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TextChunkSerializer.java",
"chars": 1530,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/json/serializers/TextLineSerializer.java",
"chars": 1521,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java",
"chars": 15628,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGeneratorFactory.java",
"chars": 1122,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownHTMLGenerator.java",
"chars": 3786,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownSyntax.java",
"chars": 2646,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/pdf/PDFLayer.java",
"chars": 1082,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/pdf/PDFWriter.java",
"chars": 15564,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/AbstractTableProcessor.java",
"chars": 5572,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/CaptionProcessor.java",
"chars": 6785,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ClusterTableProcessor.java",
"chars": 4904,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ContentFilterProcessor.java",
"chars": 7150,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java",
"chars": 20698,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeaderFooterProcessor.java",
"chars": 16299,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeadingProcessor.java",
"chars": 10902,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HiddenTextProcessor.java",
"chars": 2614,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HybridDocumentProcessor.java",
"chars": 24185,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/LevelProcessor.java",
"chars": 6614,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ListProcessor.java",
"chars": 25909,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ParagraphProcessor.java",
"chars": 23744,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/SpecialTableProcessor.java",
"chars": 4410,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/StrikethroughProcessor.java",
"chars": 6539,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableBorderProcessor.java",
"chars": 13245,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableStructureNormalizer.java",
"chars": 21941,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TaggedDocumentProcessor.java",
"chars": 17177,
"preview": "package org.opendataloader.pdf.processors;\n\nimport org.opendataloader.pdf.api.Config;\nimport org.verapdf.gf.model.impl.s"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextLineProcessor.java",
"chars": 5843,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextProcessor.java",
"chars": 7860,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorter.java",
"chars": 24114,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/text/TextGenerator.java",
"chars": 9616,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/Base64ImageUtils.java",
"chars": 3067,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/BulletedParagraphUtils.java",
"chars": 6642,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ContentSanitizer.java",
"chars": 13479,
"preview": "package org.opendataloader.pdf.utils;\n\nimport org.verapdf.wcag.algorithms.entities.IObject;\nimport org.verapdf.wcag.algo"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java",
"chars": 6394,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ModeWeightStatistics.java",
"chars": 2142,
"preview": "package org.opendataloader.pdf.utils;\n\nimport java.util.ArrayList;\nimport java.util.HashMap;\nimport java.util.List;\nimpo"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/SanitizationRule.java",
"chars": 462,
"preview": "package org.opendataloader.pdf.utils;\n\nimport java.util.regex.Pattern;\n\npublic class SanitizationRule {\n private fina"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeStatistics.java",
"chars": 1862,
"preview": "package org.opendataloader.pdf.utils;\n\nimport org.verapdf.wcag.algorithms.entities.SemanticTextNode;\n\npublic class TextN"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeStatisticsConfig.java",
"chars": 807,
"preview": "package org.opendataloader.pdf.utils;\n\n/**\n * Configuration holder that exposes the scoring constants used by {@link Tex"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/TextNodeUtils.java",
"chars": 2045,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/LevelInfo.java",
"chars": 4496,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/LineArtBulletParagraphLevelInfo.java",
"chars": 1406,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/ListLevelInfo.java",
"chars": 1605,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/TableLevelInfo.java",
"chars": 921,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/levels/TextBulletParagraphLevelInfo.java",
"chars": 1647,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/EmbedImagesIntegrationTest.java",
"chars": 7833,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/ImageDirIntegrationTest.java",
"chars": 6028,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java",
"chars": 3342,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/Issue336IntegrationTest.java",
"chars": 10051,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/PageSeparatorIntegrationTest.java",
"chars": 9275,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/PagesOptionIntegrationTest.java",
"chars": 14096,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/api/ConfigTest.java",
"chars": 10819,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/api/FilterConfigTest.java",
"chars": 613,
"preview": "package org.opendataloader.pdf.api;\n\nimport org.junit.jupiter.api.Test;\n\nimport static org.junit.jupiter.api.Assertions."
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/containers/StaticLayoutContainersTest.java",
"chars": 3938,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/DoclingFastServerClientTest.java",
"chars": 8710,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/DoclingSchemaTransformerTest.java",
"chars": 29864,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
},
{
"path": "java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/hybrid/HancomClientTest.java",
"chars": 9995,
"preview": "/*\n * Copyright 2025-2026 Hancom Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may n"
}
]
// ... and 87 more files (download for full content)
About this extraction
This page contains the full source code of the opendataloader-project/opendataloader-pdf GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 287 files (2.7 MB), approximately 720.2k tokens, and a symbol index with 1511 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.