Copy disabled (too large)
Download .txt
Showing preview only (10,548K chars total). Download the full file to get everything.
Repository: datalab-to/marker
Branch: master
Commit: d63e3d943b2c
Files: 240
Total size: 10.0 MB
Directory structure:
gitextract_pirlcywv/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows/
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CLA.md
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── benchmarks/
│ ├── __init__.py
│ ├── overall/
│ │ ├── __init__.py
│ │ ├── display/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods/
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers/
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table/
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput/
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── convert.py
├── convert_single.py
├── data/
│ ├── .gitignore
│ ├── examples/
│ │ ├── json/
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown/
│ │ ├── multicolcnn/
│ │ │ ├── multicolcnn.md
│ │ │ └── multicolcnn_meta.json
│ │ ├── switch_transformers/
│ │ │ ├── switch_trans.md
│ │ │ └── switch_trans_meta.json
│ │ └── thinkpython/
│ │ ├── thinkpython.md
│ │ └── thinkpython_meta.json
│ └── latex_to_md.sh
├── examples/
│ ├── README.md
│ └── marker_modal_deployment.py
├── extraction_app.py
├── marker/
│ ├── builders/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters/
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors/
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm/
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table.py
│ │ │ └── llm_table_merge.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers/
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema/
│ │ ├── __init__.py
│ │ ├── blocks/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text/
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts/
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert.py
│ │ ├── convert_single.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services/
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils/
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── pyproject.toml
├── pytest.ini
├── signatures/
│ └── version1/
│ └── cla.json
├── static/
│ └── fonts/
│ └── .gitignore
└── tests/
├── builders/
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config/
│ └── test_config.py
├── conftest.py
├── converters/
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors/
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers/
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers/
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema/
│ └── groups/
│ └── test_list_grouping.py
├── services/
│ └── test_service_init.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/breaking-bug-report.md
================================================
---
name: Breaking bug report
about: Create a report about a breaking bug
title: "[BUG: Breaking]"
labels: 'bug: breaking'
assignees: ''
---
## 🧨 Describe the Bug
A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc).
## 📄 Input Document
Attach the PDF or input file that triggered the error.
## 📤 Output Trace / Stack Trace
Paste the **complete** stack trace or error output, if available.
<details>
<summary>Click to expand</summary>
```
Paste stack trace here
```
</details>
## ⚙️ Environment
Please fill in all relevant details:
- **Marker version**:
- **Surya version**:
- **Python version**:
- **PyTorch version**:
- **Transformers version**:
- **Operating System** (incl. container info if relevant):
## ✅ Expected Behavior
What did you expect Marker to do?
## 📟 Command or Code Used
Paste the **exact bash command** or **Python code** you used to run Marker:
<details>
<summary>Click to expand</summary>
```bash
# or Python code block
your_command_here --with-flags
```
</details>
## 📎 Additional Context
Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings).
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: "[FEAT]"
labels: enhancement
assignees: ''
---
## ✨ Is your feature request related to a problem?
A clear and concise description of what the problem is.
## 💡 Describe the Solution You'd Like
A concise description of what you want to happen or how you envision it working.
## 📋 Alternatives Considered
Any alternative solutions or workarounds you've tried.
## 🧩 Additional Context
Any additional context, references, or related issues.
================================================
FILE: .github/ISSUE_TEMPLATE/output-bug-report.md
================================================
---
name: Output bug report
about: Create a report about poor output quality
title: "[BUG: Output]"
labels: 'bug: output'
assignees: ''
---
## 📝 Describe the Output Issue
A clear and concise description of the incorrect or unexpected output.
## 📄 Input Document
Attach the PDF or input file used.
## 📤 Current Output
Paste the Markdown or HTML that Marker generated:
````markdown
Paste output here
`````
## ✅ Expected Output
Describe or paste what you expected Marker to generate.
## ⚙️ Environment
Please fill in all relevant details:
* **Marker version**:
* **Surya version**:
* **Python version**:
* **PyTorch version**:
* **Transformers version**:
* **Operating System**:
## 📟 Command or Code Used
Paste the **exact bash command** or **Python code** you used to run Marker:
<details>
<summary>Click to expand</summary>
```bash
# or Python code block
your_command_here --with-flags
```
</details>
## 📎 Additional Context
Any other relevant info, configs, or assumptions.
================================================
FILE: .github/workflows/benchmarks.yml
================================================
name: Integration test
on: [push]
env:
PYTHONIOENCODING: "utf-8"
jobs:
benchmark:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [t4_gpu, ubuntu-latest]
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install apt dependencies
run: |
sudo apt-get update
sudo apt-get install -y pandoc
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Run benchmark test
run: |
poetry run python benchmarks/overall/overall.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
- name: Run table benchmark
run: |
poetry run python benchmarks/table/table.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI tests
on: [push]
jobs:
tests:
runs-on: t4_gpu
steps:
- uses: actions/checkout@v3
- name: Install apt requirements
run: |
sudo apt-get update
sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Run tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: poetry run pytest
================================================
FILE: .github/workflows/cla.yml
================================================
name: "Marker CLA Assistant"
on:
issue_comment:
types: [created]
pull_request_target:
types: [opened,closed,synchronize]
# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
permissions:
actions: write
contents: write
pull-requests: write
statuses: write
jobs:
CLAAssistant:
runs-on: ubuntu-latest
steps:
- name: "Marker CLA Assistant"
if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
uses: contributor-assistant/github-action@v2.3.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# the below token should have repo scope and must be manually added by you in the repository's secret
# This token is required only if you have configured to store the signatures in a remote repository/organization
PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
with:
path-to-signatures: 'signatures/version1/cla.json'
path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
# branch should not be protected
branch: 'master'
allowlist: VikParuchuri,Sandy
================================================
FILE: .github/workflows/publish.yml
================================================
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Build package
run: |
poetry build
- name: Extract version from pyproject.toml
id: version
run: |
VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])")
echo "version=v$VERSION" >> $GITHUB_OUTPUT
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
================================================
FILE: .github/workflows/scripts.yml
================================================
name: Test CLI scripts
on: [push]
jobs:
tests:
runs-on: t4_gpu
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Download benchmark data
run: |
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
unzip -o benchmark_data.zip
- name: Test single script
run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
- name: Test convert script
run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0
- name: Text convert script multiple workers
run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5
- name: Test llm option
run: |
poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing"
if ! grep -q "UserWarning" output.txt; then
echo "Success: No UserWarning found"
exit 0
else
echo "Error: UserWarning found in output"
exit 1
fi
================================================
FILE: .gitignore
================================================
private.py
.DS_Store
local.env
experiments
test_data
training
wandb
*.dat
report.json
benchmark_data
debug_data
temp.md
temp
conversion_results
uploads
/cache
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.vscode/
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.9.10
hooks:
# Run the linter.
- id: ruff
types_or: [ python, pyi ]
args: [ --fix ]
# Run the formatter.
- id: ruff-format
types_or: [ python, pyi ]
================================================
FILE: CLA.md
================================================
Marker Contributor Agreement
This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below.
If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
1. The term 'contribution' or 'contributed materials' means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
- you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements, including dual-license structures for commercial customers;
- you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
- you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
- you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
- you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of vour contribution.
3. With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
- make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
- at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
If you or your affiliates institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the contribution or any project it was submitted to constitutes direct or contributory patent infringement, then any patent licenses granted to you under this agreement for that contribution shall terminate as of the date such litigation is filed.
4. Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms. Any contribution we make available under any license will also be made available under a suitable FSF (Free Software Foundation) or OSI (Open Source Initiative) approved license.
5. You covenant, represent, warrant and agree that:
- each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA;
- to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
- each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws.
You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA.
6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Marker
Copyright (C) 2024 Endless Labs, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Marker Copyright (C) 2024 Endless Labs, Inc.
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.
================================================
FILE: MODEL_LICENSE
================================================
AI PUBS OPEN RAIL-M LICENSE (MODIFIED)
Version 0.1, March 2, 2023 (Modified)
http://licenses.ai/
PLEASE READ THESE TERMS CAREFULLY BEFORE USING THE MODEL OR A DERIVATIVE WORKS OF THE MODEL MADE AVAILABLE IN CONNECTION WITH THESE TERMS. BY DOWNLOADING, REPRODUCING, DISTRIBUTING OR USING THE MODEL OR A DERIVATIVE WORK OF THE MODEL IN ANY MANNER, YOU (“YOU”) AGREE TO BE BOUND BY THESE TERMS (THE “AGREEMENT”) TO THE EXCLUSION OF ALL OTHER TERMS. YOU REPRESENT AND WARRANT THAT YOU HAVE THE AUTHORITY TO ENTER INTO THIS AGREEMENT; IF YOU ARE ENTERING INTO THIS AGREEMENT ON BEHALF OF AN ORGANIZATION OR ENTITY, REFERENCES TO AND “YOU” IN THIS AGREEMENT, REFER TO THAT ORGANIZATION OR ENTITY. IF YOU DO NOT AGREE TO ALL OF THE FOLLOWING, YOU MAY NOT DOWNLOAD, REPRODUCE, DISTRIBUTE OR USE THE MODEL OR A DERIVATIVE WORK OF THE MODEL IN ANY MANNER.
Section I: PREAMBLE
This OpenRAIL-M License, as modified, is generally applicable to any machine-learning Model.
The “Open” nomenclature indicates that the licensed Model is be freely accessible to downstream and other users. The “RAIL” nomenclature indicates that there are use restrictions prohibiting the use of the Model. These restrictions are intended to avoid potential misuse. This License specifies that the use restrictions in the original License must apply to such derivatives.
NOW THEREFORE, You and Licensor agree as follows:
1. Definitions
(a) “Complementary Material” means the applicable source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, and any related information, if any. Complementary Material is not licensed under this License.
(b) "Contribution" means any work, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the rights owner or by an individual or legal entity authorized to submit on behalf of the rights owner. For the purposes of this definition, “submitted” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the rights owner as "Not a Contribution."
(c) "Contributor" means Licensor and any individual or legal entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
(d) “Data” means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
(e) “Derivatives of the Model” means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
(f) “Distribution” means any transmission, reproduction, publication, distribution, or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means, including but not limited to API-based or web access.
(g) “Harm” includes but is not limited to physical, mental, psychological, financial and reputational damage, pain, or loss
(h) "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
(i) “Licensor” means the rights owner or entity authorized by the rights owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
(j) “Model” means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
(k) “Output” means the results of operating a Model as embodied in informational content resulting therefrom.
(l) “Third Parties” means individuals or legal entities that are not under common control with Licensor or You.
(m) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application, including but not limited to a chatbot, translator, or image generator.
Section II: INTELLECTUAL PROPERTY RIGHTS
Both copyright and patent grants may apply to the Model and Derivatives of the Model. The Model and Derivatives of the Model are subject to additional terms as described in Section III, which shall govern the use of the Model and Derivatives of the Model even in the event Section II is held unenforceable.
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Model and Derivatives of the Model.
3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and/or Derivatives of the Model where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model or Derivatives of the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or Derivative of the Model and/or a Contribution incorporated within the Model or Derivative of the Model constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Derivative of the Model shall terminate as of the date such litigation is asserted or filed.
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
4. Distribution and Redistribution. You may host the Model or Derivatives of the Model for remote access by Third Parties, including but not limited to software-as-a-service, reproduce, or Distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the conditions in this Section III:
(a) Use-based restrictions in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (for example, a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model and Derivatives of the Model are subject to paragraph 5;
(b) You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
(c) You must cause any modified files to carry prominent notices stating that You changed the files; and
(d) You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model or Derivatives of the Model.
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions, consistent with paragraph 4.a., for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Accordingly, You cannot use the Model or the Derivatives of the Model in violation of such restrictions. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, fine-tuning, updating, running, training, evaluating and/or re-parametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph 5.
6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are solely responsible for the Output you generate and its subsequent uses. No use of the Output can contravene any provision as stated in the License.
7. Attribution. In connection with any Output, or use of Distribution of any Model or Derivatives of the Model, You agree to give appropriate credit and attribution to Licensor, provide a link to the original Model or Derivatives of the Model, provide a copy of this License, and identify any changes You have made to the Model or Derivatives of the Model (collectively, the “Attribution”). The Attribution must not suggest endorsement by any Licensor.
8. Share-a-Like. As a condition to the license and authorizations herein, You agree to apply this License (to the exclusion of all others) to any and all copies of the Model, Derivatives of the Model, any changes or improvements to the Model or Derivatives of the Model, and to the Output and any derivatives, changes or improvements to or of the Output.
Section IV: OTHER PROVISIONS
9. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or cause modification to the Output resulting from updates to the Model based.
10. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
11. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model and Derivatives of the Model, and assume any risks associated with Your exercise of permissions under this License.
12. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
13. Accepting Warranty or Additional Liability. While Distributing the Model or Derivatives of the Model, You may choose to charge a fee in exchange for support, warranty, indemnity, or other obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor or Licensor, and only if You agree to indemnify, defend, and hold each Contributor and the Licensor harmless for any liability incurred by, or claims asserted against, such Contributor or Licensor by reason of your accepting any such warranty or additional liability.
14. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
END OF TERMS AND CONDITIONS
Attachment A
USE RESTRICTIONS
As conditions to the Licenses set forth in this Agreement, You agree not to use, reproduce, modify, create or Distribute the Model, Derivatives of the Model, or Output (collectively, “Use”) in any of the following ways:
1. Legal:
(a) In any way that violates any applicable national, federal, state, local or international law or regulation; or
(b) to directly or indirectly infringe or misappropriate any third party intellectual property rights (including those of Licensor or any Contributor)
2. Commercial:
(a) for any purpose if You (your employer, or the entity you are affiliated with) generated more than two million US Dollars ($2,000,000) in gross revenue in the prior year, except where Your Use is limited to personal use or research purposes;
(b) for any purpose if You (your employer, or the entity you are affiliated with) has raised more than two million US dollars ($2,000,000) in total equity or debt funding from any source, except where Your Use is limited to personal use or research purposes; or
(c) for any purpose if You (your employer, or the entity you are affiliated with) provides or otherwise makes available any product or service that competes with any product or service offered by or made available by Licensor or any of its affiliates.
Commercial and broader use licenses may be available from Licensor at the following URL: https://www.datalab.to/
================================================
FILE: README.md
================================================
# Marker
Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
- Formats tables, forms, equations, inline math, links, references, and code blocks
- Extracts and saves images
- Removes headers/footers/other artifacts
- Extensible with your own formatting and logic
- Does structured extraction, given a JSON schema (beta)
- Optionally boost accuracy with LLMs (and your own prompt)
- Works on GPU, CPU, or MPS
For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker).
## Performance
<img src="data/images/overall.png" width="800px"/>
Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.
The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100.
See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
## Hybrid Mode
For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details.
Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
<img src="data/images/table.png" width="400px"/>
As you can see, the use_llm mode offers higher accuracy than marker or gemini alone.
## Examples
| PDF | File type | Markdown | JSON |
|-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json) |
| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) |
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) |
# Commercial usage
Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker).
# Hosted API & On-prem
There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out.
The API:
- Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files
- Is 1/4th the price of leading cloud-based competitors
- Fast - ~15s for a 250 page PDF
- Supports LLM mode
- High uptime (99.99%)
# Community
[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
# Installation
You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/).
Install with:
```shell
pip install marker-pdf
```
If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
```shell
pip install marker-pdf[full]
```
# Usage
First, some configuration:
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
- Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
- If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
## Interactive App
I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
```shell
pip install streamlit streamlit-ace
marker_gui
```
## Convert a single file
```shell
marker_single /path/to/file.pdf
```
You can pass in PDFs or images.
Options:
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
- `--output_format [markdown|json|html|chunks]`: Specify the format for the output results.
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
- `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly.
- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
- `--debug`: Enable debug mode for additional logging and diagnostic information.
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
- `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`.
- `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above)
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language.
## Convert multiple files
```shell
marker /path/to/input/folder
```
- `marker` supports all the same options from `marker_single` above.
- `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
## Convert multiple files on multiple GPUs
```shell
NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out
```
- `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater.
- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
## Use from python
See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed.
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```
`rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`. For json output, you'll have `children`, `block_type`, and `metadata`.
### Custom configuration
You can pass configuration using the `ConfigParser`. To see all available options, do `marker_single --help`.
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
config = {
"output_format": "json",
"ADDITIONAL_KEY": "VALUE"
}
config_parser = ConfigParser(config)
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=create_model_dict(),
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service()
)
rendered = converter("FILEPATH")
```
### Extract blocks
Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks.
Here's an example of extracting all forms from a document:
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.schema import BlockTypes
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
document = converter.build_document("FILEPATH")
forms = document.contained_blocks((BlockTypes.Form,))
```
Look at the processors for more examples of extracting and manipulating blocks.
## Other converters
You can also use other converters that define different conversion pipelines:
### Extract tables
The `TableConverter` will only convert and extract tables:
```python
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
converter = TableConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```
This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes.
You can also run this via the CLI with
```shell
marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
```
### OCR Only
If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes.
```python
from marker.converters.ocr import OCRConverter
from marker.models import create_model_dict
converter = OCRConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
```
This takes all the same configuration as the PdfConverter.
You can also run this via the CLI with
```shell
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
```
### Structured Extraction (beta)
You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
```python
from marker.converters.extraction import ExtractionConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from pydantic import BaseModel
class Links(BaseModel):
links: list[str]
schema = Links.model_json_schema()
config_parser = ConfigParser({
"page_schema": schema
})
converter = ExtractionConverter(
artifact_dict=create_model_dict(),
config=config_parser.generate_config_dict(),
llm_service=config_parser.get_llm_service(),
)
rendered = converter("FILEPATH")
```
Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.
# Output Formats
## Markdown
Markdown output will include:
- image links (images will be saved in the same folder)
- formatted tables
- embedded LaTeX equations (fenced with `$$`)
- Code is fenced with triple backticks
- Superscripts for footnotes
## HTML
HTML output is similar to markdown output:
- Images are included via `img` tags
- equations are fenced with `<math>` tags
- code is in `pre` tags
## JSON
JSON output will be organized in a tree-like structure, with the leaf nodes being blocks. Examples of leaf nodes are a single list item, a paragraph of text, or an image.
The output will be a list, with each list item representing a page. Each page is considered a block in the internal marker schema. There are different types of blocks to represent different elements.
Pages have the keys:
- `id` - unique id for the block.
- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML.
- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
- `children` - the child blocks.
The child blocks have two additional keys:
- `section_hierarchy` - indicates the sections that the block is part of. `1` indicates an h1 tag, `2` an h2, and so on.
- `images` - base64 encoded images. The key will be the block id, and the data will be the encoded image.
Note that child blocks of pages can have their own children as well (a tree structure).
```json
{
"id": "/page/10/Page/366",
"block_type": "Page",
"html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>",
"polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]],
"children": [
{
"id": "/page/10/SectionHeader/0",
"block_type": "SectionHeader",
"html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>",
"polygon": [
[217.845703125, 80.630859375], [374.73046875, 80.630859375],
[374.73046875, 107.0],
[217.845703125, 107.0]
],
"children": null,
"section_hierarchy": {
"1": "/page/10/SectionHeader/1"
},
"images": {}
},
...
]
}
```
## Chunks
Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG.
## Metadata
All output formats will return a metadata dictionary, with the following fields:
```json
{
"table_of_contents": [
{
"title": "Introduction",
"heading_level": 1,
"page_id": 0,
"polygon": [...]
}
], // computed PDF table of contents
"page_stats": [
{
"page_id": 0,
"text_extraction_method": "pdftext",
"block_counts": [("Span", 200), ...]
},
...
]
}
```
# LLM Services
When running with the `--use_llm` flag, you have a choice of services you can use:
- `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration.
- `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
- `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
- `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`.
- `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`.
- `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`.
These services may have additional optional configuration as well - you can see it by viewing the classes.
# Internals
Marker is easy to extend. The core units of marker are:
- `Providers`, at `marker/providers`. These provide information from a source file, like a PDF.
- `Builders`, at `marker/builders`. These generate the initial document blocks and fill in text, using info from the providers.
- `Processors`, at `marker/processors`. These process specific blocks, for example the table formatter is a processor.
- `Renderers`, at `marker/renderers`. These use the blocks to render output.
- `Schema`, at `marker/schema`. The classes for all the block types.
- `Converters`, at `marker/converters`. They run the whole end to end pipeline.
To customize processing behavior, override the `processors`. To add new output formats, write a new `renderer`. For additional input formats, write a new `provider.`
Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily.
## API server
There is a very simple API server you can run like this:
```shell
pip install -U uvicorn fastapi python-multipart
marker_server --port 8001
```
This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options.
You can send requests like this:
```
import requests
import json
post_data = {
'filepath': 'FILEPATH',
# Add other params here
}
requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
```
Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).
# Troubleshooting
There are some settings that you may find useful if things aren't working the way you expect:
- If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality. You must set `GOOGLE_API_KEY` to a Gemini API key for this to work.
- Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document.
- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
- If you're getting out of memory errors, decrease worker count. You can also try splitting up long PDFs into multiple files.
## Debugging
Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
# Benchmarks
## Overall PDF Conversion
We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method.
| Method | Avg Time | Heuristic Score | LLM Score |
|------------|----------|-----------------|-----------|
| marker | 2.83837 | 95.6709 | 4.23916 |
| llamaparse | 23.348 | 84.2442 | 3.97619 |
| mathpix | 6.36223 | 86.4281 | 4.15626 |
| docling | 3.69949 | 86.7073 | 3.70429 |
Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type:
<img src="data/images/per_doc.png" width="1000px"/>
| Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM |
|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------|
| Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 |
| Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 |
| Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 |
| Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 |
| Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 |
| Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 |
| Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 |
| Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 |
| Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 |
| Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 |
| Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 |
## Throughput
We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf).
| Method | Time per page | Time per document | VRAM used |
|---------|---------------|-------------------|---------- |
| marker | 0.18 | 43.42 | 3.17GB |
The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used.
## Table Conversion
Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
| Method | Avg score | Total tables |
|------------------|-----------|--------------|
| marker | 0.816 | 99 |
| marker w/use_llm | 0.907 | 99 |
| gemini | 0.829 | 99 |
The `--use_llm` flag can significantly improve table recognition performance, as you can see.
We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).
## Running your own benchmarks
You can benchmark the performance of marker on your machine. Install marker manually with:
```shell
git clone https://github.com/VikParuchuri/marker.git
poetry install
```
### Overall PDF Conversion
Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
```shell
python benchmarks/overall.py --methods marker --scores heuristic,llm
```
Options:
- `--use_llm` use an llm to improve the marker results.
- `--max_rows` how many rows to process for the benchmark.
- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated.
- `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated.
### Table Conversion
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
```shell
python benchmarks/table/table.py --max_rows 100
```
Options:
- `--use_llm` uses an llm with marker to improve accuracy.
- `--use_gemini` also benchmarks gemini 2.0 flash.
# How it works
Marker is a pipeline of deep learning models:
- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
- Optionally use an LLM to improve quality
- Combine blocks and postprocess complete text
It only uses models where necessary, which improves speed and accuracy.
# Limitations
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
- Very complex layouts, with nested tables and forms, may not work
- Forms may not be rendered well
Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.
# Usage and Deployment Examples
You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options:
- Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to)
- Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations.
- [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes.
================================================
FILE: benchmarks/__init__.py
================================================
================================================
FILE: benchmarks/overall/__init__.py
================================================
================================================
FILE: benchmarks/overall/display/__init__.py
================================================
================================================
FILE: benchmarks/overall/display/dataset.py
================================================
import json
from typing import List
import datasets
from tqdm import tqdm
from benchmarks.overall.registry import METHOD_REGISTRY
from benchmarks.overall.schema import FullResult
def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
rows = []
for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
if idx not in result["markdown"]:
continue
if max_rows is not None and idx >= max_rows:
break
row = {
"uuid": sample["uuid"],
"classification": sample["classification"],
"language": sample["language"],
"img": sample["img"],
}
for method in result["markdown"][idx]:
if method == "gt":
continue
method_cls = METHOD_REGISTRY[method]()
md = result["markdown"][idx][method]
try:
method_img = method_cls.render(result["markdown"][idx][method])
except Exception as e:
# This can happen when the markdown is None
method_img = PIL.Image.new("RGB", (200, 200))
row[f"{method}_md"] = md
row[f"{method}_img"] = method_img
for score_type in score_types:
try:
row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
except KeyError:
row[f"{method}_{score_type}"] = -1.0 # Missing score
try:
row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
except KeyError:
row[f"{method}_{score_type}_detail"] = "" # Missing detail
rows.append(row)
ds = datasets.Dataset.from_list(rows)
return ds
================================================
FILE: benchmarks/overall/display/table.py
================================================
from pathlib import Path
from typing import Dict, List
import tabulate
from benchmarks.overall.schema import FullResult
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
with open(out_path / filename, "w", encoding="utf-8") as f:
f.write(f"# {title}\n")
f.write(table)
print(title)
print(table)
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
headers = ["Document Type"]
for method in methods:
for score_type in score_types:
headers.append(f"{method} {score_type}")
document_rows = [[k] for k in document_types]
for i, doc_type in enumerate(document_types):
for method in methods:
for score_type in score_types:
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
document_rows[i].append(avg_score)
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
headers = ["Block Type"]
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
block_score_types = list(result["averages_by_block_type"][default_method].keys())
for method in methods:
for score_type in block_score_types:
headers.append(f"{method} {score_type}")
block_rows = [[k] for k in block_types]
for i, block_type in enumerate(block_types):
for method in methods:
for score_type in block_score_types:
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
block_rows[i].append(avg_score)
write_table("Block types", block_rows, headers, out_path, "block_types.md")
headers = ["Method", "Avg Time"] + score_types
inference_rows = [[k] for k in methods]
all_raw_scores = [result["scores"][i] for i in result["scores"]]
for i, method in enumerate(methods):
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
inference_rows[i].append(avg_time)
for score_type in score_types:
scores_lst = []
for ar in all_raw_scores:
try:
# Sometimes a few llm scores are missing
scores_lst.append(ar[method][score_type]["score"])
except KeyError:
continue
avg_score = sum(scores_lst) / max(1, len(scores_lst))
inference_rows[i].append(avg_score)
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
================================================
FILE: benchmarks/overall/download/__init__.py
================================================
================================================
FILE: benchmarks/overall/download/base.py
================================================
import json
from json import JSONDecodeError
from pathlib import Path
import datasets
from tqdm import tqdm
class Downloader:
cache_path: Path = Path("cache")
service: str
def __init__(self, api_key, app_id, max_rows: int = 2200):
self.cache_path.mkdir(exist_ok=True)
self.max_rows = max_rows
self.api_key = api_key
self.app_id = app_id
self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")
def get_html(self, pdf_bytes):
raise NotImplementedError
def upload_ds(self):
rows = []
for file in self.cache_path.glob("*.json"):
with open(file, "r") as f:
data = json.load(f)
rows.append(data)
out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
"md": datasets.Value("string"),
"uuid": datasets.Value("string"),
"time": datasets.Value("float"),
}))
out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)
def generate_data(self):
max_rows = self.max_rows
for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
cache_file = self.cache_path / f"{idx}.json"
if cache_file.exists():
continue
pdf_bytes = sample["pdf"] # This is a single page PDF
try:
out_data = self.get_html(pdf_bytes)
except JSONDecodeError as e:
print(f"Error with sample {idx}: {e}")
continue
except Exception as e:
print(f"Error with sample {idx}: {e}")
continue
out_data["uuid"] = sample["uuid"]
with cache_file.open("w") as f:
json.dump(out_data, f)
if idx >= max_rows:
break
def __call__(self):
self.generate_data()
self.upload_ds()
================================================
FILE: benchmarks/overall/download/llamaparse.py
================================================
import io
import time
import requests
from benchmarks.overall.download.base import Downloader
class LlamaParseDownloader(Downloader):
service = "llamaparse"
def get_html(self, pdf_bytes):
rand_name = str(time.time()) + ".pdf"
start = time.time()
buff = io.BytesIO(pdf_bytes)
md = upload_and_parse_file(self.api_key, rand_name, buff)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start,
}
def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
headers = {
"Authorization": f"Bearer {api_key}",
"Accept": "application/json"
}
# Upload file
files = {
'file': (fname, buff, 'application/pdf')
}
response = requests.post(
'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
headers=headers,
files=files
)
response.raise_for_status()
job_id = response.json()['id']
# Poll for completion
for _ in range(max_retries):
status_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
headers=headers
)
status_response.raise_for_status()
if status_response.json()['status'] == 'SUCCESS':
# Get results
result_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
headers=headers
)
result_response.raise_for_status()
return result_response.json()['markdown']
time.sleep(delay)
raise TimeoutError("Job did not complete within the maximum retry attempts")
================================================
FILE: benchmarks/overall/download/main.py
================================================
import click
from benchmarks.overall.download.llamaparse import LlamaParseDownloader
from benchmarks.overall.download.mathpix import MathpixDownloader
from benchmarks.overall.download.mistral import MistralDownloader
@click.command("Download data from inference services")
@click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
@click.option("--max_rows", type=int, default=2200)
@click.option("--api_key", type=str, default=None)
@click.option("--app_id", type=str, default=None)
def main(service: str, max_rows: int, api_key: str, app_id: str):
registry = {
"mathpix": MathpixDownloader,
"llamaparse": LlamaParseDownloader,
"mistral": MistralDownloader,
}
downloader = registry[service](api_key, app_id, max_rows=max_rows)
# Generate data and upload to hub
downloader()
if __name__ == "__main__":
main()
================================================
FILE: benchmarks/overall/download/mathpix.py
================================================
import json
import time
import requests
from benchmarks.overall.download.base import Downloader
class MathpixDownloader(Downloader):
service = "mathpix"
def get_html(self, pdf_bytes):
headers = {
"app_id": self.app_id,
"app_key": self.api_key,
}
start = time.time()
pdf_id = mathpix_request(pdf_bytes, headers)
status = mathpix_status(pdf_id, headers)
if status in ["processing", "error"]:
md = ""
else:
md = mathpix_results(pdf_id, headers)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start
}
def mathpix_request(buffer, headers):
response = requests.post("https://api.mathpix.com/v3/pdf",
headers=headers,
data={
"options_json": json.dumps(
{
"conversion_formats": {
"md": True,
"html": True
}
}
)
},
files={
"file": buffer
}
)
data = response.json()
pdf_id = data["pdf_id"]
return pdf_id
def mathpix_status(pdf_id, headers):
max_iters = 120
i = 0
status = "processing"
status2 = "processing"
while i < max_iters:
time.sleep(1)
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
headers=headers
)
status_resp = response.json()
if "conversion_status" not in status_resp:
continue
status = status_resp["conversion_status"]["md"]["status"]
status2 = status_resp["conversion_status"]["html"]["status"]
if status == "completed" and status2 == "completed":
break
elif status == "error" or status2 == "error":
break
out_status = "completed" if status == "completed" and status2 == "completed" else "error"
return out_status
def mathpix_results(pdf_id, headers, ext="md"):
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
headers=headers
)
return response.content
================================================
FILE: benchmarks/overall/download/mistral.py
================================================
import io
import time
import requests
from benchmarks.overall.download.base import Downloader
class MistralDownloader(Downloader):
service = "mistral"
def get_html(self, pdf_bytes):
rand_name = str(time.time()) + ".pdf"
start = time.time()
buff = io.BytesIO(pdf_bytes)
md = upload_and_process_file(self.api_key, rand_name, buff)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start,
}
def upload_and_process_file(api_key: str, fname: str, buff):
headers = {
"Authorization": f"Bearer {api_key}"
}
upload_headers = headers.copy()
files = {
'file': (fname, buff, 'application/pdf'),
'purpose': (None, 'ocr')
}
upload_response = requests.post(
'https://api.mistral.ai/v1/files',
headers=upload_headers,
files=files
)
upload_response.raise_for_status()
file_id = upload_response.json()['id']
url_headers = headers.copy()
url_headers["Accept"] = "application/json"
url_response = requests.get(
f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
headers=url_headers
)
url_response.raise_for_status()
signed_url = url_response.json()['url']
ocr_headers = headers.copy()
ocr_headers["Content-Type"] = "application/json"
ocr_data = {
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": signed_url
},
"include_image_base64": True
}
ocr_response = requests.post(
'https://api.mistral.ai/v1/ocr',
headers=ocr_headers,
json=ocr_data
)
ocr_response.raise_for_status()
result = ocr_response.json()
return result["pages"][0]["markdown"]
================================================
FILE: benchmarks/overall/elo.py
================================================
import json
import random
import time
import os
from dataclasses import dataclass
from typing import List, Dict, Tuple, Literal
from PIL import Image
from collections import defaultdict
import tabulate
import click
import datasets
from google import genai
from google.genai.errors import APIError
from pydantic import BaseModel
from tqdm import tqdm
from marker.settings import settings
rating_prompt = """
You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B.
Here are some notes on the image and markdown:
- Some parts of the page may have been recognized as images and linked from the markdown, like ``.
- Tables will be formatted as Github flavored markdown.
- Block equations will be in LaTeX.
- The image and markdown may be in any language.
- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
**Instructions**
Follow this process to evaluate the markdown:
1. Carefully examine the image.
2. Carefully examine the first markdown input provided.
3. Describe how well version a represents the image.
4. Carefully examine the second markdown input provided.
5. Describe how well version B represents the image.
6. Compare version A and version B.
7. Decide which markdown representation is better, based on the criteria below. Output version_a if version a is better, and version_b if version b is better.
Use these criteria when judging the markdown:
- Overall - the overall quality of the markdown as compared to the image.
- Text quality - the quality of the text extraction from the image.
- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
- Tables - how effectively the tables have been extracted and formatted.
- Forms - how effectively the forms have extracted and formatted.
- Equations - how effectively block equations have been converted to LaTeX.
- Lists - if the lists have been properly extracted and formatted.
- Images - if images are identified and placed correctly.
Notes on scoring:
- Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting. It may also have key values that are different from the values in the image.
Output json, like in the example below.
**Example**
Version A
```markdown
# *Section 1*
This is some *markdown* extracted from a document. Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
```
Version B
```markdown
# Section 1
This is some markdown extracted from a document. Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 124}{t}$$
```
Output
```json
{
"image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
"version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
"version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation. The formatting in version b is slightly different from the image. The value 124 is also different from the image.",
"comparison": "Version A is better than version B. The text and formatting in version A matches the image better than version B. Version B also has an incorrect value.",
"winner": "version_a",
}
```
**Input**
Version A
```markdown
{{version_a}}
```
Version B
```markdown
{{version_b}}
```
**Output**
"""
class ComparerSchema(BaseModel):
image_description: str
version_a_description: str
version_b_description: str
comparison: str
winner: Literal["version_a", "version_b"]
class Comparer:
def __init__(self):
pass
def __call__(
self,
img: Image.Image,
version_a: str,
version_b: str
) -> str | None:
if version_a is None and version_b is not None:
return "version_b"
elif version_b is None and version_a is not None:
return "version_a"
hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b)
try:
rating = self.llm_rater(img, hydrated_prompt)
except Exception as e:
print(f"Error: {e}")
return
return rating
def llm_rater(self, img: Image.Image, prompt: str):
response = self.llm_response_wrapper(
[img, prompt],
ComparerSchema
)
assert "winner" in response, f"Response missing 'winner' key: {response}"
return response["winner"]
def llm_response_wrapper(
self,
prompt,
response_schema,
):
client = genai.Client(
http_options={"timeout": 60000},
vertexai=True,
project=os.getenv("VERTEX_PROJECT_ID"),
location=os.getenv("VERTEX_LOCATION"),
)
try:
responses = client.models.generate_content(
model="gemini-2.0-flash-001",
contents=prompt,
config={
"temperature": 0,
"response_schema": response_schema,
"response_mime_type": "application/json",
},
)
output = responses.candidates[0].content.parts[0].text
return json.loads(output)
except APIError as e:
print(f"Hit Gemini rate limit")
return
except Exception as e:
print(f"Error: {e}")
return
def display_win_rates_table(win_rates: dict):
table = []
headers = ["Method A", "Method B", "Wins", "Losses", "Win %"]
for method_a, method_b_dict in win_rates.items():
row = [method_a]
for method_b, results in method_b_dict.items():
row = [method_a, method_b, results["win"], results["loss"], (results["win"] / (results["win"] + results["loss"])) * 100]
table.append(row)
print(tabulate.tabulate(table, headers=headers, tablefmt="pretty"))
@click.command("Calculate win rates for document conversion methods")
@click.argument("dataset", type=str)
@click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
@click.option("--row_samples", type=int, default=2, help="Number of samples per row")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process")
def main(
dataset: str,
methods: str,
row_samples: int,
max_rows: int
):
ds = datasets.load_dataset(dataset, split="train")
method_lst = methods.split(",")
win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
comparer = Comparer()
max_rows = max_rows or len(ds)
for i in tqdm(range(max_rows), desc="Calculating win rates..."):
row = ds[i]
# Avoid any bias in ordering
random.shuffle(method_lst)
for j, method_a in enumerate(method_lst[:-1]):
for z, method_b in enumerate(method_lst[j:]):
if method_a == method_b:
continue
method_a_md = row[f"{method_a}_md"]
method_b_md = row[f"{method_b}_md"]
winner = comparer(row["img"], method_a_md, method_b_md)
if not winner:
continue
if winner == "version_a":
win_rates[method_a][method_b]["win"] += 1
win_rates[method_b][method_a]["loss"] += 1
else:
win_rates[method_b][method_a]["win"] += 1
win_rates[method_a][method_b]["loss"] += 1
if i % 10 == 0:
display_win_rates_table(win_rates)
display_win_rates_table(win_rates)
if __name__ == "__main__":
main()
================================================
FILE: benchmarks/overall/methods/__init__.py
================================================
import io
import random
import re
from typing import Tuple
import markdown2
from PIL import Image
from playwright.sync_api import sync_playwright
from benchmarks.overall.methods.schema import BenchmarkResult
from marker.renderers.markdown import MarkdownRenderer
class BaseMethod:
def __init__(self, **kwargs):
for kwarg in kwargs:
if hasattr(self, kwarg):
setattr(self, kwarg, kwargs[kwarg])
@staticmethod
def convert_to_md(html: str):
md = MarkdownRenderer()
markdown = md.md_cls.convert(html)
return markdown
def __call__(self, sample) -> BenchmarkResult:
raise NotImplementedError()
def render(self, markdown: str):
return self.html_to_image(self.convert_to_html(markdown))
@staticmethod
def convert_to_html(md: str):
block_placeholders = []
inline_placeholders = []
# Add placeholders for the math
def block_sub(match):
content = match.group(1)
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
block_placeholders.append((placeholder, f"$${content}$$"))
return placeholder
def inline_sub(match):
content = match.group(1)
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
inline_placeholders.append((placeholder, f"${content}$"))
return placeholder
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
md = re.sub(r'\$(.*?)\$', inline_sub, md)
html = markdown2.markdown(md, extras=['tables'])
# Replace placeholders
for placeholder, math_str in block_placeholders:
html = html.replace(placeholder, math_str)
for placeholder, math_str in inline_placeholders:
html = html.replace(placeholder, math_str)
return html
def html_to_image(self, html: str) -> Image.Image:
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
html_str = f"""
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
</head>
<body>
{html}
<script>
document.addEventListener("DOMContentLoaded", function() {{
renderMathInElement(document.body, {{
delimiters: [
{{left: '$$', right: '$$', display: true}},
{{left: '$', right: '$', display: false}}
],
throwOnError : false
}});
}});
</script>
</body>
</html>
""".strip()
page.set_viewport_size({"width": 1200, "height": 800})
page.set_content(html_str)
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(500) # Wait for KaTeX to render
screenshot_bytes = page.screenshot(full_page=True)
browser.close()
return Image.open(io.BytesIO(screenshot_bytes))
================================================
FILE: benchmarks/overall/methods/docling.py
================================================
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class DoclingMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
from docling.document_converter import DocumentConverter
pdf_bytes = sample["pdf"] # This is a single page PDF
converter = DocumentConverter()
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
result = converter.convert(f.name)
total = time.time() - start
return {
"markdown": result.document.export_to_markdown(),
"time": total
}
================================================
FILE: benchmarks/overall/methods/gt.py
================================================
from typing import List
import json
from PIL import Image
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class GTMethod(BaseMethod):
def __call__(self, sample) -> BenchmarkResult:
gt_blocks = json.loads(sample["gt_blocks"])
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
gt_markdown = [self.convert_to_md(block) for block in gt_html]
return {
"markdown": gt_markdown,
"time": 0
}
def render(self, html: List[str]) -> Image.Image:
joined = "\n\n".join(html)
html = f"""
<html>
<head></head>
<body>
{joined}
</body>
</html>
""".strip()
return self.html_to_image(html)
================================================
FILE: benchmarks/overall/methods/llamaparse.py
================================================
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class LlamaParseMethod(BaseMethod):
llamaparse_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.llamaparse_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
================================================
FILE: benchmarks/overall/methods/marker.py
================================================
import os
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
class MarkerMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
parser = ConfigParser({
"page_range": "0",
"disable_tqdm": True,
"use_llm": self.use_llm,
"redo_inline_math": self.use_llm,
"llm_service": "marker.services.vertex.GoogleVertexService",
"vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
})
block_converter = PdfConverter(
artifact_dict=self.model_dict,
config=parser.generate_config_dict(),
llm_service=parser.get_llm_service()
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
rendered = block_converter(f.name)
total = time.time() - start
return {
"markdown": rendered.markdown,
"time": total
}
================================================
FILE: benchmarks/overall/methods/mathpix.py
================================================
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class MathpixMethod(BaseMethod):
mathpix_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.mathpix_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
================================================
FILE: benchmarks/overall/methods/mistral.py
================================================
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class MistralMethod(BaseMethod):
mistral_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.mistral_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
================================================
FILE: benchmarks/overall/methods/olmocr.py
================================================
import base64
import json
import tempfile
import time
from io import BytesIO
import torch
from PIL import Image
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
def convert_single_page(filename: str, model, processor, device):
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
# Build the prompt, using document metadata
anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
# Build the full prompt
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
inputs = processor(
text=[text],
images=[main_image],
padding=True,
return_tensors="pt",
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}
# Generate the output
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=8192,
num_return_sequences=1,
do_sample=True,
)
# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
new_tokens, skip_special_tokens=True
)[0]
try:
text_output = json.loads(text_output)
text = text_output["natural_text"]
except Exception:
try:
text = text_output.split("natural_text")[1].strip()
except Exception:
text = ""
return text
class OlmOCRMethod(BaseMethod):
olmocr_model: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
total = time.time() - start
return {
"markdown": result,
"time": total
}
================================================
FILE: benchmarks/overall/methods/schema.py
================================================
from typing import TypedDict, List
class BenchmarkResult(TypedDict):
markdown: str | List[str]
time: float | None
================================================
FILE: benchmarks/overall/overall.py
================================================
import json
import os
import traceback
from collections import defaultdict
from pathlib import Path
from typing import List
import click
import datasets
import torch
from tqdm import tqdm
from benchmarks.overall.display.dataset import build_dataset
from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
from benchmarks.overall.schema import FullResult
from marker.logger import configure_logging
from marker.models import create_model_dict
from marker.settings import settings
from benchmarks.overall.display.table import print_scores
configure_logging()
def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
bench_scores = {}
averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
average_times = defaultdict(list)
markdown_by_method = defaultdict(dict)
total_rows = len(benchmark_dataset)
if max_rows:
total_rows = min(max_rows, total_rows)
for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark", total=total_rows):
if max_rows is not None and idx >= max_rows:
break
doc_type = sample["classification"]
gt_cls = METHOD_REGISTRY["gt"]
gt_blocks = json.loads(sample["gt_blocks"])
gt_md = gt_cls(**artifacts)(sample)["markdown"]
markdown_by_method[idx]["gt"] = gt_md
out_data = defaultdict(dict)
try:
for method in methods:
method_cls = METHOD_REGISTRY[method](**artifacts)
method_info = method_cls(sample)
method_md = method_info["markdown"]
if method_md is None:
method_md = "" # Avoid None values
average_times[method].append(method_info["time"])
markdown_by_method[idx][method] = method_md
for score_type in score_types:
score_cls = SCORE_REGISTRY[score_type]()
try:
scores = score_cls(sample, gt_md, method_md)
except Exception as e:
# Some scorers can fail, like the LLM one
print(f"Failed to score {method} with {score_type}: {e}")
continue
out_data[method][score_type] = scores
averages_by_type[method][score_type][doc_type].append(scores["score"])
if "by_block" in scores["specific_scores"]: # Not all scorers support this
for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
except Exception as e:
print(f"Failed to process {idx}: {e}")
traceback.print_exc()
if idx in markdown_by_method:
del markdown_by_method[idx]
continue
bench_scores[idx] = out_data
return {
"scores": bench_scores,
"markdown": markdown_by_method,
"averages_by_type": averages_by_type,
"averages_by_block_type": averages_by_block_type,
"average_times": average_times,
}
@click.command(help="Benchmark PDF to MD conversion.")
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
@click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix,llamaparse,docling,mistral", default="marker")
@click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
@click.option("--languages", type=str, help="Comma separated list of languages to use for LLM", default=None)
def main(
dataset: str,
out_dataset: str,
methods: str,
scores: str,
result_path: str,
max_rows: int,
use_llm: bool,
languages: str
):
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
methods = methods.split(",")
for method in methods:
if method not in METHOD_REGISTRY:
raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}")
# Ensure marker is always first
all_methods = list(set(methods))
methods = ["marker"] if "marker" in all_methods else []
methods += [m for m in all_methods if m != "marker"]
score_types = scores.split(",")
for score_type in score_types:
if score_type not in SCORE_REGISTRY:
raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}")
if languages:
languages = languages.split(",")
else:
languages = None
benchmark_dataset = datasets.load_dataset(dataset, split="train")
if languages:
benchmark_dataset = benchmark_dataset.filter(lambda x: x["language"] in languages)
artifacts = {
"model_dict": create_model_dict(),
"use_llm": use_llm,
"mathpix_ds": None,
"llamaparse_ds": None,
}
if "mathpix" in methods:
artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
if "llamaparse" in methods:
artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")
if "mistral" in methods:
artifacts["mistral_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mistral", split="train")
if "olmocr" in methods:
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview",
torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
artifacts["olmocr_model"] = {"model": model, "processor": processor}
print(f"Running benchmark with methods: {methods} and scores: {score_types}")
result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
# Display benchmark scoring tables
print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0])
# Write to json
with open(out_path / "result.json", "w") as f:
json.dump(result, f)
if out_dataset:
if use_llm:
out_dataset += "_llm"
dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
dataset.push_to_hub(out_dataset, private=True)
if __name__ == "__main__":
main()
================================================
FILE: benchmarks/overall/registry.py
================================================
from benchmarks.overall.methods.docling import DoclingMethod
from benchmarks.overall.methods.gt import GTMethod
from benchmarks.overall.methods.llamaparse import LlamaParseMethod
from benchmarks.overall.methods.marker import MarkerMethod
from benchmarks.overall.methods.mathpix import MathpixMethod
from benchmarks.overall.methods.mistral import MistralMethod
from benchmarks.overall.methods.olmocr import OlmOCRMethod
from benchmarks.overall.scorers.heuristic import HeuristicScorer
from benchmarks.overall.scorers.llm import LLMScorer
SCORE_REGISTRY = {
"heuristic": HeuristicScorer,
"llm": LLMScorer
}
METHOD_REGISTRY = {
"marker": MarkerMethod,
"gt": GTMethod,
"mathpix": MathpixMethod,
"llamaparse": LlamaParseMethod,
"docling": DoclingMethod,
"olmocr": OlmOCRMethod,
"mistral": MistralMethod
}
================================================
FILE: benchmarks/overall/schema.py
================================================
from typing import TypedDict, List, Dict
from benchmarks.overall.scorers.schema import BlockScores
AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
class FullResult(TypedDict):
scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
averages_by_type: AVG_TYPE
averages_by_block_type: AVG_TYPE
average_times: Dict[str, List[float]]
markdown: Dict[int, Dict[str, str]]
================================================
FILE: benchmarks/overall/scorers/__init__.py
================================================
from typing import List
from benchmarks.overall.scorers.schema import BlockScores
class BaseScorer:
def __init__(self):
pass
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
raise NotImplementedError()
================================================
FILE: benchmarks/overall/scorers/clean.py
================================================
import re
import subprocess
import tempfile
from pathlib import Path
import latex2mathml.converter
class MarkdownCleaner:
def __init__(self):
pass
def __call__(self, markdown):
markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
# Replace math expressions with latexml
pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
markdown = re.sub(pattern, self.standardize_math, markdown)
# Replace image urls with a generic tag
pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
markdown = re.sub(pattern, r'![link]', markdown)
# Clean up stray html tags
markdown = markdown.replace("<br>", "\n")
markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
# Clean up markdown formatting
markdown = re.sub(r"\s+", " ", markdown)
markdown = re.sub(r"\n+", "\n", markdown)
markdown = re.sub("\\.+", ".",
markdown) # Replace repeated periods with a single period, like in table of contents
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
return markdown.strip().lower()
@staticmethod
def normalize_markdown(md_text: str) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
dirpath = Path(tmp_dir)
input_file = dirpath / 'input.md'
input_file.write_text(md_text, encoding='utf-8')
# Markdown to HTML
html_file = dirpath / 'temp.html'
subprocess.run(
[
'pandoc',
str(input_file),
'-f', 'markdown+tex_math_dollars',
'-t', 'html',
'-o', str(html_file),
'--quiet'
],
check=True
)
# HTML to Markdown
output_file = dirpath / 'output.md'
subprocess.run(
[
'pandoc',
str(html_file),
'-f', 'html',
'-t', 'markdown+tex_math_dollars',
'-o', str(output_file),
'--quiet'
],
check=True
)
# Read back the normalized Markdown
normalized_md = output_file.read_text(encoding='utf-8')
return normalized_md
def standardize_math(self, match):
try:
delim = "$$" if match.group(0).startswith('$$') else "$"
math_content = match.group(1) or match.group(2)
if delim == "$$":
math_content = latex2mathml.converter.convert(math_content)
else:
math_content = self.clean_latex(math_content)
return f'{delim}{math_content}{delim}'
except Exception as e:
print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
return match.group(0)
@staticmethod
def clean_latex(latex_str):
latex_str = re.sub(r'\s+', ' ', latex_str.strip())
for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
replacements = {
'\\times': '*',
'\\cdot': '*',
'\\div': '/',
'\\le': '<=',
'\\ge': '>=',
'\\neq': '!=',
'\\to': '\\rightarrow',
}
for old, new in replacements.items():
latex_str = latex_str.replace(old, new)
return latex_str
================================================
FILE: benchmarks/overall/scorers/heuristic.py
================================================
from typing import List
from rapidfuzz import fuzz
from benchmarks.overall.scorers.clean import MarkdownCleaner
from benchmarks.overall.scorers.schema import BlockScores
from benchmarks.overall.scorers import BaseScorer
class HeuristicScorer(BaseScorer):
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
if not method_markdown:
return {
"score": 0,
"specific_scores": {
"order": 0,
"by_block": [0] * len(gt_markdown)
}
}
# Standardize inputs
gt_markdown = [self.clean_input(block) for block in gt_markdown]
method_markdown = self.clean_input(method_markdown)
alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
scores = [alignment["score"] for alignment in alignments]
# Find order score
orders = [alignment["start"] for alignment in alignments]
correct_order = list(range(len(gt_markdown)))
actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
order_score = self.kendall_tau(correct_order, actual_order)
# Weight score by sequence length
gt_weights = [len(g) for g in gt_markdown]
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
# Weight the score by sequence length
overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
overall_score = overall_score * 0.8 + order_score * 0.2
return {
"score": overall_score,
"specific_scores": {
"order": order_score,
"by_block": scores
},
}
@staticmethod
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
n = len(correct_order)
concordant = 0
discordant = 0
if n <= 1:
return 100
for i in range(n):
for j in range(i + 1, n):
correct_sign = correct_order[i] - correct_order[j]
actual_sign = actual_order[i] - actual_order[j]
if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
concordant += 1
elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
discordant += 1
total_pairs = (n * (n - 1)) // 2
tau = (concordant - discordant) / total_pairs
tau = (tau + 1) / 2 # 0-1 scale
return tau * 100 # 0-100 scale
@staticmethod
def find_fuzzy_alignments(
main_string: str,
substrings: List[str],
threshold: int = 70
) -> List[dict]:
alignments = []
for idx, substr in enumerate(substrings):
result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
score = 0
dest_start = 0
dest_end = 0
if result:
score = result.score
dest_start = result.dest_start
dest_end = result.dest_end
alignments.append({
"string": substr,
"start": dest_start,
"end": dest_end,
"score": score,
"idx": idx
})
return alignments
@staticmethod
def clean_input(md: str):
cleaner = MarkdownCleaner()
return cleaner(md)
================================================
FILE: benchmarks/overall/scorers/llm.py
================================================
import json
import os
import tempfile
import time
from typing import List
from PIL import Image
from google.genai.errors import APIError
from google import genai
import pypdfium2 as pdfium
from benchmarks.overall.scorers import BaseScorer, BlockScores
from marker.settings import settings
rating_prompt = """
You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
You're given an image, along with the extracted markdown:
- Some parts of the page may have been recognized as images and linked from the markdown, like ``.
- Tables will be formatted as Github flavored markdown.
- Block equations will be in LaTeX.
- The image and markdown may be in any language.
- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
**Instructions**
Follow this process to evaluate the markdown:
1. Carefully examine the image.
2. Carefully examine the markdown input provided.
3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image?
4. Assign component scores, as described below.
These are the primary scores:
- Overall - the overall quality of the markdown as compared to the image.
- Text quality - the quality of the text extraction from the image.
- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
Depending on which elements are present in the markdown, you will assign element-specific scores.
- Tables - how effectively the tables have been extracted and formatted.
- Forms - how effectively the forms have extracted and formatted.
- Equations - how effectively block equations have been converted to LaTeX.
- Section headers - if all of the section headers have been detected, and the right levels set.
- Lists - if the lists have been properly extracted and formatted.
- Images - if images are identified and placed correctly.
Notes on scoring:
- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
If text that is important to the meaning of the document is missing, do not score higher than 3/5.
Output json, like in the example below.
**Example**
Input
```markdown
# Section 1
This is some *markdown* extracted from a document. Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
```
Output
```json
{
"image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
"markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
"comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.",
"overall": 5,
"text": 5,
"formatting": 5,
"section_headers": 5,
"tables": 0,
"forms": 0,
"equations": 5,
"lists": 0,
"images": 0
}
```
**Input**
```markdown
{{markdown}}
```
**Output**
"""
comparison_keys = ["comparison"]
description_keys = ["image_description", "markdown_description"]
text_keys = comparison_keys + description_keys
score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
"lists", "images"]
class LLMScorer(BaseScorer):
def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
pdf_bytes = sample["pdf"]
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf_bytes)
f.flush()
f.seek(0)
doc = pdfium.PdfDocument(f.name)
img = doc[0].render(scale=96/72).to_pil()
doc.close()
return self.llm_rater(img, markdown)
def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
if not markdown:
null_scores = {k: 1 for k in score_keys}
text_scores = {k: "" for k in text_keys}
null_scores.update(text_scores)
return {
"score": 1,
"specific_scores": null_scores
}
req_keys = text_keys + score_keys
properties = {}
for key in req_keys:
content_type = "INTEGER" if key in score_keys else "STRING"
properties[key] = {"type": content_type}
response_schema = {
"required": req_keys,
"properties": properties,
"type": "OBJECT"
}
prompt = rating_prompt.replace("{{markdown}}", markdown)
response = self.llm_response_wrapper([img, prompt], response_schema)
assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
return {
"score": response["overall"],
"specific_scores": response,
}
def llm_response_wrapper(self, prompt, response_schema, depth=0):
client = genai.Client(
http_options={"timeout": 60000},
vertexai=True,
project=os.getenv("VERTEX_PROJECT_ID"),
location=os.getenv("VERTEX_LOCATION"),
)
try:
responses = client.models.generate_content(
model="gemini-2.0-flash-001",
contents=prompt,
config={
"temperature": 0,
"response_schema": response_schema,
"response_mime_type": "application/json",
},
)
output = responses.candidates[0].content.parts[0].text
return json.loads(output)
except APIError as e:
print(f"Hit Gemini rate limit, waiting 120 seconds")
time.sleep(120)
if depth > 2:
raise e
return self.llm_response_wrapper(prompt, response_schema, depth + 1)
================================================
FILE: benchmarks/overall/scorers/schema.py
================================================
from typing import TypedDict, List, Optional, Dict
class BlockScores(TypedDict):
score: float
specific_scores: Dict[str, float | List[float]]
================================================
FILE: benchmarks/table/__init__.py
================================================
================================================
FILE: benchmarks/table/gemini.py
================================================
import json
from PIL import Image
from google import genai
from google.genai import types
from io import BytesIO
from pydantic import BaseModel
from marker.settings import settings
prompt = """
You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation.
Guidelines:
- Keep the HTML simple and concise.
- Only include the <table> tag and contents.
- Only use <table>, <tr>, and <td> tags. Only use the colspan and rowspan attributes if necessary. Do not use <tbody>, <thead>, or <th> tags.
- Make sure the table is as faithful to the image as possible with the given tags.
**Instructions**
1. Analyze the image, and determine the table structure.
2. Convert the table image to HTML, following the guidelines above.
3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
""".strip()
class TableSchema(BaseModel):
table_html: str
def gemini_table_rec(image: Image.Image):
client = genai.Client(
api_key=settings.GOOGLE_API_KEY,
http_options={"timeout": 60000}
)
image_bytes = BytesIO()
image.save(image_bytes, format="PNG")
responses = client.models.generate_content(
model="gemini-2.0-flash",
contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element
config={
"temperature": 0,
"response_schema": TableSchema,
"response_mime_type": "application/json",
},
)
output = responses.candidates[0].content.parts[0].text
return json.loads(output)["table_html"]
================================================
FILE: benchmarks/table/inference.py
================================================
from typing import List
import numpy as np
from bs4 import BeautifulSoup
import pypdfium2 as pdfium
from tqdm import tqdm
import base64
import tempfile
from benchmarks.table.gemini import gemini_table_rec
from marker.config.parser import ConfigParser
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.table import TableProcessor
from marker.renderers.json import JSONBlockOutput
from marker.schema.polygon import PolygonBox
from marker.util import matrix_intersection_area
def extract_tables(children: List[JSONBlockOutput]):
tables = []
for child in children:
if child.block_type == 'Table':
tables.append(child)
elif child.children:
tables.extend(extract_tables(child.children))
return tables
def fix_table_html(table_html: str) -> str:
marker_table_soup = BeautifulSoup(table_html, 'html.parser')
tbody = marker_table_soup.find('tbody')
if tbody:
tbody.unwrap()
for th_tag in marker_table_soup.find_all('th'):
th_tag.name = 'td'
for br_tag in marker_table_soup.find_all('br'):
br_tag.replace_with(marker_table_soup.new_string(''))
marker_table_html = str(marker_table_soup)
marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
return marker_table_html
def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
models = create_model_dict()
config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
total_unaligned = 0
results = []
iterations = len(dataset)
if max_rows is not None:
iterations = min(max_rows, len(dataset))
for i in tqdm(range(iterations), desc='Converting Tables'):
try:
row = dataset[i]
pdf_binary = base64.b64decode(row['pdf'])
gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns
# Only use the basic table processors
converter = TableConverter(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=[
"marker.processors.table.TableProcessor",
"marker.processors.llm.llm_table.LLMTableProcessor",
],
renderer=config_parser.get_renderer()
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
temp_pdf_file.write(pdf_binary)
temp_pdf_file.seek(0)
marker_json = converter(temp_pdf_file.name).children
doc = pdfium.PdfDocument(temp_pdf_file.name)
page_image = doc[0].render(scale=96/72).to_pil()
doc.close()
if len(marker_json) == 0 or len(gt_tables) == 0:
print(f'No tables detected, skipping...')
total_unaligned += len(gt_tables)
continue
marker_tables = extract_tables(marker_json)
marker_table_boxes = [table.bbox for table in marker_tables]
page_bbox = marker_json[0].bbox
if len(marker_tables) != len(gt_tables):
print(f'Number of tables do not match, skipping...')
total_unaligned += len(gt_tables)
continue
table_images = [
page_image.crop(
PolygonBox.from_bbox(bbox)
.rescale(
(page_bbox[2], page_bbox[3]), (page_image.width, page_image.height)
).bbox
)
for bbox
in marker_table_boxes
]
# Normalize the bboxes
for bbox in marker_table_boxes:
bbox[0] = bbox[0] / page_bbox[2]
bbox[1] = bbox[1] / page_bbox[3]
bbox[2] = bbox[2] / page_bbox[2]
bbox[3] = bbox[3] / page_bbox[3]
gt_boxes = [table['normalized_bbox'] for table in gt_tables]
gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
aligned_tables = []
used_tables = set()
unaligned_tables = set()
for table_idx, alignment in enumerate(table_alignments):
try:
max_area = np.max(alignment)
aligned_idx = np.argmax(alignment)
except ValueError:
# No alignment found
unaligned_tables.add(table_idx)
continue
if max_area <= .01:
# No alignment found
unaligned_tables.add(table_idx)
continue
if aligned_idx in used_tables:
# Marker table already aligned with another gt table
unaligned_tables.add(table_idx)
continue
# Gt table doesn't align well with any marker table
gt_table_pct = gt_areas[table_idx] / max_area
if not .85 < gt_table_pct < 1.15:
unaligned_tables.add(table_idx)
continue
# Marker table doesn't align with gt table
marker_table_pct = marker_areas[aligned_idx] / max_area
if not .85 < marker_table_pct < 1.15:
unaligned_tables.add(table_idx)
continue
gemini_html = ""
if use_gemini:
try:
gemini_html = gemini_table_rec(table_images[aligned_idx])
except Exception as e:
print(f'Gemini failed: {e}')
aligned_tables.append(
(marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
)
used_tables.add(aligned_idx)
total_unaligned += len(unaligned_tables)
for marker_table, gt_table, gemini_table in aligned_tables:
gt_table_html = gt_table['html']
# marker wraps the table in <tbody> which fintabnet data doesn't
# Fintabnet doesn't use th tags, need to be replaced for fair comparison
marker_table_html = fix_table_html(marker_table.html)
gemini_table_html = fix_table_html(gemini_table)
results.append({
"marker_table": marker_table_html,
"gt_table": gt_table_html,
"gemini_table": gemini_table_html
})
except pdfium.PdfiumError:
print('Broken PDF, Skipping...')
continue
return results, total_unaligned
================================================
FILE: benchmarks/table/scoring.py
================================================
""""
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
"""
import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import html
from collections import deque
def wrap_table_html(table_html:str)->str:
return f'<html><body>{table_html}</body></html>'
class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
# Sets self.name and self.children
super().__init__(tag, *children)
def bracket(self):
"""Show tree using brackets notation"""
if self.tag == 'td':
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
(self.tag, self.colspan, self.rowspan, self.content)
else:
result = '"tag": %s' % self.tag
for child in self.children:
result += child.bracket()
return "{{{}}}".format(result)
class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
return max(map(len, sequences))
def normalized_distance(self, *sequences):
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
def rename(self, node1, node2):
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == 'td':
if node1.content or node2.content:
return self.normalized_distance(node1.content, node2.content)
return 0.
def tokenize(node):
"""
Tokenizes table cells
"""
global __tokens__
__tokens__.append('<%s>' % node.tag)
if node.text is not None:
__tokens__ += list(node.text)
for n in node.getchildren():
tokenize(n)
if node.tag != 'unk':
__tokens__.append('</%s>' % node.tag)
if node.tag != 'td' and node.tail is not None:
__tokens__ += list(node.tail)
def tree_convert_html(node, convert_cell=False, parent=None):
"""
Converts HTML tree to the format required by apted
"""
global __tokens__
if node.tag == 'td':
if convert_cell:
__tokens__ = []
tokenize(node)
cell = __tokens__[1:-1].copy()
else:
cell = []
new_node = TableTree(node.tag,
int(node.attrib.get('colspan', '1')),
int(node.attrib.get('rowspan', '1')),
cell, *deque())
else:
new_node = TableTree(node.tag, None, None, None, *deque())
if parent is not None:
parent.children.append(new_node)
if node.tag != 'td':
for n in node.getchildren():
tree_convert_html(n, convert_cell, new_node)
if parent is None:
return new_node
def similarity_eval_html(pred, true, structure_only=False):
"""
Computes TEDS score between the prediction and the ground truth of a given samples
"""
pred, true = html.fromstring(pred), html.fromstring(true)
if pred.xpath('body/table') and true.xpath('body/table'):
pred = pred.xpath('body/table')[0]
true = true.xpath('body/table')[0]
n_nodes_pred = len(pred.xpath(".//*"))
n_nodes_true = len(true.xpath(".//*"))
tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
tree_true = tree_convert_html(true, convert_cell=not structure_only)
n_nodes = max(n_nodes_pred, n_nodes_true)
distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
return 1.0 - (float(distance) / n_nodes)
else:
return 0.0
================================================
FILE: benchmarks/table/table.py
================================================
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
from pathlib import Path
from itertools import repeat
from typing import List
import time
import datasets
from tqdm import tqdm
import click
from tabulate import tabulate
import json
from concurrent.futures import ProcessPoolExecutor
from marker.settings import settings
from benchmarks.table.inference import inference_tables
from scoring import wrap_table_html, similarity_eval_html
def update_teds_score(result, prefix: str = "marker"):
prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
score = similarity_eval_html(prediction, ground_truth)
result.update({f'{prefix}_score':score})
return result
@click.command(help="Benchmark Table to HTML Conversion")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
@click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
def main(
result_path: str,
dataset: str,
max_rows: int,
max_workers: int,
use_llm: bool,
table_rec_batch_size: int | None,
use_gemini: bool = False
):
start = time.time()
dataset = datasets.load_dataset(dataset, split='train')
dataset = dataset.shuffle(seed=0)
results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
print(f"Total time: {time.time() - start}.")
print(f"Could not align {total_unaligned} tables from fintabnet.")
with ProcessPoolExecutor(max_workers=max_workers) as executor:
marker_results = list(
tqdm(
executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
)
)
avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
headers = ["Avg score", "Total tables"]
data = [f"{avg_score:.3f}", len(marker_results)]
gemini_results = None
if use_gemini:
with ProcessPoolExecutor(max_workers=max_workers) as executor:
gemini_results = list(
tqdm(
executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
total=len(results)
)
)
avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
headers.append("Avg Gemini score")
data.append(f"{avg_gemini_score:.3f}")
table = tabulate([data], headers=headers, tablefmt="github")
print(table)
print("Avg score computed by comparing marker predicted HTML with original HTML")
results = {
"marker": marker_results,
"gemini": gemini_results
}
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / "table.json", "w+") as f:
json.dump(results, f, indent=2)
print(f"Results saved to {out_path}.")
if __name__ == '__main__':
main()
================================================
FILE: benchmarks/throughput/__init__.py
================================================
================================================
FILE: benchmarks/throughput/main.py
================================================
import os
import tempfile
import time
from multiprocessing import get_context
from concurrent.futures import ProcessPoolExecutor
import torch
import click
import pypdfium2 as pdfium
from tqdm import tqdm
import datasets
def get_next_pdf(ds: datasets.Dataset, i: int):
while True:
pdf = ds[i]["pdf"]
filename = ds[i]["filename"]
if pdf and filename.endswith(".pdf"):
return pdf, filename, i + 1
i += 1
if i >= len(ds):
i = 0
def single_batch(
batch_size: int,
num_threads: int,
force_ocr: bool,
quantize: bool,
compile: bool,
worker_id: int,
chunksize: int = 100,
):
if quantize:
os.environ["RECOGNITION_MODEL_QUANTIZE"] = "true"
if compile:
os.environ["COMPILE_ALL"] = "true"
for item in [
"DETECTOR_POSTPROCESSING_CPU_WORKERS",
"OPENBLAS_NUM_THREADS",
"PDFTEXT_CPU_WORKERS",
"OMP_NUM_THREADS",
]:
os.environ[item] = f"{num_threads}"
torch.set_num_threads(num_threads)
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
ds = datasets.load_dataset("datalab-to/pdfs", split="train")
model_dict = create_model_dict()
torch.cuda.reset_peak_memory_stats()
times = []
i = 0
pages = 0
chars = 0
min_time = time.time()
for _ in range(batch_size):
pdf, fname, i = get_next_pdf(ds, i)
print(f"Inferencing {fname} on worker {worker_id}...")
pdf_doc = pdfium.PdfDocument(pdf)
page_count = len(pdf_doc)
pdf_doc.close()
pages += page_count
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf)
f.flush()
page_range_chunks = list(range(0, page_count, chunksize))
for chunk_start in page_range_chunks:
chunk_end = min(chunk_start + chunksize, page_count)
page_range = list(range(chunk_start, chunk_end))
block_converter = PdfConverter(
artifact_dict=model_dict,
config={
"disable_tqdm": worker_id > 0,
"page_range": page_range,
"force_ocr": force_ocr,
},
)
start = time.time()
rendered = block_converter(f.name)
markdown, _, _ = text_from_rendered(rendered)
chars += len(markdown)
total = time.time() - start
times.append(total)
max_gpu_vram = torch.cuda.max_memory_reserved() / 1024**3
max_time = time.time()
return sum(times), min_time, max_time, max_gpu_vram, pages, chars
@click.command(help="Benchmark PDF to MD conversion throughput.")
@click.option("--workers", default=1, help="Number of workers to use.")
@click.option("--batch_size", default=1, help="Batch size for inference.")
@click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
@click.option("--quantize", is_flag=True, help="Use quantized model.")
@click.option("--compile", is_flag=True, help="Use compiled model.")
def main(
workers: int,
batch_size: int,
force_ocr: bool,
quantize: bool,
compile: bool,
):
total_cpus = os.cpu_count()
start = time.time()
current_gpu_vram = torch.cuda.memory_reserved() / 1024**3
with ProcessPoolExecutor(
max_workers=workers, mp_context=get_context("spawn")
) as executor:
cpus_per_worker = min(8, max(2, total_cpus // workers))
futures = [
executor.submit(
single_batch,
batch_size,
cpus_per_worker,
force_ocr,
quantize,
compile,
i,
)
for i in range(workers)
]
all_times = []
min_time = None
max_time = time.time()
vrams = []
page_count = 0
char_count = 0
for future in tqdm(futures, desc="Running marker workers..."):
times, min_time_worker, max_time_worker, max_vram, pages, chars = (
future.result()
)
vrams.append(max_vram - current_gpu_vram)
all_times.append(times)
page_count += pages
char_count += chars
min_time = (
min(min_time_worker, min_time)
if min_time is not None
else min_time_worker
)
max_time = max(max_time, max_time_worker)
end = time.time() - start
all_worker_time = max_time - min_time
print(f"Average time per worker: {sum(all_times) / len(all_times)}")
print(f"Max time per worker: {max(all_times)}")
print(f"End to end time (counting model loading), all processes: {end}")
print(f"End to end time (no model loading), all processes: {all_worker_time}")
print(f"Total pages: {page_count}")
print(f"Total characters: {char_count}")
print(f"Time per page: {all_worker_time / page_count:.2f}")
print(f"Characters per second: {char_count / all_worker_time:.2f}")
print(f"Max GPU VRAM: {max(vrams):.2f} GB")
print(f"Average GPU VRAM: {sum(vrams) / len(vrams):.2f} GB")
if __name__ == "__main__":
main()
================================================
FILE: benchmarks/verify_scores.py
================================================
import json
import argparse
def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
raw_scores = [data["scores"][k] for k in data["scores"]]
marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
marker_score = sum(marker_scores) / len(marker_scores)
if marker_score < 90:
raise ValueError("Marker score below 90")
def verify_table_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
if avg < 0.7:
raise ValueError("Average score is below the required threshold of 0.7")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Verify benchmark scores")
parser.add_argument("file_path", type=str, help="Path to the json file")
parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
args = parser.parse_args()
if args.type == "marker":
verify_scores(args.file_path)
elif args.type == "table":
verify_table_scores(args.file_path)
================================================
FILE: chunk_convert.py
================================================
from marker.scripts.chunk_convert import chunk_convert_cli
if __name__ == "__main__":
chunk_convert_cli()
================================================
FILE: convert.py
================================================
from marker.scripts.convert import convert_cli
if __name__ == "__main__":
convert_cli()
================================================
FILE: convert_single.py
================================================
from marker.scripts.convert_single import convert_single_cli
if __name__ == "__main__":
convert_single_cli()
================================================
FILE: data/.gitignore
================================================
latex
pdfs
references
================================================
FILE: data/examples/json/multicolcnn.json
================================================
{
"children": [
{
"id": "/page/0/Page/277",
"block_type": "Page",
"html": "<content-ref src='/page/0/PageHeader/14'></content-ref><content-ref src='/page/0/SectionHeader/0'></content-ref><content-ref src='/page/0/Text/1'></content-ref><content-ref src='/page/0/SectionHeader/2'></content-ref><content-ref src='/page/0/Text/3'></content-ref><content-ref src='/page/0/SectionHeader/4'></content-ref><content-ref src='/page/0/Text/5'></content-ref><content-ref src='/page/0/Text/6'></content-ref><content-ref src='/page/0/Text/7'></content-ref><content-ref src='/page/0/Text/8'></content-ref><content-ref src='/page/0/Text/9'></content-ref><content-ref src='/page/0/Text/10'></content-ref><content-ref src='/page/0/Text/11'></content-ref><content-ref src='/page/0/SectionHeader/12'></content-ref><content-ref src='/page/0/Text/13'></content-ref><content-ref src='/page/0/PageFooter/15'></content-ref>",
"polygon": [
[
0.0,
0.0
],
[
612.0,
0.0
],
[
612.0,
792.0
],
[
0.0,
792.0
]
],
"bbox": [
0.0,
0.0,
612.0,
792.0
],
"children": [
{
"id": "/page/0/PageHeader/14",
"block_type": "PageHeader",
"html": "",
"polygon": [
[
18.119998931884766,
211.199951171875
],
[
36.2599983215332,
211.199951171875
],
[
36.2599983215332,
559.2799987792969
],
[
18.119998931884766,
559.2799987792969
]
],
"bbox": [
18.119998931884766,
211.199951171875,
36.2599983215332,
559.2799987792969
],
"children": null,
"section_hierarchy": {},
"images": {}
},
{
"id": "/page/0/SectionHeader/0",
"block_type": "SectionHeader",
"html": "<h1>An Aggregated Multicolumn Dilated Convolution Network for Perspective-Free Counting</h1>",
"polygon": [
[
117.5888671875,
105.9219970703125
],
[
477.371826171875,
105.9219970703125
],
[
477.371826171875,
138.201171875
],
[
117.5888671875,
138.201171875
]
],
"bbox": [
117.5888671875,
105.9219970703125,
477.371826171875,
138.201171875
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/0"
},
"images": {}
},
{
"id": "/page/0/Text/1",
"block_type": "Text",
"html": "<p block-type=\"Text\">Diptodip Deb Georgia Institute of Technology diptodipdeb@gatech.edu</p>",
"polygon": [
[
104.81396484375,
163.4853515625
],
[
259.00787353515625,
163.4853515625
],
[
259.00787353515625,
202.3262939453125
],
[
104.81396484375,
202.3262939453125
]
],
"bbox": [
104.81396484375,
163.4853515625,
259.00787353515625,
202.3262939453125
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/0"
},
"images": {}
},
{
"id": "/page/0/SectionHeader/2",
"block_type": "SectionHeader",
"html": "<h1>Abstract</h1>",
"polygon": [
[
144.1845703125,
232.4891357421875
],
[
190.48028564453125,
232.4891357421875
],
[
190.48028564453125,
244.4443359375
],
[
144.1845703125,
244.4443359375
]
],
"bbox": [
144.1845703125,
232.4891357421875,
190.48028564453125,
244.4443359375
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/2"
},
"images": {}
},
{
"id": "/page/0/Text/3",
"block_type": "Text",
"html": "<p block-type=\"Text\"><i>We propose the use of dilated filters to construct an ag</i><i>gregation module in a multicolumn convolutional neural</i> <i>network for perspective-free counting. Counting is a com</i><i>mon problem in computer vision (e.g. traffic on the street or</i> <i>pedestrians in a crowd). Modern approaches to the count</i><i>ing problem involve the production of a density map via re</i><i>gression whose integral is equal to the number of objects</i> <i>in the image. However, objects in the image can occur at</i> <i>different scales (e.g. due to perspective effects) which can</i> <i>make it difficult for a learning agent to learn the proper</i> <i>density map. While the use of multiple columns to extract</i> <i>multiscale information from images has been shown be</i><i>fore, our approach aggregates the multiscale information</i> <i>gathered by the multicolumn convolutional neural network</i> <i>to improve performance. Our experiments show that our</i> <i>proposed network outperforms the state-of-the-art on many</i> <i>benchmark datasets, and also that using our aggregation</i> <i>module in combination with a higher number of columns is</i> <i>beneficial for multiscale counting.</i></p>",
"polygon": [
[
49.904296875,
258.9959716796875
],
[
287.47265625,
258.9959716796875
],
[
287.47265625,
485.33203125
],
[
49.904296875,
485.33203125
]
],
"bbox": [
49.904296875,
258.9959716796875,
287.47265625,
485.33203125
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/2"
},
"images": {}
},
{
"id": "/page/0/SectionHeader/4",
"block_type": "SectionHeader",
"html": "<h1>1. Introduction</h1>",
"polygon": [
[
50.016357421875,
512.06591796875
],
[
128.49609375,
512.06591796875
],
[
128.49609375,
524.0211181640625
],
[
50.016357421875,
524.0211181640625
]
],
"bbox": [
50.016357421875,
512.06591796875,
128.49609375,
524.0211181640625
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/5",
"block_type": "Text",
"html": "<p block-type=\"Text\">Learning to count the number of objects in an image is a deceptively difficult problem with many interesting applications, such as surveillance <a href=\"#page-8-0\">[20]</a>, traffic monitoring <a href=\"#page-8-1\">[14]</a> and medical image analysis <a href=\"#page-8-2\">[22]</a>. In many of these application areas, the objects to be counted vary widely in appearance, size and shape, and labeled training data is typically sparse. These factors pose a significant computer vision and machine learning challenge.</p>",
"polygon": [
[
49.0078125,
533.7682189941406
],
[
286.576171875,
533.7682189941406
],
[
286.576171875,
627.4168395996094
],
[
49.0078125,
627.4168395996094
]
],
"bbox": [
49.0078125,
533.7682189941406,
286.576171875,
627.4168395996094
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/6",
"block_type": "Text",
"html": "<p block-type=\"Text\">Lempitsky et al. <a href=\"#page-8-3\">[15]</a> showed that it is possible to learn to count without learning to explicitly detect and localize individual objects. Instead, they propose learning to predict a density map whose integral over the image equals the number of objects in the image. This approach has been adopted by many later works (Cf. <a href=\"#page-8-4\">[18,</a> <a href=\"#page-9-0\">28]</a>).</p>",
"polygon": [
[
49.7548828125,
630.5612335205078
],
[
287.0244140625,
630.5612335205078
],
[
287.0244140625,
700.734375
],
[
49.7548828125,
700.734375
]
],
"bbox": [
49.7548828125,
630.5612335205078,
287.0244140625,
700.734375
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/7",
"block_type": "Text",
"html": "<p block-type=\"Text\">However, in many counting problems, such as those</p>",
"polygon": [
[
59.84033203125,
703.4442443847656
],
[
287.173828125,
703.4442443847656
],
[
287.173828125,
713.49609375
],
[
59.84033203125,
713.49609375
]
],
"bbox": [
59.84033203125,
703.4442443847656,
287.173828125,
713.49609375
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/8",
"block_type": "Text",
"html": "<p block-type=\"Text\">Jonathan Ventura University of Colorado Colorado Springs jventura@uccs.edu</p>",
"polygon": [
[
291.4570007324219,
163.6572265625
],
[
488.89715576171875,
163.6572265625
],
[
488.89715576171875,
202.3262939453125
],
[
291.4570007324219,
202.3262939453125
]
],
"bbox": [
291.4570007324219,
163.6572265625,
488.89715576171875,
202.3262939453125
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/9",
"block_type": "Text",
"html": "<blockquote><p block-type=\"Text\">counting cells in a microscope image, pedestrians in a crowd, or vehicles in a traffic jam, regressors trained on a single image scale are not reliable <a href=\"#page-8-4\">[18]</a>. This is due to a variety of challenges including overlap of objects and perspective effects which cause significant variance in object shape, size and appearance.</p></blockquote>",
"polygon": [
[
308.390625,
234.1092529296875
],
[
545.1151123046875,
234.1092529296875
],
[
545.1151123046875,
303.8478698730469
],
[
308.390625,
303.8478698730469
]
],
"bbox": [
308.390625,
234.1092529296875,
545.1151123046875,
303.8478698730469
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/10",
"block_type": "Text",
"html": "<blockquote><p block-type=\"Text\">The most successful recent approaches address this issue by explicitly incorporating multi-scale information in the network <a href=\"#page-8-4\">[18,</a><a href=\"#page-9-0\">28]</a>. These approaches either combine multiple networks which take input patches of different sizes <a href=\"#page-8-4\">[18]</a> or combine multiple filtering paths (\"columns\") which have different size filters <a href=\"#page-9-0\">[28]</a>.</p></blockquote>",
"polygon": [
[
308.390625,
306.861328125
],
[
545.1151733398438,
306.861328125
],
[
545.1151733398438,
376.9228210449219
],
[
308.390625,
376.9228210449219
]
],
"bbox": [
308.390625,
306.861328125,
545.1151733398438,
376.9228210449219
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/Text/11",
"block_type": "Text",
"html": "<blockquote><p block-type=\"Text\">Following on the intuition that multiscale integration is key to achieving good counting performance, we propose to incorporate dilated filters <a href=\"#page-8-5\">[25]</a> into a multicolumn convolutional neural network design <a href=\"#page-9-0\">[28]</a>. Dilated filters exponentially increase the network's receptive field without an exponential increase in parameters, allowing for efficient use of multiscale information. Convolutional neural networks with dilated filters have proven to provide competitive performance in image segmentation where multiscale analysis is also critical <a href=\"#page-8-5\">[25,</a> <a href=\"#page-8-6\">26]</a>. By incorporating dilated filters into the multicolumn network design, we greatly increase the ability of the network to selectively aggregate multiscale information, without the need for explicit perspective maps during training and testing. We propose the \"aggregated multicolumn dilated convolution network\" or AMDCN which uses dilations to aggregate multiscale information. Our extensive experimental evaluation shows that this proposed network outperforms previous methods on many benchmark datasets.</p></blockquote>",
"polygon": [
[
308.390625,
380.14453125
],
[
545.1151123046875,
380.14453125
],
[
545.1151123046875,
605.4156646728516
],
[
308.390625,
605.4156646728516
]
],
"bbox": [
308.390625,
380.14453125,
545.1151123046875,
605.4156646728516
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/4"
},
"images": {}
},
{
"id": "/page/0/SectionHeader/12",
"block_type": "SectionHeader",
"html": "<h1>2. Related Work</h1>",
"polygon": [
[
307.1953125,
621.7747497558594
],
[
392.0625,
621.7747497558594
],
[
392.0625,
633.7299499511719
],
[
307.1953125,
633.7299499511719
]
],
"bbox": [
307.1953125,
621.7747497558594,
392.0625,
633.7299499511719
],
"children": null,
"section_hierarchy": {
"1": "/page/0/SectionHeader/12"
},
"images": {}
},
{
"id": "/page/0/Text/13",
"block_type": "Text",
"html": "<p block-type=\"Text\">Counting using a supervised regressor to formulate a density map was first shown by <a href=\"#page-8-3\">[15]</a>. In this paper, Lempitsky et al. show that the minimal annotation of a single dot blurred by a Gaussian kernel produces a sufficient density map to train a network to count. All of the counting methods that we examine as well as the method we use in</p>",
"polygon": [
[
308.86199951171875,
643.5
],
[
545.361328125,
643.5
],
[
545
gitextract_pirlcywv/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows/
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CLA.md
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── benchmarks/
│ ├── __init__.py
│ ├── overall/
│ │ ├── __init__.py
│ │ ├── display/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods/
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers/
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table/
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput/
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── convert.py
├── convert_single.py
├── data/
│ ├── .gitignore
│ ├── examples/
│ │ ├── json/
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown/
│ │ ├── multicolcnn/
│ │ │ ├── multicolcnn.md
│ │ │ └── multicolcnn_meta.json
│ │ ├── switch_transformers/
│ │ │ ├── switch_trans.md
│ │ │ └── switch_trans_meta.json
│ │ └── thinkpython/
│ │ ├── thinkpython.md
│ │ └── thinkpython_meta.json
│ └── latex_to_md.sh
├── examples/
│ ├── README.md
│ └── marker_modal_deployment.py
├── extraction_app.py
├── marker/
│ ├── builders/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters/
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors/
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm/
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table.py
│ │ │ └── llm_table_merge.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers/
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers/
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema/
│ │ ├── __init__.py
│ │ ├── blocks/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text/
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts/
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert.py
│ │ ├── convert_single.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services/
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils/
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── pyproject.toml
├── pytest.ini
├── signatures/
│ └── version1/
│ └── cla.json
├── static/
│ └── fonts/
│ └── .gitignore
└── tests/
├── builders/
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config/
│ └── test_config.py
├── conftest.py
├── converters/
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors/
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers/
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers/
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema/
│ └── groups/
│ └── test_list_grouping.py
├── services/
│ └── test_service_init.py
└── utils.py
SYMBOL INDEX (866 symbols across 190 files)
FILE: benchmarks/overall/display/dataset.py
function build_dataset (line 11) | def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, s...
FILE: benchmarks/overall/display/table.py
function write_table (line 8) | def write_table(title: str, rows: list, headers: list, out_path: Path, f...
function print_scores (line 17) | def print_scores(result: FullResult, out_path: Path, methods: List[str],...
FILE: benchmarks/overall/download/base.py
class Downloader (line 9) | class Downloader:
method __init__ (line 13) | def __init__(self, api_key, app_id, max_rows: int = 2200):
method get_html (line 20) | def get_html(self, pdf_bytes):
method upload_ds (line 23) | def upload_ds(self):
method generate_data (line 37) | def generate_data(self):
method __call__ (line 61) | def __call__(self):
FILE: benchmarks/overall/download/llamaparse.py
class LlamaParseDownloader (line 9) | class LlamaParseDownloader(Downloader):
method get_html (line 12) | def get_html(self, pdf_bytes):
function upload_and_parse_file (line 27) | def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: i...
FILE: benchmarks/overall/download/main.py
function main (line 13) | def main(service: str, max_rows: int, api_key: str, app_id: str):
FILE: benchmarks/overall/download/mathpix.py
class MathpixDownloader (line 9) | class MathpixDownloader(Downloader):
method get_html (line 12) | def get_html(self, pdf_bytes):
function mathpix_request (line 33) | def mathpix_request(buffer, headers):
function mathpix_status (line 54) | def mathpix_status(pdf_id, headers):
function mathpix_results (line 76) | def mathpix_results(pdf_id, headers, ext="md"):
FILE: benchmarks/overall/download/mistral.py
class MistralDownloader (line 8) | class MistralDownloader(Downloader):
method get_html (line 11) | def get_html(self, pdf_bytes):
function upload_and_process_file (line 26) | def upload_and_process_file(api_key: str, fname: str, buff):
FILE: benchmarks/overall/elo.py
class ComparerSchema (line 93) | class ComparerSchema(BaseModel):
class Comparer (line 101) | class Comparer:
method __init__ (line 102) | def __init__(self):
method __call__ (line 105) | def __call__(
method llm_rater (line 125) | def llm_rater(self, img: Image.Image, prompt: str):
method llm_response_wrapper (line 133) | def llm_response_wrapper(
function display_win_rates_table (line 164) | def display_win_rates_table(win_rates: dict):
function main (line 180) | def main(
FILE: benchmarks/overall/methods/__init__.py
class BaseMethod (line 14) | class BaseMethod:
method __init__ (line 15) | def __init__(self, **kwargs):
method convert_to_md (line 21) | def convert_to_md(html: str):
method __call__ (line 26) | def __call__(self, sample) -> BenchmarkResult:
method render (line 29) | def render(self, markdown: str):
method convert_to_html (line 33) | def convert_to_html(md: str):
method html_to_image (line 63) | def html_to_image(self, html: str) -> Image.Image:
FILE: benchmarks/overall/methods/docling.py
class DoclingMethod (line 7) | class DoclingMethod(BaseMethod):
method __call__ (line 11) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/gt.py
class GTMethod (line 9) | class GTMethod(BaseMethod):
method __call__ (line 10) | def __call__(self, sample) -> BenchmarkResult:
method render (line 19) | def render(self, html: List[str]) -> Image.Image:
FILE: benchmarks/overall/methods/llamaparse.py
class LlamaParseMethod (line 6) | class LlamaParseMethod(BaseMethod):
method __call__ (line 9) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/marker.py
class MarkerMethod (line 10) | class MarkerMethod(BaseMethod):
method __call__ (line 14) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/mathpix.py
class MathpixMethod (line 6) | class MathpixMethod(BaseMethod):
method __call__ (line 9) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/mistral.py
class MistralMethod (line 6) | class MistralMethod(BaseMethod):
method __call__ (line 9) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/olmocr.py
function convert_single_page (line 13) | def convert_single_page(filename: str, model, processor, device):
class OlmOCRMethod (line 75) | class OlmOCRMethod(BaseMethod):
method __call__ (line 79) | def __call__(self, sample) -> BenchmarkResult:
FILE: benchmarks/overall/methods/schema.py
class BenchmarkResult (line 4) | class BenchmarkResult(TypedDict):
FILE: benchmarks/overall/overall.py
function get_method_scores (line 24) | def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List...
function main (line 98) | def main(
FILE: benchmarks/overall/schema.py
class FullResult (line 7) | class FullResult(TypedDict):
FILE: benchmarks/overall/scorers/__init__.py
class BaseScorer (line 6) | class BaseScorer:
method __init__ (line 7) | def __init__(self):
method __call__ (line 10) | def __call__(self, sample, gt_markdown: List[str], method_markdown: st...
FILE: benchmarks/overall/scorers/clean.py
class MarkdownCleaner (line 8) | class MarkdownCleaner:
method __init__ (line 9) | def __init__(self):
method __call__ (line 12) | def __call__(self, markdown):
method normalize_markdown (line 39) | def normalize_markdown(md_text: str) -> str:
method standardize_math (line 78) | def standardize_math(self, match):
method clean_latex (line 92) | def clean_latex(latex_str):
FILE: benchmarks/overall/scorers/heuristic.py
class HeuristicScorer (line 10) | class HeuristicScorer(BaseScorer):
method __call__ (line 11) | def __call__(self, sample, gt_markdown: List[str], method_markdown: st...
method kendall_tau (line 50) | def kendall_tau(correct_order: List[int], actual_order: List[int]) -> ...
method find_fuzzy_alignments (line 74) | def find_fuzzy_alignments(
method clean_input (line 103) | def clean_input(md: str):
FILE: benchmarks/overall/scorers/llm.py
class LLMScorer (line 94) | class LLMScorer(BaseScorer):
method __call__ (line 95) | def __call__(self, sample, gt_markdown: List[str], markdown: str) -> B...
method llm_rater (line 108) | def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
method llm_response_wrapper (line 136) | def llm_response_wrapper(self, prompt, response_schema, depth=0):
FILE: benchmarks/overall/scorers/schema.py
class BlockScores (line 4) | class BlockScores(TypedDict):
FILE: benchmarks/table/gemini.py
class TableSchema (line 25) | class TableSchema(BaseModel):
function gemini_table_rec (line 28) | def gemini_table_rec(image: Image.Image):
FILE: benchmarks/table/inference.py
function extract_tables (line 21) | def extract_tables(children: List[JSONBlockOutput]):
function fix_table_html (line 30) | def fix_table_html(table_html: str) -> str:
function inference_tables (line 45) | def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int |...
FILE: benchmarks/table/scoring.py
function wrap_table_html (line 11) | def wrap_table_html(table_html:str)->str:
class TableTree (line 14) | class TableTree(Tree):
method __init__ (line 15) | def __init__(self, tag, colspan=None, rowspan=None, content=None, *chi...
method bracket (line 24) | def bracket(self):
class CustomConfig (line 35) | class CustomConfig(Config):
method maximum (line 37) | def maximum(*sequences):
method normalized_distance (line 40) | def normalized_distance(self, *sequences):
method rename (line 43) | def rename(self, node1, node2):
function tokenize (line 51) | def tokenize(node):
function tree_convert_html (line 66) | def tree_convert_html(node, convert_cell=False, parent=None):
function similarity_eval_html (line 92) | def similarity_eval_html(pred, true, structure_only=False):
FILE: benchmarks/table/table.py
function update_teds_score (line 21) | def update_teds_score(result, prefix: str = "marker"):
function main (line 37) | def main(
FILE: benchmarks/throughput/main.py
function get_next_pdf (line 15) | def get_next_pdf(ds: datasets.Dataset, i: int):
function single_batch (line 26) | def single_batch(
function main (line 108) | def main(
FILE: benchmarks/verify_scores.py
function verify_scores (line 5) | def verify_scores(file_path):
function verify_table_scores (line 16) | def verify_table_scores(file_path):
FILE: examples/marker_modal_deployment.py
function setup_models_with_cache_check (line 33) | def setup_models_with_cache_check(logger, commit_volume=False):
function download_models (line 77) | def download_models():
class MarkerModalDemoService (line 104) | class MarkerModalDemoService:
method load_models (line 106) | def load_models(self):
method marker_api (line 123) | def marker_api(self):
function invoke_conversion (line 308) | async def invoke_conversion(
FILE: marker/builders/__init__.py
class BaseBuilder (line 8) | class BaseBuilder:
method __init__ (line 9) | def __init__(self, config: Optional[BaseModel | dict] = None):
method __call__ (line 12) | def __call__(self, data, *args, **kwargs):
FILE: marker/builders/document.py
class DocumentBuilder (line 14) | class DocumentBuilder(BaseBuilder):
method __call__ (line 31) | def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilde...
method build_document (line 39) | def build_document(self, provider: PdfProvider):
FILE: marker/builders/layout.py
class LayoutBuilder (line 16) | class LayoutBuilder(BaseBuilder):
method __init__ (line 46) | def __init__(self, layout_model: LayoutPredictor, config=None):
method __call__ (line 51) | def __call__(self, document: Document, provider: PdfProvider):
method get_batch_size (line 60) | def get_batch_size(self):
method forced_layout (line 67) | def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
method surya_layout (line 86) | def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
method expand_layout_blocks (line 94) | def expand_layout_blocks(self, document: Document):
method add_blocks_to_pages (line 131) | def add_blocks_to_pages(
FILE: marker/builders/line.py
class LineBuilder (line 25) | class LineBuilder(BaseBuilder):
method __init__ (line 85) | def __init__(
method __call__ (line 96) | def __call__(self, document: Document, provider: PdfProvider):
method get_detection_batch_size (line 102) | def get_detection_batch_size(self):
method get_ocr_error_batch_size (line 109) | def get_ocr_error_batch_size(self):
method get_detection_results (line 116) | def get_detection_results(
method get_all_lines (line 138) | def get_all_lines(self, document: Document, provider: PdfProvider):
method ocr_error_detection (line 234) | def ocr_error_detection(
method check_line_overlaps (line 251) | def check_line_overlaps(
method check_layout_coverage (line 281) | def check_layout_coverage(
method filter_blank_lines (line 329) | def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutp...
method merge_blocks (line 346) | def merge_blocks(
FILE: marker/builders/ocr.py
class OcrBuilder (line 25) | class OcrBuilder(BaseBuilder):
method __init__ (line 77) | def __init__(self, recognition_model: RecognitionPredictor, config=None):
method __call__ (line 82) | def __call__(self, document: Document, provider: PdfProvider):
method get_recognition_batch_size (line 96) | def get_recognition_batch_size(self):
method select_ocr_blocks_by_mode (line 105) | def select_ocr_blocks_by_mode(
method get_ocr_images_polygons_ids (line 120) | def get_ocr_images_polygons_ids(
method ocr_extraction (line 165) | def ocr_extraction(
method link_and_break_span (line 233) | def link_and_break_span(self, span: Span, text: str, match_text, url: ...
method replace_line_spans (line 252) | def replace_line_spans(
method assign_chars (line 294) | def assign_chars(self, span: Span, current_chars: List[Char]):
method store_char (line 300) | def store_char(self, char: Char, current_chars: List[Char], page: Page...
method spans_from_html_chars (line 305) | def spans_from_html_chars(
FILE: marker/builders/structure.py
class StructureBuilder (line 12) | class StructureBuilder(BaseBuilder):
method __init__ (line 25) | def __init__(self, config=None):
method __call__ (line 28) | def __call__(self, document: Document):
method group_caption_blocks (line 34) | def group_caption_blocks(self, page: PageGroup):
method group_lists (line 79) | def group_lists(self, page: PageGroup):
method unmark_lists (line 116) | def unmark_lists(self, page: PageGroup):
FILE: marker/config/crawler.py
class ConfigCrawler (line 16) | class ConfigCrawler:
method __init__ (line 17) | def __init__(
method _crawl_config (line 34) | def _crawl_config(self):
method _gather_super_annotations (line 64) | def _gather_super_annotations(cls: Type) -> Dict[str, Type]:
method attr_counts (line 81) | def attr_counts(self) -> Dict[str, int]:
method attr_set (line 90) | def attr_set(self) -> Set[str]:
method _find_subclasses (line 99) | def _find_subclasses(self, base_class):
method _format_type (line 116) | def _format_type(self, t: Type) -> str:
FILE: marker/config/parser.py
class ConfigParser (line 19) | class ConfigParser:
method __init__ (line 20) | def __init__(self, cli_options: dict):
method common_options (line 24) | def common_options(fn):
method generate_config_dict (line 86) | def generate_config_dict(self) -> Dict[str, any]:
method get_llm_service (line 117) | def get_llm_service(self):
method get_renderer (line 127) | def get_renderer(self):
method get_processors (line 141) | def get_processors(self):
method get_converter_cls (line 154) | def get_converter_cls(self):
method get_output_folder (line 167) | def get_output_folder(self, filepath: str):
method get_base_filename (line 174) | def get_base_filename(self, filepath: str):
FILE: marker/config/printer.py
class CustomClickPrinter (line 8) | class CustomClickPrinter(click.Command):
method parse_args (line 9) | def parse_args(self, ctx, args):
FILE: marker/converters/__init__.py
class BaseConverter (line 12) | class BaseConverter:
method __init__ (line 13) | def __init__(self, config: Optional[BaseModel | dict] = None):
method __call__ (line 21) | def __call__(self, *args, **kwargs):
method resolve_dependencies (line 24) | def resolve_dependencies(self, cls):
method initialize_processors (line 43) | def initialize_processors(self, processor_cls_lst: List[Type[BaseProce...
FILE: marker/converters/extraction.py
class ExtractionConverter (line 21) | class ExtractionConverter(PdfConverter):
method build_document (line 27) | def build_document(self, filepath: str):
method __call__ (line 44) | def __call__(self, filepath: str) -> ExtractionOutput:
FILE: marker/converters/ocr.py
class OCRConverter (line 13) | class OCRConverter(PdfConverter):
method __init__ (line 16) | def __init__(self, *args, **kwargs):
method build_document (line 25) | def build_document(self, filepath: str):
method __call__ (line 40) | def __call__(self, filepath: str):
FILE: marker/converters/pdf.py
class PdfConverter (line 58) | class PdfConverter(BaseConverter):
method __init__ (line 106) | def __init__(
method filepath_to_str (line 154) | def filepath_to_str(self, file_input: Union[str, io.BytesIO]):
method build_document (line 176) | def build_document(self, filepath: str) -> Document:
method __call__ (line 193) | def __call__(self, filepath: str | io.BytesIO):
FILE: marker/converters/table.py
class TableConverter (line 17) | class TableConverter(PdfConverter):
method build_document (line 31) | def build_document(self, filepath: str):
method __call__ (line 52) | def __call__(self, filepath: str):
FILE: marker/extractors/__init__.py
class BaseExtractor (line 12) | class BaseExtractor:
method __init__ (line 26) | def __init__(self, llm_service: BaseService, config=None):
method extract_image (line 30) | def extract_image(
method __call__ (line 43) | def __call__(self, document: Document, *args, **kwargs):
FILE: marker/extractors/document.py
class DocumentExtractionSchema (line 13) | class DocumentExtractionSchema(BaseModel):
class DocumentExtractor (line 18) | class DocumentExtractor(BaseExtractor):
method assemble_document_notes (line 106) | def assemble_document_notes(self, page_notes: List[PageExtractionSchem...
method __call__ (line 114) | def __call__(
FILE: marker/extractors/page.py
class PageExtractionSchema (line 15) | class PageExtractionSchema(BaseModel):
class PageExtractor (line 20) | class PageExtractor(BaseExtractor):
method chunk_page_markdown (line 101) | def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
method inference_single_chunk (line 113) | def inference_single_chunk(
method __call__ (line 138) | def __call__(
FILE: marker/logger.py
function configure_logging (line 7) | def configure_logging():
function get_logger (line 31) | def get_logger():
FILE: marker/models.py
function create_model_dict (line 16) | def create_model_dict(
FILE: marker/output.py
function unwrap_outer_tag (line 17) | def unwrap_outer_tag(html: str):
function json_to_html (line 27) | def json_to_html(block: JSONBlockOutput | BlockOutput):
function output_exists (line 47) | def output_exists(output_dir: str, fname_base: str):
function text_from_rendered (line 55) | def text_from_rendered(rendered: BaseModel):
function convert_if_not_rgb (line 74) | def convert_if_not_rgb(image: Image.Image) -> Image.Image:
function save_output (line 80) | def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
FILE: marker/processors/__init__.py
class BaseProcessor (line 10) | class BaseProcessor:
method __init__ (line 13) | def __init__(self, config: Optional[BaseModel | dict] = None):
method __call__ (line 16) | def __call__(self, document: Document, *args, **kwargs):
FILE: marker/processors/blank_page.py
class BlankPageProcessor (line 17) | class BlankPageProcessor(BaseProcessor):
method is_blank (line 29) | def is_blank(self, image: Image.Image):
method __call__ (line 55) | def __call__(self, document: Document):
FILE: marker/processors/block_relabel.py
class BlockRelabelProcessor (line 13) | class BlockRelabelProcessor(BaseProcessor):
method __init__ (line 28) | def __init__(self, config=None):
method __call__ (line 58) | def __call__(self, document: Document):
FILE: marker/processors/blockquote.py
class BlockquoteProcessor (line 8) | class BlockquoteProcessor(BaseProcessor):
method __init__ (line 32) | def __init__(self, config):
method __call__ (line 35) | def __call__(self, document: Document):
FILE: marker/processors/code.py
class CodeProcessor (line 7) | class CodeProcessor(BaseProcessor):
method __call__ (line 13) | def __call__(self, document: Document):
method format_block (line 19) | def format_block(self, document: Document, block: Code):
FILE: marker/processors/debug.py
class DebugProcessor (line 16) | class DebugProcessor(BaseProcessor):
method __call__ (line 41) | def __call__(self, document: Document):
method draw_pdf_debug_images (line 62) | def draw_pdf_debug_images(self, document: Document):
method draw_layout_debug_images (line 96) | def draw_layout_debug_images(self, document: Document, pdf_mode=False):
method render_layout_boxes (line 130) | def render_layout_boxes(self, page, png_image):
method dump_block_debug_data (line 162) | def dump_block_debug_data(self, document: Document):
method get_text_size (line 180) | def get_text_size(self, text, font):
method render_on_image (line 186) | def render_on_image(
FILE: marker/processors/document_toc.py
class DocumentTOCProcessor (line 6) | class DocumentTOCProcessor(BaseProcessor):
method __call__ (line 12) | def __call__(self, document: Document):
FILE: marker/processors/equation.py
class EquationProcessor (line 17) | class EquationProcessor(BaseProcessor):
method __init__ (line 41) | def __init__(self, recognition_model: RecognitionPredictor, config=None):
method get_batch_size (line 46) | def get_batch_size(self):
method __call__ (line 56) | def __call__(self, document: Document):
method fix_latex (line 97) | def fix_latex(self, math_html: str):
method get_latex_batched (line 123) | def get_latex_batched(
FILE: marker/processors/footnote.py
class FootnoteProcessor (line 9) | class FootnoteProcessor(BaseProcessor):
method __call__ (line 15) | def __call__(self, document: Document):
method push_footnotes_to_bottom (line 20) | def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
method assign_superscripts (line 31) | def assign_superscripts(self, page: PageGroup, document: Document):
FILE: marker/processors/ignoretext.py
class IgnoreTextProcessor (line 14) | class IgnoreTextProcessor(BaseProcessor):
method __call__ (line 44) | def __call__(self, document: Document):
method clean_text (line 66) | def clean_text(text):
method filter_common_elements (line 72) | def filter_common_elements(self, document, blocks: List[Block]):
FILE: marker/processors/line_merge.py
class LineMergeProcessor (line 11) | class LineMergeProcessor(BaseProcessor):
method __init__ (line 41) | def __init__(self, config):
method merge_lines (line 44) | def merge_lines(self, lines: List[Line], block: Block):
method __call__ (line 116) | def __call__(self, document: Document):
FILE: marker/processors/line_numbers.py
class LineNumbersProcessor (line 8) | class LineNumbersProcessor(BaseProcessor):
method __init__ (line 32) | def __init__(self, config):
method __call__ (line 35) | def __call__(self, document: Document):
method ignore_line_number_spans (line 40) | def ignore_line_number_spans(self, document: Document):
method ignore_line_number_blocks (line 61) | def ignore_line_number_blocks(self, document: Document):
method ignore_line_starts_ends (line 76) | def ignore_line_starts_ends(self, document: Document):
FILE: marker/processors/list.py
class ListProcessor (line 9) | class ListProcessor(BaseProcessor):
method __init__ (line 23) | def __init__(self, config):
method __call__ (line 26) | def __call__(self, document: Document):
method list_group_continuation (line 30) | def list_group_continuation(self, document: Document):
method list_group_indentation (line 57) | def list_group_indentation(self, document: Document):
FILE: marker/processors/llm/__init__.py
class PromptData (line 23) | class PromptData(TypedDict):
class BlockData (line 32) | class BlockData(TypedDict):
class BaseLLMProcessor (line 37) | class BaseLLMProcessor(BaseProcessor):
method __init__ (line 60) | def __init__(self, llm_service: BaseService, config=None):
method extract_image (line 69) | def extract_image(
method normalize_block_json (line 82) | def normalize_block_json(self, block: Block, document: Document, page:...
method load_blocks (line 107) | def load_blocks(self, response: dict):
method handle_rewrites (line 110) | def handle_rewrites(self, blocks: list, document: Document):
class BaseLLMComplexBlockProcessor (line 132) | class BaseLLMComplexBlockProcessor(BaseLLMProcessor):
method __call__ (line 137) | def __call__(self, document: Document):
method process_rewriting (line 146) | def process_rewriting(self, document: Document, page: PageGroup, block...
method rewrite_blocks (line 149) | def rewrite_blocks(self, document: Document):
class BaseLLMSimpleBlockProcessor (line 177) | class BaseLLMSimpleBlockProcessor(BaseLLMProcessor):
method __init__ (line 183) | def __init__(self, config=None):
method __call__ (line 186) | def __call__(self, result: dict, prompt_data: PromptData, document: Do...
method inference_blocks (line 193) | def inference_blocks(self, document: Document) -> List[BlockData]:
method block_prompts (line 200) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 203) | def rewrite_block(
FILE: marker/processors/llm/llm_complex.py
class LLMComplexRegionProcessor (line 12) | class LLMComplexRegionProcessor(BaseLLMSimpleBlockProcessor):
method block_prompts (line 54) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 69) | def rewrite_block(self, response: dict, prompt_data: PromptData, docum...
class ComplexSchema (line 92) | class ComplexSchema(BaseModel):
FILE: marker/processors/llm/llm_equation.py
class LLMEquationProcessor (line 10) | class LLMEquationProcessor(BaseLLMSimpleBlockProcessor):
method inference_blocks (line 71) | def inference_blocks(self, document: Document) -> List[BlockData]:
method block_prompts (line 87) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 106) | def rewrite_block(self, response: dict, prompt_data: PromptData, docum...
class EquationSchema (line 130) | class EquationSchema(BaseModel):
FILE: marker/processors/llm/llm_form.py
class LLMFormProcessor (line 12) | class LLMFormProcessor(BaseLLMSimpleBlockProcessor):
method inference_blocks (line 65) | def inference_blocks(self, document: Document) -> List[BlockData]:
method block_prompts (line 77) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 94) | def rewrite_block(self, response: dict, prompt_data: PromptData, docum...
class FormSchema (line 116) | class FormSchema(BaseModel):
FILE: marker/processors/llm/llm_handwriting.py
class LLMHandwritingProcessor (line 11) | class LLMHandwritingProcessor(BaseLLMSimpleBlockProcessor):
method inference_blocks (line 36) | def inference_blocks(self, document: Document) -> List[BlockData]:
method block_prompts (line 52) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 68) | def rewrite_block(self, response: dict, prompt_data: PromptData, docum...
class HandwritingSchema (line 84) | class HandwritingSchema(BaseModel):
FILE: marker/processors/llm/llm_image_description.py
class LLMImageDescriptionProcessor (line 11) | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor):
method inference_blocks (line 42) | def inference_blocks(self, document: Document) -> List[BlockData]:
method block_prompts (line 48) | def block_prompts(self, document: Document) -> List[PromptData]:
method rewrite_block (line 69) | def rewrite_block(
class ImageSchema (line 86) | class ImageSchema(BaseModel):
FILE: marker/processors/llm/llm_mathblock.py
class LLMMathBlockProcessor (line 16) | class LLMMathBlockProcessor(BaseLLMComplexBlockProcessor):
method rewrite_blocks (line 76) | def rewrite_blocks(self, document: Document):
method get_block_text (line 161) | def get_block_text(self, block: Block, document: Document) -> str:
method get_block_lines (line 166) | def get_block_lines(self, block: Block, document: Document) -> Tuple[l...
method process_rewriting (line 171) | def process_rewriting(self, document: Document, page: PageGroup, block...
class LLMTextSchema (line 198) | class LLMTextSchema(BaseModel):
FILE: marker/processors/llm/llm_meta.py
class LLMSimpleBlockMetaProcessor (line 14) | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
method __init__ (line 19) | def __init__(
method __call__ (line 28) | def __call__(self, document: Document):
method get_response (line 67) | def get_response(self, prompt_data: Dict[str, Any]):
FILE: marker/processors/llm/llm_page_correction.py
class LLMPageCorrectionProcessor (line 32) | class LLMPageCorrectionProcessor(BaseLLMComplexBlockProcessor):
method get_selected_blocks (line 138) | def get_selected_blocks(
method process_rewriting (line 150) | def process_rewriting(self, document: Document, page1: PageGroup):
method load_blocks (line 183) | def load_blocks(self, response):
method handle_reorder (line 187) | def handle_reorder(self, blocks: list, page1: PageGroup):
method handle_rewrites (line 247) | def handle_rewrites(self, blocks: list, document: Document):
method rewrite_blocks (line 268) | def rewrite_blocks(self, document: Document):
class BlockSchema (line 296) | class BlockSchema(BaseModel):
class PageSchema (line 302) | class PageSchema(BaseModel):
FILE: marker/processors/llm/llm_sectionheader.py
class LLMSectionHeaderProcessor (line 17) | class LLMSectionHeaderProcessor(BaseLLMComplexBlockProcessor):
method get_selected_blocks (line 99) | def get_selected_blocks(
method process_rewriting (line 111) | def process_rewriting(
method load_blocks (line 141) | def load_blocks(self, response):
method rewrite_blocks (line 145) | def rewrite_blocks(self, document: Document):
class BlockSchema (line 167) | class BlockSchema(BaseModel):
class SectionHeaderSchema (line 172) | class SectionHeaderSchema(BaseModel):
FILE: marker/processors/llm/llm_table.py
class LLMTableProcessor (line 18) | class LLMTableProcessor(BaseLLMComplexBlockProcessor):
method handle_image_rotation (line 94) | def handle_image_rotation(self, children: List[TableCell], image: Imag...
method process_rewriting (line 121) | def process_rewriting(self, document: Document, page: PageGroup, block...
method rewrite_single_chunk (line 188) | def rewrite_single_chunk(
method get_cell_text (line 243) | def get_cell_text(element, keep_tags=("br", "i", "b", "span", "math"))...
method parse_html_table (line 249) | def parse_html_table(
class TableSchema (line 323) | class TableSchema(BaseModel):
FILE: marker/processors/llm/llm_table_merge.py
class LLMTableMergeProcessor (line 17) | class LLMTableMergeProcessor(BaseLLMComplexBlockProcessor):
method get_row_count (line 127) | def get_row_count(cells: List[TableCell]):
method get_column_count (line 142) | def get_column_count(cells: List[TableCell]):
method rewrite_blocks (line 156) | def rewrite_blocks(self, document: Document):
method process_rewriting (line 244) | def process_rewriting(self, document: Document, blocks: List[Block]):
method validate_merge (line 295) | def validate_merge(self, cells1: List[TableCell], cells2: List[TableCe...
method join_cells (line 308) | def join_cells(self, cells1: List[TableCell], cells2: List[TableCell],...
method join_images (line 324) | def join_images(image1: Image.Image, image2: Image.Image, direction: L...
class MergeSchema (line 344) | class MergeSchema(BaseModel):
FILE: marker/processors/order.py
class OrderProcessor (line 9) | class OrderProcessor(BaseProcessor):
method __call__ (line 15) | def __call__(self, document: Document):
FILE: marker/processors/page_header.py
class PageHeaderProcessor (line 7) | class PageHeaderProcessor(BaseProcessor):
method __call__ (line 13) | def __call__(self, document: Document):
method move_page_header_to_top (line 17) | def move_page_header_to_top(self, page: PageGroup, document: Document):
FILE: marker/processors/reference.py
class ReferenceProcessor (line 13) | class ReferenceProcessor(BaseProcessor):
method __init__ (line 18) | def __init__(self, config):
method __call__ (line 21) | def __call__(self, document: Document):
FILE: marker/processors/sectionheader.py
class SectionHeaderProcessor (line 16) | class SectionHeaderProcessor(BaseProcessor):
method __call__ (line 38) | def __call__(self, document: Document):
method bucket_headings (line 69) | def bucket_headings(self, line_heights: List[float], num_levels=4):
FILE: marker/processors/table.py
class TableProcessor (line 28) | class TableProcessor(BaseProcessor):
method __init__ (line 72) | def __init__(
method __call__ (line 85) | def __call__(self, document: Document):
method finalize_cell_text (line 187) | def finalize_cell_text(self, cell: SuryaTableCell):
method normalize_spaces (line 222) | def normalize_spaces(text):
method combine_dollar_column (line 234) | def combine_dollar_column(self, tables: List[TableResult]):
method split_combined_rows (line 312) | def split_combined_rows(self, tables: List[TableResult]):
method assign_text_to_cells (line 427) | def assign_text_to_cells(self, tables: List[TableResult], table_data: ...
method assign_pdftext_lines (line 457) | def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
method align_table_cells (line 498) | def align_table_cells(
method needs_ocr (line 569) | def needs_ocr(self, tables: List[TableResult], table_blocks: List[dict]):
method get_ocr_results (line 604) | def get_ocr_results(
method assign_ocr_lines (line 671) | def assign_ocr_lines(self, tables: List[TableResult], table_blocks: li...
method get_table_rec_batch_size (line 696) | def get_table_rec_batch_size(self):
method get_recognition_batch_size (line 705) | def get_recognition_batch_size(self):
method get_detection_batch_size (line 714) | def get_detection_batch_size(self):
FILE: marker/processors/text.py
class TextProcessor (line 12) | class TextProcessor(BaseProcessor):
method __init__ (line 24) | def __init__(self, config):
method __call__ (line 27) | def __call__(self, document: Document):
FILE: marker/processors/util.py
function escape_latex_commands (line 11) | def escape_latex_commands(text: str):
function add_math_spans_to_line (line 19) | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: P...
function text_to_spans (line 47) | def text_to_spans(text):
FILE: marker/providers/__init__.py
class ProviderOutput (line 20) | class ProviderOutput(BaseModel):
method raw_text (line 26) | def raw_text(self):
method __hash__ (line 29) | def __hash__(self):
method merge (line 32) | def merge(self, other: "ProviderOutput"):
class BaseProvider (line 51) | class BaseProvider:
method __init__ (line 52) | def __init__(self, filepath: str, config: Optional[BaseModel | dict] =...
method __len__ (line 56) | def __len__(self):
method get_images (line 59) | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
method get_page_bbox (line 62) | def get_page_bbox(self, idx: int) -> PolygonBox | None:
method get_page_lines (line 65) | def get_page_lines(self, idx: int) -> List[Line]:
method get_page_refs (line 68) | def get_page_refs(self, idx: int) -> List[Reference]:
method __enter__ (line 71) | def __enter__(self):
method get_font_css (line 75) | def get_font_css():
FILE: marker/providers/document.py
class DocumentProvider (line 52) | class DocumentProvider(PdfProvider):
method __init__ (line 53) | def __init__(self, filepath: str, config=None):
method __del__ (line 67) | def __del__(self):
method convert_docx_to_pdf (line 71) | def convert_docx_to_pdf(self, filepath: str):
method _preprocess_base64_images (line 86) | def _preprocess_base64_images(html_content):
FILE: marker/providers/epub.py
class EpubProvider (line 47) | class EpubProvider(PdfProvider):
method __init__ (line 48) | def __init__(self, filepath: str, config=None):
method __del__ (line 62) | def __del__(self):
method convert_epub_to_pdf (line 66) | def convert_epub_to_pdf(self, filepath):
FILE: marker/providers/html.py
class HTMLProvider (line 7) | class HTMLProvider(PdfProvider):
method __init__ (line 8) | def __init__(self, filepath: str, config=None):
method __del__ (line 22) | def __del__(self):
method convert_html_to_pdf (line 26) | def convert_html_to_pdf(self, filepath: str):
FILE: marker/providers/image.py
class ImageProvider (line 10) | class ImageProvider(BaseProvider):
method __init__ (line 19) | def __init__(self, filepath: str, config=None):
method __len__ (line 37) | def __len__(self):
method get_images (line 40) | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
method get_page_bbox (line 43) | def get_page_bbox(self, idx: int) -> PolygonBox | None:
method get_page_lines (line 48) | def get_page_lines(self, idx: int) -> List[Line]:
method get_page_refs (line 51) | def get_page_refs(self, idx: int) -> List[Reference]:
FILE: marker/providers/pdf.py
class PdfProvider (line 29) | class PdfProvider(BaseProvider):
method __init__ (line 84) | def __init__(self, filepath: str, config=None):
method get_doc (line 110) | def get_doc(self):
method __len__ (line 124) | def __len__(self) -> int:
method font_flags_to_format (line 127) | def font_flags_to_format(self, flags: Optional[int]) -> Set[str]:
method font_names_to_format (line 178) | def font_names_to_format(self, font_name: str | None) -> Set[str]:
method normalize_spaces (line 190) | def normalize_spaces(text):
method pdftext_extraction (line 202) | def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
method check_line_spans (line 306) | def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
method check_page (line 321) | def check_page(self, page_id: int, doc: PdfDocument) -> bool:
method detect_bad_ocr (line 379) | def detect_bad_ocr(self, text):
method _render_image (line 404) | def _render_image(
method get_images (line 415) | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
method get_page_bbox (line 422) | def get_page_bbox(self, idx: int) -> PolygonBox | None:
method get_page_lines (line 427) | def get_page_lines(self, idx: int) -> List[ProviderOutput]:
method get_page_refs (line 430) | def get_page_refs(self, idx: int) -> List[Reference]:
method _get_fontname (line 434) | def _get_fontname(font) -> str:
FILE: marker/providers/powerpoint.py
class PowerPointProvider (line 42) | class PowerPointProvider(PdfProvider):
method __init__ (line 45) | def __init__(self, filepath: str, config=None):
method __del__ (line 60) | def __del__(self):
method convert_pptx_to_pdf (line 64) | def convert_pptx_to_pdf(self, filepath):
method _handle_group (line 112) | def _handle_group(self, group_shape) -> str:
method _handle_text (line 140) | def _handle_text(self, shape) -> str:
method _handle_image (line 213) | def _handle_image(self, shape) -> str:
method _handle_table (line 227) | def _handle_table(self, shape) -> str:
method _escape_html (line 244) | def _escape_html(self, text: str) -> str:
FILE: marker/providers/registry.py
function load_matchers (line 28) | def load_matchers(doctype: str):
function load_extensions (line 32) | def load_extensions(doctype: str):
function provider_from_ext (line 36) | def provider_from_ext(filepath: str):
function provider_from_filepath (line 59) | def provider_from_filepath(filepath: str):
FILE: marker/providers/spreadsheet.py
class SpreadSheetProvider (line 31) | class SpreadSheetProvider(PdfProvider):
method __init__ (line 32) | def __init__(self, filepath: str, config=None):
method __del__ (line 46) | def __del__(self):
method convert_xlsx_to_pdf (line 50) | def convert_xlsx_to_pdf(self, filepath: str):
method _get_merged_cell_ranges (line 70) | def _get_merged_cell_ranges(sheet):
method _excel_to_html_table (line 81) | def _excel_to_html_table(self, sheet):
FILE: marker/providers/utils.py
function alphanum_ratio (line 1) | def alphanum_ratio(text):
FILE: marker/renderers/__init__.py
class BaseRenderer (line 17) | class BaseRenderer:
method __init__ (line 36) | def __init__(self, config: Optional[BaseModel | dict] = None):
method __call__ (line 45) | def __call__(self, document):
method extract_image (line 49) | def extract_image(self, document: Document, image_id, to_base64=False):
method merge_consecutive_math (line 68) | def merge_consecutive_math(html, tag="math"):
method merge_consecutive_tags (line 79) | def merge_consecutive_tags(html, tag):
method generate_page_stats (line 100) | def generate_page_stats(self, document: Document, document_output):
method generate_document_metadata (line 117) | def generate_document_metadata(self, document: Document, document_outp...
method extract_block_html (line 127) | def extract_block_html(self, document: Document, block_output: BlockOu...
FILE: marker/renderers/chunk.py
class FlatBlockOutput (line 11) | class FlatBlockOutput(BaseModel):
class ChunkOutput (line 22) | class ChunkOutput(BaseModel):
function collect_images (line 27) | def collect_images(block: JSONBlockOutput) -> dict[str, str]:
function assemble_html_with_images (line 36) | def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[...
function json_to_chunks (line 55) | def json_to_chunks(
class ChunkRenderer (line 74) | class ChunkRenderer(JSONRenderer):
method __call__ (line 76) | def __call__(self, document: Document) -> ChunkOutput:
FILE: marker/renderers/extraction.py
class ExtractionOutput (line 7) | class ExtractionOutput(BaseModel):
class ExtractionRenderer (line 13) | class ExtractionRenderer(BaseRenderer):
method __call__ (line 14) | def __call__(
FILE: marker/renderers/html.py
class HTMLOutput (line 23) | class HTMLOutput(BaseModel):
class HTMLRenderer (line 29) | class HTMLRenderer(BaseRenderer):
method extract_image (line 43) | def extract_image(self, document, image_id):
method insert_block_id (line 50) | def insert_block_id(self, soup, block_id: BlockId):
method extract_html (line 81) | def extract_html(self, document, document_output, level=0):
method __call__ (line 143) | def __call__(self, document) -> HTMLOutput:
FILE: marker/renderers/json.py
class JSONBlockOutput (line 12) | class JSONBlockOutput(BaseModel):
class JSONOutput (line 23) | class JSONOutput(BaseModel):
function reformat_section_hierarchy (line 29) | def reformat_section_hierarchy(section_hierarchy):
class JSONRenderer (line 36) | class JSONRenderer(BaseRenderer):
method extract_json (line 50) | def extract_json(self, document: Document, block_output: BlockOutput):
method __call__ (line 83) | def __call__(self, document: Document) -> JSONOutput:
FILE: marker/renderers/markdown.py
function escape_dollars (line 19) | def escape_dollars(text):
function cleanup_text (line 23) | def cleanup_text(full_text):
function get_formatted_table_text (line 29) | def get_formatted_table_text(element):
class Markdownify (line 58) | class Markdownify(MarkdownConverter):
method __init__ (line 59) | def __init__(
method convert_div (line 75) | def convert_div(self, el, text, parent_tags):
method convert_p (line 86) | def convert_p(self, el, text, parent_tags):
method convert_math (line 101) | def convert_math(self, el, text, parent_tags):
method convert_table (line 120) | def convert_table(self, el, text, parent_tags):
method convert_a (line 220) | def convert_a(self, el, text, parent_tags):
method convert_span (line 226) | def convert_span(self, el, text, parent_tags):
method escape (line 232) | def escape(self, text, parent_tags=None):
method process_text (line 238) | def process_text(self, el, parent_tags=None):
class MarkdownOutput (line 260) | class MarkdownOutput(BaseModel):
class MarkdownRenderer (line 266) | class MarkdownRenderer(HTMLRenderer):
method md_cls (line 281) | def md_cls(self):
method __call__ (line 298) | def __call__(self, document: Document) -> MarkdownOutput:
FILE: marker/renderers/ocr_json.py
class OCRJSONCharOutput (line 10) | class OCRJSONCharOutput(BaseModel):
class OCRJSONLineOutput (line 18) | class OCRJSONLineOutput(BaseModel):
class OCRJSONPageOutput (line 27) | class OCRJSONPageOutput(BaseModel):
class OCRJSONOutput (line 35) | class OCRJSONOutput(BaseModel):
class OCRJSONRenderer (line 41) | class OCRJSONRenderer(BaseRenderer):
method extract_json (line 55) | def extract_json(self, document: Document) -> List[OCRJSONPageOutput]:
method __call__ (line 133) | def __call__(self, document: Document) -> OCRJSONOutput:
FILE: marker/schema/__init__.py
class BlockTypes (line 4) | class BlockTypes(str, Enum):
method __str__ (line 34) | def __str__(self):
FILE: marker/schema/blocks/base.py
class BlockMetadata (line 16) | class BlockMetadata(BaseModel):
method merge (line 24) | def merge(self, model2):
class BlockOutput (line 33) | class BlockOutput(BaseModel):
class BlockId (line 41) | class BlockId(BaseModel):
method __str__ (line 46) | def __str__(self):
method __hash__ (line 51) | def __hash__(self):
method __repr__ (line 54) | def __repr__(self):
method __eq__ (line 57) | def __eq__(self, other):
method validate_block_type (line 72) | def validate_block_type(cls, v):
method to_path (line 79) | def to_path(self):
class Block (line 83) | class Block(BaseModel):
method id (line 108) | def id(self) -> BlockId:
method from_block (line 114) | def from_block(cls, block: Block) -> Block:
method set_internal_metadata (line 118) | def set_internal_metadata(self, key, data):
method get_internal_metadata (line 123) | def get_internal_metadata(self, key):
method get_image (line 128) | def get_image(
method structure_blocks (line 150) | def structure_blocks(self, document_page: Document | PageGroup) -> Lis...
method get_prev_block (line 155) | def get_prev_block(
method get_next_block (line 172) | def get_next_block(
method add_structure (line 191) | def add_structure(self, block: Block):
method update_structure_item (line 197) | def update_structure_item(self, old_id: BlockId, new_id: BlockId):
method remove_structure_items (line 204) | def remove_structure_items(self, block_ids: List[BlockId]):
method raw_text (line 208) | def raw_text(self, document: Document) -> str:
method assemble_html (line 227) | def assemble_html(
method assign_section_hierarchy (line 247) | def assign_section_hierarchy(self, section_hierarchy):
method contained_blocks (line 257) | def contained_blocks(
method replace_block (line 275) | def replace_block(self, block: Block, new_block: Block):
method render (line 282) | def render(
method line_height (line 318) | def line_height(self, document: Document) -> float:
method update_metadata (line 324) | def update_metadata(self, **kwargs):
method handle_html_output (line 337) | def handle_html_output(
FILE: marker/schema/blocks/basetable.py
class BaseTable (line 8) | class BaseTable(Block):
method format_cells (line 13) | def format_cells(
method assemble_html (line 38) | def assemble_html(
FILE: marker/schema/blocks/caption.py
class Caption (line 5) | class Caption(Block):
method assemble_html (line 11) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/schema/blocks/code.py
class Code (line 7) | class Code(Block):
method assemble_html (line 13) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/schema/blocks/complexregion.py
class ComplexRegion (line 5) | class ComplexRegion(Block):
method assemble_html (line 10) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/schema/blocks/equation.py
class Equation (line 5) | class Equation(Block):
method assemble_html (line 10) | def assemble_html(
FILE: marker/schema/blocks/figure.py
class Figure (line 5) | class Figure(Block):
method assemble_html (line 11) | def assemble_html(
FILE: marker/schema/blocks/footnote.py
class Footnote (line 5) | class Footnote(Block):
method assemble_html (line 13) | def assemble_html(
FILE: marker/schema/blocks/form.py
class Form (line 7) | class Form(BaseTable):
FILE: marker/schema/blocks/handwriting.py
class Handwriting (line 5) | class Handwriting(Block):
method assemble_html (line 11) | def assemble_html(
FILE: marker/schema/blocks/inlinemath.py
class InlineMath (line 5) | class InlineMath(Block):
method assemble_html (line 13) | def assemble_html(
FILE: marker/schema/blocks/listitem.py
function replace_bullets (line 7) | def replace_bullets(child_blocks):
class ListItem (line 19) | class ListItem(Block):
method assemble_html (line 25) | def assemble_html(
FILE: marker/schema/blocks/pagefooter.py
class PageFooter (line 5) | class PageFooter(Block):
method assemble_html (line 14) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/schema/blocks/pageheader.py
class PageHeader (line 5) | class PageHeader(Block):
method assemble_html (line 14) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/schema/blocks/picture.py
class Picture (line 5) | class Picture(Block):
method assemble_html (line 11) | def assemble_html(
FILE: marker/schema/blocks/reference.py
class Reference (line 5) | class Reference(Block):
method assemble_html (line 10) | def assemble_html(
FILE: marker/schema/blocks/sectionheader.py
class SectionHeader (line 7) | class SectionHeader(Block):
method assemble_html (line 13) | def assemble_html(
FILE: marker/schema/blocks/table.py
class Table (line 5) | class Table(BaseTable):
FILE: marker/schema/blocks/tablecell.py
class TableCell (line 7) | class TableCell(Block):
method text (line 18) | def text(self):
method assemble_html (line 21) | def assemble_html(
FILE: marker/schema/blocks/text.py
class Text (line 5) | class Text(Block):
method assemble_html (line 13) | def assemble_html(
FILE: marker/schema/blocks/toc.py
class TableOfContents (line 5) | class TableOfContents(BaseTable):
FILE: marker/schema/document.py
class DocumentOutput (line 12) | class DocumentOutput(BaseModel):
class TocItem (line 18) | class TocItem(BaseModel):
class Document (line 25) | class Document(BaseModel):
method get_block (line 32) | def get_block(self, block_id: BlockId):
method get_page (line 39) | def get_page(self, page_id):
method get_next_block (line 45) | def get_next_block(
method get_next_page (line 65) | def get_next_page(self, page: PageGroup):
method get_prev_block (line 71) | def get_prev_block(self, block: Block):
method get_prev_page (line 81) | def get_prev_page(self, page: PageGroup):
method assemble_html (line 87) | def assemble_html(
method render (line 95) | def render(self, block_config: Optional[dict] = None):
method contained_blocks (line 108) | def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -...
FILE: marker/schema/groups/base.py
class Group (line 4) | class Group(Block):
FILE: marker/schema/groups/figure.py
class FigureGroup (line 5) | class FigureGroup(Group):
method assemble_html (line 10) | def assemble_html(
FILE: marker/schema/groups/list.py
class ListGroup (line 5) | class ListGroup(Group):
method assemble_html (line 11) | def assemble_html(
FILE: marker/schema/groups/page.py
class PageGroup (line 21) | class PageGroup(Group):
method incr_block_id (line 39) | def incr_block_id(self):
method add_child (line 45) | def add_child(self, block: Block):
method get_image (line 51) | def get_image(
method current_children (line 82) | def current_children(self) -> List[Block]:
method get_next_block (line 85) | def get_next_block(
method get_prev_block (line 104) | def get_prev_block(self, block: Block):
method add_block (line 110) | def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Bl...
method add_full_block (line 120) | def add_full_block(self, block: Block) -> Block:
method get_block (line 126) | def get_block(self, block_id: BlockId) -> Block | None:
method assemble_html (line 131) | def assemble_html(
method compute_line_block_intersections (line 139) | def compute_line_block_intersections(
method compute_max_structure_block_intersection_pct (line 163) | def compute_max_structure_block_intersection_pct(self):
method replace_block (line 178) | def replace_block(self, block: Block, new_block: Block):
method identify_missing_blocks (line 192) | def identify_missing_blocks(
method create_missing_blocks (line 232) | def create_missing_blocks(
method add_initial_blocks (line 262) | def add_initial_blocks(
method merge_blocks (line 308) | def merge_blocks(
method aggregate_block_metadata (line 360) | def aggregate_block_metadata(self) -> BlockMetadata:
FILE: marker/schema/groups/picture.py
class PictureGroup (line 5) | class PictureGroup(Group):
method assemble_html (line 10) | def assemble_html(
FILE: marker/schema/groups/table.py
class TableGroup (line 8) | class TableGroup(Group):
method assemble_html (line 13) | def assemble_html(
FILE: marker/schema/polygon.py
class PolygonBox (line 9) | class PolygonBox(BaseModel):
method check_elements (line 14) | def check_elements(cls, v: List[List[float]]) -> List[List[float]]:
method height (line 34) | def height(self):
method width (line 38) | def width(self):
method area (line 42) | def area(self):
method center (line 46) | def center(self):
method size (line 50) | def size(self):
method x_start (line 54) | def x_start(self):
method y_start (line 58) | def y_start(self):
method x_end (line 62) | def x_end(self):
method y_end (line 66) | def y_end(self):
method bbox (line 71) | def bbox(self) -> List[float]:
method expand (line 78) | def expand(self, x_margin: float, y_margin: float) -> PolygonBox:
method expand_y2 (line 93) | def expand_y2(self, y_margin: float) -> PolygonBox:
method expand_y1 (line 105) | def expand_y1(self, y_margin: float) -> PolygonBox:
method minimum_gap (line 117) | def minimum_gap(self, other: PolygonBox):
method center_distance (line 147) | def center_distance(self, other: PolygonBox, x_weight: float = 1, y_we...
method tl_distance (line 153) | def tl_distance(self, other: PolygonBox):
method rescale (line 156) | def rescale(self, old_size, new_size):
method fit_to_bounds (line 170) | def fit_to_bounds(self, bounds):
method overlap_x (line 177) | def overlap_x(self, other: PolygonBox):
method overlap_y (line 180) | def overlap_y(self, other: PolygonBox):
method intersection_area (line 183) | def intersection_area(self, other: PolygonBox):
method intersection_pct (line 186) | def intersection_pct(self, other: PolygonBox):
method merge (line 193) | def merge(self, others: List[PolygonBox]) -> PolygonBox:
method from_bbox (line 214) | def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False):
FILE: marker/schema/registry.py
function register_block_class (line 41) | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
function get_block_class (line 45) | def get_block_class(block_type: BlockTypes) -> Type[Block]:
FILE: marker/schema/text/char.py
class Char (line 5) | class Char(Block):
FILE: marker/schema/text/line.py
function remove_tags (line 13) | def remove_tags(text):
function replace_last (line 17) | def replace_last(string, old, new):
function strip_trailing_hyphens (line 25) | def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
class Line (line 39) | class Line(Block):
method ocr_input_text (line 46) | def ocr_input_text(self, document):
method formatted_text (line 60) | def formatted_text(self, document, skip_urls=False):
method assemble_html (line 84) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
method render (line 102) | def render(
method merge (line 125) | def merge(self, other: "Line"):
FILE: marker/schema/text/span.py
function cleanup_text (line 10) | def cleanup_text(full_text):
class Span (line 16) | class Span(Block):
method bold (line 47) | def bold(self):
method italic (line 51) | def italic(self):
method math (line 55) | def math(self):
method highlight (line 59) | def highlight(self):
method superscript (line 63) | def superscript(self):
method subscript (line 67) | def subscript(self):
method small (line 71) | def small(self):
method code (line 75) | def code(self):
method underline (line 79) | def underline(self):
method assemble_html (line 82) | def assemble_html(self, document, child_blocks, parent_structure, bloc...
FILE: marker/scripts/chunk_convert.py
function chunk_convert_cli (line 7) | def chunk_convert_cli():
FILE: marker/scripts/common.py
function parse_args (line 22) | def parse_args():
function load_models (line 46) | def load_models():
function open_pdf (line 50) | def open_pdf(pdf_file):
function img_to_html (line 55) | def img_to_html(img, img_alt):
function get_page_image (line 65) | def get_page_image(pdf_file, page_num, dpi=96):
function page_count (line 82) | def page_count(pdf_file: UploadedFile):
function pillow_image_to_base64_string (line 90) | def pillow_image_to_base64_string(img: Image) -> str:
function extract_root_pydantic_class (line 96) | def extract_root_pydantic_class(schema_code: str) -> Optional[str]:
function get_root_class (line 161) | def get_root_class(schema_code: str) -> Optional[BaseModel]:
FILE: marker/scripts/convert.py
function worker_init (line 42) | def worker_init():
function worker_exit (line 52) | def worker_exit():
function process_single_pdf (line 60) | def process_single_pdf(args):
function convert_cli (line 139) | def convert_cli(in_folder: str, **kwargs):
FILE: marker/scripts/convert_single.py
function convert_single_cli (line 25) | def convert_single_cli(fpath: str, **kwargs):
FILE: marker/scripts/extraction_app.py
function extract_data (line 29) | def extract_data(
FILE: marker/scripts/file_to_s3.py
function main (line 19) | def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: s...
FILE: marker/scripts/run_streamlit_app.py
function streamlit_app_cli (line 6) | def streamlit_app_cli(app_name: str = "streamlit_app.py"):
function extraction_app_cli (line 24) | def extraction_app_cli():
FILE: marker/scripts/server.py
function lifespan (line 30) | async def lifespan(app: FastAPI):
function root (line 43) | async def root():
class CommonParams (line 55) | class CommonParams(BaseModel):
function _convert_pdf (line 86) | async def _convert_pdf(params: CommonParams):
function convert_pdf (line 131) | async def convert_pdf(params: CommonParams):
function convert_pdf_upload (line 136) | async def convert_pdf_upload(
function server_cli (line 165) | def server_cli(port: int, host: str):
FILE: marker/scripts/streamlit_app.py
function convert_pdf (line 29) | def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[s...
function markdown_insert_images (line 43) | def markdown_insert_images(markdown, images):
FILE: marker/services/__init__.py
class BaseService (line 12) | class BaseService:
method img_to_base64 (line 22) | def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"):
method process_images (line 27) | def process_images(self, images: List[PIL.Image.Image]) -> list:
method format_image_for_llm (line 30) | def format_image_for_llm(self, image):
method __init__ (line 40) | def __init__(self, config: Optional[BaseModel | dict] = None):
method __call__ (line 46) | def __call__(
FILE: marker/services/azure_openai.py
class AzureOpenAIService (line 17) | class AzureOpenAIService(BaseService):
method process_images (line 29) | def process_images(self, images: List[PIL.Image.Image]) -> list:
method __call__ (line 43) | def __call__(
method get_client (line 111) | def get_client(self) -> AzureOpenAI:
FILE: marker/services/claude.py
class ClaudeService (line 18) | class ClaudeService(BaseService):
method process_images (line 27) | def process_images(self, images: List[Image.Image]) -> List[dict]:
method validate_response (line 40) | def validate_response(self, response_text: str, schema: type[T]) -> T:
method get_client (line 61) | def get_client(self):
method __call__ (line 66) | def __call__(
FILE: marker/services/gemini.py
class BaseGeminiService (line 20) | class BaseGeminiService(BaseService):
method img_to_bytes (line 28) | def img_to_bytes(self, img: PIL.Image.Image):
method get_google_client (line 33) | def get_google_client(self, timeout: int):
method process_images (line 36) | def process_images(self, images):
method __call__ (line 43) | def __call__(
class GoogleGeminiService (line 134) | class GoogleGeminiService(BaseGeminiService):
method get_google_client (line 137) | def get_google_client(self, timeout: int):
FILE: marker/services/ollama.py
class OllamaService (line 15) | class OllamaService(BaseService):
method process_images (line 23) | def process_images(self, images):
method __call__ (line 27) | def __call__(
FILE: marker/services/openai.py
class OpenAIService (line 18) | class OpenAIService(BaseService):
method process_images (line 33) | def process_images(self, images: List[Image.Image]) -> List[dict]:
method __call__ (line 61) | def __call__(
method get_client (line 129) | def get_client(self) -> openai.OpenAI:
FILE: marker/services/vertex.py
class GoogleVertexService (line 7) | class GoogleVertexService(BaseGeminiService):
method get_google_client (line 25) | def get_google_client(self, timeout: int):
FILE: marker/settings.py
class Settings (line 10) | class Settings(BaseSettings):
method TORCH_DEVICE_MODEL (line 35) | def TORCH_DEVICE_MODEL(self) -> str:
method MODEL_DTYPE (line 49) | def MODEL_DTYPE(self) -> torch.dtype:
class Config (line 55) | class Config:
FILE: marker/util.py
function strings_to_classes (line 28) | def strings_to_classes(items: List[str]) -> List[type]:
function classes_to_strings (line 37) | def classes_to_strings(items: List[type]) -> List[str]:
function verify_config_keys (line 45) | def verify_config_keys(obj):
function assign_config (line 58) | def assign_config(cls, config: BaseModel | dict | None):
function parse_range_str (line 82) | def parse_range_str(range_str: str) -> List[int]:
function matrix_intersection_area (line 95) | def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[Lis...
function matrix_distance (line 116) | def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]...
function sort_text_lines (line 135) | def sort_text_lines(lines: List[PolygonBox], tolerance=1.25):
function download_font (line 153) | def download_font():
function get_opening_tag_type (line 162) | def get_opening_tag_type(tag):
function get_closing_tag_type (line 181) | def get_closing_tag_type(tag):
function normalize_latex_escapes (line 211) | def normalize_latex_escapes(s: str) -> str:
function unwrap_math (line 216) | def unwrap_math(text: str, math_symbols: List[str] = MATH_SYMBOLS) -> str:
FILE: marker/utils/batch.py
function get_batch_sizes_worker_counts (line 4) | def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_v...
FILE: marker/utils/gpu.py
class GPUManager (line 11) | class GPUManager:
method __init__ (line 14) | def __init__(self, device_idx: int):
method __enter__ (line 19) | def __enter__(self):
method __exit__ (line 24) | def __exit__(self, exc_type, exc_val, exc_tb):
method using_cuda (line 29) | def using_cuda():
method check_cuda_available (line 32) | def check_cuda_available(self) -> bool:
method get_gpu_vram (line 41) | def get_gpu_vram(self):
method start_mps_server (line 66) | def start_mps_server(self) -> bool:
method stop_mps_server (line 98) | def stop_mps_server(self) -> None:
method cleanup (line 127) | def cleanup(self) -> None:
FILE: marker/utils/image.py
function is_blank_image (line 6) | def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]...
FILE: tests/builders/test_blank_page.py
function test_blank_page (line 8) | def test_blank_page(config, doc_provider, layout_model, ocr_error_model,...
FILE: tests/builders/test_document_builder.py
function test_document_builder (line 9) | def test_document_builder(pdf_document):
function test_document_builder_inline_eq (line 28) | def test_document_builder_inline_eq(pdf_document):
FILE: tests/builders/test_garbled_pdf.py
function test_garbled_pdf (line 10) | def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, d...
function test_garbled_builder (line 33) | def test_garbled_builder(config, doc_provider, detection_model, ocr_erro...
function test_nongarbled_builder (line 47) | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_e...
FILE: tests/builders/test_layout_replace.py
function test_layout_replace (line 13) | def test_layout_replace(
FILE: tests/builders/test_ocr_builder.py
function test_blank_char_builder (line 6) | def test_blank_char_builder(recognition_model):
FILE: tests/builders/test_ocr_pipeline.py
function _ocr_pipeline_test (line 7) | def _ocr_pipeline_test(pdf_document):
function test_ocr_pipeline (line 39) | def test_ocr_pipeline(pdf_document):
function test_ocr_with_inline_pipeline (line 44) | def test_ocr_with_inline_pipeline(pdf_document):
FILE: tests/builders/test_overriding.py
class NewSectionHeader (line 14) | class NewSectionHeader(SectionHeader):
class NewLine (line 18) | class NewLine(Line):
function test_overriding (line 26) | def test_overriding(pdf_document: Document):
function get_lines (line 31) | def get_lines(pdf: str, config=None):
function test_overriding_mp (line 39) | def test_overriding_mp():
FILE: tests/builders/test_pdf_links.py
function test_pdf_links (line 15) | def test_pdf_links(pdf_document: Document, config, renderer, model_dict,...
FILE: tests/builders/test_rotated_bboxes.py
function test_rotated_bboxes (line 8) | def test_rotated_bboxes(pdf_document):
FILE: tests/builders/test_strip_existing_ocr.py
function test_strip_ocr (line 6) | def test_strip_ocr(doc_provider):
function test_keep_ocr (line 13) | def test_keep_ocr(doc_provider):
FILE: tests/builders/test_structure.py
function test_structure_builder (line 7) | def test_structure_builder(pdf_document):
FILE: tests/config/test_config.py
function capture_kwargs (line 10) | def capture_kwargs(argv):
function test_config_parser (line 29) | def test_config_parser():
function test_config_none (line 50) | def test_config_none():
function test_config_llm (line 59) | def test_config_llm():
function test_config_force_ocr (line 68) | def test_config_force_ocr():
FILE: tests/conftest.py
function model_dict (line 28) | def model_dict():
function layout_model (line 35) | def layout_model(model_dict):
function detection_model (line 40) | def detection_model(model_dict):
function recognition_model (line 45) | def recognition_model(model_dict):
function table_rec_model (line 50) | def table_rec_model(model_dict):
function ocr_error_model (line 55) | def ocr_error_model(model_dict):
function config (line 60) | def config(request):
function pdf_dataset (line 72) | def pdf_dataset():
function temp_doc (line 77) | def temp_doc(request, pdf_dataset):
function doc_provider (line 91) | def doc_provider(request, config, temp_doc):
function pdf_document (line 97) | def pdf_document(
function pdf_converter (line 117) | def pdf_converter(request, config, model_dict, renderer, llm_service):
function renderer (line 130) | def renderer(request, config):
function llm_service (line 148) | def llm_service(request, config):
function temp_image (line 157) | def temp_image():
FILE: tests/converters/test_extraction_converter.py
class MockLLMService (line 10) | class MockLLMService(BaseService):
method __call__ (line 11) | def __call__(self, prompt, image=None, page=None, response_schema=None...
function mock_llm_service (line 26) | def mock_llm_service():
function extraction_converter (line 31) | def extraction_converter(config, model_dict, mock_llm_service):
function test_extraction_converter (line 52) | def test_extraction_converter(config, model_dict, mock_llm_service, temp...
function test_extraction_converter_multiple_pages (line 66) | def test_extraction_converter_multiple_pages(extraction_converter, temp_...
FILE: tests/converters/test_ocr_converter.py
function _ocr_converter (line 7) | def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_cou...
function check_bboxes (line 20) | def check_bboxes(page: OCRJSONPageOutput, lines):
function test_ocr_converter (line 37) | def test_ocr_converter(config, model_dict, temp_doc):
function test_ocr_converter_force (line 43) | def test_ocr_converter_force(config, model_dict, temp_doc):
function test_ocr_converter_keep (line 51) | def test_ocr_converter_keep(config, model_dict, temp_doc):
FILE: tests/converters/test_pdf_converter.py
function test_pdf_converter (line 10) | def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
function test_epub_converter (line 33) | def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
function test_xlsx_converter (line 43) | def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
function test_html_converter (line 53) | def test_html_converter(pdf_converter: PdfConverter, temp_doc):
function test_docx_converter (line 63) | def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
function test_pptx_converter (line 73) | def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
function test_pdf_converter_bytes (line 83) | def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
FILE: tests/converters/test_table_converter.py
function _table_converter (line 6) | def _table_converter(config, model_dict, renderer, temp_pdf):
function test_table_converter (line 23) | def test_table_converter(config, model_dict, renderer, temp_doc):
function test_table_converter_ocr (line 28) | def test_table_converter_ocr(config, model_dict, renderer, temp_doc):
FILE: tests/processors/test_document_toc_processor.py
function test_document_toc_processor (line 7) | def test_document_toc_processor(pdf_document, detection_model, recogniti...
FILE: tests/processors/test_equation_processor.py
function test_equation_processor (line 8) | def test_equation_processor(pdf_document, recognition_model):
FILE: tests/processors/test_footnote_processor.py
function test_footnote_processor (line 9) | def test_footnote_processor(pdf_document):
FILE: tests/processors/test_ignoretext.py
function test_ignoretext_processor (line 10) | def test_ignoretext_processor(pdf_document):
FILE: tests/processors/test_llm_processors.py
function test_llm_form_processor_no_config (line 19) | def test_llm_form_processor_no_config(pdf_document, llm_service):
function test_llm_form_processor_no_cells (line 30) | def test_llm_form_processor_no_cells(pdf_document, llm_service):
function test_llm_form_processor (line 42) | def test_llm_form_processor(pdf_document, table_rec_model, recognition_m...
function test_llm_table_processor (line 64) | def test_llm_table_processor(pdf_document, table_rec_model, recognition_...
function test_llm_caption_processor_disabled (line 107) | def test_llm_caption_processor_disabled(pdf_document):
function test_llm_caption_processor (line 119) | def test_llm_caption_processor(pdf_document):
function test_llm_complex_region_processor (line 141) | def test_llm_complex_region_processor(pdf_document):
function test_multi_llm_processors (line 167) | def test_multi_llm_processors(pdf_document):
FILE: tests/processors/test_table_merge.py
function test_llm_table_processor_nomerge (line 11) | def test_llm_table_processor_nomerge(pdf_document, table_rec_model, reco...
FILE: tests/processors/test_table_processor.py
function test_table_processor (line 12) | def test_table_processor(
function test_avoid_double_ocr (line 34) | def test_avoid_double_ocr(
function test_overlap_blocks (line 53) | def test_overlap_blocks(
function test_ocr_table (line 71) | def test_ocr_table(pdf_document, recognition_model, table_rec_model, det...
function test_split_rows (line 81) | def test_split_rows(pdf_document, recognition_model, table_rec_model, de...
FILE: tests/providers/test_document_providers.py
function test_pptx_provider (line 6) | def test_pptx_provider(doc_provider):
function test_epub_provider (line 20) | def test_epub_provider(doc_provider):
function test_html_provider (line 31) | def test_html_provider(doc_provider):
function test_docx_provider (line 41) | def test_docx_provider(doc_provider):
function test_xlsx_provider (line 52) | def test_xlsx_provider(doc_provider):
FILE: tests/providers/test_image_provider.py
function test_image_provider (line 5) | def test_image_provider(config, temp_image):
function test_image_provider_conversion (line 13) | def test_image_provider_conversion(pdf_converter, temp_image):
FILE: tests/providers/test_pdf_provider.py
function test_pdf_provider (line 5) | def test_pdf_provider(doc_provider):
FILE: tests/renderers/test_chunk_renderer.py
function test_chunk_renderer (line 7) | def test_chunk_renderer(pdf_document):
FILE: tests/renderers/test_extract_images.py
function test_disable_extract_images (line 8) | def test_disable_extract_images(pdf_document):
function test_extract_images (line 18) | def test_extract_images(pdf_document):
FILE: tests/renderers/test_html_renderer.py
function test_html_renderer_block_ids (line 14) | def test_html_renderer_block_ids(pdf_document, config):
FILE: tests/renderers/test_json_renderer.py
function test_markdown_renderer_pagination (line 7) | def test_markdown_renderer_pagination(pdf_document):
FILE: tests/renderers/test_markdown_renderer.py
function test_markdown_renderer (line 9) | def test_markdown_renderer(pdf_document):
function test_markdown_renderer_auto_ocr (line 18) | def test_markdown_renderer_auto_ocr(pdf_document):
function test_markdown_renderer_pagination (line 27) | def test_markdown_renderer_pagination(pdf_document):
function test_markdown_renderer_pagination_blank_last_page (line 36) | def test_markdown_renderer_pagination_blank_last_page(pdf_document):
function test_markdown_renderer_metadata (line 52) | def test_markdown_renderer_metadata(pdf_document):
function test_markdown_renderer_images (line 59) | def test_markdown_renderer_images(pdf_document):
function test_markdown_renderer_tables (line 68) | def test_markdown_renderer_tables(pdf_document):
FILE: tests/schema/groups/test_list_grouping.py
function test_list_grouping (line 9) | def test_list_grouping(pdf_document):
FILE: tests/services/test_service_init.py
function test_empty_llm (line 13) | def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
function test_llm_no_keys (line 18) | def test_llm_no_keys(model_dict, config):
function test_llm_gemini (line 25) | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
function test_llm_vertex (line 39) | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
function test_llm_ollama (line 52) | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
function test_llm_openai (line 66) | def test_llm_openai(pdf_converter: PdfConverter, temp_doc):
function test_llm_azure_openai (line 83) | def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc):
FILE: tests/utils.py
function setup_pdf_provider (line 7) | def setup_pdf_provider(
Copy disabled (too large)
Download .json
Condensed preview — 240 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (11,058K chars).
[
{
"path": ".github/ISSUE_TEMPLATE/breaking-bug-report.md",
"chars": 1182,
"preview": "---\nname: Breaking bug report\nabout: Create a report about a breaking bug\ntitle: \"[BUG: Breaking]\"\nlabels: 'bug: breakin"
},
{
"path": ".github/ISSUE_TEMPLATE/feature_request.md",
"chars": 519,
"preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"[FEAT]\"\nlabels: enhancement\nassignees: ''\n\n---"
},
{
"path": ".github/ISSUE_TEMPLATE/output-bug-report.md",
"chars": 994,
"preview": "---\nname: Output bug report\nabout: Create a report about poor output quality\ntitle: \"[BUG: Output]\"\nlabels: 'bug: output"
},
{
"path": ".github/workflows/benchmarks.yml",
"chars": 1064,
"preview": "name: Integration test\n\non: [push]\n\nenv:\n PYTHONIOENCODING: \"utf-8\"\n\njobs:\n benchmark:\n runs-on: ${{ matrix.os }}\n "
},
{
"path": ".github/workflows/ci.yml",
"chars": 685,
"preview": "name: CI tests\n\non: [push]\n\njobs:\n tests:\n runs-on: t4_gpu\n steps:\n - uses: actions/checkout@v3\n - name"
},
{
"path": ".github/workflows/cla.yml",
"chars": 1335,
"preview": "name: \"Marker CLA Assistant\"\non:\n issue_comment:\n types: [created]\n pull_request_target:\n types: [opened,closed,"
},
{
"path": ".github/workflows/publish.yml",
"chars": 907,
"preview": "name: Python package\non:\n push:\n tags:\n - \"v*.*.*\"\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n -"
},
{
"path": ".github/workflows/scripts.yml",
"chars": 1357,
"preview": "name: Test CLI scripts\n\non: [push]\n\njobs:\n tests:\n runs-on: t4_gpu\n steps:\n - uses: actions/checkout@v3\n "
},
{
"path": ".gitignore",
"chars": 3246,
"preview": "private.py\n.DS_Store\nlocal.env\nexperiments\ntest_data\ntraining\nwandb\n*.dat\nreport.json\nbenchmark_data\ndebug_data\ntemp.md\n"
},
{
"path": ".pre-commit-config.yaml",
"chars": 271,
"preview": "repos:\n- repo: https://github.com/astral-sh/ruff-pre-commit\n # Ruff version.\n rev: v0.9.10\n hooks:\n # Run the lint"
},
{
"path": "CLA.md",
"chars": 4585,
"preview": "Marker Contributor Agreement\n\nThis Marker Contributor Agreement (\"MCA\") applies to any contribution that you make to any"
},
{
"path": "LICENSE",
"chars": 35079,
"preview": " GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free "
},
{
"path": "MODEL_LICENSE",
"chars": 14654,
"preview": " AI PUBS OPEN RAIL-M LICENSE (MODIFIED)\n\nVersion 0.1, March 2, 2023 (Modified)\nhttp://licenses.ai/\n\nPL"
},
{
"path": "README.md",
"chars": 28383,
"preview": "# Marker\n\nMarker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.\n\n- Converts PDF, image, "
},
{
"path": "benchmarks/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/overall/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/overall/display/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/overall/display/dataset.py",
"chars": 1917,
"preview": "import json\nfrom typing import List\n\nimport datasets\nfrom tqdm import tqdm\n\nfrom benchmarks.overall.registry import METH"
},
{
"path": "benchmarks/overall/display/table.py",
"chars": 3191,
"preview": "from pathlib import Path\nfrom typing import Dict, List\n\nimport tabulate\n\nfrom benchmarks.overall.schema import FullResul"
},
{
"path": "benchmarks/overall/download/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/overall/download/base.py",
"chars": 1967,
"preview": "import json\nfrom json import JSONDecodeError\nfrom pathlib import Path\n\nimport datasets\nfrom tqdm import tqdm\n\n\nclass Dow"
},
{
"path": "benchmarks/overall/download/llamaparse.py",
"chars": 1800,
"preview": "import io\nimport time\n\nimport requests\n\nfrom benchmarks.overall.download.base import Downloader\n\n\nclass LlamaParseDownlo"
},
{
"path": "benchmarks/overall/download/main.py",
"chars": 888,
"preview": "import click\n\nfrom benchmarks.overall.download.llamaparse import LlamaParseDownloader\nfrom benchmarks.overall.download.m"
},
{
"path": "benchmarks/overall/download/mathpix.py",
"chars": 2242,
"preview": "import json\nimport time\n\nimport requests\n\nfrom benchmarks.overall.download.base import Downloader\n\n\nclass MathpixDownloa"
},
{
"path": "benchmarks/overall/download/mistral.py",
"chars": 1877,
"preview": "import io\nimport time\nimport requests\n\nfrom benchmarks.overall.download.base import Downloader\n\n\nclass MistralDownloader"
},
{
"path": "benchmarks/overall/elo.py",
"chars": 8526,
"preview": "import json\nimport random\nimport time\nimport os\nfrom dataclasses import dataclass\nfrom typing import List, Dict, Tuple, "
},
{
"path": "benchmarks/overall/methods/__init__.py",
"chars": 4106,
"preview": "import io\nimport random\nimport re\nfrom typing import Tuple\n\nimport markdown2\nfrom PIL import Image\nfrom playwright.sync_"
},
{
"path": "benchmarks/overall/methods/docling.py",
"chars": 746,
"preview": "import tempfile\nimport time\n\nfrom benchmarks.overall.methods import BaseMethod, BenchmarkResult\n\n\nclass DoclingMethod(Ba"
},
{
"path": "benchmarks/overall/methods/gt.py",
"chars": 714,
"preview": "from typing import List\nimport json\n\nfrom PIL import Image\n\nfrom benchmarks.overall.methods import BaseMethod, Benchmark"
},
{
"path": "benchmarks/overall/methods/llamaparse.py",
"chars": 587,
"preview": "import datasets\n\nfrom benchmarks.overall.methods import BaseMethod, BenchmarkResult\n\n\nclass LlamaParseMethod(BaseMethod)"
},
{
"path": "benchmarks/overall/methods/marker.py",
"chars": 1274,
"preview": "import os\nimport tempfile\nimport time\n\nfrom benchmarks.overall.methods import BaseMethod, BenchmarkResult\nfrom marker.co"
},
{
"path": "benchmarks/overall/methods/mathpix.py",
"chars": 578,
"preview": "import datasets\n\nfrom benchmarks.overall.methods import BaseMethod, BenchmarkResult\n\n\nclass MathpixMethod(BaseMethod):\n "
},
{
"path": "benchmarks/overall/methods/mistral.py",
"chars": 578,
"preview": "import datasets\n\nfrom benchmarks.overall.methods import BaseMethod, BenchmarkResult\n\n\nclass MistralMethod(BaseMethod):\n "
},
{
"path": "benchmarks/overall/methods/olmocr.py",
"chars": 2691,
"preview": "import base64\nimport json\nimport tempfile\nimport time\nfrom io import BytesIO\n\nimport torch\nfrom PIL import Image\n\nfrom b"
},
{
"path": "benchmarks/overall/methods/schema.py",
"chars": 123,
"preview": "from typing import TypedDict, List\n\n\nclass BenchmarkResult(TypedDict):\n markdown: str | List[str]\n time: float | N"
},
{
"path": "benchmarks/overall/overall.py",
"chars": 7422,
"preview": "import json\nimport os\nimport traceback\nfrom collections import defaultdict\nfrom pathlib import Path\nfrom typing import L"
},
{
"path": "benchmarks/overall/registry.py",
"chars": 837,
"preview": "from benchmarks.overall.methods.docling import DoclingMethod\nfrom benchmarks.overall.methods.gt import GTMethod\nfrom ben"
},
{
"path": "benchmarks/overall/schema.py",
"chars": 394,
"preview": "from typing import TypedDict, List, Dict\n\nfrom benchmarks.overall.scorers.schema import BlockScores\n\nAVG_TYPE = Dict[str"
},
{
"path": "benchmarks/overall/scorers/__init__.py",
"chars": 269,
"preview": "from typing import List\n\nfrom benchmarks.overall.scorers.schema import BlockScores\n\n\nclass BaseScorer:\n def __init__("
},
{
"path": "benchmarks/overall/scorers/clean.py",
"chars": 3914,
"preview": "import re\nimport subprocess\nimport tempfile\nfrom pathlib import Path\n\nimport latex2mathml.converter\n\nclass MarkdownClean"
},
{
"path": "benchmarks/overall/scorers/heuristic.py",
"chars": 3522,
"preview": "from typing import List\n\nfrom rapidfuzz import fuzz\n\nfrom benchmarks.overall.scorers.clean import MarkdownCleaner\nfrom b"
},
{
"path": "benchmarks/overall/scorers/llm.py",
"chars": 6797,
"preview": "import json\nimport os\nimport tempfile\nimport time\nfrom typing import List\n\nfrom PIL import Image\nfrom google.genai.error"
},
{
"path": "benchmarks/overall/scorers/schema.py",
"chars": 152,
"preview": "from typing import TypedDict, List, Optional, Dict\n\n\nclass BlockScores(TypedDict):\n score: float\n specific_scores:"
},
{
"path": "benchmarks/table/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/table/gemini.py",
"chars": 1754,
"preview": "import json\nfrom PIL import Image\nfrom google import genai\nfrom google.genai import types\nfrom io import BytesIO\nfrom py"
},
{
"path": "benchmarks/table/inference.py",
"chars": 7175,
"preview": "from typing import List\n\nimport numpy as np\nfrom bs4 import BeautifulSoup\nimport pypdfium2 as pdfium\nfrom tqdm import tq"
},
{
"path": "benchmarks/table/scoring.py",
"chars": 3733,
"preview": "\"\"\"\"\nTEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD\n\"\"\"\n\nimport distance\nfrom apted import APTED, Config\nfrom"
},
{
"path": "benchmarks/table/table.py",
"chars": 3699,
"preview": "import os\nos.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\" # Transformers uses .isin for an op, which is not supported o"
},
{
"path": "benchmarks/throughput/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "benchmarks/throughput/main.py",
"chars": 5378,
"preview": "import os\nimport tempfile\nimport time\nfrom multiprocessing import get_context\nfrom concurrent.futures import ProcessPool"
},
{
"path": "benchmarks/verify_scores.py",
"chars": 1131,
"preview": "import json\nimport argparse\n\n\ndef verify_scores(file_path):\n with open(file_path, 'r') as file:\n data = json.l"
},
{
"path": "chunk_convert.py",
"chars": 110,
"preview": "from marker.scripts.chunk_convert import chunk_convert_cli\n\nif __name__ == \"__main__\":\n chunk_convert_cli()"
},
{
"path": "convert.py",
"chars": 93,
"preview": "from marker.scripts.convert import convert_cli\n\nif __name__ == \"__main__\":\n convert_cli()\n"
},
{
"path": "convert_single.py",
"chars": 114,
"preview": "from marker.scripts.convert_single import convert_single_cli\n\nif __name__ == \"__main__\":\n convert_single_cli()\n"
},
{
"path": "data/.gitignore",
"chars": 21,
"preview": "latex\npdfs\nreferences"
},
{
"path": "data/examples/json/multicolcnn.json",
"chars": 631108,
"preview": "{\n \"children\": [\n {\n \"id\": \"/page/0/Page/277\",\n \"block_type\": \"Page\",\n \"html\": \"<content-ref src='/pa"
},
{
"path": "data/examples/json/switch_trans.json",
"chars": 1983289,
"preview": "{\n \"children\": [\n {\n \"id\": \"/page/0/Page/164\",\n \"block_type\": \"Page\",\n \"html\": \"<content-ref src='/pa"
},
{
"path": "data/examples/json/thinkpython.json",
"chars": 6158386,
"preview": "{\n \"children\": [\n {\n \"id\": \"/page/0/Page/10\",\n \"block_type\": \"Page\",\n \"html\": \"<content-ref src='/pag"
},
{
"path": "data/examples/markdown/multicolcnn/multicolcnn.md",
"chars": 42698,
"preview": "# An Aggregated Multicolumn Dilated Convolution Network for Perspective-Free Counting\n\nDiptodip Deb Georgia Institute of"
},
{
"path": "data/examples/markdown/multicolcnn/multicolcnn_meta.json",
"chars": 16467,
"preview": "{\n \"table_of_contents\": [\n {\n \"title\": \"An Aggregated Multicolumn Dilated Convolution Network\\nfor Perspective-"
},
{
"path": "data/examples/markdown/switch_transformers/switch_trans.md",
"chars": 113021,
"preview": "# Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity\n\n### William Fedus∗\n\nliam"
},
{
"path": "data/examples/markdown/switch_transformers/switch_trans_meta.json",
"chars": 47677,
"preview": "{\n \"table_of_contents\": [\n {\n \"title\": \"Switch Transformers: Scaling to Trillion Parameter Models\\nwith Simple "
},
{
"path": "data/examples/markdown/thinkpython/thinkpython.md",
"chars": 486039,
"preview": "# Think Python\n\n## How to Think Like a Computer Scientist\n\nVersion 2.0.17\n\n# Think Python\n\n## How to Think Like a Comput"
},
{
"path": "data/examples/markdown/thinkpython/thinkpython_meta.json",
"chars": 279677,
"preview": "{\n \"table_of_contents\": [\n {\n \"title\": \"Think Python\",\n \"heading_level\": null,\n \"page_id\": 0,\n \""
},
{
"path": "data/latex_to_md.sh",
"chars": 986,
"preview": "#!/bin/bash\n\n# List all .tex files in the latex folder\nFILES=$(find latex -name \"*.tex\")\n\nfor f in $FILES\ndo\n echo \"Pro"
},
{
"path": "examples/README.md",
"chars": 3811,
"preview": "## Usage Examples\n\nThis directory contains examples of running `marker` in different contexts.\n\n### Usage with Modal\n\nWe"
},
{
"path": "examples/marker_modal_deployment.py",
"chars": 14208,
"preview": "\"\"\"\nModal deployment for Datalab Marker PDF conversion service.\n\"\"\"\n\nimport modal\nimport os\nfrom typing import Optional\n"
},
{
"path": "extraction_app.py",
"chars": 117,
"preview": "from marker.scripts.run_streamlit_app import extraction_app_cli\n\nif __name__ == \"__main__\":\n extraction_app_cli()\n"
},
{
"path": "marker/builders/__init__.py",
"chars": 305,
"preview": "from typing import Optional\n\nfrom pydantic import BaseModel\n\nfrom marker.util import assign_config\n\n\nclass BaseBuilder:\n"
},
{
"path": "marker/builders/document.py",
"chars": 2087,
"preview": "from typing import Annotated\n\nfrom marker.builders import BaseBuilder\nfrom marker.builders.layout import LayoutBuilder\nf"
},
{
"path": "marker/builders/layout.py",
"chars": 6365,
"preview": "from typing import Annotated, List\n\nfrom surya.layout import LayoutPredictor\nfrom surya.layout.schema import LayoutResul"
},
{
"path": "marker/builders/line.py",
"chars": 14162,
"preview": "from copy import deepcopy\nfrom typing import Annotated, List, Tuple\n\nimport numpy as np\nfrom PIL import Image\nimport cv2"
},
{
"path": "marker/builders/ocr.py",
"chars": 17339,
"preview": "import copy\nfrom typing import Annotated, List\n\nfrom ftfy import fix_text\nfrom PIL import Image\nfrom surya.common.surya."
},
{
"path": "marker/builders/structure.py",
"chars": 5079,
"preview": "from typing import Annotated\n\nfrom marker.builders import BaseBuilder\nfrom marker.schema import BlockTypes\nfrom marker.s"
},
{
"path": "marker/config/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "marker/config/crawler.py",
"chars": 4795,
"preview": "import importlib\nimport inspect\nimport pkgutil\nfrom functools import cached_property\nfrom typing import Annotated, Dict,"
},
{
"path": "marker/config/parser.py",
"chars": 6084,
"preview": "import json\nimport os\nfrom typing import Dict\n\nimport click\n\nfrom marker.converters.pdf import PdfConverter\nfrom marker."
},
{
"path": "marker/config/printer.py",
"chars": 4090,
"preview": "from typing import Optional\n\nimport click\n\nfrom marker.config.crawler import crawler\n\n\nclass CustomClickPrinter(click.Co"
},
{
"path": "marker/converters/__init__.py",
"chars": 2469,
"preview": "import inspect\nfrom typing import Optional, List, Type\n\nfrom pydantic import BaseModel\n\nfrom marker.processors import Ba"
},
{
"path": "marker/converters/extraction.py",
"chars": 2912,
"preview": "import re\nfrom typing import Annotated\n\nfrom marker.builders.document import DocumentBuilder\nfrom marker.builders.line i"
},
{
"path": "marker/converters/ocr.py",
"chars": 1598,
"preview": "from typing import Tuple\n\nfrom marker.builders.document import DocumentBuilder\nfrom marker.builders.line import LineBuil"
},
{
"path": "marker/converters/pdf.py",
"chars": 7880,
"preview": "import os\n\nfrom marker.schema.document import Document\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\" # disables a tok"
},
{
"path": "marker/converters/table.py",
"chars": 2114,
"preview": "from typing import Tuple, List\n\nfrom marker.builders.document import DocumentBuilder\nfrom marker.builders.line import Li"
},
{
"path": "marker/extractors/__init__.py",
"chars": 1244,
"preview": "from typing import Annotated, Sequence\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.document import Document"
},
{
"path": "marker/extractors/document.py",
"chars": 4934,
"preview": "import json\n\nfrom pydantic import BaseModel\nfrom typing import Annotated, Optional, List\n\nfrom marker.extractors import "
},
{
"path": "marker/extractors/page.py",
"chars": 5756,
"preview": "import json\nfrom concurrent.futures import ThreadPoolExecutor\n\nfrom pydantic import BaseModel\nfrom typing import Annotat"
},
{
"path": "marker/logger.py",
"chars": 888,
"preview": "import logging\nimport warnings\n\nfrom marker.settings import settings\n\n\ndef configure_logging():\n # Setup marker logge"
},
{
"path": "marker/models.py",
"chars": 1220,
"preview": "import os\n\nos.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = (\n \"1\" # Transformers uses .isin for an op, which is not supp"
},
{
"path": "marker/output.py",
"chars": 3553,
"preview": "import json\nimport os\n\nfrom bs4 import BeautifulSoup, Tag\nfrom pydantic import BaseModel\nfrom PIL import Image\n\nfrom mar"
},
{
"path": "marker/processors/__init__.py",
"chars": 513,
"preview": "from typing import Optional, Tuple\n\nfrom pydantic import BaseModel\n\nfrom marker.schema import BlockTypes\nfrom marker.sch"
},
{
"path": "marker/processors/blank_page.py",
"chars": 2511,
"preview": "from typing import Annotated\n\nfrom PIL import Image\nimport numpy as np\nimport cv2\n\nfrom marker.processors import BasePro"
},
{
"path": "marker/processors/block_relabel.py",
"chars": 3571,
"preview": "from copy import deepcopy\nfrom typing import Annotated\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema i"
},
{
"path": "marker/processors/blockquote.py",
"chars": 2836,
"preview": "from typing import Annotated, Tuple\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfr"
},
{
"path": "marker/processors/code.py",
"chars": 1561,
"preview": "from marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Code\nf"
},
{
"path": "marker/processors/debug.py",
"chars": 7623,
"preview": "import json\nimport os\nfrom typing import Annotated\n\nfrom PIL import Image, ImageDraw, ImageFont\nfrom marker.logger impor"
},
{
"path": "marker/processors/document_toc.py",
"chars": 784,
"preview": "from marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom marker.schema.document import Docu"
},
{
"path": "marker/processors/equation.py",
"chars": 5072,
"preview": "from typing import Annotated, List, Tuple\nfrom PIL import Image\nimport re\nfrom bs4 import BeautifulSoup\n\nfrom ftfy impor"
},
{
"path": "marker/processors/footnote.py",
"chars": 1398,
"preview": "import re\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom marker.schema.document "
},
{
"path": "marker/processors/ignoretext.py",
"chars": 3646,
"preview": "import re\nfrom collections import Counter\nfrom itertools import groupby\nfrom typing import Annotated, List\n\nfrom rapidfu"
},
{
"path": "marker/processors/line_merge.py",
"chars": 4827,
"preview": "from typing import Annotated, List\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfro"
},
{
"path": "marker/processors/line_numbers.py",
"chars": 5142,
"preview": "from typing import Annotated\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom mark"
},
{
"path": "marker/processors/list.py",
"chars": 4725,
"preview": "from typing import Annotated, List, Tuple\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTy"
},
{
"path": "marker/processors/llm/__init__.py",
"chars": 6498,
"preview": "import json\nimport traceback\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Annotate"
},
{
"path": "marker/processors/llm/llm_complex.py",
"chars": 3653,
"preview": "from typing import List\n\nimport markdown2\nfrom pydantic import BaseModel\n\nfrom marker.processors.llm import PromptData, "
},
{
"path": "marker/processors/llm/llm_equation.py",
"chars": 5078,
"preview": "from pydantic import BaseModel\n\nfrom marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData\nfro"
},
{
"path": "marker/processors/llm/llm_form.py",
"chars": 4157,
"preview": "from typing import List\n\nfrom pydantic import BaseModel\n\nfrom marker.output import json_to_html\nfrom marker.processors.l"
},
{
"path": "marker/processors/llm/llm_handwriting.py",
"chars": 3825,
"preview": "import markdown2\nfrom pydantic import BaseModel\nfrom marker.processors.llm import PromptData, BaseLLMSimpleBlockProcesso"
},
{
"path": "marker/processors/llm/llm_image_description.py",
"chars": 3166,
"preview": "from pydantic import BaseModel\n\nfrom marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData\n\nfr"
},
{
"path": "marker/processors/llm/llm_mathblock.py",
"chars": 8688,
"preview": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import List, Tuple, Annotated\n\nfrom pydantic"
},
{
"path": "marker/processors/llm/llm_meta.py",
"chars": 2448,
"preview": "from concurrent.futures import ThreadPoolExecutor\nfrom typing import List, Dict, Any\n\nfrom marker.logger import get_logg"
},
{
"path": "marker/processors/llm/llm_page_correction.py",
"chars": 11683,
"preview": "import json\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import List, Annotated\n\nfrom mar"
},
{
"path": "marker/processors/llm/llm_sectionheader.py",
"chars": 5456,
"preview": "import json\nfrom typing import List, Tuple\n\nfrom tqdm import tqdm\n\nfrom marker.logger import get_logger\nfrom marker.proc"
},
{
"path": "marker/processors/llm/llm_table.py",
"chars": 13051,
"preview": "from typing import Annotated, List, Tuple\n\nfrom bs4 import BeautifulSoup\nfrom PIL import Image\nfrom marker.logger import"
},
{
"path": "marker/processors/llm/llm_table_merge.py",
"chars": 14994,
"preview": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Annotated, List, Tuple, Literal\n\nfrom"
},
{
"path": "marker/processors/order.py",
"chars": 2413,
"preview": "from statistics import mean\nfrom collections import defaultdict\n\nfrom marker.processors import BaseProcessor\nfrom marker"
},
{
"path": "marker/processors/page_header.py",
"chars": 838,
"preview": "from marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom marker.schema.document import Docu"
},
{
"path": "marker/processors/reference.py",
"chars": 2002,
"preview": "import numpy as np\n\nfrom marker.processors import BaseProcessor\nfrom marker.schema import BlockTypes\nfrom marker.schema."
},
{
"path": "marker/processors/sectionheader.py",
"chars": 4050,
"preview": "import warnings\nfrom typing import Annotated, Dict, List\n\nimport numpy as np\nfrom sklearn.cluster import KMeans\nfrom skl"
},
{
"path": "marker/processors/table.py",
"chars": 30520,
"preview": "import re\nfrom collections import defaultdict\nfrom copy import deepcopy\nfrom typing import Annotated, List\nfrom collecti"
},
{
"path": "marker/processors/text.py",
"chars": 4104,
"preview": "import math\nfrom typing import Annotated, List\n\nimport regex\n\nfrom marker.processors import BaseProcessor\nfrom marker.sc"
},
{
"path": "marker/processors/util.py",
"chars": 2519,
"preview": "import re\n\nfrom bs4 import BeautifulSoup\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.groups import PageGrou"
},
{
"path": "marker/providers/__init__.py",
"chars": 2636,
"preview": "from copy import deepcopy\nfrom typing import List, Optional, Dict\n\nfrom PIL import Image\nfrom pydantic import BaseModel\n"
},
{
"path": "marker/providers/document.py",
"chars": 2688,
"preview": "import base64\nimport os\nimport re\nimport tempfile\nfrom io import BytesIO\n\nfrom PIL import Image\nfrom marker.logger impor"
},
{
"path": "marker/providers/epub.py",
"chars": 2933,
"preview": "import base64\nimport os\nimport tempfile\n\nfrom bs4 import BeautifulSoup\n\nfrom marker.providers.pdf import PdfProvider\n\ncs"
},
{
"path": "marker/providers/html.py",
"chars": 982,
"preview": "import os\nimport tempfile\n\nfrom marker.providers.pdf import PdfProvider\n\n\nclass HTMLProvider(PdfProvider):\n def __ini"
},
{
"path": "marker/providers/image.py",
"chars": 1704,
"preview": "from typing import List, Annotated\nfrom PIL import Image\n\nfrom marker.providers import ProviderPageLines, BaseProvider\nf"
},
{
"path": "marker/providers/pdf.py",
"chars": 16251,
"preview": "import contextlib\nimport ctypes\nimport logging\nimport re\nfrom typing import Annotated, Dict, List, Optional, Set\n\nimport"
},
{
"path": "marker/providers/powerpoint.py",
"chars": 8474,
"preview": "import base64\nimport os\nimport tempfile\nimport traceback\n\nfrom marker.logger import get_logger\nfrom marker.providers.pdf"
},
{
"path": "marker/providers/registry.py",
"chars": 2492,
"preview": "import filetype\nimport filetype.match as file_match\nfrom bs4 import BeautifulSoup\nfrom filetype.types import archive, do"
},
{
"path": "marker/providers/spreadsheet.py",
"chars": 3591,
"preview": "import os\nimport tempfile\n\nfrom marker.providers.pdf import PdfProvider\n\ncss = '''\n@page {\n size: A4 landscape;\n m"
},
{
"path": "marker/providers/utils.py",
"chars": 258,
"preview": "def alphanum_ratio(text):\n text = text.replace(\" \", \"\")\n text = text.replace(\"\\n\", \"\")\n alphanumeric_count = su"
},
{
"path": "marker/renderers/__init__.py",
"chars": 5390,
"preview": "import base64\nimport io\nimport re\nfrom collections import Counter\nfrom typing import Annotated, Optional, Tuple, Literal"
},
{
"path": "marker/renderers/chunk.py",
"chars": 3221,
"preview": "import html\nfrom typing import List, Dict\n\nfrom bs4 import BeautifulSoup\nfrom pydantic import BaseModel\n\nfrom marker.ren"
},
{
"path": "marker/renderers/extraction.py",
"chars": 641,
"preview": "from pydantic import BaseModel\n\nfrom marker.extractors.document import DocumentExtractionSchema\nfrom marker.renderers im"
},
{
"path": "marker/renderers/html.py",
"chars": 5485,
"preview": "import textwrap\n\nfrom PIL import Image\nfrom typing import Annotated, Tuple\n\nfrom bs4 import BeautifulSoup, MarkupResembl"
},
{
"path": "marker/renderers/json.py",
"chars": 3080,
"preview": "from typing import Annotated, Dict, List, Tuple\n\nfrom pydantic import BaseModel\n\nfrom marker.renderers import BaseRender"
},
{
"path": "marker/renderers/markdown.py",
"chars": 10993,
"preview": "import re\nfrom collections import defaultdict\nfrom typing import Annotated, Tuple\n\nimport regex\nimport six\nfrom bs4 impo"
},
{
"path": "marker/renderers/ocr_json.py",
"chars": 4279,
"preview": "from typing import Annotated, List, Tuple\n\nfrom pydantic import BaseModel\n\nfrom marker.renderers import BaseRenderer\nfro"
},
{
"path": "marker/schema/__init__.py",
"chars": 731,
"preview": "from enum import auto, Enum\n\n\nclass BlockTypes(str, Enum):\n Line = auto()\n Span = auto()\n Char = auto()\n Fig"
},
{
"path": "marker/schema/blocks/__init__.py",
"chars": 1077,
"preview": "from __future__ import annotations\n\nfrom marker.schema.blocks.base import Block, BlockId, BlockOutput\nfrom marker.schema"
},
{
"path": "marker/schema/blocks/base.py",
"chars": 11732,
"preview": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, Tuple\n\nfr"
},
{
"path": "marker/schema/blocks/basetable.py",
"chars": 2180,
"preview": "from typing import List\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Block, BlockOutput\nfrom m"
},
{
"path": "marker/schema/blocks/caption.py",
"chars": 722,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Caption(Block):\n block_type: Bloc"
},
{
"path": "marker/schema/blocks/code.py",
"chars": 485,
"preview": "import html\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Code(Block):\n block_"
},
{
"path": "marker/schema/blocks/complexregion.py",
"chars": 1035,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass ComplexRegion(Block):\n block_type"
},
{
"path": "marker/schema/blocks/equation.py",
"chars": 998,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Equation(Block):\n block_type: Blo"
},
{
"path": "marker/schema/blocks/figure.py",
"chars": 989,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Figure(Block):\n block_type: Block"
},
{
"path": "marker/schema/blocks/footnote.py",
"chars": 701,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Footnote(Block):\n block_type: Blo"
},
{
"path": "marker/schema/blocks/form.py",
"chars": 325,
"preview": "from typing import List\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks.basetable import BaseTable\n\n\ncla"
},
{
"path": "marker/schema/blocks/handwriting.py",
"chars": 589,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Handwriting(Block):\n block_type: "
},
{
"path": "marker/schema/blocks/inlinemath.py",
"chars": 1432,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass InlineMath(Block):\n block_type: B"
},
{
"path": "marker/schema/blocks/listitem.py",
"chars": 1692,
"preview": "import re\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\ndef replace_bullets(child_block"
},
{
"path": "marker/schema/blocks/pagefooter.py",
"chars": 764,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass PageFooter(Block):\n block_type: s"
},
{
"path": "marker/schema/blocks/pageheader.py",
"chars": 767,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass PageHeader(Block):\n block_type: B"
},
{
"path": "marker/schema/blocks/picture.py",
"chars": 1045,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Picture(Block):\n block_type: Bloc"
},
{
"path": "marker/schema/blocks/reference.py",
"chars": 529,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Reference(Block):\n block_type: Bl"
},
{
"path": "marker/schema/blocks/sectionheader.py",
"chars": 936,
"preview": "from typing import Optional\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Section"
},
{
"path": "marker/schema/blocks/table.py",
"chars": 265,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks.basetable import BaseTable\n\n\nclass Table(BaseTable):\n "
},
{
"path": "marker/schema/blocks/tablecell.py",
"chars": 1079,
"preview": "from typing import List\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass TableCell(B"
},
{
"path": "marker/schema/blocks/text.py",
"chars": 1322,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Text(Block):\n block_type: BlockTy"
},
{
"path": "marker/schema/blocks/toc.py",
"chars": 227,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks.basetable import BaseTable\n\n\nclass TableOfContents(BaseTa"
},
{
"path": "marker/schema/document.py",
"chars": 3488,
"preview": "from __future__ import annotations\n\nfrom typing import List, Sequence, Optional\n\nfrom pydantic import BaseModel\n\nfrom ma"
},
{
"path": "marker/schema/groups/__init__.py",
"chars": 296,
"preview": "from marker.schema.blocks.base import Block\nfrom marker.schema.groups.figure import FigureGroup\nfrom marker.schema.group"
},
{
"path": "marker/schema/groups/base.py",
"chars": 69,
"preview": "from marker.schema.blocks import Block\n\n\nclass Group(Block):\n pass"
},
{
"path": "marker/schema/groups/figure.py",
"chars": 580,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.groups.base import Group\n\n\nclass FigureGroup(Group):\n block_t"
},
{
"path": "marker/schema/groups/list.py",
"chars": 876,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.groups.base import Group\n\n\nclass ListGroup(Group):\n block_typ"
},
{
"path": "marker/schema/groups/page.py",
"chars": 13649,
"preview": "from collections import defaultdict\nfrom typing import Any, Dict, List, Optional, Sequence, Tuple, Union\nimport numpy as"
},
{
"path": "marker/schema/groups/picture.py",
"chars": 568,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.groups.base import Group\n\n\nclass PictureGroup(Group):\n block_"
},
{
"path": "marker/schema/groups/table.py",
"chars": 772,
"preview": "from typing import List\n\nfrom marker.schema import BlockTypes\nfrom marker.schema.blocks import BlockOutput\nfrom marker.s"
},
{
"path": "marker/schema/polygon.py",
"chars": 8103,
"preview": "from __future__ import annotations\nimport copy\nfrom typing import List\n\nimport numpy as np\nfrom pydantic import BaseMode"
},
{
"path": "marker/schema/registry.py",
"chars": 2769,
"preview": "from typing import Dict, Type\nfrom importlib import import_module\n\nfrom marker.schema import BlockTypes\nfrom marker.sche"
},
{
"path": "marker/schema/text/__init__.py",
"chars": 82,
"preview": "from marker.schema.text.line import Line\nfrom marker.schema.text.span import Span\n"
},
{
"path": "marker/schema/text/char.py",
"chars": 235,
"preview": "from marker.schema import BlockTypes\nfrom marker.schema.blocks import Block\n\n\nclass Char(Block):\n block_type: BlockTy"
},
{
"path": "marker/schema/text/line.py",
"chars": 4641,
"preview": "import html\nimport re\nfrom typing import Literal, List\n\nimport regex\n\nfrom marker.schema import BlockTypes\nfrom marker.s"
},
{
"path": "marker/schema/text/span.py",
"chars": 3815,
"preview": "import html\nimport re\nfrom typing import List, Literal, Optional\n\nfrom marker.schema import BlockTypes\nfrom marker.schem"
},
{
"path": "marker/scripts/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "marker/scripts/chunk_convert.py",
"chars": 658,
"preview": "import argparse\nimport os\nimport subprocess\nimport pkg_resources\n\n\ndef chunk_convert_cli():\n parser = argparse.Argume"
},
{
"path": "marker/scripts/chunk_convert.sh",
"chars": 1075,
"preview": "#!/bin/bash\n\ntrap 'pkill -P $$' SIGINT\n\n# Check if NUM_DEVICES is set\nif [[ -z \"$NUM_DEVICES\" ]]; then\n echo \"Please "
},
{
"path": "marker/scripts/common.py",
"chars": 5821,
"preview": "import ast\nimport base64\nimport io\nimport re\nimport sys\nfrom typing import Optional\n\nfrom PIL import Image\nimport click\n"
},
{
"path": "marker/scripts/convert.py",
"chars": 6495,
"preview": "import atexit\nimport os\nimport time\n\nimport psutil\nimport torch\n\nfrom marker.utils.batch import get_batch_sizes_worker_c"
},
{
"path": "marker/scripts/convert_single.py",
"chars": 1417,
"preview": "import os\n\nos.environ[\"GRPC_VERBOSITY\"] = \"ERROR\"\nos.environ[\"GLOG_minloglevel\"] = \"2\"\nos.environ[\"PYTORCH_ENABLE_MPS_FA"
},
{
"path": "marker/scripts/extraction_app.py",
"chars": 7413,
"preview": "import json\nimport os\n\nfrom streamlit_ace import st_ace\nfrom pydantic import BaseModel\n\nfrom marker.converters.extractio"
},
{
"path": "marker/scripts/file_to_s3.py",
"chars": 1252,
"preview": "import json\nimport shutil\nimport datetime\nfrom pathlib import Path\nimport boto3\n\nfrom huggingface_hub import snapshot_do"
},
{
"path": "marker/scripts/run_streamlit_app.py",
"chars": 578,
"preview": "import subprocess\nimport os\nimport sys\n\n\ndef streamlit_app_cli(app_name: str = \"streamlit_app.py\"):\n argv = sys.argv["
},
{
"path": "marker/scripts/server.py",
"chars": 4957,
"preview": "import traceback\n\nimport click\nimport os\n\nfrom pydantic import BaseModel, Field\nfrom starlette.responses import HTMLResp"
},
{
"path": "marker/scripts/streamlit_app.py",
"chars": 5189,
"preview": "import os\n\nfrom marker.scripts.common import (\n load_models,\n parse_args,\n img_to_html,\n get_page_image,\n "
},
{
"path": "marker/services/__init__.py",
"chars": 1686,
"preview": "from typing import Optional, List, Annotated\nfrom io import BytesIO\n\nimport PIL\nfrom pydantic import BaseModel\n\nfrom mar"
},
{
"path": "marker/services/azure_openai.py",
"chars": 3842,
"preview": "import json\nimport time\nfrom typing import Annotated, List\n\nimport PIL\nfrom marker.logger import get_logger\nfrom openai "
},
{
"path": "marker/services/claude.py",
"chars": 4548,
"preview": "import json\nimport time\nfrom typing import List, Annotated, T\n\nimport PIL\nfrom PIL import Image\nimport anthropic\nfrom an"
},
{
"path": "marker/services/gemini.py",
"chars": 5083,
"preview": "import json\nimport time\nimport traceback\nfrom io import BytesIO\nfrom typing import List, Annotated\n\nimport PIL\nfrom goog"
},
{
"path": "marker/services/ollama.py",
"chars": 2101,
"preview": "import json\nfrom typing import Annotated, List\n\nimport PIL\nimport requests\nfrom marker.logger import get_logger\nfrom pyd"
},
{
"path": "marker/services/openai.py",
"chars": 4386,
"preview": "import json\nimport time\nfrom typing import Annotated, List\n\nimport openai\nimport PIL\nfrom marker.logger import get_logge"
},
{
"path": "marker/services/vertex.py",
"chars": 1074,
"preview": "from typing import Annotated\n\nfrom google import genai\n\nfrom marker.services.gemini import BaseGeminiService\n\nclass Goog"
},
{
"path": "marker/settings.py",
"chars": 1597,
"preview": "from typing import Optional\n\nfrom dotenv import find_dotenv\nfrom pydantic import computed_field\nfrom pydantic_settings i"
},
{
"path": "marker/util.py",
"chars": 7520,
"preview": "import inspect\nimport os\nfrom importlib import import_module\nfrom typing import List, Annotated\nimport re\n\nimport numpy "
},
{
"path": "marker/utils/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "marker/utils/batch.py",
"chars": 551,
"preview": "from marker.utils.gpu import GPUManager\n\n\ndef get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: i"
},
{
"path": "marker/utils/gpu.py",
"chars": 4009,
"preview": "import os\nimport subprocess\nimport torch\n\nfrom marker.logger import get_logger\nfrom marker.settings import settings\n\nlog"
},
{
"path": "marker/utils/image.py",
"chars": 1275,
"preview": "from PIL import Image\nimport numpy as np\nimport cv2\nfrom typing import List, Optional\n\ndef is_blank_image(image: Image.I"
},
{
"path": "marker_app.py",
"chars": 114,
"preview": "from marker.scripts.run_streamlit_app import streamlit_app_cli\n\nif __name__ == \"__main__\":\n streamlit_app_cli()"
}
]
// ... and 40 more files (download for full content)
About this extraction
This page contains the full source code of the datalab-to/marker GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 240 files (10.0 MB), approximately 2.6M tokens, and a symbol index with 866 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.