Showing preview only (518K chars total). Download the full file or copy to clipboard to get everything.
Repository: QuivrHQ/MegaParse
Branch: main
Commit: ba9a24aec950
Files: 123
Total size: 28.4 MB
Directory structure:
gitextract_ylqgqesz/
├── .aws/
│ └── task_definition.json
├── .flake8
├── .gitattributes
├── .github/
│ └── workflows/
│ ├── CI.yml
│ ├── build-and-deploy.yml
│ ├── build-gpu.yml
│ ├── release-please.yml
│ └── test-build-docker.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .release-please-manifest.json
├── .vscode/
│ ├── extensions.json
│ ├── launch.json
│ └── settings.json
├── CHANGELOG.md
├── Dockerfile
├── Dockerfile.gpu
├── LICENSE
├── Makefile
├── Pipfile
├── README.md
├── benchmark/
│ ├── process_single_doc.py
│ └── test_quality_sim.py
├── docker-compose.dev.yml
├── docker-compose.yml
├── docs/
│ └── archive.txt
├── evaluations/
│ └── script.py
├── libs/
│ ├── megaparse/
│ │ ├── .python-version
│ │ ├── CHANGELOG.md
│ │ ├── README.md
│ │ ├── bench.md
│ │ ├── examples/
│ │ │ ├── parse_file_fast.py
│ │ │ ├── parse_file_mp.py
│ │ │ └── parse_file_unstructured.py
│ │ ├── program.prof
│ │ ├── pyproject.toml
│ │ ├── src/
│ │ │ └── megaparse/
│ │ │ ├── __init__.py
│ │ │ ├── api/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ ├── exceptions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── megaparse_exceptions.py
│ │ │ │ └── models/
│ │ │ │ ├── __init__.py
│ │ │ │ └── base.py
│ │ │ ├── configs/
│ │ │ │ └── auto.py
│ │ │ ├── examples/
│ │ │ │ ├── parse_file.py
│ │ │ │ └── parsing_process.py
│ │ │ ├── exceptions/
│ │ │ │ └── base.py
│ │ │ ├── formatter/
│ │ │ │ ├── base.py
│ │ │ │ ├── structured_formatter/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── custom_structured_formatter.py
│ │ │ │ └── table_formatter/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── llm_table_formatter.py
│ │ │ │ └── vision_table_formatter.py
│ │ │ ├── layout_detection/
│ │ │ │ ├── layout_detector.py
│ │ │ │ ├── models/
│ │ │ │ │ └── yolov10s-doclaynet.onnx
│ │ │ │ └── output.py
│ │ │ ├── megaparse.py
│ │ │ ├── models/
│ │ │ │ └── page.py
│ │ │ ├── parser/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── builder.py
│ │ │ │ ├── doctr_parser.py
│ │ │ │ ├── entity.py
│ │ │ │ ├── llama.py
│ │ │ │ ├── megaparse_vision.py
│ │ │ │ └── unstructured_parser.py
│ │ │ ├── predictor/
│ │ │ │ └── layout_predictor.py
│ │ │ └── utils/
│ │ │ ├── extract_metadata.py
│ │ │ ├── onnx.py
│ │ │ └── strategy.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ ├── certs/
│ │ │ ├── client-cert.pem
│ │ │ └── client-key.pem
│ │ ├── conftest.py
│ │ ├── data/
│ │ │ └── grt_example/
│ │ │ └── MegaFake_report.md
│ │ ├── pdf/
│ │ │ ├── test_detect_ocr.py
│ │ │ ├── test_pdf_processing.py
│ │ │ └── test_pdfium_parser.py
│ │ ├── supported_docs/
│ │ │ ├── Sway.epub
│ │ │ ├── file-sample_500kB.odt
│ │ │ ├── file_example_XLSX_50.xlsx
│ │ │ ├── file_example_XLS_50.xls
│ │ │ ├── sample.csv
│ │ │ ├── sample.docx
│ │ │ ├── sample.markdown
│ │ │ ├── sample.md
│ │ │ ├── sample.otf
│ │ │ ├── sample.pptx
│ │ │ ├── sample.txt
│ │ │ ├── sample.xml
│ │ │ └── sample_complexe.html
│ │ ├── test_endpoints.py
│ │ ├── test_import.py
│ │ └── test_parsers.py
│ └── megaparse_sdk/
│ ├── CHANGELOG.md
│ ├── README.md
│ ├── __init__.py
│ ├── examples/
│ │ └── usage_example.py
│ ├── megaparse_sdk/
│ │ ├── __init__.py
│ │ ├── client.py
│ │ ├── config.py
│ │ ├── endpoints/
│ │ │ ├── __init__.py
│ │ │ ├── file_upload.py
│ │ │ └── url_upload.py
│ │ ├── schema/
│ │ │ ├── __init__.py
│ │ │ ├── document.py
│ │ │ ├── extensions.py
│ │ │ ├── languages.py
│ │ │ ├── mp_exceptions.py
│ │ │ ├── mp_inputs.py
│ │ │ ├── mp_outputs.py
│ │ │ ├── parser_config.py
│ │ │ └── supported_models.py
│ │ └── utils/
│ │ └── load_ssl.py
│ ├── pyproject.toml
│ └── tests/
│ ├── README.md
│ ├── certs/
│ │ ├── client-cert.pem
│ │ ├── client-key.pem
│ │ └── rootCA.pem
│ └── test_nats_client.py
├── pyproject.toml
└── release-please-config.json
================================================
FILE CONTENTS
================================================
================================================
FILE: .aws/task_definition.json
================================================
{
"taskDefinitionArn": "arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2",
"containerDefinitions": [
{
"name": "megaparse",
"image": "quay.io/unstructured-io/unstructured-api:latest",
"cpu": 0,
"portMappings": [
{
"containerPort": 8000,
"hostPort": 8000,
"protocol": "tcp"
}
],
"essential": true,
"environment": [
{
"name": "UNSTRUCTURED_HI_RES_MODEL_NAME",
"value": "detectron2_onnx"
},
{
"name": "UNSTRUCTURED_PARALLEL_MODE_ENABLED",
"value": "false"
}
],
"mountPoints": [],
"volumesFrom": [],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/ecs/megaparse",
"awslogs-region": "eu-west-1",
"awslogs-stream-prefix": "ecs"
}
},
"systemControls": []
}
],
"family": "megaparse-task",
"executionRoleArn": "arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole",
"networkMode": "awsvpc",
"revision": 2,
"volumes": [],
"status": "ACTIVE",
"requiresAttributes": [
{
"name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
},
{
"name": "ecs.capability.execution-role-awslogs"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
},
{
"name": "ecs.capability.task-eni"
}
],
"placementConstraints": [],
"compatibilities": [
"EC2",
"FARGATE"
],
"requiresCompatibilities": [
"FARGATE"
],
"cpu": "2048",
"memory": "8192",
"tags": []
}
================================================
FILE: .flake8
================================================
[flake8]
; Minimal configuration for Flake8 to work with Black.
max-line-length = 100
ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100
================================================
FILE: .gitattributes
================================================
*.ipynb linguist-vendored
*.html linguist-vendored
================================================
FILE: .github/workflows/CI.yml
================================================
name: Run tests
on:
pull_request:
workflow_dispatch:
env:
NATS_TOKEN: test
jobs:
test:
name: Run tests on Python ${{ matrix.python-version }}
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11", "3.12"]
steps:
- name: 👀 Checkout code
uses: actions/checkout@v2
with:
submodules: true
- name: Setup apt cache
uses: actions/cache@v2
with:
path: /var/cache/apt/archives
key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }}
- name: 😭 Install system dependencies
run: |
sudo apt-get update && sudo apt-get install -y \
netcat-traditional \
unzip \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
build-essential \
libtool \
gcc \
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc
- name: 🔽 Install the latest version of rye
uses: eifinger/setup-rye@v4
with:
enable-cache: true
- name: 📌 Pin Python version
run: rye pin ${{ matrix.python-version }}
- name: 🔽 Download and Install NATS Server
run: |
curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip
unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin
- name: 🛠️ Set up NATS arguments
run: |
nohup nats-server \
--addr 0.0.0.0 \
--port 4222 \
--auth "$NATS_TOKEN" > nats.log 2>&1 &
- name: 🔍 Verify NATS Server is Running
run: |
sleep 1 # Give the server some time to start
if nc -zv localhost 4222; then
echo "✅ NATS Server is running on port 4222."
else
echo "❌ Failed to start NATS Server."
cat nats.log
exit 1
fi
- name: 🔨 Sync dependencies
run: |
UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
- name: 🚀 Run tests
run: |
rye test -p megaparse-sdk
================================================
FILE: .github/workflows/build-and-deploy.yml
================================================
name: Build Docker image and push ECR
on:
push:
tags:
- "v*"
branches: [main]
env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse
permissions:
contents: read
jobs:
deploy:
name: build docker
runs-on: ubuntu-latest
environment: production
outputs:
imageoutput: ${{ steps.build-image.outputs.imageoutput }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registry-type: public
- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
# Tag the image as 'latest' and push
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
================================================
FILE: .github/workflows/build-gpu.yml
================================================
name: Build docker GPU and push ECR
on:
push:
tags:
- "v*"
branches: [main]
env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse-gpu
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse
permissions:
contents: read
jobs:
deploy:
name: Build docker-gpu
runs-on:
group: big-boy-gpu
environment: production
outputs:
imageoutput: ${{ steps.build-image.outputs.imageoutput }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registry-type: public
- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
# Tag the image as 'latest' and push
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
================================================
FILE: .github/workflows/release-please.yml
================================================
on:
push:
branches:
- main
permissions:
contents: write
pull-requests: write
name: release-please
jobs:
release-please:
runs-on: ubuntu-latest
outputs:
release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }}
release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
fetch-depth: 0 # Fetch all history for tags and releases
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Run release-please
id: release
uses: google-github-actions/release-please-action@v4
with:
token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
deploy-megaparse:
if: needs.release-please.outputs.release_created == 'true'
needs: release-please
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rye
uses: eifinger/setup-rye@v2
with:
enable-cache: true
- name: Rye Sync
run: rye sync --no-lock
- name: Rye Build
run: cd libs/megaparse && rye build
- name: Rye Publish
run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
deploy-sdk:
if: needs.release-please.outputs.release_created_sdk == 'true'
needs: release-please
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rye
uses: eifinger/setup-rye@v2
with:
enable-cache: true
- name: Rye Sync
run: cd libs/megaparse_sdk && rye sync --no-lock
- name: Rye Build
run: cd libs/megaparse_sdk && rye build
- name: Rye Publish
run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
================================================
FILE: .github/workflows/test-build-docker.yml
================================================
on:
pull_request:
branches:
- main
name: Test build docker
jobs:
build-docker:
runs-on: ubuntu-latest
strategy:
matrix:
dockerfile: [Dockerfile, Dockerfile.gpu]
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: all
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker image with caching
uses: docker/build-push-action@v4
with:
context: .
file: ${{ matrix.dockerfile }}
push: false
tags: quivrhq/megaparse:${{ matrix.dockerfile }}
cache-from: type=gha
cache-to: type=gha,mode=max
================================================
FILE: .gitignore
================================================
/output
/input
.env
__pycache__/
dist/**
megaparse.egg-info/
*.pyc
build/*
ENV
venv
*/evaluations/*
*/cdp/*
*.pkl
!megaparse/tests/output_tests/MegaFake_report.md
*.DS_Store
.tool-versions
megaparse/sdk/examples/only_pdfs/*
**/profile/
**/prof/
.ropeproject/
benchmark/hi_res/*
benchmark/auto/*
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-added-large-files
args: ["--maxkb=5000"]
- id: check-toml
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: check-merge-conflict
- id: detect-private-key
- id: check-case-conflict
- repo: https://github.com/pre-commit/pre-commit
rev: v3.6.2
hooks:
- id: validate_manifest
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.5.1
hooks:
# Run the linter.
- id: ruff
args: [--fix]
additional_dependencies: []
# Run the formatter.
- id: ruff-format
additional_dependencies: []
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.1
hooks:
- id: mypy
name: mypy
additional_dependencies: ["types-aiofiles"]
================================================
FILE: .python-version
================================================
3.11.9
================================================
FILE: .release-please-manifest.json
================================================
{
"libs/megaparse": "0.0.55",
"libs/megaparse_sdk": "0.1.12"
}
================================================
FILE: .vscode/extensions.json
================================================
{
"recommendations": [
"dbaeumer.vscode-eslint",
"charliermarsh.ruff",
"knisterpeter.vscode-github",
"github.vscode-pull-request-github",
"ms-python.python",
"ms-python.vscode-pylance",
"ms-python.debugpy"
]
}
================================================
FILE: .vscode/launch.json
================================================
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Remote Attach",
"type": "python",
"request": "attach",
"connect": {
"host": "localhost",
"port": 5678
},
"pathMappings": [
{
"localRoot": "${workspaceFolder}/backend",
"remoteRoot": "."
}
],
"justMyCode": true
},
{
"name": "Python: Debug Test Script",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/backend/test_process_file_and_notify.py",
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Python: Debug",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}"
},
"envFile": "${workspaceFolder}/.env"
}
]
}
================================================
FILE: .vscode/settings.json
================================================
{
"editor.formatOnSave": true,
"editor.formatOnSaveMode": "file",
"files.exclude": {
"**/__pycache__": true,
"**/.benchmarks/": true,
"**/.cache/": true,
"**/.pytest_cache/": true,
"**/.next/": true,
"**/build/": true,
"**/.docusaurus/": true,
"**/node_modules/": true
},
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
}
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.autoTestDiscoverOnSaveEnabled": true,
"python.analysis.autoImportCompletions": true,
"python.analysis.typeCheckingMode": "basic",
"python.analysis.diagnosticSeverityOverrides": {
"reportMissingImports": "error",
"reportUnusedImport": "warning",
"reportGeneralTypeIssues": "warning"
},
"makefile.configureOnOpen": false
}
================================================
FILE: CHANGELOG.md
================================================
# Changelog
## [0.0.46](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.45...megaparse-v0.0.46) (2024-11-21)
### Features
* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
## [0.0.45](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.44...megaparse-v0.0.45) (2024-11-19)
### Bug Fixes
* small fixes from backlogs ([#128](https://github.com/QuivrHQ/MegaParse/issues/128)) ([954554c](https://github.com/QuivrHQ/MegaParse/commit/954554c5abaa7b0513e9ff3f6bbaff393d36cf03))
## [0.0.44](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.43...megaparse-v0.0.44) (2024-11-18)
### Bug Fixes
* fixing the wrong passing of arguments to the parse_file endpoint ([#123](https://github.com/QuivrHQ/MegaParse/issues/123)) ([9105672](https://github.com/QuivrHQ/MegaParse/commit/9105672abc0942f26785e494053112d486e8d2d9))
## [0.0.43](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.42...megaparse-v0.0.43) (2024-11-14)
### Features
* increase the robustness of megaparse ([#121](https://github.com/QuivrHQ/MegaParse/issues/121)) ([d21d8bb](https://github.com/QuivrHQ/MegaParse/commit/d21d8bb77bd8e687b1a951db6b81653e4e47a8bb))
### Bug Fixes
* uvicorn version ([#127](https://github.com/QuivrHQ/MegaParse/issues/127)) ([ceaba3d](https://github.com/QuivrHQ/MegaParse/commit/ceaba3df2951be27e6a4835e5784917a62867896))
* version requirements ([#126](https://github.com/QuivrHQ/MegaParse/issues/126)) ([a10d502](https://github.com/QuivrHQ/MegaParse/commit/a10d502f1b3576690cebe33b656d2480a24defe3))
## [0.0.42](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.41...megaparse-v0.0.42) (2024-11-08)
### Features
* **sdk:** new version ([e377cd6](https://github.com/QuivrHQ/MegaParse/commit/e377cd6df98b3ea9265788a4d907b43bde796196))
## [0.0.41](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.40...megaparse-v0.0.41) (2024-11-08)
### Bug Fixes
* add megaparse url env variable ([#118](https://github.com/QuivrHQ/MegaParse/issues/118)) ([132c2eb](https://github.com/QuivrHQ/MegaParse/commit/132c2ebd13177fd116c4e710a4b1c864a9fa04bb))
## [0.0.40](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.39...megaparse-v0.0.40) (2024-11-08)
### Bug Fixes
* sdk version ([#116](https://github.com/QuivrHQ/MegaParse/issues/116)) ([8bfeb4a](https://github.com/QuivrHQ/MegaParse/commit/8bfeb4a52326a5f645d3ed20e113153dc19bf012))
## [0.0.39](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.38...megaparse-v0.0.39) (2024-11-08)
### Bug Fixes
* add_logs ([#114](https://github.com/QuivrHQ/MegaParse/issues/114)) ([63c9236](https://github.com/QuivrHQ/MegaParse/commit/63c9236590016ee4c210174e746e96ff2b654480))
## [0.0.38](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.37...megaparse-v0.0.38) (2024-11-07)
### Bug Fixes
* env roots, imports root ([#112](https://github.com/QuivrHQ/MegaParse/issues/112)) ([a04230d](https://github.com/QuivrHQ/MegaParse/commit/a04230dc2de9e0bb0bde39ab66b2208f80743922))
## [0.0.37](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.36...megaparse-v0.0.37) (2024-11-07)
### Features
* bump megaparse-sdk version to 0.1.1 ([ed3fdfb](https://github.com/QuivrHQ/MegaParse/commit/ed3fdfb10498c95d4f9a510df3a2913e0dfc3c23))
## [0.0.36](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.35...megaparse-v0.0.36) (2024-11-07)
### Features
* **readme:** update ([9d571b7](https://github.com/QuivrHQ/MegaParse/commit/9d571b7c71db610e7a0b08045ad98994ecf71baa))
## [0.0.35](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.34...megaparse-v0.0.35) (2024-11-07)
### Bug Fixes
* unnecessary dep and readme ([#107](https://github.com/QuivrHQ/MegaParse/issues/107)) ([b80aaa3](https://github.com/QuivrHQ/MegaParse/commit/b80aaa3a894b2bd2c7d7f518919c41af5c99219f))
## [0.0.34](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.33...megaparse-v0.0.34) (2024-11-07)
### Features
* megaparse-sdk-cherry ([#105](https://github.com/QuivrHQ/MegaParse/issues/105)) ([ad44aa3](https://github.com/QuivrHQ/MegaParse/commit/ad44aa34999596e156c78f91adab97bce7ceeb0e))
## [0.0.33](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.32...megaparse-v0.0.33) (2024-11-01)
### Bug Fixes
* readme ([#99](https://github.com/QuivrHQ/MegaParse/issues/99)) ([b3b80a3](https://github.com/QuivrHQ/MegaParse/commit/b3b80a3a599bbd4bec8ed79bb9ef44c8c7c92789))
## [0.0.32](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.31...megaparse-v0.0.32) (2024-11-01)
### Features
* **api:** megaparse under api ([#93](https://github.com/QuivrHQ/MegaParse/issues/93)) ([2edf44b](https://github.com/QuivrHQ/MegaParse/commit/2edf44bd8c09ac7127db74206e463ebe29c68998))
### Bug Fixes
* api call error & tests ([#98](https://github.com/QuivrHQ/MegaParse/issues/98)) ([6bf1ce8](https://github.com/QuivrHQ/MegaParse/commit/6bf1ce8c6ed0e4f1e81577973a0fc71f61b10776))
## [0.0.31](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.30...megaparse-v0.0.31) (2024-08-20)
### Features
* **pytorch:** cpu only removed ([#88](https://github.com/QuivrHQ/MegaParse/issues/88)) ([6b2fcfa](https://github.com/QuivrHQ/MegaParse/commit/6b2fcfa4413b8a72d398aab57f277dd28ab69c2f))
## [0.0.30](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.29...megaparse-v0.0.30) (2024-08-20)
### Features
* **pytorch:** cpu only optional ([#86](https://github.com/QuivrHQ/MegaParse/issues/86)) ([e5d8806](https://github.com/QuivrHQ/MegaParse/commit/e5d8806ee6182de250352ce65ac6cd57c1093494))
## [0.0.29](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.28...megaparse-v0.0.29) (2024-08-18)
### Bug Fixes
* **building:** version not working ([#83](https://github.com/QuivrHQ/MegaParse/issues/83)) ([c5e73f6](https://github.com/QuivrHQ/MegaParse/commit/c5e73f6c821424ef277ddd15ddb5b2df48ff7ab2))
## [0.0.28](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.27...megaparse-v0.0.28) (2024-08-16)
### Features
* **rye:** added package manager ([#81](https://github.com/QuivrHQ/MegaParse/issues/81)) ([a3a50a3](https://github.com/QuivrHQ/MegaParse/commit/a3a50a3f27d3d9b4d6de4f3415472f8e52710656))
## [0.0.27](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.26...megaparse-v0.0.27) (2024-08-16)
### Features
* **unstructured:** increased version ([#78](https://github.com/QuivrHQ/MegaParse/issues/78)) ([eb49cf5](https://github.com/QuivrHQ/MegaParse/commit/eb49cf5e79cd7a38c8212b315a4b64860c35a7b7))
## [0.0.26](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.25...megaparse-v0.0.26) (2024-08-16)
### Bug Fixes
* **pycrypto:** being used by an old version of pdfplumber ([#76](https://github.com/QuivrHQ/MegaParse/issues/76)) ([d28f88c](https://github.com/QuivrHQ/MegaParse/commit/d28f88ceb2a722b15c84738f395b3ff4c818a365))
## [0.0.25](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.24...megaparse-v0.0.25) (2024-08-16)
### Features
* **rye:** implemented ([#74](https://github.com/QuivrHQ/MegaParse/issues/74)) ([1e9ad8e](https://github.com/QuivrHQ/MegaParse/commit/1e9ad8e0000f28c709d915219fe62c0dbe7fa812))
## [0.0.24](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.23...megaparse-v0.0.24) (2024-07-30)
### Features
* async load ([#71](https://github.com/QuivrHQ/MegaParse/issues/71)) ([fbc3e1b](https://github.com/QuivrHQ/MegaParse/commit/fbc3e1b5f504eee9757e15592169ddad9b069f03))
## [0.0.23](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.22...megaparse-v0.0.23) (2024-07-30)
### Features
* megaparse 0.0.22 ([071fd4d](https://github.com/QuivrHQ/MegaParse/commit/071fd4da2e8f0abb58fc66c3cdd87c4ee5cda4d6))
## 0.0.20 (2024-07-10)
## What's Changed
* add: resolve multiple page problem on llama parse by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/61
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.19...v0.0.20
## 0.0.19 (2024-06-28)
## What's Changed
* add: choose unstructured strategy by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/57
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.18...v0.0.19
## 0.0.18 (2024-06-28)
## What's Changed
* fix: add __init__.py by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/54
* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/56
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.17...v0.0.18
## 0.0.17 (2024-06-27)
## What's Changed
* markdown by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/48
* fix:Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/49
* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/50
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.16...v0.0.17
## 0.0.16 (2024-06-27)
## What's Changed
* Fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/47
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.15...v0.0.16
## 0.0.15 (2024-06-26)
## What's Changed
* add: llm megaparser by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/42
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.14...v0.0.15
## 0.0.14 (2024-06-24)
## What's Changed
* fix: remove nest asycio by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/40
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.13...v0.0.14
## 0.0.13 (2024-06-24)
## What's Changed
* fix: use aload_data by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/38
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.12...v0.0.13
## 0.0.12 (2024-06-18)
## What's Changed
* fix:delete markdownify dependency by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/33
* fix: fake fix README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/34
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.11...v0.0.12
## 0.0.11 (2024-06-17)
## What's Changed
* Fix OpenAI key error. Add docstrings. Polish code by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/24
* Fix DOCX reader. Add input tests by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/25
* add: xlsx convertor by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/29
* add: convert_tab by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/31
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.10...v0.0.11
## 0.0.10 (2024-06-04)
## What's Changed
* Change from LiteralString to Literal (typing) by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/21
* chore: Add Dockerfile and Makefile for project setup by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/23
## New Contributors
* @dSupertramp made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/21
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.9...v0.0.10
## 0.0.9 (2024-06-04)
## What's Changed
* chore: Update README.md to include optional use of LlamaParse for improved results by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/19
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.8...v0.0.9
## 0.0.8 (2024-06-04)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.7...v0.0.8
## 0.0.7 (2024-06-03)
## What's Changed
* feat: Update benchmark results in README.md by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/15
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.6...v0.0.7
## 0.0.6 (2024-06-03)
## What's Changed
* add: gpt cleaner for header and footer by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/13
## New Contributors
* @chloedia made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/13
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.5...v0.0.6
## 0.0.5 (2024-06-02)
## What's Changed
* feat: Add instructions for installing poppler and tesseract by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/10
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.4...v0.0.5
## 0.0.4 (2024-06-02)
## What's Changed
* add: baseline evaluation by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/7
* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/9
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.3...v0.0.4
## 0.0.3 (2024-05-30)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.2...v0.0.3
## 0.0.2 (2024-05-30)
## What's Changed
* feat: Megaparse example and working by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/2
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2
## 0.0.2 (2024-05-30)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2
================================================
FILE: Dockerfile
================================================
FROM python:3.11.10-slim-bullseye
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && apt-get upgrade && apt-get install -y \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
build-essential \
libtool \
python-dev \
build-essential \
wget \
gcc \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
COPY requirements.lock pyproject.toml README.md ./
COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
RUN pip install uv
RUN uv pip install --no-cache --system -r requirements.lock
RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all
COPY . .
RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
EXPOSE 8000
CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
================================================
FILE: Dockerfile.gpu
================================================
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04
WORKDIR /app
ENV UV_COMPILE_BYTECODE=1
ENV UV_NO_CACHE=1
ENV DEBIAN_FRONTEND=noninteractive
# Install runtime dependencies
RUN apt-get update && apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && apt-get install -y \
python3.11 \
python3.11-dev \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
libtool \
python3-pip \
build-essential \
wget \
gcc \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
update-alternatives --set python3 /usr/bin/python3.11
COPY requirements.lock pyproject.toml README.md ./
COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
RUN uv pip install --no-cache --system -r requirements.lock
RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all
# FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured
# RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
# RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')"
COPY . .
RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Makefile
================================================
.DEFAULT_TARGET=help
## help: Display list of commands
.PHONY: help
help:
@echo "Available commands:"
@sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |'
## dev: Start development environment
.PHONY: dev
dev:
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build
## dev-build: Build development environment without cache
.PHONY: dev-build
dev-build:
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up
## prod: Build and start production environment
.PHONY: prod
prod:
docker compose -f docker-compose.yml up --build
================================================
FILE: Pipfile
================================================
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
[dev-packages]
[requires]
python_version = "3.11"
================================================
FILE: README.md
================================================
# MegaParse - Your Parser for every type of documents
<div align="center">
<img src="https://raw.githubusercontent.com/QuivrHQ/MegaParse/main/logo.png" alt="Quivr-logo" width="30%" style="border-radius: 50%; padding-bottom: 20px"/>
</div>
MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing.
## Key Features 🎯
- **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease.
- **No Information Loss**: Focus on having no information loss during parsing.
- **Fast and Efficient**: Designed with speed and efficiency at its core.
- **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents.
- **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use.
## Support
- Files: ✅ PDF ✅ Powerpoint ✅ Word
- Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images
### Example
https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3
## Installation
required python version >= 3.11
```bash
pip install megaparse
```
## Usage
1. Add your OpenAI or Anthropic API key to the .env file
2. Install poppler on your computer (images and PDFs)
3. Install tesseract on your computer (images and PDFs)
4. If you have a mac, you also need to install libmagic ```brew install libmagic```
Use MegaParse as it is :
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
megaparse = MegaParse()
response = megaparse.load("./test.pdf")
print(response)
```
### Use MegaParse Vision
```python
from megaparse.parser.megaparse_vision import MegaParseVision
model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
parser = MegaParseVision(model=model)
response = parser.convert("./test.pdf")
print(response)
```
**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
## Use as an API
There is a MakeFile for you, simply use :
```make dev```
at the root of the project and you are good to go.
See localhost:8000/docs for more info on the different endpoints !
## BenchMark
<!---BENCHMARK-->
| Parser | similarity_ratio |
| ----------------------------- | ---------------- |
| megaparse_vision | 0.87 |
| unstructured_with_check_table | 0.77 |
| unstructured | 0.59 |
| llama_parser | 0.33 |
<!---END_BENCHMARK-->
_Higher the better_
Note: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together .
## In Construction 🚧
- Improve table checker
- Create Checkers to add **modular postprocessing** ⚙️
- Add Structured output, **let's get computer talking** 🤖
## Star History
[](https://star-history.com/#QuivrHQ/MegaParse&Date)
================================================
FILE: benchmark/process_single_doc.py
================================================
import asyncio
import time
from pathlib import Path
import numpy as np
from megaparse import MegaParse
N_TRY = 1
async def process_file(megaparse: MegaParse, file_path: str | Path):
try:
t0 = time.perf_counter()
_ = await megaparse.aload(
file_path=file_path,
)
total = time.perf_counter() - t0
return total
except Exception as e:
print(f"Exception occured: {e}")
return None
async def test_process_file(file: str | Path):
# parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse()
task = []
for _ in range(N_TRY):
task.append(process_file(megaparse, file))
list_process_time = await asyncio.gather(*task)
n_errors = sum([t is None for t in list_process_time])
list_process_time = [t for t in list_process_time if t is not None]
np_list_process_time = np.array(list_process_time)
print(f"All errors : {n_errors}")
print(f"Average time taken: {np_list_process_time.mean()}")
print(f"Median time taken: {np.median(list_process_time)}")
print(f"Standard deviation of time taken: {np.std(list_process_time)}")
print(f"Max time taken: {np.max(list_process_time)}")
print(f"Min time taken: {np.min(list_process_time)}")
if __name__ == "__main__":
folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf"
asyncio.run(test_process_file(folder_path))
================================================
FILE: benchmark/test_quality_sim.py
================================================
import os
import difflib
from pathlib import Path
auto_dir = Path("benchmark/auto")
hi_res_dir = Path("benchmark/hi_res")
def jaccard_similarity(str1, str2):
if len(str1) == 0 and len(str2) == 0:
return 1
# Tokenize the strings into sets of words
words1 = set(str1.split())
words2 = set(str2.split())
# Find intersection and union of the word sets
intersection = words1.intersection(words2)
union = words1.union(words2)
# Compute Jaccard similarity
return len(intersection) / len(union) if len(union) != 0 else 0
def compare_files(file_name):
file_path_auto = auto_dir / f"{file_name}.md"
file_path_hi_res = hi_res_dir / f"{file_name}.md"
with open(file_path_auto, "r") as f:
auto_content = f.read()
with open(file_path_hi_res, "r") as f:
hi_res_content = f.read()
if len(auto_content) == 0 and len(hi_res_content) == 0:
return 1
similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio()
# similarity = jaccard_similarity(auto_content, hi_res_content)
return similarity
def main():
files = os.listdir(hi_res_dir)
print(f"Comparing {len(files)} files...")
similarity_dict = {}
for file in files:
file_name = Path(file).stem
similarity = compare_files(file_name)
similarity_dict[file_name] = similarity
avg_similarity = sum(similarity_dict.values()) / len(similarity_dict)
print(f"\nAverage similarity: {avg_similarity}\n")
pass_rate = sum(
[similarity > 0.9 for similarity in similarity_dict.values()]
) / len(similarity_dict)
print(f"Pass rate: {pass_rate}\n")
print("Under 0.9 similarity documents:")
print("-------------------------------")
for file_name, similarity in similarity_dict.items():
if similarity < 0.9:
print(f"{file_name}: {similarity}")
if __name__ == "__main__":
main()
================================================
FILE: docker-compose.dev.yml
================================================
version: "3.8"
services:
megaparse:
build:
context: .
dockerfile: Dockerfile
cache_from:
- megaparse:latest
args:
- DEV_MODE=true
image: megaparse:latest
extra_hosts:
- "host.docker.internal:host-gateway"
container_name: megaparse
volumes:
- ./:/app/
command: >
/bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000"
restart: always
ports:
- 8000:8000
================================================
FILE: docker-compose.yml
================================================
version: "3.8"
services:
megaparse:
image: megaparse:latest
pull_policy: if_not_present
container_name: megaparse
extra_hosts:
- "host.docker.internal:host-gateway"
healthcheck:
test: [ "CMD", "curl", "http://localhost:5050/healthz" ]
command: >
/bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop"
restart: always
ports:
- 8000:8000
================================================
FILE: docs/archive.txt
================================================
### (Optional) Use LlamaParse for Improved Results
1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
2. Change the parser to LlamaParser
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.llama_parser import LlamaParser
parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY"))
megaparse = MegaParse(parser)
response = megaparse.load("./test.pdf")
print(response)
megaparse.save("./test.md") #saves the last processed doc in md format
```
================================================
FILE: evaluations/script.py
================================================
import difflib
import os
from langchain_openai import ChatOpenAI
from megaparse.megaparse import MegaParse
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.parser_config import StrategyEnum
if __name__ == "__main__":
print("---Launching evaluations script---")
model = ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))) # type: ignore
parser_dict = {
"unstructured": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None),
"unstructured_with_check_table": UnstructuredParser(
strategy=StrategyEnum.AUTO,
model=model,
),
"llama_parser": LlamaParser(api_key=str(os.getenv("LLAMA_CLOUD_API_KEY"))),
"megaparse_vision": MegaParseVision(model=model),
}
base_pdf_path = "tests/data/MegaFake_report.pdf"
base_md_path = "tests/data/grt_example/MegaFake_report.md"
with open(base_md_path, "r", encoding="utf-8") as f:
base_md = f.read()
score_dict = {}
for method, parser in parser_dict.items():
print(f"Method: {method}")
megaparse = MegaParse()
result = megaparse.load(file_path=base_pdf_path)
score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()
print(f"Score for method {method}: {score_dict[method]}")
# Sort the results
sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
# Generate a table with the results
benchmark_results = "| Parser | similarity_ratio |\n|---|---|\n"
for parser, score in sorted_score:
benchmark_results += f"| {parser} | {score:.2f} |\n"
print(benchmark_results)
# Update README.md file
with open("README.md", "r") as readme_file:
readme_content = readme_file.read()
start_marker = "<!---BENCHMARK-->"
end_marker = "<!---END_BENCHMARK-->"
start_index = readme_content.find(start_marker) + len(start_marker)
end_index = readme_content.find(end_marker)
updated_readme_content = (
readme_content[:start_index]
+ "\n"
+ benchmark_results
+ readme_content[end_index:]
)
with open("README.md", "w") as readme_file:
readme_file.write(updated_readme_content)
================================================
FILE: libs/megaparse/.python-version
================================================
3.11.9
================================================
FILE: libs/megaparse/CHANGELOG.md
================================================
# Changelog
## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14)
### Features
* remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8))
## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11)
### Features
* add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))
## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16)
### Features
* modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2))
* Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))
* type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8))
### Bug Fixes
* Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))
* add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))
* logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e))
* megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9))
* Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))
* sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707))
## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16)
### Bug Fixes
* hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))
## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16)
### Features
* updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862))
## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13)
### Features
* small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))
## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12)
### Features
* custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))
* faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))
## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03)
### Features
* Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430))
## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21)
### Features
* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
* release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))
## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24)
### Features
* Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc))
* Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72))
* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914))
* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
* Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0))
### Bug Fixes
* add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70))
* change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f))
* PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7))
* remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4))
* use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089))
## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24)
### Features
* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
================================================
FILE: libs/megaparse/README.md
================================================
# MegaParse CORE
- Core package of megaparse
> **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/).
================================================
FILE: libs/megaparse/bench.md
================================================
------------
UNSTRUCTURED(HI-RES):
------------
folder: cdp
cdp_etiquette.pdf parsing took: 2.10s
folder: scanned-tables
POZIBILAN 2022.pdf parsing took: 78.72s
Banco Popilar Number 2.pdf parsing took: 94.44s
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s
folder: scanned
machine.pdf parsing took: 54.29s
medical.pdf parsing took: 76.11s
les_americains.pdf parsing took: 643.84s
agency.pdf parsing took: 114.19s
clark.pdf parsing took: 27.89s
tables_ocr.pdf parsing took: 81.21s
folder: rich
language_learning.pdf parsing took: 2.60s
dites nous tout....pdf parsing took: 1.62s
------------
UNSTRUCTURED(FAST):
------------
folder: cdp
cdp_etiquette.pdf parsing took: 0.05s
folder: scanned-tables
POZIBILAN 2022.pdf: can't parse
Banco Popilar Number 2.pdf: can't parse
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s
folder: scanned
machine.pdf parsing took: 0.02s
medical.pdf parsing took: 0.04s
les_americains.pdf parsing took: 5.90s
agency.pdf: can't parse
clark.pdf: can't parse
tables_ocr.pdf: can't parse
folder: rich
language_learning.pdf: can't parse
dites nous tout....pdf parsing took: 0.02s
------------
Megaparse (
strategy = AUTO
Config = {
provider=COREML,
det_arch: str = "fast_base"
det_batch_size: int = 2
assume_straight_pages: bool = True
preserve_aspect_ratio: bool = True
symmetric_pad: bool = True
load_in_8_bit: bool = False
reco_arch: str = "crnn_vgg16_bn"
rec_batch_size: int = 512
}
)
------------
folder: cdp
cdp_etiquette.pdf parsing took: 1.71s
folder: scanned-tables
POZIBILAN 2022.pdf parsing took: 17.76s
Banco Popilar Number 2.pdf parsing took: 19.25s
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s
folder: scanned
machine.pdf parsing took: 9.90s
medical.pdf parsing took: 13.09s
les_americains.pdf parsing took: 139.53s
agency.pdf parsing took: 10.73s
clark.pdf parsing took: 10.69s
tables_ocr.pdf parsing took: 15.58s
folder: rich
language_learning.pdf parsing took: 1.74s
dites nous tout....pdf parsing took: 0.64s
----
| Type | PDF Name | Unstructured(HI-RES) | Unstructured(FAST) | Megaparse( w/ doctr COREML) |
|------------------|-----------------------------------|---------------------|----------------------|--------------------|
| **cdp** | cdp_etiquette.pdf | 2.10s | 0.05s (bad parsing) | 1.71s |
| **scanned-tables** | POZIBILAN 2022.pdf | 78.72s | can't parse | 17.76s |
| **scanned-tables** | Banco Popilar Number 2.pdf | 94.44s | can't parse | 19.25s |
| **native** | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s | 0.07s | 0.96s |
| **native** | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s | 12.57s |
| **native** | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s | 1.53s |
| **scanned** | machine.pdf | 54.29s | 0.02s | 9.90s |
| **scanned** | medical.pdf | 76.11s | 0.04s | 13.09s |
| **scanned** | les_americains.pdf | 643.84s | 5.90s | 139.53s |
| **scanned** | agency.pdf | 114.19s | can't parse | 10.73s |
| **scanned** | clark.pdf | 28.89s | can't parse | 10.69s |
| **scanned** | tables_ocr.pdf | 81.21s | can't parse | 15.58s |
| **rich** | language_learning.pdf | 2.60s | can't parse | 1.74s |
| **rich** | dites nous tout....pdf | 1.62s | 0.02s | 0.64s |
================================================
FILE: libs/megaparse/examples/parse_file_fast.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from unstructured.partition.auto import partition
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
elements = partition(
filename=file.file_path,
strategy="fast",
)
if len(elements) == 0:
print(f"\t{file.file_name}: can't parse ")
continue
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/examples/parse_file_mp.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from megaparse import MegaParse
from megaparse.configs.auto import DeviceEnum, MegaParseConfig
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
config = MegaParseConfig(device=DeviceEnum.COREML)
mp = MegaParse(config=config)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
result = mp.load(file.file_path)
if len(result) == 0:
print(f"\t{file.file_name}: can't parse ")
continue
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/examples/parse_file_unstructured.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from unstructured.partition.auto import partition
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
_ = partition(
filename=file.file_path,
strategy="hi_res",
)
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/pyproject.toml
================================================
[project]
name = "megaparse"
version = "0.0.55"
authors = [
{ name = "Stan Girard", email = "stan@quivr.app" },
{ name = "Chloé Daems", email = "chloe@quivr.app" },
{ name = "Amine Dirhoussi", email = "amine@quivr.app" },
{ name = "Jacopo Chevallard", email = "jacopo@quivr.app" },
]
readme = "README.md"
requires-python = ">= 3.11"
dependencies = [
"megaparse-sdk",
"pycryptodome>=3.21.0",
"pdfplumber>=0.11.0",
"backoff>=2.2.1",
"pypdf>=5.0.1",
"psutil>=6.1.0",
"numpy<=2.0.0",
"playwright>=1.47.0",
"langchain-anthropic>=0.1.23",
"python-magic>=0.4.27",
"unstructured[all-docs]==0.15.0",
"langchain>=0.3,<0.4",
"langchain-community>=0.3,<0.4",
"langchain-openai>=0.1.21",
"langchain-core>=0.3,<0.4",
"llama-parse>=0.4.0",
"pydantic-settings>=2.6.1",
"onnxruntime==1.20.0; platform_machine == 'x86_64'",
"onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'",
"onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'",
"onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'",
"pypdfium2>=4.30.0",
]
[project.optional-dependencies]
api = [
"python-dotenv>=1.0.0",
"uvloop>=0.18.0",
"pydantic-settings>=2.6.1",
"uvicorn>=0.32.0",
"fastapi>=0.115.2",
"ratelimit>=2.2.1",
]
[build-system]
requires = ["hatchling==1.26.3"]
build-backend = "hatchling.build"
[tool.rye]
managed = true
dev-dependencies = []
universal = true
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["src/megaparse", "src/api"]
================================================
FILE: libs/megaparse/src/megaparse/__init__.py
================================================
from .megaparse import MegaParse
__all__ = ["MegaParse"]
================================================
FILE: libs/megaparse/src/megaparse/api/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/app.py
================================================
import io
import os
import tempfile
from typing import Any, Optional
import httpx
import psutil
import uvicorn
from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
from langchain_anthropic import ChatAnthropic
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_openai import ChatOpenAI
from llama_parse.utils import Language
from megaparse_sdk.schema.document import Document
from megaparse_sdk.schema.parser_config import (
ParserType,
StrategyEnum,
)
from megaparse_sdk.schema.supported_models import SupportedModel
from megaparse import MegaParse
from megaparse.api.exceptions.megaparse_exceptions import (
HTTPDownloadError,
HTTPFileNotFound,
HTTPModelNotSupported,
HTTPParsingException,
ParsingException,
)
from megaparse.parser.builder import ParserBuilder
app = FastAPI()
playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"])
def parser_builder_dep():
return ParserBuilder()
def get_playwright_loader():
return playwright_loader
@app.get("/healthz")
def healthz():
return {"status": "ok"}
def _check_free_memory() -> bool:
"""Reject traffic when free memory is below minimum (default 2GB)."""
mem = psutil.virtual_memory()
memory_free_minimum = int(os.environ.get("MEMORY_FREE_MINIMUM_MB", 2048))
if mem.available <= memory_free_minimum * 1024 * 1024:
return False
return True
@app.post(
"/v1/file",
)
async def parse_file(
file: UploadFile = File(...),
method: ParserType = Form(ParserType.UNSTRUCTURED),
strategy: StrategyEnum = Form(StrategyEnum.AUTO),
check_table: bool = Form(False),
language: Language = Form(Language.ENGLISH),
parsing_instruction: Optional[str] = Form(None),
model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O),
parser_builder=Depends(parser_builder_dep),
) -> dict[str, str | Document]:
if not _check_free_memory():
raise HTTPException(
status_code=503, detail="Service unavailable due to low memory"
)
model = None
if model_name and check_table:
if model_name.startswith("gpt"):
model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
elif model_name.startswith("claude"):
model = ChatAnthropic(
model_name=model_name,
api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore
timeout=60,
stop=None,
)
else:
raise HTTPModelNotSupported()
# parser_config = ParseFileConfig( #FIXME
# method=method,
# strategy=strategy,
# llm_model_name=SupportedModel(model_name) if model_name and check_table else None,
# language=language,
# parsing_instruction=parsing_instruction,
# )
try:
# parser = parser_builder.build(parser_config)
megaparse = MegaParse()
if not file.filename:
raise HTTPFileNotFound("No filename provided")
_, extension = os.path.splitext(file.filename)
file_bytes = await file.read()
file_stream = io.BytesIO(file_bytes)
result = await megaparse.aload(file=file_stream, file_extension=extension)
return {"message": "File parsed successfully", "result": result}
except ParsingException as e:
print(e)
raise HTTPParsingException(file.filename)
except ValueError as e:
print(e)
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
print(e)
raise HTTPException(status_code=500, detail=str(e))
@app.post(
"/v1/url",
)
async def upload_url(
url: str, playwright_loader=Depends(get_playwright_loader)
) -> dict[str, Any]:
playwright_loader.urls = [url]
if url.endswith(".pdf"):
## Download the file
async with httpx.AsyncClient() as client:
response = await client.get(url)
if response.status_code != 200:
raise HTTPDownloadError(url)
with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
try:
megaparse = MegaParse()
result = await megaparse.aload(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except ParsingException:
raise HTTPParsingException(url)
else:
data = await playwright_loader.aload()
# Now turn the data into a string
extracted_content = ""
for page in data:
extracted_content += page.page_content
if not extracted_content:
raise HTTPDownloadError(
url,
message="Failed to extract content from the website. Valid URL example : https://www.quivr.com",
)
return {
"message": "Website content parsed successfully",
"result": extracted_content,
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
================================================
FILE: libs/megaparse/src/megaparse/api/exceptions/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py
================================================
from fastapi import HTTPException
class HTTPModelNotSupported(HTTPException):
def __init__(
self,
detail: str = "The requested model is not supported yet.",
headers: dict | None = None,
):
super().__init__(status_code=501, detail=detail, headers=headers)
class HTTPFileNotFound(HTTPException):
def __init__(
self,
message="The UploadFile.filename does not exist and is needed for this operation",
):
super().__init__(status_code=404, detail=message)
class HTTPDownloadError(HTTPException):
def __init__(self, file_name, message="Failed to download the file"):
message = f"{file_name} : {message}"
super().__init__(status_code=400, detail=message)
class HTTPParsingException(HTTPException):
def __init__(self, file_name, message="Failed to parse the file"):
message = f"{file_name} : {message}"
super().__init__(status_code=500, detail=message)
class ParsingException(Exception):
"""Exception raised for errors in the parsing process."""
def __init__(self, message="An error occurred during parsing"):
self.message = message
super().__init__(self.message)
================================================
FILE: libs/megaparse/src/megaparse/api/models/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/models/base.py
================================================
from enum import Enum
class MarkDownType(str, Enum):
"""Markdown type enumeration."""
TITLE = "Title"
SUBTITLE = "Subtitle"
HEADER = "Header"
FOOTER = "Footer"
NARRATIVE_TEXT = "NarrativeText"
LIST_ITEM = "ListItem"
TABLE = "Table"
PAGE_BREAK = "PageBreak"
IMAGE = "Image"
FORMULA = "Formula"
FIGURE_CAPTION = "FigureCaption"
ADDRESS = "Address"
EMAIL_ADDRESS = "EmailAddress"
CODE_SNIPPET = "CodeSnippet"
PAGE_NUMBER = "PageNumber"
DEFAULT = "Default"
UNDEFINED = "Undefined"
================================================
FILE: libs/megaparse/src/megaparse/configs/auto.py
================================================
from enum import Enum
from pydantic import BaseModel
from pydantic_settings import BaseSettings, SettingsConfigDict
class TextDetConfig(BaseModel):
det_arch: str = "fast_base"
batch_size: int = 2
assume_straight_pages: bool = True
preserve_aspect_ratio: bool = True
symmetric_pad: bool = True
load_in_8_bit: bool = False
class AutoStrategyConfig(BaseModel):
page_threshold: float = 0.6
document_threshold: float = 0.2
class TextRecoConfig(BaseModel):
reco_arch: str = "crnn_vgg16_bn"
batch_size: int = 512
class DeviceEnum(str, Enum):
CPU = "cpu"
CUDA = "cuda"
COREML = "coreml"
class DoctrConfig(BaseModel):
straighten_pages: bool = False
detect_orientation: bool = False
detect_language: bool = False
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()
class MegaParseConfig(BaseSettings):
"""
Configuration for Megaparse.
"""
model_config = SettingsConfigDict(
env_prefix="MEGAPARSE_",
env_file=(".env.local", ".env"),
env_nested_delimiter="__",
extra="ignore",
use_enum_values=True,
)
doctr_config: DoctrConfig = DoctrConfig()
auto_config: AutoStrategyConfig = AutoStrategyConfig()
device: DeviceEnum = DeviceEnum.CPU
================================================
FILE: libs/megaparse/src/megaparse/examples/parse_file.py
================================================
from pathlib import Path
from megaparse.megaparse import MegaParse
from pydantic import BaseModel, Field
class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")
def main():
# model = ChatOpenAI(name="gpt-4o")
# formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)
megaparse = MegaParse()
file_path = Path("./tests/pdf/ocr/0168127.pdf")
result = megaparse.load(file_path=file_path)
print(result)
if __name__ == "__main__":
main()
================================================
FILE: libs/megaparse/src/megaparse/examples/parsing_process.py
================================================
from pathlib import Path
from typing import IO, Any, List, Tuple
import numpy as np
import onnxruntime as rt
import pypdfium2 as pdfium
from megaparse.configs.auto import (
AutoStrategyConfig,
DeviceEnum,
TextDetConfig,
TextRecoConfig,
)
from megaparse.models.page import Page, PageDimension
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.document import BBOX, BlockLayout, BlockType, TextDetection
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from onnxtr.models import detection_predictor, recognition_predictor
from onnxtr.models.builder import DocumentBuilder
from onnxtr.models.engine import EngineConfig
from onnxtr.utils.geometry import (
detach_scores,
extract_crops,
extract_rcrops,
)
from pypdfium2._helpers.page import PdfPage
def get_strategy_page(
pdfium_page: PdfPage, onnxtr_page: TextDetection, page_threshold: float = 0.6
) -> StrategyEnum:
# assert (
# p_width == onnxtr_page.dimensions[1]
# and p_height == onnxtr_page.dimensions[0]
# ), "Page dimensions do not match"
text_coords = []
# Get all the images in the page
for obj in pdfium_page.get_objects():
if obj.type == 1:
text_coords.append(obj.get_pos())
p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())
pdfium_canva = np.zeros((int(p_height), int(p_width)))
for coords in text_coords:
# (left,bottom,right, top)
# 0---l--------------R-> y
# |
# B (x0,y0)
# |
# T (x1,y1)
# ^
# x
x0, y0, x1, y1 = (
p_height - coords[3],
coords[0],
p_height - coords[1],
coords[2],
)
x0 = max(0, min(p_height, int(x0)))
y0 = max(0, min(p_width, int(y0)))
x1 = max(0, min(p_height, int(x1)))
y1 = max(0, min(p_width, int(y1)))
pdfium_canva[x0:x1, y0:y1] = 1
onnxtr_canva = np.zeros((int(p_height), int(p_width)))
for block in onnxtr_page.bboxes:
x0, y0 = block.bbox[0]
x1, y1 = block.bbox[1]
x0 = max(0, min(int(x0 * p_width), int(p_width)))
y0 = max(0, min(int(y0 * p_height), int(p_height)))
x1 = max(0, min(int(x1 * p_width), int(p_width)))
y1 = max(0, min(int(y1 * p_height), int(p_height)))
onnxtr_canva[y0:y1, x0:x1] = 1
intersection = np.logical_and(pdfium_canva, onnxtr_canva)
union = np.logical_or(pdfium_canva, onnxtr_canva)
iou = np.sum(intersection) / np.sum(union)
if iou < page_threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
def validate_input(
file_path: Path | str | None = None,
file: IO[bytes] | None = None,
file_extension: str | FileExtension | None = None,
) -> FileExtension:
if not (file_path or file):
raise ValueError("Either file_path or file should be provided")
if file_path and file:
raise ValueError("Only one of file_path or file should be provided")
if file_path and file is None:
if isinstance(file_path, str):
file_path = Path(file_path)
file_extension = file_path.suffix
elif file and file_path is None:
if not file_extension:
raise ValueError(
"file_extension should be provided when given file argument"
)
file.seek(0)
else:
raise ValueError("Either provider a file_path or file")
if isinstance(file_extension, str):
try:
file_extension = FileExtension(file_extension)
except ValueError:
raise ValueError(f"Unsupported file extension: {file_extension}")
return file_extension
def _generate_crops(
pages: list[np.ndarray],
loc_preds: list[np.ndarray],
channels_last: bool,
assume_straight_pages: bool = False,
assume_horizontal: bool = False,
) -> list[list[np.ndarray]]:
if assume_straight_pages:
crops = [
extract_crops(page, _boxes[:, :4], channels_last=channels_last)
for page, _boxes in zip(pages, loc_preds, strict=False)
]
else:
crops = [
extract_rcrops(
page,
_boxes[:, :4],
channels_last=channels_last,
assume_horizontal=assume_horizontal,
)
for page, _boxes in zip(pages, loc_preds, strict=False)
]
return crops
def _prepare_crops(
pages: list[np.ndarray],
loc_preds: list[np.ndarray],
channels_last: bool,
assume_straight_pages: bool = False,
assume_horizontal: bool = False,
) -> tuple[list[list[np.ndarray]], list[np.ndarray]]:
crops = _generate_crops(
pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal
)
# Avoid sending zero-sized crops
is_kept = [
[all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops
]
crops = [
[crop for crop, _kept in zip(page_crops, page_kept, strict=False) if _kept]
for page_crops, page_kept in zip(crops, is_kept, strict=False)
]
loc_preds = [
_boxes[_kept] for _boxes, _kept in zip(loc_preds, is_kept, strict=False)
]
return crops, loc_preds
def _process_predictions(
loc_preds: list[np.ndarray],
word_preds: list[tuple[str, float]],
crop_orientations: list[dict[str, Any]],
) -> tuple[list[np.ndarray], list[list[tuple[str, float]]], list[list[dict[str, Any]]]]:
text_preds = []
crop_orientation_preds = []
if len(loc_preds) > 0:
# Text & crop orientation predictions at page level
_idx = 0
for page_boxes in loc_preds:
text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]])
crop_orientation_preds.append(
crop_orientations[_idx : _idx + page_boxes.shape[0]]
)
_idx += page_boxes.shape[0]
return loc_preds, text_preds, crop_orientation_preds
def main():
file_path = Path("./tests/pdf/sample_pdf.pdf")
strategy = StrategyEnum.AUTO
device = DeviceEnum.COREML
ocr_parser = DoctrParser()
default_parser = UnstructuredParser(strategy=StrategyEnum.FAST)
file_extension = validate_input(file_path=file_path)
with open(file_path, "rb") as file:
pdfium_document = pdfium.PdfDocument(file)
rasterized_pages: list[np.ndarray] = [
np.array(page.render().to_pil(scale=2)) for page in pdfium_document
]
##-----------------------------------
## GET PAGES
##-----------------------------------
mp_pages = []
if strategy == StrategyEnum.FAST:
parsed_document = default_parser.convert(
file=file,
file_extension=file_extension,
)
else:
text_det_config = TextDetConfig()
general_options = rt.SessionOptions()
providers = get_providers(device=device)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
det_predictor = detection_predictor(
arch=text_det_config.det_arch,
assume_straight_pages=text_det_config.assume_straight_pages,
preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,
symmetric_pad=text_det_config.symmetric_pad,
batch_size=text_det_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
if any(page.ndim != 3 for page in rasterized_pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
orientations = None
general_pages_orientations = None
# Localize text elements
loc_preds, out_maps = det_predictor(rasterized_pages, return_maps=True)
# FIXME: For simplicity we do not care about page orientation rn
# FIXME: similaly we don't care about straighten page
# Detach objectness scores from loc_preds
loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type]
# FIXME: Do not care about hooks here
# # Apply hooks to loc_preds if any
# for hook in hooks:
# loc_preds = hook(loc_preds)
all_pages_layouts = []
for page_index, (page, loc_pred, objectness_score) in enumerate(
zip(rasterized_pages, loc_preds, objectness_scores, strict=True)
):
block_layouts = []
for bbox, score in zip(loc_pred, objectness_score, strict=True):
block_layouts.append(
BlockLayout(
bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),
objectness_score=score,
block_type=BlockType.TEXT,
)
)
all_pages_layouts.append(
TextDetection(
bboxes=block_layouts,
page_index=page_index,
dimensions=page.shape[:2],
orientation=general_pages_orientations[page_index]
if general_pages_orientations is not None
else 0,
)
)
for pdfium_page, onnxtr_page, rasterized_page in zip(
pdfium_document, all_pages_layouts, rasterized_pages, strict=True
):
strategy = get_strategy_page(pdfium_page, onnxtr_page)
mp_pages.append(
Page(
strategy=strategy,
text_detections=onnxtr_page,
rasterized=rasterized_page,
page_size=PageDimension(
width=pdfium_page.get_width(),
height=pdfium_page.get_height(),
),
page_index=onnxtr_page.page_index,
pdfium_elements=pdfium_page,
)
)
##-----------------------------------
## GET PARSER BASE ON CHOSE STRATEGY
##-----------------------------------
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
parser = default_parser
elif strategy == StrategyEnum.HI_RES:
parser = ocr_parser
else:
if need_hi_res(mp_pages, AutoStrategyConfig()):
parser = ocr_parser
else:
parser = default_parser
##-----------------------------------
## PARSE FILE
##-----------------------------------
if isinstance(parser, UnstructuredParser):
parsed_document = parser.convert(
file=file,
pages=mp_pages,
file_extension=file_extension,
)
else:
origin_page_shapes: List[Tuple[int, int]] = [
(page.shape[0], page.shape[1]) for page in rasterized_pages
]
reco_config = TextRecoConfig()
reco_predictor = recognition_predictor(
arch=reco_config.reco_arch,
batch_size=reco_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
# Crop images
crops, loc_preds = _prepare_crops(
rasterized_pages,
loc_preds, # type: ignore[arg-type]
channels_last=True,
assume_straight_pages=True, # FIXME: To change
assume_horizontal=True, # FIXME: To change
)
# Rectify crop orientation and get crop orientation predictions
crop_orientations: Any = []
# Identify character sequences
word_preds = reco_predictor(
[crop for page_crops in crops for crop in page_crops]
)
if not crop_orientations:
crop_orientations = [
{"value": 0, "confidence": None} for _ in word_preds
]
boxes, text_preds, crop_orientations = _process_predictions(
loc_preds, word_preds, crop_orientations
)
doc_builder = DocumentBuilder()
parsed_document = doc_builder(
rasterized_pages,
boxes,
objectness_scores,
text_preds,
origin_page_shapes,
crop_orientations,
orientations,
None,
)
print(parsed_document)
if __name__ == "__main__":
main()
================================================
FILE: libs/megaparse/src/megaparse/exceptions/base.py
================================================
class ParsingException(Exception):
"""Exception raised for errors in the parsing process."""
def __init__(self, message="An error occurred during parsing"):
self.message = message
super().__init__(self.message)
================================================
FILE: libs/megaparse/src/megaparse/formatter/base.py
================================================
from abc import ABC
from pathlib import Path
from typing import Union
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse_sdk.schema.document import Document
class BaseFormatter(ABC):
"""
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
Attributes
----------
model : BaseChatModel
An instance of a chat model used to process and improve the layout of elements.
Methods
-------
improve_layout(elements: List[Element]) -> List[Element]
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
"""
def __init__(self, model: BaseChatModel | None = None):
self.model = model
def format(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py
================================================
from pathlib import Path
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from megaparse_sdk.schema.document import Document
from pydantic import BaseModel
class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model
async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
================================================
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py
================================================
from pathlib import Path
from megaparse.formatter.structured_formatter import StructuredFormatter
from megaparse_sdk.schema.document import Document
from pydantic import BaseModel
class CustomStructuredFormatter(StructuredFormatter):
def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str:
"""
Structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
text = str(document)
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)
structured_model = self.model.with_structured_output(self.output_model) # type: ignore
formatted_text = structured_model.invoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
return formatted_text.model_dump_json()
async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str:
"""
Asynchronously structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
text = str(document)
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)
structured_model = self.model.with_structured_output(self.output_model) # type: ignore
formatted_text = await structured_model.ainvoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
return formatted_text.model_dump_json()
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py
================================================
from pathlib import Path
from megaparse.formatter.base import BaseFormatter
from megaparse_sdk.schema.document import Document
class TableFormatter(BaseFormatter):
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
raise NotImplementedError("Subclasses should implement this method")
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py
================================================
import re
import warnings
from pathlib import Path
from typing import Optional
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
from megaparse.formatter.table_formatter import TableFormatter
from megaparse_sdk.schema.document import Document, TableBlock
class SimpleMDTableFormatter(TableFormatter):
"""
A formatter that converts table elements into Markdown format using llms.
"""
TABLE_MARKER_START = "[TABLE]"
TABLE_MARKER_END = "[/TABLE]"
CODE_BLOCK_PATTERN = r"^```.*$\n?"
def __init__(self, model: Optional[BaseChatModel] = None):
super().__init__(model)
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
warnings.warn(
"The SimpleMDTableFormatter is a sync formatter, please use the sync format method",
UserWarning,
stacklevel=2,
)
return self.format(document=document, file_path=file_path)
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Formats table elements within a list of elements.
Args:
elements: A list of Element objects.
Returns:
A list of Element objects with formatted tables.
"""
if not self.model:
raise ValueError("A Model is needed to use the SimpleMDTableFormatter.")
print("Formatting tables using SimpleMDTableFormatter...")
table_stack = []
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
previous_table = table_stack[-1] if table_stack else ""
formatted_table = self.format_table(block, previous_table)
table_stack.append(formatted_table.text)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
def format_table(
self, table_element: TableBlock, previous_table: str
) -> TableBlock:
"""
Formats a single table element into Markdown using an AI language model.
Args:
table_element: The table element to format.
previous_table: The previously formatted table text.
Returns:
The formatted table element.
"""
assert self.model is not None, "Model is not set."
prompt = ChatPromptTemplate.from_messages(
[
(
"human",
(
"You are an expert in markdown tables. Transform the following parsed table into a "
"markdown table. Provide just the table in pure markdown, nothing else.\n"
"<TEXT>\n{text}\n</TEXT>\n"
"<PREVIOUS_TABLE>\n{previous_table}\n</PREVIOUS_TABLE>"
),
),
]
)
chain = prompt | self.model
result = chain.invoke(
{
"text": table_element.text,
"previous_table": previous_table,
}
)
content_str = str(result.content)
cleaned_content = re.sub(
self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE
)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{cleaned_content}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
table_element.text = markdown_table
return table_element
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py
================================================
import base64
from io import BytesIO
from pathlib import Path
from typing import List, Optional
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from megaparse.formatter.table_formatter import TableFormatter
from megaparse_sdk.schema.document import Document, TableBlock
from pdf2image import convert_from_path
from PIL import Image
TABLE_OCR_PROMPT = """
You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting.
Answer uniquely with the parsed table. Do not include the fenced code blocks backticks.
"""
class VisionMDTableFormatter(TableFormatter):
"""
A formatter that converts table elements into Markdown format using an AI language model.
"""
TABLE_MARKER_START = "[TABLE]"
TABLE_MARKER_END = "[/TABLE]"
CODE_BLOCK_PATTERN = r"^```.*$\n?"
def __init__(self, model: Optional[BaseChatModel] = None):
super().__init__(model)
def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str:
"""
Helper method to crop the table portion of the PDF page and convert it to a base64 string.
"""
assert table_element.bbox, "Table element must have coordinates."
bbox = table_element.bbox
page_number = table_element.page_range[0]
assert page_number, "Table element must have a page number."
assert bbox, "Table element must have coordinates."
pages = convert_from_path(file_path)
# Calculate the box for cropping
box = (
bbox.top_left.x,
bbox.top_left.y,
bbox.bottom_right.x,
bbox.bottom_right.y,
)
table_image = pages[page_number - 1].crop(box)
# Convert the cropped image to base64
table_image64 = self.process_file([table_image])[0]
return table_image64
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Asynchronously formats table elements within a list of elements.
"""
if not self.model:
raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
print("Formatting tables using VisionMDTableFormatter (async)...")
assert (
file_path
), "A file path is needed to format tables using VisionMDTableFormatter."
if not isinstance(file_path, str):
file_path = str(file_path)
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
formatted_table = await self.aformat_table(block, file_path)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Asynchronously formats table elements within a list of elements.
"""
if not self.model:
raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
print("Formatting tables using VisionMDTableFormatter (async)...")
assert (
file_path
), "A file path is needed to format tables using VisionMDTableFormatter."
if not isinstance(file_path, str):
file_path = str(file_path)
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
formatted_table = self.format_table(block, file_path)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
async def aformat_table(
self, table_element: TableBlock, file_path: str
) -> TableBlock:
"""
Asynchronously formats a table element into Markdown format using a Vision Model.
"""
table_image64 = self._crop_table_image(table_element, file_path)
formatted_table = await self.avision_extract(table_image64)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{formatted_table}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
# Replace the element's text with the formatted table text
table_element.text = markdown_table
return table_element
def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock:
"""
Asynchronously formats a table element into Markdown format using a Vision Model.
"""
table_image64 = self._crop_table_image(table_element, file_path)
formatted_table = self.vision_extract(table_image64)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{formatted_table}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
# Replace the element's text with the formatted table text
table_element.text = markdown_table
return table_element
def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]:
"""
Convert a list of PIL images to base64 encoded images.
"""
try:
images_base64 = []
for image in images:
buffered = BytesIO()
image.save(buffered, format=image_format)
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
images_base64.append(image_base64)
return images_base64
except Exception as e:
raise ValueError(f"Error processing PDF file: {str(e)}")
async def avision_extract(self, table_image: str) -> str:
"""
Asynchronously send image data to the language model for processing.
"""
assert (
self.model
), "A model is needed to use the VisionMDTableFormatter (async)."
image_prompt = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
}
message = HumanMessage(
content=[
{"type": "text", "text": TABLE_OCR_PROMPT},
image_prompt,
],
)
response = await self.model.ainvoke([message])
return str(response.content)
def vision_extract(self, table_image: str) -> str:
"""
Synchronously send image data to the language model for processing.
"""
assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)."
image_prompt = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
}
message = HumanMessage(
content=[
{"type": "text", "text": TABLE_OCR_PROMPT},
image_prompt,
],
)
response = self.model.invoke([message])
return str(response.content)
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/layout_detector.py
================================================
import logging
import os
import pathlib
import uuid
from typing import Any, List
import numpy as np
import onnxruntime as rt
from megaparse.configs.auto import DeviceEnum
from megaparse.layout_detection.output import LayoutDetectionOutput
from megaparse.utils.onnx import get_providers
from megaparse_sdk.schema.document import BBOX, Point2D
from onnxtr.models.engine import EngineConfig
from onnxtr.models.preprocessor import PreProcessor
from PIL import Image, ImageDraw
from PIL.Image import Image as PILImage
logger = logging.getLogger("megaparse")
LABEL_MAP = {
0: "Caption",
1: "Footnote",
2: "Formula",
3: "List-item",
4: "Page-footer",
5: "Page-header",
6: "Picture",
7: "Section-header",
8: "Table",
9: "Text",
10: "Title",
}
default_cfg: dict[str, dict[str, Any]] = {
"yolov10s-doclaynet": {
"mean": (0.5, 0.5, 0.5),
"std": (1.0, 1.0, 1.0),
"url_8_bit": None,
"input_shape": (1, 1024, 1024),
"url": pathlib.Path(__file__).parent.joinpath("models/yolov10s-doclaynet.onnx"),
}
}
class LayoutDetector:
def __init__(
self,
device: DeviceEnum = DeviceEnum.CPU,
threshold: float = 0.1,
preserve_aspect_ratio: bool = True,
model_name: str = "yolov10s-doclaynet",
load_in_8_bit: bool = False,
):
model_config = default_cfg[model_name]
self.device = device
general_options = rt.SessionOptions()
providers = get_providers(self.device)
self.threshold = threshold
self.batch_size, self.required_width, self.required_height = model_config[
"input_shape"
]
self.preserve_aspect_ratio = preserve_aspect_ratio
self.pre_processor = PreProcessor(
output_size=(self.required_width, self.required_height),
batch_size=self.batch_size,
preserve_aspect_ratio=self.preserve_aspect_ratio,
)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
model_path = (
model_config.get("url_8_bit") if load_in_8_bit else model_config.get("url")
)
assert model_path, f"Model path not found for {model_name}"
self.model = rt.InferenceSession(model_path, engine_config=engine_config)
def __call__(
self, img_pages: list[PILImage], output_dir: str | None = None
) -> List[List[LayoutDetectionOutput]]:
pages = [np.array(img) for img in img_pages]
# Dimension check
if any(page.ndim != 3 for page in pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
processed_batches = self.pre_processor(pages)
processed_batches = np.array(processed_batches)
processed_batches = processed_batches.squeeze(1) # Horrendus
processed_batches = processed_batches.transpose(0, 3, 1, 2)
pred_batches = np.array(
[
self.model.run(None, {"images": np.expand_dims(batch, axis=0)})
for batch in processed_batches
]
)
pred_batches = np.concatenate(pred_batches, axis=0)
pred_batches = pred_batches.squeeze(1) # Horrendus
processed_preds = []
for page, pred in zip(pages, pred_batches, strict=True):
img_h, img_w = page.shape[:2]
bboxes = self.extract_bboxes_from_page(pred, img_h, img_w)
processed_preds.append(bboxes)
if output_dir:
self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir)
return processed_preds
def extract_bboxes_from_page(
self, preds: np.ndarray, img_h: int, img_w: int
) -> List[LayoutDetectionOutput]:
results = []
assert preds.shape == (300, 6)
scale_h = img_h / self.required_height
scale_w = img_w / self.required_width
for det in preds:
# Rescale the bounding box coordinates to the original dimensions
x1, y1, x2, y2, score, cls_idx = det
if score < self.threshold:
continue
x1 *= scale_w
x2 *= scale_w
y1 *= scale_h
y2 *= scale_h
if self.preserve_aspect_ratio:
ratio = img_h / img_w
x1 = x1 * (ratio if ratio > 1 else 1)
x2 = x2 * (ratio if ratio > 1 else 1)
y1 = y1 / (ratio if ratio < 1 else 1)
y2 = y2 / (ratio if ratio < 1 else 1)
x1 = max(0, min(x1, img_w))
x2 = max(0, min(x2, img_w))
y1 = max(0, min(y1, img_h))
y2 = max(0, min(y2, img_h))
bbox_id = uuid.uuid4()
results.append(
LayoutDetectionOutput(
bbox_id=bbox_id,
bbox=BBOX(
top_left=Point2D(x=x1 / img_w, y=y1 / img_h),
bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h),
),
prob=det[4],
label=int(det[5]),
)
)
result = self.topK(results) # or topK
return result
def nms(
self,
raw_bboxes: List[LayoutDetectionOutput],
iou_threshold: float = 0.9, # FIXME: thresh Configurable in constructor
) -> List[LayoutDetectionOutput]:
"""
Non-Maximum Suppression (NMS) algorithm.
Args:
raw_bboxes (list): List of LayoutBBox objects.
iou_threshold (float): IoU threshold for suppression.
Returns:
None: The input list `raw_bboxes` is modified in-place.
"""
raw_bboxes.sort(key=lambda x: x.prob, reverse=True)
current_index = 0
for index in range(len(raw_bboxes)):
drop = False
for prev_index in range(current_index):
iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox)
if iou > iou_threshold:
drop = True
break
if not drop:
raw_bboxes[current_index], raw_bboxes[index] = (
raw_bboxes[index],
raw_bboxes[current_index],
)
current_index += 1
return raw_bboxes[:current_index]
def topK(
self, detectResult: List[LayoutDetectionOutput], topK: int = 50
) -> List[LayoutDetectionOutput]:
if len(detectResult) <= topK:
return detectResult
else:
predBoxs = []
sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True)
for i in range(topK):
predBoxs.append(sort_detectboxs[i])
return predBoxs
def _save_layout(
self,
pages: list[np.ndarray],
preds: list[list[LayoutDetectionOutput]],
output_dir: str,
):
os.makedirs(output_dir, exist_ok=True)
for i, (page, layout) in enumerate(zip(pages, preds, strict=True)):
image = Image.fromarray(page)
draw = ImageDraw.Draw(image)
img_w, img_h = image.size
for detection in layout:
x_min, y_min, x_max, y_max = detection.bbox.to_numpy()
bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h
confidence = detection.prob
category = detection.label
label = LABEL_MAP.get(category, "Unknown")
draw.rectangle(bbox, outline="red", width=2)
# assert bbox[2] <= image.width
# assert bbox[3] <= image.height
draw.text(
(bbox[0], bbox[1]),
f"{label} ({confidence:.2f})",
fill="red",
)
image.save(os.path.join(output_dir, f"page_{i}.png"))
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx
================================================
[File too large to display: 27.9 MB]
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/output.py
================================================
from uuid import UUID
from megaparse_sdk.schema.document import BBOX
from pydantic import BaseModel
class LayoutDetectionOutput(BaseModel):
bbox_id: UUID
bbox: BBOX
prob: float
label: int
================================================
FILE: libs/megaparse/src/megaparse/megaparse.py
================================================
import logging
import warnings
from pathlib import Path
from typing import IO, BinaryIO, List
import pypdfium2 as pdfium
from megaparse_sdk.schema import document
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from megaparse.configs.auto import MegaParseConfig
from megaparse.exceptions.base import ParsingException
from megaparse.formatter.base import BaseFormatter
from megaparse.layout_detection.layout_detector import LayoutDetector
from megaparse.models.page import Page, PageDimension
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse.utils.strategy import (
determine_global_strategy,
get_page_strategy,
)
logger = logging.getLogger("megaparse")
class MegaParse:
def __init__(
self,
formatters: List[BaseFormatter] | None = None,
config: MegaParseConfig = MegaParseConfig(),
unstructured_strategy: StrategyEnum = StrategyEnum.AUTO,
) -> None:
self.config = config
self.formatters = formatters
self.doctr_parser = DoctrParser(
text_det_config=self.config.doctr_config.text_det_config,
text_reco_config=self.config.doctr_config.text_reco_config,
device=self.config.device,
straighten_pages=self.config.doctr_config.straighten_pages,
detect_orientation=self.config.doctr_config.detect_orientation,
detect_language=self.config.doctr_config.detect_language,
)
self.unstructured_parser = UnstructuredParser()
self.layout_model = LayoutDetector()
self.unstructured_parser = UnstructuredParser(unstructured_strategy)
def validate_input(
self,
file_path: Path | str | None = None,
file: IO[bytes] | None = None,
file_extension: str | FileExtension | None = None,
) -> FileExtension:
if not (file_path or file):
raise ValueError("Either file_path or file should be provided")
if file_path and file:
raise ValueError("Only one of file_path or file should be provided")
if file_path and file is None:
if isinstance(file_path, str):
file_path = Path(file_path)
file_extension = file_path.suffix
elif file and file_path is None:
if not file_extension:
raise ValueError(
"file_extension should be provided when given file argument"
)
file.seek(0)
else:
raise ValueError("Either provider a file_path or file")
if isinstance(file_extension, str):
try:
file_extension = FileExtension(file_extension)
except ValueError:
raise ValueError(f"Unsupported file extension: {file_extension}")
return file_extension
def extract_page_strategies(
self, pdfium_document: pdfium.PdfDocument, rast_scale: int = 2
) -> List[Page]:
pages = []
for i, pdfium_page in enumerate(pdfium_document):
rasterized_page = pdfium_page.render(scale=rast_scale)
assert (
abs(pdfium_page.get_width() * rast_scale - rasterized_page.width) <= 1
), (
f"Widths do not match within a margin of 1: "
f"{pdfium_page.get_width() * rast_scale} != {rasterized_page.width}"
)
pages.append(
Page(
strategy=StrategyEnum.AUTO,
text_detections=None,
rasterized=rasterized_page.to_pil(),
page_size=PageDimension(
width=pdfium_page.get_width() * rast_scale,
height=pdfium_page.get_height() * rast_scale,
),
page_index=i,
pdfium_elements=pdfium_page,
)
)
pages.append(
Page(
strategy=StrategyEnum.AUTO,
text_detections=None,
rasterized=rasterized_page.to_pil(),
page_size=PageDimension(
width=pdfium_page.get_width() * rast_scale,
height=pdfium_page.get_height() * rast_scale,
),
page_index=i,
pdfium_elements=pdfium_page,
)
)
# ----
# Get text detection for each page -> PAGE
pages = self.doctr_parser.get_text_detections(pages)
# ---
# Get strategy per page -> PAGE
for page in pages:
page.strategy = get_page_strategy(
page.pdfium_elements,
page.text_detections,
threshold=self.config.auto_config.page_threshold,
)
return pages
pages = self.doctr_parser.get_text_detections(pages)
for page in pages:
page.strategy = get_page_strategy(
page.pdfium_elements,
page.text_detections,
threshold=self.config.auto_config.page_threshold,
)
return pages
def load(
self,
file_path: Path | str | None = None,
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
strategy: StrategyEnum = StrategyEnum.AUTO,
) -> str:
file_extension = self.validate_input(
file=file, file_path=file_path, file_extension=file_extension
)
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
self.unstructured_parser.strategy = strategy
return str(
self.unstructured_parser.convert(
file_path=file_path, file=file, file_extension=file_extension
)
)
else:
opened_file = None
try:
if file_path:
opened_file = open(file_path, "rb")
file = opened_file
assert file is not None, "No File provided"
pdfium_document = pdfium.PdfDocument(file)
# Rasterize pages and extract text recognition
pages = self.extract_page_strategies(pdfium_document)
strategy = determine_global_strategy(
pages, self.config.auto_config.document_threshold
)
# Extract layout model
assert all(p.rasterized for p in pages)
layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore
if strategy == StrategyEnum.HI_RES:
logger.debug("Using doctr for text recognition")
parsed_document = self.doctr_parser.get_text_recognition(
pages, layout_result
)
else:
logger.debug("Using Unstructured Parser")
self.unstructured_parser.strategy = StrategyEnum.FAST
parsed_document = self.unstructured_parser.convert(
file=file, file_extension=file_extension
)
# additional attributes
parsed_document.file_name = str(file_path) if file_path else None
parsed_document.metadata = pdfium_document.get_metadata_dict()
# Format -> TODO: should be generic
if self.formatters:
for formatter in self.formatters:
if isinstance(parsed_document, str):
warnings.warn(
f"The last step returned a string, the {formatter.__class__} and following will not be applied",
stacklevel=2,
)
break
parsed_document = formatter.format(parsed_document)
if not isinstance(parsed_document, str):
return str(parsed_document)
return parsed_document
except Exception as e:
logger.exception(f"Error occured while parsing {file}: {e}")
raise ParsingException(
f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}"
)
finally:
if opened_file:
opened_file.close()
async def aload(
self,
file_path: Path | str | None = None,
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
strategy: StrategyEnum = StrategyEnum.AUTO,
) -> str | document.Document:
file_extension = self.validate_input(
file=file, file_path=file_path, file_extension=file_extension
)
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
self.unstructured_parser.strategy = strategy
parsed_document = await self.unstructured_parser.aconvert(
file_path=file_path, file=file, file_extension=file_extension
)
return str(parsed_document)
else:
opened_file = None
try:
if file_path:
opened_file = open(file_path, "rb")
file = opened_file
assert file is not None, "No File provided"
pdfium_document = pdfium.PdfDocument(file)
# Determine strategy
pages = self.extract_page_strategies(pdfium_document)
strategy = determine_global_strategy(
pages, self.config.auto_config.document_threshold
)
# Run layout model
assert all(p.rasterized for p in pages)
layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore
if strategy == StrategyEnum.HI_RES:
logger.info("Using Doctr for text recognition")
parsed_document = self.doctr_parser.get_text_recognition(
pages, layout_result
)
else:
logger.info("Switching to Unstructured Parser")
self.unstructured_parser.strategy = StrategyEnum.FAST
parsed_document = await self.unstructured_parser.aconvert(
file=file, file_extension=file_extension
)
parsed_document.file_name = str(file_path) if file_path else None
parsed_document.metadata = pdfium_document.get_metadata_dict()
if self.formatters:
for formatter in self.formatters:
if isinstance(parsed_document, str):
warnings.warn(
f"The last step returned a string, the {formatter.__class__} and following will not be applied",
stacklevel=2,
)
break
parsed_document = await formatter.aformat(parsed_document)
return parsed_document
except Exception as e:
raise ParsingException(
f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}"
)
finally:
if opened_file:
opened_file.close()
================================================
FILE: libs/megaparse/src/megaparse/models/page.py
================================================
from typing import List
from megaparse_sdk.schema.document import TextDetection
from megaparse_sdk.schema.parser_config import StrategyEnum
from PIL.Image import Image as PILImage
from pydantic import BaseModel, ConfigDict
from pypdfium2._helpers.page import PdfPage
class PageDimension(BaseModel):
"""
A class to represent a page dimension
"""
width: float
height: float
class Page(BaseModel):
"""
A class to represent a page
"""
strategy: StrategyEnum
text_detections: TextDetection | None = None
rasterized: PILImage | None = None
page_size: PageDimension
page_index: int
pdfium_elements: PdfPage
model_config = ConfigDict(arbitrary_types_allowed=True)
class GatewayDocument(BaseModel):
"""
A class to represent a Gateway MegaParse Document, which is a container of pages.
"""
file_name: str
pages: List[Page]
================================================
FILE: libs/megaparse/src/megaparse/parser/__init__.py
================================================
from .base import BaseParser
__all__ = ["BaseParser"]
================================================
FILE: libs/megaparse/src/megaparse/parser/base.py
================================================
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO
from megaparse_sdk.schema.document import Document
from megaparse_sdk.schema.extensions import FileExtension
class BaseParser(ABC):
"""Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]"""
supported_extensions = []
def check_supported_extension(
self, file_extension: FileExtension | None, file_path: str | Path | None = None
):
if not file_extension and not file_path:
raise ValueError(
f"Either file_path or file_extension must be provided for {self.__class__.__name__}"
)
if file_path and not file_extension:
file_path = Path(file_path) if isinstance(file_path, str) else file_path
file_extension = FileExtension(file_path.suffix)
if file_extension and file_extension not in self.supported_extensions:
raise ValueError(
f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}"
)
@abstractmethod
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> Document:
"""
Convert the given file to a specific format.
Args:
file_path (str | Path): The path to the file to be converted.
**kwargs: Additional keyword arguments for the conversion process.
Returns:
str: The result of the conversion process.
Raises:
NotImplementedError: If the method is not implemented by a subclass.
"""
raise NotImplementedError("Subclasses should implement this method")
@abstractmethod
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> Document:
"""
Convert the given file to the unstructured format.
Args:
file_path (str | Path): The path to the file to be converted.
**kwargs: Additional keyword arguments for the conversion process.
Returns:
str: The result of the conversion process.
Raises:
NotImplementedError: If the method is not implemented by a subclass.
"""
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/parser/builder.py
================================================
from megaparse_sdk.schema.parser_config import ParseFileConfig
from megaparse.parser.base import BaseParser
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
parser_dict: dict[str, type] = {
"unstructured": UnstructuredParser,
"llama_parser": LlamaParser,
"megaparse_vision": MegaParseVision,
}
class ParserBuilder:
def build(self, config: ParseFileConfig) -> BaseParser:
"""
Build a parser based on the given configuration.
Args:
config (ParserDict): The configuration to be used for building the parser.
Returns:
BaseParser: The built parser.
Raises:
ValueError: If the configuration is invalid.
"""
return parser_dict[config.method](**config.model_dump())
================================================
FILE: libs/megaparse/src/megaparse/parser/doctr_parser.py
================================================
import logging
import uuid
from typing import Any, Dict, List, Tuple, Type
from uuid import UUID
import numpy as np
import onnxruntime as rt
from megaparse_sdk.schema.document import (
BBOX,
Block,
BlockLayout,
BlockType,
CaptionBlock,
FooterBlock,
HeaderBlock,
ImageBlock,
ListElementBlock,
Point2D,
SubTitleBlock,
TableBlock,
TextBlock,
TextDetection,
TitleBlock,
UndefinedBlock,
)
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from onnxtr.io import Document
from onnxtr.models import detection_predictor, recognition_predictor
from onnxtr.models._utils import get_language
from onnxtr.models.engine import EngineConfig
from onnxtr.models.predictor.base import _OCRPredictor
from onnxtr.utils.geometry import detach_scores
from onnxtr.utils.repr import NestedObject
from megaparse.configs.auto import DeviceEnum, TextDetConfig, TextRecoConfig
from megaparse.layout_detection.output import LayoutDetectionOutput
from megaparse.models.page import Page
from megaparse.utils.onnx import get_providers
logger = logging.getLogger("megaparse")
block_cls_map: Dict[int, Type[Block]] = {
0: CaptionBlock,
1: TextBlock,
2: TextBlock,
3: ListElementBlock,
4: FooterBlock,
5: HeaderBlock,
6: ImageBlock,
7: SubTitleBlock,
8: TableBlock,
9: TextBlock,
10: TitleBlock,
}
class DoctrParser(NestedObject, _OCRPredictor):
supported_extensions = [FileExtension.PDF]
def __init__(
self,
text_det_config: TextDetConfig = TextDetConfig(),
text_reco_config: TextRecoConfig = TextRecoConfig(),
device: DeviceEnum = DeviceEnum.CPU,
straighten_pages: bool = False,
detect_orientation: bool = False,
detect_language: bool = False,
**kwargs,
):
self.device = device
general_options = rt.SessionOptions()
providers = get_providers(self.device)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
_OCRPredictor.__init__(
self,
text_det_config.assume_straight_pages,
straighten_pages,
text_det_config.preserve_aspect_ratio,
text_det_config.symmetric_pad,
detect_orientation,
clf_engine_cfg=engine_config,
**kwargs,
)
self.det_predictor = detection_predictor(
arch=text_det_config.det_arch,
assume_straight_pages=text_det_config.assume_straight_pages,
preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,
symmetric_pad=text_det_config.symmetric_pad,
batch_size=text_det_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
self.reco_predictor = recognition_predictor(
arch=text_reco_config.reco_arch,
batch_size=text_reco_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
self.detect_orientation = detect_orientation
self.detect_language = detect_language
def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]:
rasterized_pages = [np.array(page.rasterized) for page in pages]
# Dimension check
if any(page.ndim != 3 for page in rasterized_pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
origin_page_shapes = [page.shape[:2] for page in rasterized_pages]
# Localize text elements
loc_preds, out_maps = self.det_predictor(
rasterized_pages, return_maps=True, **kwargs
)
# Detect document rotation and rotate pages
seg_maps = [
np.where(
out_map > self.det_predictor.model.postprocessor.bin_thresh,
255,
0,
).astype(np.uint8)
for out_map in out_maps
]
if self.detect_orientation:
general_pages_orientations, origin_pages_orientations = (
self._get_orientations(rasterized_pages, seg_maps)
)
orientations = [
{"value": orientation_page, "confidence": None}
for orientation_page in origin_pages_orientations
]
else:
orientations = None
general_pages_orientations = None
origin_pages_orientations = None
if self.straighten_pages:
rasterized_pages = self._straighten_pages(
rasterized_pages,
seg_maps,
general_pages_orientations,
origin_pages_orientations,
)
# update page shapes after straightening
origin_page_shapes = [page.shape[:2] for page in rasterized_pages]
# forward again to get predictions on straight pagess
loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment]
# Detach objectness scores from loc_preds
loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type]
# Apply hooks to loc_preds if any
for hook in self.hooks:
loc_preds = hook(loc_preds)
for page_index, (rast_page, loc_pred, objectness_score, page) in enumerate(
zip(rasterized_pages, loc_preds, objectness_scores, pages, strict=True)
):
block_layouts = []
for bbox, score in zip(loc_pred, objectness_score, strict=True):
block_layouts.append(
BlockLayout(
bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),
objectness_score=score,
block_type=BlockType.TEXT,
)
)
page.text_detections = TextDetection(
bboxes=block_layouts,
page_index=page_index,
dimensions=rast_page.shape[:2],
orientation=orientations[page_index] if orientations is not None else 0,
origin_page_shape=origin_page_shapes[page_index],
)
return pages
def get_text_recognition(
self, pages: List[Page], layout: List[List[LayoutDetectionOutput]], **kwargs
) -> MPDocument:
assert any(
page.text_detections is not None for page in pages
), "Text detections should be computed before running text recognition"
rasterized_pages = []
loc_preds = []
objectness_scores = []
orientations = []
origin_page_shapes = []
for page in pages:
page_loc_pred = page.text_detections.get_loc_preds() # type: ignore
if page_loc_pred.shape[0] == 0:
page_loc_pred = np.zeros((0, 4))
rasterized_pages.append(np.array(page.rasterized))
loc_preds.append(page_loc_pred) # type: ignore
objectness_scores.append(page.text_detections.get_objectness_scores()) # type: ignore
orientations.append(page.text_detections.get_orientations()) # type: ignore
origin_page_shapes.append(page.text_detections.get_origin_page_shapes()) # type: ignore
# Crop images
crops, loc_preds = self._prepare_crops(
rasterized_pages,
loc_preds, # type: ignore[arg-type]
channels_last=True,
assume_straight_pages=self.assume_straight_pages,
assume_horizontal=self._page_orientation_disabled,
)
# Rectify crop orientation and get crop orientation predictions
crop_orientations: Any = []
if not self.assume_straight_pages:
crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
crop_orientations = [
{"value": orientation[0], "confidence": orientation[1]}
for orientation in _crop_orientations
]
# Identify character sequences
word_preds = self.reco_predictor(
[crop for page_crops in crops for crop in page_crops], **kwargs
)
if not crop_orientations:
crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
boxes, text_preds, crop_orientations = self._process_predictions(
loc_preds, word_preds, crop_orientations
)
if self.detect_language:
languages = [
get_language(" ".join([item[0] for item in text_pred]))
for text_pred in text_preds
]
languages_dict = [
{"value": lang[0], "confidence": lang[1]} for lang in languages
]
else:
languages_dict = None
# FIXME : Not good return type we want :(
out = self.doc_builder(
rasterized_pages,
boxes,
objectness_scores,
text_preds,
origin_page_shapes,
crop_orientations,
orientations,
languages_dict,
)
return self.__to_elements_list(out, layout)
def _get_block_cls(
self,
coordinates: tuple[float, float, float, float],
layout: List[LayoutDetectionOutput],
threshold: float = 0.6,
) -> Tuple[UUID | None, Type[Block]]:
for det in layout:
x1, y1, x2, y2 = coordinates
X1, Y1, X2, Y2 = det.bbox.to_numpy()
assert x1 <= x2 and y1 <= y2, "bbox1 coordinates are invalid"
assert X1 <= X2 and Y1 <= Y2, "bbox2 coordinates are invalid"
union_x1 = max(x1, X1)
union_y1 = max(y1, Y1)
union_x2 = min(x2, X2)
union_y2 = min(y2, Y2)
union_width = max(0, union_x2 - union_x1)
union_height = max(0, union_y2 - union_y1)
union_area = union_width * union_height
detection_area = max(0, x2 - x1) * max(0, y2 - y1)
if union_area / detection_area > threshold:
# breakpoint()
return (det.bbox_id, block_cls_map[det.label])
return (uuid.uuid4(), UndefinedBlock)
def __to_elements_list(
self, doctr_document: Document, layouts: List[List[LayoutDetectionOutput]]
) -> MPDocument:
results = []
for page_number, (page, layout) in enumerate(
zip(doctr_document.pages, layouts, strict=True)
):
result = {}
for block in page.blocks:
if len(block.lines) and len(block.artefacts) > 0:
raise ValueError(
"Block should not contain both lines and artefacts"
)
for line in block.lines:
line_coordinates = [word.geometry for word in line.words]
x0 = min(word[0][0] for word in line_coordinates)
y0 = min(word[0][1] for word in line_coordinates)
x1 = max(word[1][0] for word in line_coordinates)
y1 = max(word[1][1] for word in line_coordinates)
block_id, block_cls = self._get_block_cls(
coordinates=(x0, y0, x1, y1), layout=layout
)
if block_id in result:
bbx0, bby0, bbx1, bby1 = result[block_id].bbox.to_numpy()
result[block_id].text += "\n" + line.render()
result[block_id].bbox = BBOX(
top_left=Point2D(x=min(x0, bbx0), y=min(y0, bby0)),
bottom_right=Point2D(x=max(x1, bbx1), y=max(y1, bby1)),
)
elif issubclass(block_cls, TextBlock):
result[block_id] = block_cls(
text=line.render(),
bbox=BBOX(
top_left=Point2D(x=x0, y=y0),
bottom_right=Point2D(x=x1, y=y1),
),
metadata={},
page_range=(page_number, page_number),
)
# We add the Image Blocks to the MPDocument with the right order
for det in layout:
if det.label in [6, 8]:
x0, y0, x1, y1 = det.bbox.to_numpy()
block_cls = block_cls_map[det.label]
result[uuid.uuid4()] = block_cls(
bbox=BBOX(
top_left=Point2D(x=x0, y=y0),
bottom_right=Point2D(x=x1, y=y1),
),
metadata={},
page_range=(page_number, page_number),
)
sorted_page_blocks = sorted(
result.values(), key=lambda block: block.bbox.top_left.y
)
results += sorted_page_blocks
return MPDocument(
metadata={},
content=results,
detection_origin="doctr",
)
================================================
FILE: libs/megaparse/src/megaparse/parser/entity.py
================================================
from enum import Enum
from typing import List, Optional
class TagEnum(str, Enum):
"""Possible tags for the elements in the file"""
TABLE = "TABLE"
TOC = "TOC"
HEADER = "HEADER"
IMAGE = "IMAGE"
class SupportedModel(Enum):
GPT_4O = ("gpt-4o", None)
GPT_4O_TURBO = ("gpt-4o-turbo", None)
CLAUDE_3_5_SONNET = ("claude-3-5-sonnet", ["latest", "20241022"])
CLAUDE_3_OPUS = ("claude-3-opus", ["latest", "20240229"])
def __init__(self, model_name: str, supported_releases: Optional[List[str]]):
self.model_name = model_name
self.supported_releases = supported_releases
@classmethod
def is_supported(cls, model_name: str) -> bool:
# Attempt to match model_name by checking if it starts with a known model name
for model in cls:
if model_name.startswith(model.model_name):
# Extract the release version if available
release = model_name[len(model.model_name) :].lstrip("-") or None
# Check if the model supports this release
if model.supported_releases is None:
return True
return release in model.supported_releases if release else False
return False
================================================
FILE: libs/megaparse/src/megaparse/parser/llama.py
================================================
from pathlib import Path
from typing import IO, List
from llama_index.core.schema import Document as LlamaDocument
from llama_parse import LlamaParse as _LlamaParse
from llama_parse.utils import Language, ResultType
from megaparse_sdk.schema.document import BBOX, Point2D, TextBlock
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from megaparse.parser import BaseParser
class LlamaParser(BaseParser):
supported_extensions = [FileExtension.PDF]
def __init__(
self,
api_key: str,
verbose=True,
language: Language = Language.FRENCH,
parsing_instruction: str | None = None,
**kwargs,
) -> None:
self.api_key = api_key
self.verbose = verbose
self.language = language
if parsing_instruction:
self.parsing_instruction = parsing_instruction
else:
self.parsing_instruction = """Do not take into account the page breaks (no --- between pages),
do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables."""
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)
llama_parser = _LlamaParse(
api_key=self.api_key,
result_type=ResultType.MD,
gpt4o_mode=True,
verbose=self.verbose,
language=self.language,
parsing_instruction=self.parsing_instruction,
)
documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path))
return self.__to_elements_list__(documents)
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)
llama_parser = _LlamaParse(
api_key=self.api_key,
result_type=ResultType.JSON,
gpt4o_mode=True,
verbose=self.verbose,
language=self.language,
parsing_instruction=self.parsing_instruction,
)
documents: List[LlamaDocument] = llama_parser.load_data(str(file_path))
return self.__to_elements_list__(documents)
def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument:
list_blocks = []
for i, page in enumerate(llama_doc):
list_blocks.append(
TextBlock(
text=page.text,
metadata={},
page_range=(i, i + 1),
bbox=BBOX(
top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)
),
)
)
return MPDocument(
metadata={},
detection_origin="llamaparse",
content=list_blocks,
)
================================================
FILE: libs/megaparse/src/megaparse/parser/megaparse_vision.py
================================================
import asyncio
import base64
import re
from io import BytesIO
from pathlib import Path
from typing import IO, List
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from pdf2image import convert_from_path
from megaparse.parser import BaseParser
from megaparse.parser.entity import SupportedModel, TagEnum
# BASE_OCR_PROMPT = """
# Transcribe the content of this file into markdown. Be mindful of the formatting.
# Add formatting if you think it is not clear.
# Do not include page breaks and merge content of tables if it is continued in the next page.
# Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]'
# Return only the parsed content.
# """
BASE_OCR_PROMPT = """
You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags.
Follow these instructions to complete the task:
1. Carefully read through the entire file content.
2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure.
3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure.
4. For tables, headers, and table of contents, add the following tags:
- Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page.
- Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file.
- Table of contents: Enclose in [TOC] and [/TOC] tags
5. When transcribing tables:
- If a table continues across multiple pages, merge the content into a single, cohesive table.
- Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure.
6. Do not include page breaks in your transcription.
7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.).
8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed.
10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents.
"""
class MegaParseVision(BaseParser):
supported_extensions = [FileExtension.PDF]
def __init__(self, model: BaseChatModel, **kwargs):
if hasattr(model, "model_name"):
if not SupportedModel.is_supported(model.model_name):
raise ValueError(
f"Invald model name, MegaParse vision only supports model that have vision capabilities. "
f"{model.model_name} is not supported."
)
self.model = model
self.parsed_chunks: list[str] | None = None
def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]:
"""
Process a PDF file and convert its pages to base64 encoded images.
:param file_path: Path to the PDF file
:param image_format: Format to save the images (default: PNG)
:return: List of base64 encoded images
"""
try:
images = convert_from_path(file_path)
images_base64 = []
for image in images:
buffered = BytesIO()
image.save(buffered, format=image_format)
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
images_base64.append(image_base64)
return images_base64
except Exception as e:
raise ValueError(f"Error processing PDF file: {str(e)}")
def get_element(self, tag: TagEnum, chunk: str):
pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]"
all_elmts = re.findall(pattern, chunk)
if not all_elmts:
print(f"No {tag.value} found in the chunk")
return []
return [elmt.strip() for elmt in all_elmts]
async def asend_to_mlm(self, images_data: List[str]) -> str:
"""
Send images to the language model for processing.
:param images_data: List of base64 encoded images
:return: Processed content as a string
"""
images_prompt = [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
}
for image_data in images_data
]
message = HumanMessage(
content=[
{"type": "text", "text": BASE_OCR_PROMPT},
*images_prompt,
],
)
response = await self.model.ainvoke([message])
return str(response.content)
def send_to_mlm(self, images_data: List[str]) -> str:
"""
Send images to the language model for processing.
:param images_data: List of base64 encoded images
:return: Processed content as a string
"""
images_prompt = [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
}
for image_data in images_data
]
message = HumanMessage(
content=[
{"type": "text", "text": BASE_OCR_PROMPT},
*images_prompt,
],
)
response = self.model.invoke([message])
return str(response.content)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
batch_size: int = 3,
**kwargs,
) -> MPDocument:
"""
Parse a PDF file and process its content using the language model.
:param file_path: Path to the PDF file
:param batch_size: Number of pages to process concurrently
:return: List of processed content strings
"""
if not file_path:
raise ValueError("File_path should be provided to run MegaParseVision")
if isinstance(file_path, Path):
file_path = str(file_path)
self.check_supported_extension(file_extension, file_path)
pdf_base64 = self.process_file(file_path)
n_pages = len(pdf_base64)
tasks = [
self.asend_to_mlm(pdf_base64[i : i + batch_size])
for i in range(0, len(pdf_base64), batch_size)
]
self.parsed_chunks = await asyncio.gather(*tasks)
responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
return self.__to_elements_list__(responses, n_pages=n_pages)
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
batch_size: int = 3,
**kwargs,
) -> MPDocument:
"""
Parse a PDF file and process its content using the language model.
:param file_path: Path to the PDF file
:param batch_size: Number of pages to process at a time
:return: List of processed content strings
"""
if not file_path:
raise ValueError("File_path should be provided to run MegaParseVision")
if isinstance(file_path, Path):
file_path = str(file_path)
self.check_supported_extension(file_extension, file_path)
pdf_base64 = self.process_file(file_path)
n_pages = len(pdf_base64)
chunks = [
pdf_base64[i : i + batch_size]
for i in range(0, len(pdf_base64), batch_size)
]
self.parsed_chunks = []
for chunk in chunks:
response = self.send_to_mlm(chunk)
self.parsed_chunks.append(response)
responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
return self.__to_elements_list__(responses, n_pages)
def get_cleaned_content(self, parsed_file: str) -> str:
"""
Get cleaned parsed file without any tags defined in TagEnum.
This method removes all tags from TagEnum from the parsed file, formats the content,
and handles the HEADER tag specially by keeping only the first occurrence.
Args:
parsed_file (str): The parsed file content with tags.
Returns:
str: The cleaned content without TagEnum tags.
"""
tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values()))
tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]"
# handle the HEADER tag specially
header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]"
headers = re.findall(header_pattern, parsed_file, re.DOTALL)
if headers:
first_header = headers[0].strip()
# Remove all HEADER tags and their content
parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL)
# Add the first header back at the beginning
parsed_file = f"{first_header}\n{parsed_file}"
# Remove all other tags
def remove_tag(match):
return match.group(2)
cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL)
cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE)
cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
cleaned_content = cleaned_content.replace("|\n\n|", "|\n|")
cleaned_content = cleaned_content.strip()
return cleaned_content
def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument:
list_blocks: List[Block] = [
TextBlock(
text=mpv_doc,
metadata={},
page_range=(0, n_pages - 1),
bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)),
)
]
return MPDocument(
metadata={},
detection_origin="megaparse_vision",
content=list_blocks,
)
================================================
FILE: libs/megaparse/src/megaparse/parser/unstructured_parser.py
================================================
import warnings
from pathlib import Path
from typing import IO, Dict, List
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse_sdk.schema.document import (
BBOX,
Block,
FooterBlock,
HeaderBlock,
ImageBlock,
Point2D,
SubTitleBlock,
TableBlock,
TextBlock,
TitleBlock,
)
from megaparse_sdk.schema.document import (
Document as MPDocument,
)
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from megaparse.parser import BaseParser
load_dotenv()
class UnstructuredParser(BaseParser):
supported_extensions = [
FileExtension.PDF,
FileExtension.DOCX,
FileExtension.TXT,
FileExtension.OTF,
FileExtension.EPUB,
FileExtension.HTML,
FileExtension.XML,
FileExtension.CSV,
FileExtension.XLSX,
FileExtension.XLS,
FileExtension.PPTX,
FileExtension.MD,
FileExtension.MARKDOWN,
]
def __init__(
self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs
):
self.strategy = strategy
self.model = model
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> MPDocument:
self.check_supported_extension(file_extension, file_path)
# Partition the PDF
elements = partition(
filename=str(file_path) if file_path else None,
file=file,
strategy=self.strategy,
content_type=file_extension.mimetype if file_extension else None,
)
return self.__to_mp_document(elements)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> MPDocument:
self.check_supported_extension(file_extension, file_path)
warnings.warn(
"The UnstructuredParser is a sync parser, please use the sync convert method",
UserWarning,
stacklevel=2,
)
return self.convert(file_path, file, file_extension, **kwargs)
def __to_mp_document(self, elements: List[Element]) -> MPDocument:
text_blocks = []
for element in elements:
block = self.__convert_element_to_block(element)
if block:
text_blocks.append(block)
return MPDocument(
content=text_blocks, metadata={}, detection_origin="unstructured"
)
def __convert_element_to_block(self, element: Element) -> Block | None:
element_type = element.category
text = element.text
metadata = element.metadata
category_depth = metadata.category_depth
# Element type-specific markdown content
markdown_types: Dict[str, Block] = {
"Title": TitleBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Subtitle": SubTitleBlock(
text=text,
depth=category_depth if category_depth else 0,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Header": HeaderBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Footer": FooterBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"NarrativeText": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"ListItem": TextBlock( # FIXME: @chloedia, list item need to be handled differently in ListBlock
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Table": TableBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Image": ImageBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Formula": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"FigureCaption": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Address": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"EmailAddress": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"CodeSnippet": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"UncategorizedText": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
}
return markdown_types.get(element_type, None)
================================================
FILE: libs/megaparse/src/megaparse/predictor/layout_predictor.py
================================================
from PIL import Image
from unstructured_inference.inference.layout import PageLayout
from unstructured_inference.models.base import get_model
from unstructured_inference.visualize import draw_bbox
def extract_layout(
page_number: int, page_image: Image.Image, model_name: str = "yolox"
) -> PageLayout:
layout_model = get_model(model_name)
parsed_page = PageLayout.from_image(
image=page_image,
number=page_number,
detection_model=layout_model,
element_extraction_model=None,
fixed_layout=None,
)
colors = ["red" for _ in parsed_page.elements]
for el, color in zip(parsed_page.elements, colors, strict=True):
page_image = draw_bbox(page_image, el, color=color, details=False)
page_image.show()
return parsed_page
================================================
FILE: libs/megaparse/src/megaparse/utils/extract_metadata.py
================================================
from typing import Any, Dict
import pypdfium2 as pdfium
def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]:
pass
================================================
FILE: libs/megaparse/src/megaparse/utils/onnx.py
================================================
import logging
from typing import List
import onnxruntime as rt
from megaparse.configs.auto import DeviceEnum
logger = logging.getLogger("megaparse")
def get_providers(device: DeviceEnum) -> List[str]:
prov = rt.get_available_providers()
logger.info("Available providers: %s", prov)
if device == DeviceEnum.CUDA:
if "CUDAExecutionProvider" not in prov:
raise ValueError(
"onnxruntime can't find CUDAExecutionProvider in list of available providers"
)
return ["CUDAExecutionProvider"]
elif device == DeviceEnum.COREML:
if "CoreMLExecutionProvider" not in prov:
raise ValueError(
"onnxruntime can't find CoreMLExecutionProvider in list of available providers"
)
return ["CoreMLExecutionProvider"]
elif device == DeviceEnum.CPU:
return ["CPUExecutionProvider"]
else:
raise ValueError("device not in (CUDA,CoreML,CPU)")
================================================
FILE: libs/megaparse/src/megaparse/utils/strategy.py
================================================
from typing import List
import numpy as np
from megaparse.models.page import Page
from megaparse_sdk.schema.document import TextDetection
from megaparse_sdk.schema.parser_config import StrategyEnum
from pypdfium2._helpers.page import PdfPage
def get_page_strategy(
pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float
) -> StrategyEnum:
if onnxtr_page is None:
return StrategyEnum.FAST
text_coords = []
# Get all the images in the page
for obj in pdfium_page.get_objects():
if obj.type == 1: # type: ignore
text_coords.append(obj.get_pos())
p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())
pdfium_canva = np.zeros((int(p_height), int(p_width)))
for coords in text_coords:
# (left,bottom,right, top)
# 0---l--------------R-> y
# |
# B (x0,y0)
# |
# T (x1,y1)
# ^
# x
x0, y0, x1, y1 = (
p_height - coords[3],
coords[0],
p_height - coords[1],
coords[2],
)
x0 = max(0, min(p_height, int(x0)))
y0 = max(0, min(p_width, int(y0)))
x1 = max(0, min(p_height, int(x1)))
y1 = max(0, min(p_width, int(y1)))
pdfium_canva[x0:x1, y0:y1] = 1
onnxtr_canva = np.zeros((int(p_height), int(p_width)))
for block in onnxtr_page.bboxes:
x0, y0 = block.bbox[0]
x1, y1 = block.bbox[1]
x0 = max(0, min(int(x0 * p_width), int(p_width)))
y0 = max(0, min(int(y0 * p_height), int(p_height)))
x1 = max(0, min(int(x1 * p_width), int(p_width)))
y1 = max(0, min(int(y1 * p_height), int(p_height)))
onnxtr_canva[y0:y1, x0:x1] = 1
intersection = np.logical_and(pdfium_canva, onnxtr_canva)
union = np.logical_or(pdfium_canva, onnxtr_canva)
sum_intersection = np.sum(intersection)
sum_union = np.sum(union)
iou = sum_intersection / sum_union if sum_union != 0 else 0
if iou < threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum:
count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES)
if count / len(pages) > threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
================================================
FILE: libs/megaparse/tests/__init__.py
================================================
================================================
FILE: libs/megaparse/tests/certs/client-cert.pem
================================================
-----BEGIN CERTIFICATE-----
MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw
gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p
bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw
PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh
bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow
ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD
VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv
dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG
AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1
2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp
dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ
6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV
HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn
AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p
vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW
0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9
ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr
drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7
/E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=
-----END CERTIFICATE-----
================================================
FILE: libs/megaparse/tests/certs/client-key.pem
================================================
-----BEGIN PRIVATE KEY-----
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU
QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj
rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj
BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k
0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo
8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy
dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0
xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW
OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB
Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18
vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY
nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ
eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M
f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG
qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh
zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq
8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP
HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz
4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI
1OaXIqrCA/V43NydDezh0ylQ
-----END PRIVATE KEY-----
================================================
FILE: libs/megaparse/tests/conftest.py
================================================
from pathlib import Path
from typing import IO
import pytest_asyncio
from httpx import ASGITransport, AsyncClient
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_core.documents import Document
from megaparse.api.app import app, get_playwright_loader, parser_builder_dep
from megaparse.parser.base import BaseParser
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.document import TextBlock
from megaparse_sdk.schema.extensions import FileExtension
class FakeParserBuilder:
def build(self, *args, **kwargs) -> BaseParser:
"""
Build a fake parser based on the given configuration.
Returns:
BaseParser: The built fake parser.
Raises:
ValueError: If the configuration is invalid.
"""
class FakeParser(BaseParser):
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
print("Fake parser is converting the file")
return MPDocument(
file_name="Fake file",
content=[TextBlock(text="Fake conversion result", metadata={})],
metadata={},
detection_origin="fakeparser",
)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
print("Fake parser is converting the file")
return MPDocument(
file_name="Fake file",
content=[TextBlock(text="Fake conversion result", metadata={})],
metadata={},
detection_origin="fakeparser",
)
return FakeParser()
@pytest_asyncio.fixture(scope="function")
async def test_client():
print("Setting up test_client fixture")
def fake_parser_builder():
return FakeParserBuilder()
def fake_playwright_loader():
class FakePlaywrightLoader(PlaywrightURLLoader):
async def aload(self):
return [Document(page_content="Fake website content")]
return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"])
app.dependency_overrides[parser_builder_dep] = fake_parser_builder
app.dependency_overrides[get_playwright_loader] = fake_playwright_loader
async with AsyncClient(
transport=ASGITransport(app=app), # type: ignore
base_url="http://test",
) as ac:
yield ac
app.dependency_overrides = {}
================================================
FILE: libs/megaparse/tests/data/grt_example/MegaFake_report.md
================================================
| My Mega fake report | #1756394 | 31/05/2024 |
|---------------------|----------|------------|
## Why Mega Parse might be the best ?
### Introduction
Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, be
gitextract_ylqgqesz/ ├── .aws/ │ └── task_definition.json ├── .flake8 ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── CI.yml │ ├── build-and-deploy.yml │ ├── build-gpu.yml │ ├── release-please.yml │ └── test-build-docker.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .release-please-manifest.json ├── .vscode/ │ ├── extensions.json │ ├── launch.json │ └── settings.json ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.gpu ├── LICENSE ├── Makefile ├── Pipfile ├── README.md ├── benchmark/ │ ├── process_single_doc.py │ └── test_quality_sim.py ├── docker-compose.dev.yml ├── docker-compose.yml ├── docs/ │ └── archive.txt ├── evaluations/ │ └── script.py ├── libs/ │ ├── megaparse/ │ │ ├── .python-version │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── bench.md │ │ ├── examples/ │ │ │ ├── parse_file_fast.py │ │ │ ├── parse_file_mp.py │ │ │ └── parse_file_unstructured.py │ │ ├── program.prof │ │ ├── pyproject.toml │ │ ├── src/ │ │ │ └── megaparse/ │ │ │ ├── __init__.py │ │ │ ├── api/ │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ ├── exceptions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── megaparse_exceptions.py │ │ │ │ └── models/ │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ ├── configs/ │ │ │ │ └── auto.py │ │ │ ├── examples/ │ │ │ │ ├── parse_file.py │ │ │ │ └── parsing_process.py │ │ │ ├── exceptions/ │ │ │ │ └── base.py │ │ │ ├── formatter/ │ │ │ │ ├── base.py │ │ │ │ ├── structured_formatter/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── custom_structured_formatter.py │ │ │ │ └── table_formatter/ │ │ │ │ ├── __init__.py │ │ │ │ ├── llm_table_formatter.py │ │ │ │ └── vision_table_formatter.py │ │ │ ├── layout_detection/ │ │ │ │ ├── layout_detector.py │ │ │ │ ├── models/ │ │ │ │ │ └── yolov10s-doclaynet.onnx │ │ │ │ └── output.py │ │ │ ├── megaparse.py │ │ │ ├── models/ │ │ │ │ └── page.py │ │ │ ├── parser/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── builder.py │ │ │ │ ├── doctr_parser.py │ │ │ │ ├── entity.py │ │ │ │ ├── llama.py │ │ │ │ ├── megaparse_vision.py │ │ │ │ └── unstructured_parser.py │ │ │ ├── predictor/ │ │ │ │ └── layout_predictor.py │ │ │ └── utils/ │ │ │ ├── extract_metadata.py │ │ │ ├── onnx.py │ │ │ └── strategy.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── certs/ │ │ │ ├── client-cert.pem │ │ │ └── client-key.pem │ │ ├── conftest.py │ │ ├── data/ │ │ │ └── grt_example/ │ │ │ └── MegaFake_report.md │ │ ├── pdf/ │ │ │ ├── test_detect_ocr.py │ │ │ ├── test_pdf_processing.py │ │ │ └── test_pdfium_parser.py │ │ ├── supported_docs/ │ │ │ ├── Sway.epub │ │ │ ├── file-sample_500kB.odt │ │ │ ├── file_example_XLSX_50.xlsx │ │ │ ├── file_example_XLS_50.xls │ │ │ ├── sample.csv │ │ │ ├── sample.docx │ │ │ ├── sample.markdown │ │ │ ├── sample.md │ │ │ ├── sample.otf │ │ │ ├── sample.pptx │ │ │ ├── sample.txt │ │ │ ├── sample.xml │ │ │ └── sample_complexe.html │ │ ├── test_endpoints.py │ │ ├── test_import.py │ │ └── test_parsers.py │ └── megaparse_sdk/ │ ├── CHANGELOG.md │ ├── README.md │ ├── __init__.py │ ├── examples/ │ │ └── usage_example.py │ ├── megaparse_sdk/ │ │ ├── __init__.py │ │ ├── client.py │ │ ├── config.py │ │ ├── endpoints/ │ │ │ ├── __init__.py │ │ │ ├── file_upload.py │ │ │ └── url_upload.py │ │ ├── schema/ │ │ │ ├── __init__.py │ │ │ ├── document.py │ │ │ ├── extensions.py │ │ │ ├── languages.py │ │ │ ├── mp_exceptions.py │ │ │ ├── mp_inputs.py │ │ │ ├── mp_outputs.py │ │ │ ├── parser_config.py │ │ │ └── supported_models.py │ │ └── utils/ │ │ └── load_ssl.py │ ├── pyproject.toml │ └── tests/ │ ├── README.md │ ├── certs/ │ │ ├── client-cert.pem │ │ ├── client-key.pem │ │ └── rootCA.pem │ └── test_nats_client.py ├── pyproject.toml └── release-please-config.json
SYMBOL INDEX (270 symbols across 56 files)
FILE: benchmark/process_single_doc.py
function process_file (line 11) | async def process_file(megaparse: MegaParse, file_path: str | Path):
function test_process_file (line 24) | async def test_process_file(file: str | Path):
FILE: benchmark/test_quality_sim.py
function jaccard_similarity (line 9) | def jaccard_similarity(str1, str2):
function compare_files (line 24) | def compare_files(file_name):
function main (line 43) | def main():
FILE: libs/megaparse/examples/parse_file_fast.py
class File (line 9) | class File:
function list_files_in_directory (line 15) | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
function main (line 36) | def main():
FILE: libs/megaparse/examples/parse_file_mp.py
class File (line 10) | class File:
function list_files_in_directory (line 16) | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
function main (line 37) | def main():
FILE: libs/megaparse/examples/parse_file_unstructured.py
class File (line 9) | class File:
function list_files_in_directory (line 15) | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
function main (line 36) | def main():
FILE: libs/megaparse/src/megaparse/api/app.py
function parser_builder_dep (line 36) | def parser_builder_dep():
function get_playwright_loader (line 40) | def get_playwright_loader():
function healthz (line 45) | def healthz():
function _check_free_memory (line 49) | def _check_free_memory() -> bool:
function parse_file (line 62) | async def parse_file(
function upload_url (line 122) | async def upload_url(
FILE: libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py
class HTTPModelNotSupported (line 4) | class HTTPModelNotSupported(HTTPException):
method __init__ (line 5) | def __init__(
class HTTPFileNotFound (line 13) | class HTTPFileNotFound(HTTPException):
method __init__ (line 14) | def __init__(
class HTTPDownloadError (line 21) | class HTTPDownloadError(HTTPException):
method __init__ (line 22) | def __init__(self, file_name, message="Failed to download the file"):
class HTTPParsingException (line 27) | class HTTPParsingException(HTTPException):
method __init__ (line 28) | def __init__(self, file_name, message="Failed to parse the file"):
class ParsingException (line 33) | class ParsingException(Exception):
method __init__ (line 36) | def __init__(self, message="An error occurred during parsing"):
FILE: libs/megaparse/src/megaparse/api/models/base.py
class MarkDownType (line 4) | class MarkDownType(str, Enum):
FILE: libs/megaparse/src/megaparse/configs/auto.py
class TextDetConfig (line 7) | class TextDetConfig(BaseModel):
class AutoStrategyConfig (line 16) | class AutoStrategyConfig(BaseModel):
class TextRecoConfig (line 21) | class TextRecoConfig(BaseModel):
class DeviceEnum (line 26) | class DeviceEnum(str, Enum):
class DoctrConfig (line 32) | class DoctrConfig(BaseModel):
class MegaParseConfig (line 40) | class MegaParseConfig(BaseSettings):
FILE: libs/megaparse/src/megaparse/examples/parse_file.py
class MyCustomFormat (line 7) | class MyCustomFormat(BaseModel):
function main (line 13) | def main():
FILE: libs/megaparse/src/megaparse/examples/parsing_process.py
function get_strategy_page (line 30) | def get_strategy_page(
function validate_input (line 86) | def validate_input(
function _generate_crops (line 118) | def _generate_crops(
function _prepare_crops (line 143) | def _prepare_crops(
function _process_predictions (line 169) | def _process_predictions(
function main (line 189) | def main():
FILE: libs/megaparse/src/megaparse/exceptions/base.py
class ParsingException (line 1) | class ParsingException(Exception):
method __init__ (line 4) | def __init__(self, message="An error occurred during parsing"):
FILE: libs/megaparse/src/megaparse/formatter/base.py
class BaseFormatter (line 9) | class BaseFormatter(ABC):
method __init__ (line 22) | def __init__(self, model: BaseChatModel | None = None):
method format (line 25) | def format(
method aformat (line 30) | async def aformat(
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py
class StructuredFormatter (line 9) | class StructuredFormatter(BaseFormatter):
method __init__ (line 10) | def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
method aformat (line 14) | async def aformat(
method format (line 21) | def format(
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py
class CustomStructuredFormatter (line 8) | class CustomStructuredFormatter(StructuredFormatter):
method format (line 9) | def format(
method aformat (line 45) | async def aformat(
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py
class TableFormatter (line 7) | class TableFormatter(BaseFormatter):
method format (line 8) | def format(
method aformat (line 13) | async def aformat(
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py
class SimpleMDTableFormatter (line 12) | class SimpleMDTableFormatter(TableFormatter):
method __init__ (line 21) | def __init__(self, model: Optional[BaseChatModel] = None):
method aformat (line 24) | async def aformat(
method format (line 34) | def format(
method format_table (line 62) | def format_table(
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py
class VisionMDTableFormatter (line 19) | class VisionMDTableFormatter(TableFormatter):
method __init__ (line 28) | def __init__(self, model: Optional[BaseChatModel] = None):
method _crop_table_image (line 31) | def _crop_table_image(self, table_element: TableBlock, file_path: str)...
method aformat (line 55) | async def aformat(
method format (line 80) | def format(
method aformat_table (line 105) | async def aformat_table(
method format_table (line 123) | def format_table(self, table_element: TableBlock, file_path: str) -> T...
method process_file (line 139) | def process_file(self, images: List[Image.Image], image_format="PNG") ...
method avision_extract (line 154) | async def avision_extract(self, table_image: str) -> str:
method vision_extract (line 175) | def vision_extract(self, table_image: str) -> str:
FILE: libs/megaparse/src/megaparse/layout_detection/layout_detector.py
class LayoutDetector (line 45) | class LayoutDetector:
method __init__ (line 46) | def __init__(
method __call__ (line 81) | def __call__(
method extract_bboxes_from_page (line 115) | def extract_bboxes_from_page(
method nms (line 165) | def nms(
method topK (line 199) | def topK(
method _save_layout (line 211) | def _save_layout(
FILE: libs/megaparse/src/megaparse/layout_detection/output.py
class LayoutDetectionOutput (line 7) | class LayoutDetectionOutput(BaseModel):
FILE: libs/megaparse/src/megaparse/megaparse.py
class MegaParse (line 26) | class MegaParse:
method __init__ (line 27) | def __init__(
method validate_input (line 48) | def validate_input(
method extract_page_strategies (line 80) | def extract_page_strategies(
method load (line 145) | def load(
method aload (line 223) | async def aload(
FILE: libs/megaparse/src/megaparse/models/page.py
class PageDimension (line 10) | class PageDimension(BaseModel):
class Page (line 19) | class Page(BaseModel):
class GatewayDocument (line 34) | class GatewayDocument(BaseModel):
FILE: libs/megaparse/src/megaparse/parser/base.py
class BaseParser (line 9) | class BaseParser(ABC):
method check_supported_extension (line 14) | def check_supported_extension(
method aconvert (line 30) | async def aconvert(
method convert (line 53) | def convert(
FILE: libs/megaparse/src/megaparse/parser/builder.py
class ParserBuilder (line 15) | class ParserBuilder:
method build (line 16) | def build(self, config: ParseFileConfig) -> BaseParser:
FILE: libs/megaparse/src/megaparse/parser/doctr_parser.py
class DoctrParser (line 58) | class DoctrParser(NestedObject, _OCRPredictor):
method __init__ (line 61) | def __init__(
method get_text_detections (line 110) | def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]:
method get_text_recognition (line 188) | def get_text_recognition(
method _get_block_cls (line 261) | def _get_block_cls(
method __to_elements_list (line 291) | def __to_elements_list(
FILE: libs/megaparse/src/megaparse/parser/entity.py
class TagEnum (line 5) | class TagEnum(str, Enum):
class SupportedModel (line 14) | class SupportedModel(Enum):
method __init__ (line 20) | def __init__(self, model_name: str, supported_releases: Optional[List[...
method is_supported (line 25) | def is_supported(cls, model_name: str) -> bool:
FILE: libs/megaparse/src/megaparse/parser/llama.py
class LlamaParser (line 14) | class LlamaParser(BaseParser):
method __init__ (line 17) | def __init__(
method aconvert (line 34) | async def aconvert(
method convert (line 58) | def convert(
method __to_elements_list__ (line 82) | def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDo...
FILE: libs/megaparse/src/megaparse/parser/megaparse_vision.py
class MegaParseVision (line 57) | class MegaParseVision(BaseParser):
method __init__ (line 60) | def __init__(self, model: BaseChatModel, **kwargs):
method process_file (line 71) | def process_file(self, file_path: str, image_format: str = "PNG") -> L...
method get_element (line 91) | def get_element(self, tag: TagEnum, chunk: str):
method asend_to_mlm (line 99) | async def asend_to_mlm(self, images_data: List[str]) -> str:
method send_to_mlm (line 122) | def send_to_mlm(self, images_data: List[str]) -> str:
method aconvert (line 145) | async def aconvert(
method convert (line 178) | def convert(
method get_cleaned_content (line 214) | def get_cleaned_content(self, parsed_file: str) -> str:
method __to_elements_list__ (line 253) | def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument:
FILE: libs/megaparse/src/megaparse/parser/unstructured_parser.py
class UnstructuredParser (line 32) | class UnstructuredParser(BaseParser):
method __init__ (line 49) | def __init__(
method convert (line 55) | def convert(
method aconvert (line 73) | async def aconvert(
method __to_mp_document (line 88) | def __to_mp_document(self, elements: List[Element]) -> MPDocument:
method __convert_element_to_block (line 98) | def __convert_element_to_block(self, element: Element) -> Block | None:
FILE: libs/megaparse/src/megaparse/predictor/layout_predictor.py
function extract_layout (line 7) | def extract_layout(
FILE: libs/megaparse/src/megaparse/utils/extract_metadata.py
function get_doc_metdata (line 6) | def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]:
FILE: libs/megaparse/src/megaparse/utils/onnx.py
function get_providers (line 10) | def get_providers(device: DeviceEnum) -> List[str]:
FILE: libs/megaparse/src/megaparse/utils/strategy.py
function get_page_strategy (line 10) | def get_page_strategy(
function determine_global_strategy (line 66) | def determine_global_strategy(pages: List[Page], threshold: float) -> St...
FILE: libs/megaparse/tests/conftest.py
class FakeParserBuilder (line 15) | class FakeParserBuilder:
method build (line 16) | def build(self, *args, **kwargs) -> BaseParser:
function test_client (line 62) | async def test_client():
FILE: libs/megaparse/tests/pdf/test_detect_ocr.py
function test_hi_res_strategy (line 16) | def test_hi_res_strategy(hi_res_pdf):
function test_fast_strategy (line 31) | def test_fast_strategy(native_pdf):
FILE: libs/megaparse/tests/pdf/test_pdf_processing.py
function native_pdf (line 16) | def native_pdf() -> Path:
function scanned_pdf (line 22) | def scanned_pdf() -> Path:
function test_async_megaparse_pdf_processor_file_path (line 34) | async def test_async_megaparse_pdf_processor_file_path(pdf_name, request):
function test_sync_megaparse_pdf_processor_file_path (line 42) | def test_sync_megaparse_pdf_processor_file_path(pdf_name, request):
function test_megaparse_pdf_processor_file (line 51) | async def test_megaparse_pdf_processor_file(pdf_name, request):
function test_strategy_native (line 59) | def test_strategy_native(native_pdf):
function test_strategy_scanned (line 74) | def test_strategy_scanned(scanned_pdf):
FILE: libs/megaparse/tests/pdf/test_pdfium_parser.py
function test_pdfium (line 6) | def test_pdfium():
FILE: libs/megaparse/tests/test_endpoints.py
function test_parse_file_endpoint (line 5) | async def test_parse_file_endpoint(test_client):
function test_parse_url_endpoint (line 23) | async def test_parse_url_endpoint(test_client):
FILE: libs/megaparse/tests/test_import.py
function test_load (line 6) | def test_load():
FILE: libs/megaparse/tests/test_parsers.py
function test_sync_parser (line 18) | def test_sync_parser(parser, extension):
FILE: libs/megaparse_sdk/examples/usage_example.py
function main (line 7) | async def main():
FILE: libs/megaparse_sdk/megaparse_sdk/__init__.py
class MegaParseSDK (line 6) | class MegaParseSDK:
method __init__ (line 7) | def __init__(self, api_key: str | None = None, base_url: str | None = ...
method close (line 12) | async def close(self):
FILE: libs/megaparse_sdk/megaparse_sdk/client.py
class MegaParseClient (line 40) | class MegaParseClient:
method __init__ (line 41) | def __init__(
method request (line 57) | async def request(self, method: str, endpoint: str, **kwargs: Any) -> ...
method close (line 71) | async def close(self):
class ClientState (line 75) | class ClientState(enum.Enum):
class MegaParseNATSClient (line 84) | class MegaParseNATSClient:
method __init__ (line 85) | def __init__(self, config: ClientNATSConfig):
method _get_nc (line 97) | async def _get_nc(self):
method __aenter__ (line 109) | async def __aenter__(self: Self) -> Self:
method __aexit__ (line 124) | async def __aexit__(
method parse_url (line 133) | async def parse_url(self, url: str):
method parse_file (line 137) | async def parse_file(
method _send_req (line 158) | async def _send_req(self, inp: MPInput) -> str | Document:
method _send_req_inner (line 171) | async def _send_req_inner(self, inp: MPInput):
method _handle_mp_output (line 181) | def _handle_mp_output(self, response: MPOutput) -> str | Document:
method aclose (line 200) | async def aclose(self):
FILE: libs/megaparse_sdk/megaparse_sdk/config.py
class MegaParseSDKConfig (line 5) | class MegaParseSDKConfig(BaseSettings):
class SSLConfig (line 17) | class SSLConfig(BaseModel):
class ClientNATSConfig (line 23) | class ClientNATSConfig(BaseSettings):
FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py
class UploadFileConfig (line 11) | class UploadFileConfig(BaseModel):
class FileUpload (line 20) | class FileUpload:
method __init__ (line 21) | def __init__(self, client: MegaParseClient):
method upload (line 24) | async def upload(
FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py
class URLUpload (line 6) | class URLUpload:
method __init__ (line 7) | def __init__(self, client: MegaParseClient):
method upload (line 10) | async def upload(self, url: str, max_retries: int = 3) -> Response:
FILE: libs/megaparse_sdk/megaparse_sdk/schema/document.py
class Point2D (line 10) | class Point2D(NamedTuple):
class BlockType (line 15) | class BlockType(str, Enum):
class BBOX (line 19) | class BBOX(NamedTuple):
method to_numpy (line 23) | def to_numpy(self):
method iou (line 28) | def iou(self, other: Self):
class BlockLayout (line 44) | class BlockLayout(BaseModel):
class TextDetection (line 50) | class TextDetection:
method __init__ (line 59) | def __init__(
method __repr__ (line 73) | def __repr__(self) -> str:
method render (line 76) | def render(
method get_loc_preds (line 104) | def get_loc_preds(self) -> np.ndarray:
method get_objectness_scores (line 114) | def get_objectness_scores(self) -> np.ndarray:
method get_origin_page_shapes (line 124) | def get_origin_page_shapes(self) -> np.ndarray:
method get_orientations (line 134) | def get_orientations(self) -> np.ndarray:
class Block (line 145) | class Block(BaseModel):
method validate_range (line 161) | def validate_range(cls, value):
class TextBlock (line 172) | class TextBlock(Block):
method __str__ (line 180) | def __str__(self):
class UndefinedBlock (line 184) | class UndefinedBlock(TextBlock):
class TitleBlock (line 193) | class TitleBlock(TextBlock):
method __str__ (line 199) | def __str__(self):
class SubTitleBlock (line 203) | class SubTitleBlock(TextBlock):
method __str__ (line 210) | def __str__(self):
class CaptionBlock (line 215) | class CaptionBlock(TextBlock):
class ImageBlock (line 223) | class ImageBlock(Block):
method __str__ (line 231) | def __str__(self) -> str:
class TableBlock (line 235) | class TableBlock(ImageBlock):
method __str__ (line 241) | def __str__(self):
class ListElementBlock (line 245) | class ListElementBlock(TextBlock):
class ListBlock (line 254) | class ListBlock(Block):
method __str__ (line 264) | def __str__(self):
class HeaderBlock (line 271) | class HeaderBlock(TextBlock):
method __str__ (line 277) | def __str__(self):
class FooterBlock (line 281) | class FooterBlock(TextBlock):
method __str__ (line 287) | def __str__(self):
class SectionBlock (line 291) | class SectionBlock(Block):
method __str__ (line 301) | def __str__(self):
class TOCItem (line 307) | class TOCItem(BaseModel):
method validate_range (line 313) | def validate_range(cls, value):
method __str__ (line 321) | def __str__(self):
class TOC (line 331) | class TOC(BaseModel):
method text (line 335) | def text(self) -> str:
method __str__ (line 338) | def __str__(self):
class Document (line 342) | class Document(BaseModel):
method __str__ (line 355) | def __str__(self) -> str:
method clean (line 369) | def clean(self):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/extensions.py
class FileExtension (line 4) | class FileExtension(str, Enum):
method __new__ (line 9) | def __new__(cls, value: str, mimetype: str):
method mimetype (line 39) | def mimetype(self) -> str:
FILE: libs/megaparse_sdk/megaparse_sdk/schema/languages.py
class Language (line 4) | class Language(str, Enum):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py
class ModelNotSupported (line 1) | class ModelNotSupported(Exception):
method __init__ (line 2) | def __init__(
class MemoryLimitExceeded (line 9) | class MemoryLimitExceeded(Exception):
method __init__ (line 10) | def __init__(self, message="The service is under high memory pressure"):
class InternalServiceError (line 14) | class InternalServiceError(Exception):
method __init__ (line 15) | def __init__(self, message="Internal service error occured"):
class DownloadError (line 19) | class DownloadError(Exception):
method __init__ (line 20) | def __init__(self, message="Failed to download the file"):
class ParsingException (line 24) | class ParsingException(Exception):
method __init__ (line 25) | def __init__(self, message="An error occurred during parsing"):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py
class FileInput (line 10) | class FileInput(BaseModel):
method decode_data (line 16) | def decode_data(cls, value):
method serialize_data (line 27) | def serialize_data(self, data: bytes, _info):
class MPParseType (line 31) | class MPParseType(str, Enum):
class ParseFileInput (line 36) | class ParseFileInput(BaseModel):
class ParseUrlInput (line 42) | class ParseUrlInput(BaseModel):
class MPInput (line 47) | class MPInput(BaseModel):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py
class MPErrorType (line 9) | class MPErrorType(Enum):
class ParseError (line 17) | class ParseError(BaseModel):
class MPOutputType (line 22) | class MPOutputType(str, Enum):
class MPOutput (line 27) | class MPOutput(BaseModel):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py
class ParserType (line 10) | class ParserType(str, Enum):
class StrategyEnum (line 18) | class StrategyEnum(str, Enum):
class ParseFileConfig (line 26) | class ParseFileConfig(BaseModel):
FILE: libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py
class SupportedModel (line 4) | class SupportedModel(str, Enum):
method __str__ (line 24) | def __str__(self):
method is_supported (line 28) | def is_supported(cls, model_name: str) -> bool:
method get_supported_models (line 33) | def get_supported_models(cls) -> list[str]:
FILE: libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py
function load_ssl_cxt (line 6) | def load_ssl_cxt(ssl_config: SSLConfig):
FILE: libs/megaparse_sdk/tests/test_nats_client.py
function ssl_config (line 36) | def ssl_config() -> SSLConfig:
function nc_config (line 45) | def nc_config(ssl_config: SSLConfig) -> ClientNATSConfig:
function nats_service (line 61) | async def nats_service(nc_config: ClientNATSConfig):
function test_client_state_transition (line 76) | async def test_client_state_transition(nc_config: ClientNATSConfig):
function test_client_parse_file (line 89) | async def test_client_parse_file(nats_service: Client, nc_config: Client...
function test_client_parse_url (line 105) | async def test_client_parse_url(nats_service: Client, nc_config: ClientN...
function test_client_parse_timeout (line 120) | async def test_client_parse_timeout(nats_service: Client, ssl_config: SS...
function test_client_parse_timeout_retry (line 142) | async def test_client_parse_timeout_retry(nats_service: Client, ssl_conf...
function test_client_parse_file_excp (line 178) | async def test_client_parse_file_excp(
Condensed preview — 123 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (526K chars).
[
{
"path": ".aws/task_definition.json",
"chars": 2121,
"preview": "{\n \"taskDefinitionArn\": \"arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2\",\n \"containerDefiniti"
},
{
"path": ".flake8",
"chars": 157,
"preview": "[flake8]\n; Minimal configuration for Flake8 to work with Black.\nmax-line-length = 100\nignore = E101,E111,E112,E221,E222,"
},
{
"path": ".gitattributes",
"chars": 50,
"preview": "*.ipynb linguist-vendored\n*.html linguist-vendored"
},
{
"path": ".github/workflows/CI.yml",
"chars": 2417,
"preview": "name: Run tests\n\non:\n pull_request:\n workflow_dispatch:\n\nenv:\n NATS_TOKEN: test\n\njobs:\n test:\n name: Run tests on"
},
{
"path": ".github/workflows/build-and-deploy.yml",
"chars": 1652,
"preview": "name: Build Docker image and push ECR\n\non:\n push:\n tags:\n - \"v*\"\n branches: [main]\n\nenv:\n AWS_REGION: eu-we"
},
{
"path": ".github/workflows/build-gpu.yml",
"chars": 1687,
"preview": "name: Build docker GPU and push ECR\n\non:\n push:\n tags:\n - \"v*\"\n branches: [main]\n\nenv:\n AWS_REGION: eu-west"
},
{
"path": ".github/workflows/release-please.yml",
"chars": 1901,
"preview": "on:\n push:\n branches:\n - main\n\npermissions:\n contents: write\n pull-requests: write\n\nname: release-please\n\njob"
},
{
"path": ".github/workflows/test-build-docker.yml",
"chars": 780,
"preview": "on:\n pull_request:\n branches:\n - main\n\nname: Test build docker\njobs:\n build-docker:\n runs-on: ubuntu-latest"
},
{
"path": ".gitignore",
"chars": 298,
"preview": "/output\n/input\n.env\n__pycache__/\ndist/**\nmegaparse.egg-info/\n*.pyc\nbuild/*\nENV\nvenv\n*/evaluations/*\n*/cdp/*\n*.pkl\n\n!mega"
},
{
"path": ".pre-commit-config.yaml",
"chars": 927,
"preview": "repos:\n - repo: https://github.com/pre-commit/pre-commit-hooks\n rev: v4.6.0\n hooks:\n - id: check-added-large"
},
{
"path": ".python-version",
"chars": 7,
"preview": "3.11.9\n"
},
{
"path": ".release-please-manifest.json",
"chars": 67,
"preview": "{\n \"libs/megaparse\": \"0.0.55\",\n \"libs/megaparse_sdk\": \"0.1.12\"\n}\n"
},
{
"path": ".vscode/extensions.json",
"chars": 242,
"preview": "{\n \"recommendations\": [\n \"dbaeumer.vscode-eslint\",\n \"charliermarsh.ruff\",\n \"knisterpeter.vscode-github\",\n \""
},
{
"path": ".vscode/launch.json",
"chars": 1196,
"preview": "{\n \"version\": \"0.2.0\",\n \"configurations\": [\n {\n \"name\": \"Python: Remote Attach\",\n \"ty"
},
{
"path": ".vscode/settings.json",
"chars": 981,
"preview": "{\n \"editor.formatOnSave\": true,\n \"editor.formatOnSaveMode\": \"file\",\n \"files.exclude\": {\n \"**/__pycache__\": true,\n "
},
{
"path": "CHANGELOG.md",
"chars": 13104,
"preview": "# Changelog\n\n## [0.0.46](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.45...megaparse-v0.0.46) (2024-11-21"
},
{
"path": "Dockerfile",
"chars": 1133,
"preview": "FROM python:3.11.10-slim-bullseye\n\nWORKDIR /app\n\n# Install runtime dependencies\nRUN apt-get update && apt-get upgrade &&"
},
{
"path": "Dockerfile.gpu",
"chars": 1775,
"preview": "FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04\n\nWORKDIR /app\n\nENV UV_COMPILE_BYTECODE=1\nENV UV_NO_CACHE=1\nENV DEBIAN_FR"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "Makefile",
"chars": 647,
"preview": ".DEFAULT_TARGET=help\n\n## help: Display list of commands\n.PHONY: help\nhelp:\n\t@echo \"Available commands:\"\n\t@sed -n 's|^##|"
},
{
"path": "Pipfile",
"chars": 139,
"preview": "[[source]]\nurl = \"https://pypi.org/simple\"\nverify_ssl = true\nname = \"pypi\"\n\n[packages]\n\n[dev-packages]\n\n[requires]\npytho"
},
{
"path": "README.md",
"chars": 3304,
"preview": "# MegaParse - Your Parser for every type of documents\n\n<div align=\"center\">\n <img src=\"https://raw.githubusercontent."
},
{
"path": "benchmark/process_single_doc.py",
"chars": 1436,
"preview": "import asyncio\nimport time\nfrom pathlib import Path\n\nimport numpy as np\nfrom megaparse import MegaParse\n\nN_TRY = 1\n\n\nasy"
},
{
"path": "benchmark/test_quality_sim.py",
"chars": 1935,
"preview": "import os\nimport difflib\nfrom pathlib import Path\n\nauto_dir = Path(\"benchmark/auto\")\nhi_res_dir = Path(\"benchmark/hi_res"
},
{
"path": "docker-compose.dev.yml",
"chars": 504,
"preview": "version: \"3.8\"\n\nservices:\n megaparse:\n build:\n context: .\n dockerfile: Dockerfile\n cache_from:\n "
},
{
"path": "docker-compose.yml",
"chars": 462,
"preview": "version: \"3.8\"\n\nservices:\n megaparse:\n image: megaparse:latest\n pull_policy: if_not_present\n container_name: m"
},
{
"path": "docs/archive.txt",
"chars": 542,
"preview": "### (Optional) Use LlamaParse for Improved Results\n\n1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) "
},
{
"path": "evaluations/script.py",
"chars": 2371,
"preview": "import difflib\nimport os\n\nfrom langchain_openai import ChatOpenAI\nfrom megaparse.megaparse import MegaParse\nfrom megapar"
},
{
"path": "libs/megaparse/.python-version",
"chars": 6,
"preview": "3.11.9"
},
{
"path": "libs/megaparse/CHANGELOG.md",
"chars": 6933,
"preview": "# Changelog\n\n## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14"
},
{
"path": "libs/megaparse/README.md",
"chars": 289,
"preview": "# MegaParse CORE\n\n- Core package of megaparse\n\n> **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come"
},
{
"path": "libs/megaparse/bench.md",
"chars": 4883,
"preview": "------------\nUNSTRUCTURED(HI-RES):\n------------\n\nfolder: cdp\n cdp_etiquette.pdf parsing took: 2.10s\nfolder: scan"
},
{
"path": "libs/megaparse/examples/parse_file_fast.py",
"chars": 1793,
"preview": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom unstructured.partition.auto import parti"
},
{
"path": "libs/megaparse/examples/parse_file_mp.py",
"chars": 1738,
"preview": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom megaparse import MegaParse\nfrom megapars"
},
{
"path": "libs/megaparse/examples/parse_file_unstructured.py",
"chars": 1655,
"preview": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom unstructured.partition.auto import parti"
},
{
"path": "libs/megaparse/pyproject.toml",
"chars": 1580,
"preview": "[project]\nname = \"megaparse\"\nversion = \"0.0.55\"\nauthors = [\n { name = \"Stan Girard\", email = \"stan@quivr.app\" },\n "
},
{
"path": "libs/megaparse/src/megaparse/__init__.py",
"chars": 58,
"preview": "from .megaparse import MegaParse\n\n__all__ = [\"MegaParse\"]\n"
},
{
"path": "libs/megaparse/src/megaparse/api/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "libs/megaparse/src/megaparse/api/app.py",
"chars": 5132,
"preview": "import io\nimport os\nimport tempfile\nfrom typing import Any, Optional\n\nimport httpx\nimport psutil\nimport uvicorn\nfrom fas"
},
{
"path": "libs/megaparse/src/megaparse/api/exceptions/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py",
"chars": 1202,
"preview": "from fastapi import HTTPException\n\n\nclass HTTPModelNotSupported(HTTPException):\n def __init__(\n self,\n "
},
{
"path": "libs/megaparse/src/megaparse/api/models/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "libs/megaparse/src/megaparse/api/models/base.py",
"chars": 552,
"preview": "from enum import Enum\n\n\nclass MarkDownType(str, Enum):\n \"\"\"Markdown type enumeration.\"\"\"\n\n TITLE = \"Title\"\n SUB"
},
{
"path": "libs/megaparse/src/megaparse/configs/auto.py",
"chars": 1329,
"preview": "from enum import Enum\n\nfrom pydantic import BaseModel\nfrom pydantic_settings import BaseSettings, SettingsConfigDict\n\n\nc"
},
{
"path": "libs/megaparse/src/megaparse/examples/parse_file.py",
"chars": 665,
"preview": "from pathlib import Path\n\nfrom megaparse.megaparse import MegaParse\nfrom pydantic import BaseModel, Field\n\n\nclass MyCust"
},
{
"path": "libs/megaparse/src/megaparse/examples/parsing_process.py",
"chars": 13418,
"preview": "from pathlib import Path\nfrom typing import IO, Any, List, Tuple\n\nimport numpy as np\nimport onnxruntime as rt\nimport pyp"
},
{
"path": "libs/megaparse/src/megaparse/exceptions/base.py",
"chars": 236,
"preview": "class ParsingException(Exception):\n \"\"\"Exception raised for errors in the parsing process.\"\"\"\n\n def __init__(self,"
},
{
"path": "libs/megaparse/src/megaparse/formatter/base.py",
"chars": 1183,
"preview": "from abc import ABC\nfrom pathlib import Path\nfrom typing import Union\n\nfrom langchain_core.language_models.chat_models i"
},
{
"path": "libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py",
"chars": 851,
"preview": "from pathlib import Path\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom megaparse.formatter."
},
{
"path": "libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py",
"chars": 2902,
"preview": "from pathlib import Path\n\nfrom megaparse.formatter.structured_formatter import StructuredFormatter\nfrom megaparse_sdk.sc"
},
{
"path": "libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py",
"chars": 539,
"preview": "from pathlib import Path\n\nfrom megaparse.formatter.base import BaseFormatter\nfrom megaparse_sdk.schema.document import D"
},
{
"path": "libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py",
"chars": 3685,
"preview": "import re\nimport warnings\nfrom pathlib import Path\nfrom typing import Optional\n\nfrom langchain_core.language_models.chat"
},
{
"path": "libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py",
"chars": 7238,
"preview": "import base64\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom langchain_core.lan"
},
{
"path": "libs/megaparse/src/megaparse/layout_detection/layout_detector.py",
"chars": 8017,
"preview": "import logging\nimport os\nimport pathlib\nimport uuid\nfrom typing import Any, List\n\nimport numpy as np\nimport onnxruntime "
},
{
"path": "libs/megaparse/src/megaparse/layout_detection/output.py",
"chars": 207,
"preview": "from uuid import UUID\n\nfrom megaparse_sdk.schema.document import BBOX\nfrom pydantic import BaseModel\n\n\nclass LayoutDetec"
},
{
"path": "libs/megaparse/src/megaparse/megaparse.py",
"chars": 11687,
"preview": "import logging\nimport warnings\nfrom pathlib import Path\nfrom typing import IO, BinaryIO, List\n\nimport pypdfium2 as pdfiu"
},
{
"path": "libs/megaparse/src/megaparse/models/page.py",
"chars": 904,
"preview": "from typing import List\n\nfrom megaparse_sdk.schema.document import TextDetection\nfrom megaparse_sdk.schema.parser_config"
},
{
"path": "libs/megaparse/src/megaparse/parser/__init__.py",
"chars": 55,
"preview": "from .base import BaseParser\n\n__all__ = [\"BaseParser\"]\n"
},
{
"path": "libs/megaparse/src/megaparse/parser/base.py",
"chars": 2525,
"preview": "from abc import ABC, abstractmethod\nfrom pathlib import Path\nfrom typing import IO\n\nfrom megaparse_sdk.schema.document i"
},
{
"path": "libs/megaparse/src/megaparse/parser/builder.py",
"chars": 901,
"preview": "from megaparse_sdk.schema.parser_config import ParseFileConfig\n\nfrom megaparse.parser.base import BaseParser\nfrom megapa"
},
{
"path": "libs/megaparse/src/megaparse/parser/doctr_parser.py",
"chars": 13389,
"preview": "import logging\nimport uuid\nfrom typing import Any, Dict, List, Tuple, Type\nfrom uuid import UUID\n\nimport numpy as np\nimp"
},
{
"path": "libs/megaparse/src/megaparse/parser/entity.py",
"chars": 1249,
"preview": "from enum import Enum\nfrom typing import List, Optional\n\n\nclass TagEnum(str, Enum):\n \"\"\"Possible tags for the element"
},
{
"path": "libs/megaparse/src/megaparse/parser/llama.py",
"chars": 3400,
"preview": "from pathlib import Path\nfrom typing import IO, List\n\nfrom llama_index.core.schema import Document as LlamaDocument\nfrom"
},
{
"path": "libs/megaparse/src/megaparse/parser/megaparse_vision.py",
"chars": 10617,
"preview": "import asyncio\nimport base64\nimport re\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import IO, List\n\nfrom"
},
{
"path": "libs/megaparse/src/megaparse/parser/unstructured_parser.py",
"chars": 13989,
"preview": "import warnings\nfrom pathlib import Path\nfrom typing import IO, Dict, List\n\nfrom dotenv import load_dotenv\nfrom langchai"
},
{
"path": "libs/megaparse/src/megaparse/predictor/layout_predictor.py",
"chars": 798,
"preview": "from PIL import Image\nfrom unstructured_inference.inference.layout import PageLayout\nfrom unstructured_inference.models."
},
{
"path": "libs/megaparse/src/megaparse/utils/extract_metadata.py",
"chars": 144,
"preview": "from typing import Any, Dict\n\nimport pypdfium2 as pdfium\n\n\ndef get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> D"
},
{
"path": "libs/megaparse/src/megaparse/utils/onnx.py",
"chars": 972,
"preview": "import logging\nfrom typing import List\n\nimport onnxruntime as rt\nfrom megaparse.configs.auto import DeviceEnum\n\nlogger ="
},
{
"path": "libs/megaparse/src/megaparse/utils/strategy.py",
"chars": 2374,
"preview": "from typing import List\n\nimport numpy as np\nfrom megaparse.models.page import Page\nfrom megaparse_sdk.schema.document im"
},
{
"path": "libs/megaparse/tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "libs/megaparse/tests/certs/client-cert.pem",
"chars": 1675,
"preview": "-----BEGIN CERTIFICATE-----\nMIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw\ngZ0xHjAcBgNVBAoTFW1rY2VydCB"
},
{
"path": "libs/megaparse/tests/certs/client-key.pem",
"chars": 1704,
"preview": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+"
},
{
"path": "libs/megaparse/tests/conftest.py",
"chars": 2877,
"preview": "from pathlib import Path\nfrom typing import IO\n\nimport pytest_asyncio\nfrom httpx import ASGITransport, AsyncClient\nfrom "
},
{
"path": "libs/megaparse/tests/data/grt_example/MegaFake_report.md",
"chars": 10093,
"preview": "| My Mega fake report | #1756394 | 31/05/2024 |\n|---------------------|----------|------------|\n\n## Why Mega Parse might"
},
{
"path": "libs/megaparse/tests/pdf/test_detect_ocr.py",
"chars": 1282,
"preview": "import os\n\nimport pypdfium2\nimport pytest\nfrom megaparse.megaparse import MegaParse\nfrom megaparse.utils.strategy import"
},
{
"path": "libs/megaparse/tests/pdf/test_pdf_processing.py",
"chars": 2429,
"preview": "from pathlib import Path\n\nimport pypdfium2\nimport pytest\nfrom megaparse.configs.auto import (\n DeviceEnum,\n MegaPa"
},
{
"path": "libs/megaparse/tests/pdf/test_pdfium_parser.py",
"chars": 299,
"preview": "from pathlib import Path\n\nimport pypdfium2 as pdfium\n\n\ndef test_pdfium():\n # scanned pdf\n p = Path(\"./tests/pdf/ml"
},
{
"path": "libs/megaparse/tests/supported_docs/sample.csv",
"chars": 258,
"preview": "Name,Description\nMegaParse,\"MegaParse is the best parser, even with accents like é, è, and ñ.\"\nOtherParse,\"OtherParse is"
},
{
"path": "libs/megaparse/tests/supported_docs/sample.markdown",
"chars": 1304,
"preview": "# The Difficulty of Parsing Files\n\nParsing files can be a challenging task due to several factors:\n\n## 1. File Format Va"
},
{
"path": "libs/megaparse/tests/supported_docs/sample.md",
"chars": 1304,
"preview": "# The Difficulty of Parsing Files\n\nParsing files can be a challenging task due to several factors:\n\n## 1. File Format Va"
},
{
"path": "libs/megaparse/tests/supported_docs/sample.txt",
"chars": 1261,
"preview": "Lorem ipsum \n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \n\nVestibulum neque massa, "
},
{
"path": "libs/megaparse/tests/supported_docs/sample.xml",
"chars": 553,
"preview": "<?xml version=\"1.0\"?>\n<customers>\n <customer id=\"55000\">\n <name>Charter Group</name>\n <address>\n <st"
},
{
"path": "libs/megaparse/tests/supported_docs/sample_complexe.html",
"chars": 235948,
"preview": "\n<!-- saved from url=(0065)https://demo.borland.com/testsite/stadyn_largepagewithimages.html -->\n<html><head><meta http-"
},
{
"path": "libs/megaparse/tests/test_endpoints.py",
"chars": 931,
"preview": "import pytest\n\n\n@pytest.mark.asyncio\nasync def test_parse_file_endpoint(test_client):\n # Simulate a request to the pa"
},
{
"path": "libs/megaparse/tests/test_import.py",
"chars": 256,
"preview": "import pytest\nfrom megaparse import MegaParse\n\n\n@pytest.mark.skip(\"slow test\")\ndef test_load():\n megaparse = MegaPars"
},
{
"path": "libs/megaparse/tests/test_parsers.py",
"chars": 1184,
"preview": "import os\n\nimport pytest\nfrom megaparse.parser.doctr_parser import DoctrParser\nfrom megaparse.parser.llama import LlamaP"
},
{
"path": "libs/megaparse_sdk/CHANGELOG.md",
"chars": 3617,
"preview": "# Changelog\n\n## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (20"
},
{
"path": "libs/megaparse_sdk/README.md",
"chars": 1960,
"preview": "## MegaParse SDK\n\nWelcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload "
},
{
"path": "libs/megaparse_sdk/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "libs/megaparse_sdk/examples/usage_example.py",
"chars": 963,
"preview": "import asyncio\nimport os\n\nfrom megaparse.sdk.megaparse_sdk import MegaParseSDK\n\n\nasync def main():\n api_key = str(os."
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/__init__.py",
"chars": 434,
"preview": "from .client import MegaParseClient\nfrom .endpoints.file_upload import FileUpload\nfrom .endpoints.url_upload import URLU"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/client.py",
"chars": 6948,
"preview": "import asyncio\nimport enum\nimport logging\nimport os\nfrom io import BytesIO\nfrom pathlib import Path\nfrom types import Tr"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/config.py",
"chars": 1056,
"preview": "from pydantic import BaseModel, FilePath\nfrom pydantic_settings import BaseSettings, SettingsConfigDict\n\n\nclass MegaPars"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py",
"chars": 1494,
"preview": "from typing import Optional\n\nfrom httpx import Response\nfrom pydantic import BaseModel\n\nfrom megaparse_sdk.client import"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py",
"chars": 449,
"preview": "from httpx import Response\n\nfrom megaparse_sdk.client import MegaParseClient\n\n\nclass URLUpload:\n def __init__(self, c"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/document.py",
"chars": 10368,
"preview": "import uuid\nfrom enum import Enum\nfrom typing import Any, Dict, List, Literal, NamedTuple, Optional, Self, Tuple\n\nimport"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/extensions.py",
"chars": 1126,
"preview": "from enum import Enum\n\n\nclass FileExtension(str, Enum):\n \"\"\"Supported file extension enumeration.\"\"\"\n\n _mimetype: "
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/languages.py",
"chars": 1709,
"preview": "from enum import Enum\n\n\nclass Language(str, Enum):\n BAZA = \"abq\"\n ADYGHE = \"ady\"\n AFRIKAANS = \"af\"\n ANGIKA ="
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py",
"chars": 739,
"preview": "class ModelNotSupported(Exception):\n def __init__(\n self,\n message: str = \"The requested model is not s"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py",
"chars": 1360,
"preview": "import base64\nfrom enum import Enum\nfrom typing import Literal, Union\n\nfrom pydantic import BaseModel, Field, field_seri"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py",
"chars": 679,
"preview": "from enum import Enum, auto\nfrom typing import Dict\n\nfrom pydantic import BaseModel, Field\n\nfrom megaparse_sdk.schema.do"
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py",
"chars": 780,
"preview": "from enum import Enum\nfrom typing import Optional\n\nfrom pydantic import BaseModel\n\nfrom .languages import Language\nfrom "
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py",
"chars": 1074,
"preview": "from enum import Enum\n\n\nclass SupportedModel(str, Enum):\n \"\"\"Supported models enumeration.\"\"\"\n\n # OpenAI Models\n "
},
{
"path": "libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py",
"chars": 383,
"preview": "import ssl\n\nfrom megaparse_sdk.config import SSLConfig\n\n\ndef load_ssl_cxt(ssl_config: SSLConfig):\n context = ssl.SSLC"
},
{
"path": "libs/megaparse_sdk/pyproject.toml",
"chars": 558,
"preview": "[project]\nname = \"megaparse-sdk\"\nversion = \"0.1.12\"\ndescription = \"Megaparse SDK\"\ndependencies = [\n \"python-dotenv>=1"
},
{
"path": "libs/megaparse_sdk/tests/README.md",
"chars": 0,
"preview": ""
},
{
"path": "libs/megaparse_sdk/tests/certs/client-cert.pem",
"chars": 1675,
"preview": "-----BEGIN CERTIFICATE-----\nMIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw\ngZ0xHjAcBgNVBAoTFW1rY2VydCB"
},
{
"path": "libs/megaparse_sdk/tests/certs/client-key.pem",
"chars": 1704,
"preview": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+"
},
{
"path": "libs/megaparse_sdk/tests/certs/rootCA.pem",
"chars": 1809,
"preview": "-----BEGIN CERTIFICATE-----\nMIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB\nnTEeMBwGA1UEChMVbWtjZXJ0IGR"
},
{
"path": "libs/megaparse_sdk/tests/test_nats_client.py",
"chars": 6463,
"preview": "import asyncio\nimport logging\nfrom pathlib import Path\n\nimport nats\nimport pytest\nimport pytest_asyncio\nfrom megaparse_s"
},
{
"path": "pyproject.toml",
"chars": 2200,
"preview": "[project]\nname = \"megaparse-monorepo\"\nversion = \"0.0.1\"\ndescription = \"Megaparse monorepo\"\nauthors = [\n { name = \"Sta"
},
{
"path": "release-please-config.json",
"chars": 546,
"preview": "{\n \"$schema\": \"https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json\",\n \"separate-pull-"
}
]
// ... and 9 more files (download for full content)
About this extraction
This page contains the full source code of the QuivrHQ/MegaParse GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 123 files (28.4 MB), approximately 139.7k tokens, and a symbol index with 270 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.