Repository: QuivrHQ/MegaParse
Branch: main
Commit: ba9a24aec950
Files: 123
Total size: 28.4 MB
Directory structure:
gitextract_ylqgqesz/
├── .aws/
│ └── task_definition.json
├── .flake8
├── .gitattributes
├── .github/
│ └── workflows/
│ ├── CI.yml
│ ├── build-and-deploy.yml
│ ├── build-gpu.yml
│ ├── release-please.yml
│ └── test-build-docker.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .release-please-manifest.json
├── .vscode/
│ ├── extensions.json
│ ├── launch.json
│ └── settings.json
├── CHANGELOG.md
├── Dockerfile
├── Dockerfile.gpu
├── LICENSE
├── Makefile
├── Pipfile
├── README.md
├── benchmark/
│ ├── process_single_doc.py
│ └── test_quality_sim.py
├── docker-compose.dev.yml
├── docker-compose.yml
├── docs/
│ └── archive.txt
├── evaluations/
│ └── script.py
├── libs/
│ ├── megaparse/
│ │ ├── .python-version
│ │ ├── CHANGELOG.md
│ │ ├── README.md
│ │ ├── bench.md
│ │ ├── examples/
│ │ │ ├── parse_file_fast.py
│ │ │ ├── parse_file_mp.py
│ │ │ └── parse_file_unstructured.py
│ │ ├── program.prof
│ │ ├── pyproject.toml
│ │ ├── src/
│ │ │ └── megaparse/
│ │ │ ├── __init__.py
│ │ │ ├── api/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ ├── exceptions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── megaparse_exceptions.py
│ │ │ │ └── models/
│ │ │ │ ├── __init__.py
│ │ │ │ └── base.py
│ │ │ ├── configs/
│ │ │ │ └── auto.py
│ │ │ ├── examples/
│ │ │ │ ├── parse_file.py
│ │ │ │ └── parsing_process.py
│ │ │ ├── exceptions/
│ │ │ │ └── base.py
│ │ │ ├── formatter/
│ │ │ │ ├── base.py
│ │ │ │ ├── structured_formatter/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── custom_structured_formatter.py
│ │ │ │ └── table_formatter/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── llm_table_formatter.py
│ │ │ │ └── vision_table_formatter.py
│ │ │ ├── layout_detection/
│ │ │ │ ├── layout_detector.py
│ │ │ │ ├── models/
│ │ │ │ │ └── yolov10s-doclaynet.onnx
│ │ │ │ └── output.py
│ │ │ ├── megaparse.py
│ │ │ ├── models/
│ │ │ │ └── page.py
│ │ │ ├── parser/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── builder.py
│ │ │ │ ├── doctr_parser.py
│ │ │ │ ├── entity.py
│ │ │ │ ├── llama.py
│ │ │ │ ├── megaparse_vision.py
│ │ │ │ └── unstructured_parser.py
│ │ │ ├── predictor/
│ │ │ │ └── layout_predictor.py
│ │ │ └── utils/
│ │ │ ├── extract_metadata.py
│ │ │ ├── onnx.py
│ │ │ └── strategy.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ ├── certs/
│ │ │ ├── client-cert.pem
│ │ │ └── client-key.pem
│ │ ├── conftest.py
│ │ ├── data/
│ │ │ └── grt_example/
│ │ │ └── MegaFake_report.md
│ │ ├── pdf/
│ │ │ ├── test_detect_ocr.py
│ │ │ ├── test_pdf_processing.py
│ │ │ └── test_pdfium_parser.py
│ │ ├── supported_docs/
│ │ │ ├── Sway.epub
│ │ │ ├── file-sample_500kB.odt
│ │ │ ├── file_example_XLSX_50.xlsx
│ │ │ ├── file_example_XLS_50.xls
│ │ │ ├── sample.csv
│ │ │ ├── sample.docx
│ │ │ ├── sample.markdown
│ │ │ ├── sample.md
│ │ │ ├── sample.otf
│ │ │ ├── sample.pptx
│ │ │ ├── sample.txt
│ │ │ ├── sample.xml
│ │ │ └── sample_complexe.html
│ │ ├── test_endpoints.py
│ │ ├── test_import.py
│ │ └── test_parsers.py
│ └── megaparse_sdk/
│ ├── CHANGELOG.md
│ ├── README.md
│ ├── __init__.py
│ ├── examples/
│ │ └── usage_example.py
│ ├── megaparse_sdk/
│ │ ├── __init__.py
│ │ ├── client.py
│ │ ├── config.py
│ │ ├── endpoints/
│ │ │ ├── __init__.py
│ │ │ ├── file_upload.py
│ │ │ └── url_upload.py
│ │ ├── schema/
│ │ │ ├── __init__.py
│ │ │ ├── document.py
│ │ │ ├── extensions.py
│ │ │ ├── languages.py
│ │ │ ├── mp_exceptions.py
│ │ │ ├── mp_inputs.py
│ │ │ ├── mp_outputs.py
│ │ │ ├── parser_config.py
│ │ │ └── supported_models.py
│ │ └── utils/
│ │ └── load_ssl.py
│ ├── pyproject.toml
│ └── tests/
│ ├── README.md
│ ├── certs/
│ │ ├── client-cert.pem
│ │ ├── client-key.pem
│ │ └── rootCA.pem
│ └── test_nats_client.py
├── pyproject.toml
└── release-please-config.json
================================================
FILE CONTENTS
================================================
================================================
FILE: .aws/task_definition.json
================================================
{
"taskDefinitionArn": "arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2",
"containerDefinitions": [
{
"name": "megaparse",
"image": "quay.io/unstructured-io/unstructured-api:latest",
"cpu": 0,
"portMappings": [
{
"containerPort": 8000,
"hostPort": 8000,
"protocol": "tcp"
}
],
"essential": true,
"environment": [
{
"name": "UNSTRUCTURED_HI_RES_MODEL_NAME",
"value": "detectron2_onnx"
},
{
"name": "UNSTRUCTURED_PARALLEL_MODE_ENABLED",
"value": "false"
}
],
"mountPoints": [],
"volumesFrom": [],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/ecs/megaparse",
"awslogs-region": "eu-west-1",
"awslogs-stream-prefix": "ecs"
}
},
"systemControls": []
}
],
"family": "megaparse-task",
"executionRoleArn": "arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole",
"networkMode": "awsvpc",
"revision": 2,
"volumes": [],
"status": "ACTIVE",
"requiresAttributes": [
{
"name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
},
{
"name": "ecs.capability.execution-role-awslogs"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
},
{
"name": "ecs.capability.task-eni"
}
],
"placementConstraints": [],
"compatibilities": [
"EC2",
"FARGATE"
],
"requiresCompatibilities": [
"FARGATE"
],
"cpu": "2048",
"memory": "8192",
"tags": []
}
================================================
FILE: .flake8
================================================
[flake8]
; Minimal configuration for Flake8 to work with Black.
max-line-length = 100
ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100
================================================
FILE: .gitattributes
================================================
*.ipynb linguist-vendored
*.html linguist-vendored
================================================
FILE: .github/workflows/CI.yml
================================================
name: Run tests
on:
pull_request:
workflow_dispatch:
env:
NATS_TOKEN: test
jobs:
test:
name: Run tests on Python ${{ matrix.python-version }}
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11", "3.12"]
steps:
- name: 👀 Checkout code
uses: actions/checkout@v2
with:
submodules: true
- name: Setup apt cache
uses: actions/cache@v2
with:
path: /var/cache/apt/archives
key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }}
- name: 😭 Install system dependencies
run: |
sudo apt-get update && sudo apt-get install -y \
netcat-traditional \
unzip \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
build-essential \
libtool \
gcc \
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc
- name: 🔽 Install the latest version of rye
uses: eifinger/setup-rye@v4
with:
enable-cache: true
- name: 📌 Pin Python version
run: rye pin ${{ matrix.python-version }}
- name: 🔽 Download and Install NATS Server
run: |
curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip
unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin
- name: 🛠️ Set up NATS arguments
run: |
nohup nats-server \
--addr 0.0.0.0 \
--port 4222 \
--auth "$NATS_TOKEN" > nats.log 2>&1 &
- name: 🔍 Verify NATS Server is Running
run: |
sleep 1 # Give the server some time to start
if nc -zv localhost 4222; then
echo "✅ NATS Server is running on port 4222."
else
echo "❌ Failed to start NATS Server."
cat nats.log
exit 1
fi
- name: 🔨 Sync dependencies
run: |
UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
- name: 🚀 Run tests
run: |
rye test -p megaparse-sdk
================================================
FILE: .github/workflows/build-and-deploy.yml
================================================
name: Build Docker image and push ECR
on:
push:
tags:
- "v*"
branches: [main]
env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse
permissions:
contents: read
jobs:
deploy:
name: build docker
runs-on: ubuntu-latest
environment: production
outputs:
imageoutput: ${{ steps.build-image.outputs.imageoutput }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registry-type: public
- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
# Tag the image as 'latest' and push
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
================================================
FILE: .github/workflows/build-gpu.yml
================================================
name: Build docker GPU and push ECR
on:
push:
tags:
- "v*"
branches: [main]
env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse-gpu
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse
permissions:
contents: read
jobs:
deploy:
name: Build docker-gpu
runs-on:
group: big-boy-gpu
environment: production
outputs:
imageoutput: ${{ steps.build-image.outputs.imageoutput }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registry-type: public
- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
# Tag the image as 'latest' and push
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
================================================
FILE: .github/workflows/release-please.yml
================================================
on:
push:
branches:
- main
permissions:
contents: write
pull-requests: write
name: release-please
jobs:
release-please:
runs-on: ubuntu-latest
outputs:
release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }}
release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
fetch-depth: 0 # Fetch all history for tags and releases
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Run release-please
id: release
uses: google-github-actions/release-please-action@v4
with:
token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
deploy-megaparse:
if: needs.release-please.outputs.release_created == 'true'
needs: release-please
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rye
uses: eifinger/setup-rye@v2
with:
enable-cache: true
- name: Rye Sync
run: rye sync --no-lock
- name: Rye Build
run: cd libs/megaparse && rye build
- name: Rye Publish
run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
deploy-sdk:
if: needs.release-please.outputs.release_created_sdk == 'true'
needs: release-please
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rye
uses: eifinger/setup-rye@v2
with:
enable-cache: true
- name: Rye Sync
run: cd libs/megaparse_sdk && rye sync --no-lock
- name: Rye Build
run: cd libs/megaparse_sdk && rye build
- name: Rye Publish
run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
================================================
FILE: .github/workflows/test-build-docker.yml
================================================
on:
pull_request:
branches:
- main
name: Test build docker
jobs:
build-docker:
runs-on: ubuntu-latest
strategy:
matrix:
dockerfile: [Dockerfile, Dockerfile.gpu]
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: all
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker image with caching
uses: docker/build-push-action@v4
with:
context: .
file: ${{ matrix.dockerfile }}
push: false
tags: quivrhq/megaparse:${{ matrix.dockerfile }}
cache-from: type=gha
cache-to: type=gha,mode=max
================================================
FILE: .gitignore
================================================
/output
/input
.env
__pycache__/
dist/**
megaparse.egg-info/
*.pyc
build/*
ENV
venv
*/evaluations/*
*/cdp/*
*.pkl
!megaparse/tests/output_tests/MegaFake_report.md
*.DS_Store
.tool-versions
megaparse/sdk/examples/only_pdfs/*
**/profile/
**/prof/
.ropeproject/
benchmark/hi_res/*
benchmark/auto/*
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-added-large-files
args: ["--maxkb=5000"]
- id: check-toml
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: check-merge-conflict
- id: detect-private-key
- id: check-case-conflict
- repo: https://github.com/pre-commit/pre-commit
rev: v3.6.2
hooks:
- id: validate_manifest
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.5.1
hooks:
# Run the linter.
- id: ruff
args: [--fix]
additional_dependencies: []
# Run the formatter.
- id: ruff-format
additional_dependencies: []
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.1
hooks:
- id: mypy
name: mypy
additional_dependencies: ["types-aiofiles"]
================================================
FILE: .python-version
================================================
3.11.9
================================================
FILE: .release-please-manifest.json
================================================
{
"libs/megaparse": "0.0.55",
"libs/megaparse_sdk": "0.1.12"
}
================================================
FILE: .vscode/extensions.json
================================================
{
"recommendations": [
"dbaeumer.vscode-eslint",
"charliermarsh.ruff",
"knisterpeter.vscode-github",
"github.vscode-pull-request-github",
"ms-python.python",
"ms-python.vscode-pylance",
"ms-python.debugpy"
]
}
================================================
FILE: .vscode/launch.json
================================================
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Remote Attach",
"type": "python",
"request": "attach",
"connect": {
"host": "localhost",
"port": 5678
},
"pathMappings": [
{
"localRoot": "${workspaceFolder}/backend",
"remoteRoot": "."
}
],
"justMyCode": true
},
{
"name": "Python: Debug Test Script",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/backend/test_process_file_and_notify.py",
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Python: Debug",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}"
},
"envFile": "${workspaceFolder}/.env"
}
]
}
================================================
FILE: .vscode/settings.json
================================================
{
"editor.formatOnSave": true,
"editor.formatOnSaveMode": "file",
"files.exclude": {
"**/__pycache__": true,
"**/.benchmarks/": true,
"**/.cache/": true,
"**/.pytest_cache/": true,
"**/.next/": true,
"**/build/": true,
"**/.docusaurus/": true,
"**/node_modules/": true
},
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
}
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.autoTestDiscoverOnSaveEnabled": true,
"python.analysis.autoImportCompletions": true,
"python.analysis.typeCheckingMode": "basic",
"python.analysis.diagnosticSeverityOverrides": {
"reportMissingImports": "error",
"reportUnusedImport": "warning",
"reportGeneralTypeIssues": "warning"
},
"makefile.configureOnOpen": false
}
================================================
FILE: CHANGELOG.md
================================================
# Changelog
## [0.0.46](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.45...megaparse-v0.0.46) (2024-11-21)
### Features
* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
## [0.0.45](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.44...megaparse-v0.0.45) (2024-11-19)
### Bug Fixes
* small fixes from backlogs ([#128](https://github.com/QuivrHQ/MegaParse/issues/128)) ([954554c](https://github.com/QuivrHQ/MegaParse/commit/954554c5abaa7b0513e9ff3f6bbaff393d36cf03))
## [0.0.44](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.43...megaparse-v0.0.44) (2024-11-18)
### Bug Fixes
* fixing the wrong passing of arguments to the parse_file endpoint ([#123](https://github.com/QuivrHQ/MegaParse/issues/123)) ([9105672](https://github.com/QuivrHQ/MegaParse/commit/9105672abc0942f26785e494053112d486e8d2d9))
## [0.0.43](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.42...megaparse-v0.0.43) (2024-11-14)
### Features
* increase the robustness of megaparse ([#121](https://github.com/QuivrHQ/MegaParse/issues/121)) ([d21d8bb](https://github.com/QuivrHQ/MegaParse/commit/d21d8bb77bd8e687b1a951db6b81653e4e47a8bb))
### Bug Fixes
* uvicorn version ([#127](https://github.com/QuivrHQ/MegaParse/issues/127)) ([ceaba3d](https://github.com/QuivrHQ/MegaParse/commit/ceaba3df2951be27e6a4835e5784917a62867896))
* version requirements ([#126](https://github.com/QuivrHQ/MegaParse/issues/126)) ([a10d502](https://github.com/QuivrHQ/MegaParse/commit/a10d502f1b3576690cebe33b656d2480a24defe3))
## [0.0.42](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.41...megaparse-v0.0.42) (2024-11-08)
### Features
* **sdk:** new version ([e377cd6](https://github.com/QuivrHQ/MegaParse/commit/e377cd6df98b3ea9265788a4d907b43bde796196))
## [0.0.41](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.40...megaparse-v0.0.41) (2024-11-08)
### Bug Fixes
* add megaparse url env variable ([#118](https://github.com/QuivrHQ/MegaParse/issues/118)) ([132c2eb](https://github.com/QuivrHQ/MegaParse/commit/132c2ebd13177fd116c4e710a4b1c864a9fa04bb))
## [0.0.40](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.39...megaparse-v0.0.40) (2024-11-08)
### Bug Fixes
* sdk version ([#116](https://github.com/QuivrHQ/MegaParse/issues/116)) ([8bfeb4a](https://github.com/QuivrHQ/MegaParse/commit/8bfeb4a52326a5f645d3ed20e113153dc19bf012))
## [0.0.39](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.38...megaparse-v0.0.39) (2024-11-08)
### Bug Fixes
* add_logs ([#114](https://github.com/QuivrHQ/MegaParse/issues/114)) ([63c9236](https://github.com/QuivrHQ/MegaParse/commit/63c9236590016ee4c210174e746e96ff2b654480))
## [0.0.38](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.37...megaparse-v0.0.38) (2024-11-07)
### Bug Fixes
* env roots, imports root ([#112](https://github.com/QuivrHQ/MegaParse/issues/112)) ([a04230d](https://github.com/QuivrHQ/MegaParse/commit/a04230dc2de9e0bb0bde39ab66b2208f80743922))
## [0.0.37](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.36...megaparse-v0.0.37) (2024-11-07)
### Features
* bump megaparse-sdk version to 0.1.1 ([ed3fdfb](https://github.com/QuivrHQ/MegaParse/commit/ed3fdfb10498c95d4f9a510df3a2913e0dfc3c23))
## [0.0.36](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.35...megaparse-v0.0.36) (2024-11-07)
### Features
* **readme:** update ([9d571b7](https://github.com/QuivrHQ/MegaParse/commit/9d571b7c71db610e7a0b08045ad98994ecf71baa))
## [0.0.35](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.34...megaparse-v0.0.35) (2024-11-07)
### Bug Fixes
* unnecessary dep and readme ([#107](https://github.com/QuivrHQ/MegaParse/issues/107)) ([b80aaa3](https://github.com/QuivrHQ/MegaParse/commit/b80aaa3a894b2bd2c7d7f518919c41af5c99219f))
## [0.0.34](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.33...megaparse-v0.0.34) (2024-11-07)
### Features
* megaparse-sdk-cherry ([#105](https://github.com/QuivrHQ/MegaParse/issues/105)) ([ad44aa3](https://github.com/QuivrHQ/MegaParse/commit/ad44aa34999596e156c78f91adab97bce7ceeb0e))
## [0.0.33](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.32...megaparse-v0.0.33) (2024-11-01)
### Bug Fixes
* readme ([#99](https://github.com/QuivrHQ/MegaParse/issues/99)) ([b3b80a3](https://github.com/QuivrHQ/MegaParse/commit/b3b80a3a599bbd4bec8ed79bb9ef44c8c7c92789))
## [0.0.32](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.31...megaparse-v0.0.32) (2024-11-01)
### Features
* **api:** megaparse under api ([#93](https://github.com/QuivrHQ/MegaParse/issues/93)) ([2edf44b](https://github.com/QuivrHQ/MegaParse/commit/2edf44bd8c09ac7127db74206e463ebe29c68998))
### Bug Fixes
* api call error & tests ([#98](https://github.com/QuivrHQ/MegaParse/issues/98)) ([6bf1ce8](https://github.com/QuivrHQ/MegaParse/commit/6bf1ce8c6ed0e4f1e81577973a0fc71f61b10776))
## [0.0.31](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.30...megaparse-v0.0.31) (2024-08-20)
### Features
* **pytorch:** cpu only removed ([#88](https://github.com/QuivrHQ/MegaParse/issues/88)) ([6b2fcfa](https://github.com/QuivrHQ/MegaParse/commit/6b2fcfa4413b8a72d398aab57f277dd28ab69c2f))
## [0.0.30](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.29...megaparse-v0.0.30) (2024-08-20)
### Features
* **pytorch:** cpu only optional ([#86](https://github.com/QuivrHQ/MegaParse/issues/86)) ([e5d8806](https://github.com/QuivrHQ/MegaParse/commit/e5d8806ee6182de250352ce65ac6cd57c1093494))
## [0.0.29](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.28...megaparse-v0.0.29) (2024-08-18)
### Bug Fixes
* **building:** version not working ([#83](https://github.com/QuivrHQ/MegaParse/issues/83)) ([c5e73f6](https://github.com/QuivrHQ/MegaParse/commit/c5e73f6c821424ef277ddd15ddb5b2df48ff7ab2))
## [0.0.28](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.27...megaparse-v0.0.28) (2024-08-16)
### Features
* **rye:** added package manager ([#81](https://github.com/QuivrHQ/MegaParse/issues/81)) ([a3a50a3](https://github.com/QuivrHQ/MegaParse/commit/a3a50a3f27d3d9b4d6de4f3415472f8e52710656))
## [0.0.27](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.26...megaparse-v0.0.27) (2024-08-16)
### Features
* **unstructured:** increased version ([#78](https://github.com/QuivrHQ/MegaParse/issues/78)) ([eb49cf5](https://github.com/QuivrHQ/MegaParse/commit/eb49cf5e79cd7a38c8212b315a4b64860c35a7b7))
## [0.0.26](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.25...megaparse-v0.0.26) (2024-08-16)
### Bug Fixes
* **pycrypto:** being used by an old version of pdfplumber ([#76](https://github.com/QuivrHQ/MegaParse/issues/76)) ([d28f88c](https://github.com/QuivrHQ/MegaParse/commit/d28f88ceb2a722b15c84738f395b3ff4c818a365))
## [0.0.25](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.24...megaparse-v0.0.25) (2024-08-16)
### Features
* **rye:** implemented ([#74](https://github.com/QuivrHQ/MegaParse/issues/74)) ([1e9ad8e](https://github.com/QuivrHQ/MegaParse/commit/1e9ad8e0000f28c709d915219fe62c0dbe7fa812))
## [0.0.24](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.23...megaparse-v0.0.24) (2024-07-30)
### Features
* async load ([#71](https://github.com/QuivrHQ/MegaParse/issues/71)) ([fbc3e1b](https://github.com/QuivrHQ/MegaParse/commit/fbc3e1b5f504eee9757e15592169ddad9b069f03))
## [0.0.23](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.22...megaparse-v0.0.23) (2024-07-30)
### Features
* megaparse 0.0.22 ([071fd4d](https://github.com/QuivrHQ/MegaParse/commit/071fd4da2e8f0abb58fc66c3cdd87c4ee5cda4d6))
## 0.0.20 (2024-07-10)
## What's Changed
* add: resolve multiple page problem on llama parse by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/61
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.19...v0.0.20
## 0.0.19 (2024-06-28)
## What's Changed
* add: choose unstructured strategy by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/57
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.18...v0.0.19
## 0.0.18 (2024-06-28)
## What's Changed
* fix: add __init__.py by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/54
* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/56
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.17...v0.0.18
## 0.0.17 (2024-06-27)
## What's Changed
* markdown by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/48
* fix:Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/49
* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/50
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.16...v0.0.17
## 0.0.16 (2024-06-27)
## What's Changed
* Fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/47
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.15...v0.0.16
## 0.0.15 (2024-06-26)
## What's Changed
* add: llm megaparser by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/42
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.14...v0.0.15
## 0.0.14 (2024-06-24)
## What's Changed
* fix: remove nest asycio by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/40
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.13...v0.0.14
## 0.0.13 (2024-06-24)
## What's Changed
* fix: use aload_data by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/38
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.12...v0.0.13
## 0.0.12 (2024-06-18)
## What's Changed
* fix:delete markdownify dependency by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/33
* fix: fake fix README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/34
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.11...v0.0.12
## 0.0.11 (2024-06-17)
## What's Changed
* Fix OpenAI key error. Add docstrings. Polish code by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/24
* Fix DOCX reader. Add input tests by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/25
* add: xlsx convertor by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/29
* add: convert_tab by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/31
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.10...v0.0.11
## 0.0.10 (2024-06-04)
## What's Changed
* Change from LiteralString to Literal (typing) by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/21
* chore: Add Dockerfile and Makefile for project setup by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/23
## New Contributors
* @dSupertramp made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/21
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.9...v0.0.10
## 0.0.9 (2024-06-04)
## What's Changed
* chore: Update README.md to include optional use of LlamaParse for improved results by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/19
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.8...v0.0.9
## 0.0.8 (2024-06-04)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.7...v0.0.8
## 0.0.7 (2024-06-03)
## What's Changed
* feat: Update benchmark results in README.md by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/15
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.6...v0.0.7
## 0.0.6 (2024-06-03)
## What's Changed
* add: gpt cleaner for header and footer by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/13
## New Contributors
* @chloedia made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/13
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.5...v0.0.6
## 0.0.5 (2024-06-02)
## What's Changed
* feat: Add instructions for installing poppler and tesseract by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/10
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.4...v0.0.5
## 0.0.4 (2024-06-02)
## What's Changed
* add: baseline evaluation by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/7
* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/9
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.3...v0.0.4
## 0.0.3 (2024-05-30)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.2...v0.0.3
## 0.0.2 (2024-05-30)
## What's Changed
* feat: Megaparse example and working by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/2
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2
## 0.0.2 (2024-05-30)
**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2
================================================
FILE: Dockerfile
================================================
FROM python:3.11.10-slim-bullseye
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && apt-get upgrade && apt-get install -y \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
build-essential \
libtool \
python-dev \
build-essential \
wget \
gcc \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
COPY requirements.lock pyproject.toml README.md ./
COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
RUN pip install uv
RUN uv pip install --no-cache --system -r requirements.lock
RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all
COPY . .
RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
EXPOSE 8000
CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
================================================
FILE: Dockerfile.gpu
================================================
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04
WORKDIR /app
ENV UV_COMPILE_BYTECODE=1
ENV UV_NO_CACHE=1
ENV DEBIAN_FRONTEND=noninteractive
# Install runtime dependencies
RUN apt-get update && apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && apt-get install -y \
python3.11 \
python3.11-dev \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
libtool \
python3-pip \
build-essential \
wget \
gcc \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
update-alternatives --set python3 /usr/bin/python3.11
COPY requirements.lock pyproject.toml README.md ./
COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
RUN uv pip install --no-cache --system -r requirements.lock
RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all
# FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured
# RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
# RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')"
COPY . .
RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Makefile
================================================
.DEFAULT_TARGET=help
## help: Display list of commands
.PHONY: help
help:
@echo "Available commands:"
@sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |'
## dev: Start development environment
.PHONY: dev
dev:
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build
## dev-build: Build development environment without cache
.PHONY: dev-build
dev-build:
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache
DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up
## prod: Build and start production environment
.PHONY: prod
prod:
docker compose -f docker-compose.yml up --build
================================================
FILE: Pipfile
================================================
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
[dev-packages]
[requires]
python_version = "3.11"
================================================
FILE: README.md
================================================
# MegaParse - Your Parser for every type of documents
MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing.
## Key Features 🎯
- **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease.
- **No Information Loss**: Focus on having no information loss during parsing.
- **Fast and Efficient**: Designed with speed and efficiency at its core.
- **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents.
- **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use.
## Support
- Files: ✅ PDF ✅ Powerpoint ✅ Word
- Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images
### Example
https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3
## Installation
required python version >= 3.11
```bash
pip install megaparse
```
## Usage
1. Add your OpenAI or Anthropic API key to the .env file
2. Install poppler on your computer (images and PDFs)
3. Install tesseract on your computer (images and PDFs)
4. If you have a mac, you also need to install libmagic ```brew install libmagic```
Use MegaParse as it is :
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
megaparse = MegaParse()
response = megaparse.load("./test.pdf")
print(response)
```
### Use MegaParse Vision
```python
from megaparse.parser.megaparse_vision import MegaParseVision
model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
parser = MegaParseVision(model=model)
response = parser.convert("./test.pdf")
print(response)
```
**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
## Use as an API
There is a MakeFile for you, simply use :
```make dev```
at the root of the project and you are good to go.
See localhost:8000/docs for more info on the different endpoints !
## BenchMark
| Parser | similarity_ratio |
| ----------------------------- | ---------------- |
| megaparse_vision | 0.87 |
| unstructured_with_check_table | 0.77 |
| unstructured | 0.59 |
| llama_parser | 0.33 |
_Higher the better_
Note: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together .
## In Construction 🚧
- Improve table checker
- Create Checkers to add **modular postprocessing** ⚙️
- Add Structured output, **let's get computer talking** 🤖
## Star History
[](https://star-history.com/#QuivrHQ/MegaParse&Date)
================================================
FILE: benchmark/process_single_doc.py
================================================
import asyncio
import time
from pathlib import Path
import numpy as np
from megaparse import MegaParse
N_TRY = 1
async def process_file(megaparse: MegaParse, file_path: str | Path):
try:
t0 = time.perf_counter()
_ = await megaparse.aload(
file_path=file_path,
)
total = time.perf_counter() - t0
return total
except Exception as e:
print(f"Exception occured: {e}")
return None
async def test_process_file(file: str | Path):
# parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse()
task = []
for _ in range(N_TRY):
task.append(process_file(megaparse, file))
list_process_time = await asyncio.gather(*task)
n_errors = sum([t is None for t in list_process_time])
list_process_time = [t for t in list_process_time if t is not None]
np_list_process_time = np.array(list_process_time)
print(f"All errors : {n_errors}")
print(f"Average time taken: {np_list_process_time.mean()}")
print(f"Median time taken: {np.median(list_process_time)}")
print(f"Standard deviation of time taken: {np.std(list_process_time)}")
print(f"Max time taken: {np.max(list_process_time)}")
print(f"Min time taken: {np.min(list_process_time)}")
if __name__ == "__main__":
folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf"
asyncio.run(test_process_file(folder_path))
================================================
FILE: benchmark/test_quality_sim.py
================================================
import os
import difflib
from pathlib import Path
auto_dir = Path("benchmark/auto")
hi_res_dir = Path("benchmark/hi_res")
def jaccard_similarity(str1, str2):
if len(str1) == 0 and len(str2) == 0:
return 1
# Tokenize the strings into sets of words
words1 = set(str1.split())
words2 = set(str2.split())
# Find intersection and union of the word sets
intersection = words1.intersection(words2)
union = words1.union(words2)
# Compute Jaccard similarity
return len(intersection) / len(union) if len(union) != 0 else 0
def compare_files(file_name):
file_path_auto = auto_dir / f"{file_name}.md"
file_path_hi_res = hi_res_dir / f"{file_name}.md"
with open(file_path_auto, "r") as f:
auto_content = f.read()
with open(file_path_hi_res, "r") as f:
hi_res_content = f.read()
if len(auto_content) == 0 and len(hi_res_content) == 0:
return 1
similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio()
# similarity = jaccard_similarity(auto_content, hi_res_content)
return similarity
def main():
files = os.listdir(hi_res_dir)
print(f"Comparing {len(files)} files...")
similarity_dict = {}
for file in files:
file_name = Path(file).stem
similarity = compare_files(file_name)
similarity_dict[file_name] = similarity
avg_similarity = sum(similarity_dict.values()) / len(similarity_dict)
print(f"\nAverage similarity: {avg_similarity}\n")
pass_rate = sum(
[similarity > 0.9 for similarity in similarity_dict.values()]
) / len(similarity_dict)
print(f"Pass rate: {pass_rate}\n")
print("Under 0.9 similarity documents:")
print("-------------------------------")
for file_name, similarity in similarity_dict.items():
if similarity < 0.9:
print(f"{file_name}: {similarity}")
if __name__ == "__main__":
main()
================================================
FILE: docker-compose.dev.yml
================================================
version: "3.8"
services:
megaparse:
build:
context: .
dockerfile: Dockerfile
cache_from:
- megaparse:latest
args:
- DEV_MODE=true
image: megaparse:latest
extra_hosts:
- "host.docker.internal:host-gateway"
container_name: megaparse
volumes:
- ./:/app/
command: >
/bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000"
restart: always
ports:
- 8000:8000
================================================
FILE: docker-compose.yml
================================================
version: "3.8"
services:
megaparse:
image: megaparse:latest
pull_policy: if_not_present
container_name: megaparse
extra_hosts:
- "host.docker.internal:host-gateway"
healthcheck:
test: [ "CMD", "curl", "http://localhost:5050/healthz" ]
command: >
/bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop"
restart: always
ports:
- 8000:8000
================================================
FILE: docs/archive.txt
================================================
### (Optional) Use LlamaParse for Improved Results
1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
2. Change the parser to LlamaParser
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.llama_parser import LlamaParser
parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY"))
megaparse = MegaParse(parser)
response = megaparse.load("./test.pdf")
print(response)
megaparse.save("./test.md") #saves the last processed doc in md format
```
================================================
FILE: evaluations/script.py
================================================
import difflib
import os
from langchain_openai import ChatOpenAI
from megaparse.megaparse import MegaParse
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.parser_config import StrategyEnum
if __name__ == "__main__":
print("---Launching evaluations script---")
model = ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))) # type: ignore
parser_dict = {
"unstructured": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None),
"unstructured_with_check_table": UnstructuredParser(
strategy=StrategyEnum.AUTO,
model=model,
),
"llama_parser": LlamaParser(api_key=str(os.getenv("LLAMA_CLOUD_API_KEY"))),
"megaparse_vision": MegaParseVision(model=model),
}
base_pdf_path = "tests/data/MegaFake_report.pdf"
base_md_path = "tests/data/grt_example/MegaFake_report.md"
with open(base_md_path, "r", encoding="utf-8") as f:
base_md = f.read()
score_dict = {}
for method, parser in parser_dict.items():
print(f"Method: {method}")
megaparse = MegaParse()
result = megaparse.load(file_path=base_pdf_path)
score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()
print(f"Score for method {method}: {score_dict[method]}")
# Sort the results
sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
# Generate a table with the results
benchmark_results = "| Parser | similarity_ratio |\n|---|---|\n"
for parser, score in sorted_score:
benchmark_results += f"| {parser} | {score:.2f} |\n"
print(benchmark_results)
# Update README.md file
with open("README.md", "r") as readme_file:
readme_content = readme_file.read()
start_marker = ""
end_marker = ""
start_index = readme_content.find(start_marker) + len(start_marker)
end_index = readme_content.find(end_marker)
updated_readme_content = (
readme_content[:start_index]
+ "\n"
+ benchmark_results
+ readme_content[end_index:]
)
with open("README.md", "w") as readme_file:
readme_file.write(updated_readme_content)
================================================
FILE: libs/megaparse/.python-version
================================================
3.11.9
================================================
FILE: libs/megaparse/CHANGELOG.md
================================================
# Changelog
## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14)
### Features
* remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8))
## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11)
### Features
* add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))
## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16)
### Features
* modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2))
* Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))
* type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8))
### Bug Fixes
* Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))
* add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))
* logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e))
* megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9))
* Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))
* sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707))
## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16)
### Bug Fixes
* hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))
## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16)
### Features
* updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862))
## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13)
### Features
* small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))
## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12)
### Features
* custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))
* faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))
## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03)
### Features
* Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430))
## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21)
### Features
* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
* release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))
## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24)
### Features
* Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc))
* Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72))
* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914))
* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
* Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0))
### Bug Fixes
* add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70))
* change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f))
* PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7))
* remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4))
* use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089))
## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24)
### Features
* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
================================================
FILE: libs/megaparse/README.md
================================================
# MegaParse CORE
- Core package of megaparse
> **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/).
================================================
FILE: libs/megaparse/bench.md
================================================
------------
UNSTRUCTURED(HI-RES):
------------
folder: cdp
cdp_etiquette.pdf parsing took: 2.10s
folder: scanned-tables
POZIBILAN 2022.pdf parsing took: 78.72s
Banco Popilar Number 2.pdf parsing took: 94.44s
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s
folder: scanned
machine.pdf parsing took: 54.29s
medical.pdf parsing took: 76.11s
les_americains.pdf parsing took: 643.84s
agency.pdf parsing took: 114.19s
clark.pdf parsing took: 27.89s
tables_ocr.pdf parsing took: 81.21s
folder: rich
language_learning.pdf parsing took: 2.60s
dites nous tout....pdf parsing took: 1.62s
------------
UNSTRUCTURED(FAST):
------------
folder: cdp
cdp_etiquette.pdf parsing took: 0.05s
folder: scanned-tables
POZIBILAN 2022.pdf: can't parse
Banco Popilar Number 2.pdf: can't parse
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s
folder: scanned
machine.pdf parsing took: 0.02s
medical.pdf parsing took: 0.04s
les_americains.pdf parsing took: 5.90s
agency.pdf: can't parse
clark.pdf: can't parse
tables_ocr.pdf: can't parse
folder: rich
language_learning.pdf: can't parse
dites nous tout....pdf parsing took: 0.02s
------------
Megaparse (
strategy = AUTO
Config = {
provider=COREML,
det_arch: str = "fast_base"
det_batch_size: int = 2
assume_straight_pages: bool = True
preserve_aspect_ratio: bool = True
symmetric_pad: bool = True
load_in_8_bit: bool = False
reco_arch: str = "crnn_vgg16_bn"
rec_batch_size: int = 512
}
)
------------
folder: cdp
cdp_etiquette.pdf parsing took: 1.71s
folder: scanned-tables
POZIBILAN 2022.pdf parsing took: 17.76s
Banco Popilar Number 2.pdf parsing took: 19.25s
folder: native
00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s
0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s
0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s
folder: scanned
machine.pdf parsing took: 9.90s
medical.pdf parsing took: 13.09s
les_americains.pdf parsing took: 139.53s
agency.pdf parsing took: 10.73s
clark.pdf parsing took: 10.69s
tables_ocr.pdf parsing took: 15.58s
folder: rich
language_learning.pdf parsing took: 1.74s
dites nous tout....pdf parsing took: 0.64s
----
| Type | PDF Name | Unstructured(HI-RES) | Unstructured(FAST) | Megaparse( w/ doctr COREML) |
|------------------|-----------------------------------|---------------------|----------------------|--------------------|
| **cdp** | cdp_etiquette.pdf | 2.10s | 0.05s (bad parsing) | 1.71s |
| **scanned-tables** | POZIBILAN 2022.pdf | 78.72s | can't parse | 17.76s |
| **scanned-tables** | Banco Popilar Number 2.pdf | 94.44s | can't parse | 19.25s |
| **native** | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s | 0.07s | 0.96s |
| **native** | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s | 12.57s |
| **native** | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s | 1.53s |
| **scanned** | machine.pdf | 54.29s | 0.02s | 9.90s |
| **scanned** | medical.pdf | 76.11s | 0.04s | 13.09s |
| **scanned** | les_americains.pdf | 643.84s | 5.90s | 139.53s |
| **scanned** | agency.pdf | 114.19s | can't parse | 10.73s |
| **scanned** | clark.pdf | 28.89s | can't parse | 10.69s |
| **scanned** | tables_ocr.pdf | 81.21s | can't parse | 15.58s |
| **rich** | language_learning.pdf | 2.60s | can't parse | 1.74s |
| **rich** | dites nous tout....pdf | 1.62s | 0.02s | 0.64s |
================================================
FILE: libs/megaparse/examples/parse_file_fast.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from unstructured.partition.auto import partition
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
elements = partition(
filename=file.file_path,
strategy="fast",
)
if len(elements) == 0:
print(f"\t{file.file_name}: can't parse ")
continue
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/examples/parse_file_mp.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from megaparse import MegaParse
from megaparse.configs.auto import DeviceEnum, MegaParseConfig
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
config = MegaParseConfig(device=DeviceEnum.COREML)
mp = MegaParse(config=config)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
result = mp.load(file.file_path)
if len(result) == 0:
print(f"\t{file.file_name}: can't parse ")
continue
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/examples/parse_file_unstructured.py
================================================
import os
from dataclasses import dataclass
from time import perf_counter
from unstructured.partition.auto import partition
@dataclass
class File:
file_path: str
file_name: str
file_extension: str
def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
directory_dict = {}
for root, _, files in os.walk(directory_path):
folder_name = os.path.basename(root)
if len(folder_name) > 0:
file_list = []
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = os.path.splitext(file_name)[1]
file_list.append(
File(
file_path=file_path,
file_name=file_name,
file_extension=file_extension,
)
)
directory_dict[folder_name] = file_list
return directory_dict
def main():
file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
folder_path = "/Users/amine/data/quivr/parsing/"
list_files = list_files_in_directory(folder_path)
for folder_name, files in list_files.items():
print(f"folder: {folder_name}")
for file in files:
if file.file_extension == ".pdf":
s = perf_counter()
_ = partition(
filename=file.file_path,
strategy="hi_res",
)
e = perf_counter()
print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
if __name__ == "__main__":
els = main()
================================================
FILE: libs/megaparse/pyproject.toml
================================================
[project]
name = "megaparse"
version = "0.0.55"
authors = [
{ name = "Stan Girard", email = "stan@quivr.app" },
{ name = "Chloé Daems", email = "chloe@quivr.app" },
{ name = "Amine Dirhoussi", email = "amine@quivr.app" },
{ name = "Jacopo Chevallard", email = "jacopo@quivr.app" },
]
readme = "README.md"
requires-python = ">= 3.11"
dependencies = [
"megaparse-sdk",
"pycryptodome>=3.21.0",
"pdfplumber>=0.11.0",
"backoff>=2.2.1",
"pypdf>=5.0.1",
"psutil>=6.1.0",
"numpy<=2.0.0",
"playwright>=1.47.0",
"langchain-anthropic>=0.1.23",
"python-magic>=0.4.27",
"unstructured[all-docs]==0.15.0",
"langchain>=0.3,<0.4",
"langchain-community>=0.3,<0.4",
"langchain-openai>=0.1.21",
"langchain-core>=0.3,<0.4",
"llama-parse>=0.4.0",
"pydantic-settings>=2.6.1",
"onnxruntime==1.20.0; platform_machine == 'x86_64'",
"onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'",
"onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'",
"onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'",
"pypdfium2>=4.30.0",
]
[project.optional-dependencies]
api = [
"python-dotenv>=1.0.0",
"uvloop>=0.18.0",
"pydantic-settings>=2.6.1",
"uvicorn>=0.32.0",
"fastapi>=0.115.2",
"ratelimit>=2.2.1",
]
[build-system]
requires = ["hatchling==1.26.3"]
build-backend = "hatchling.build"
[tool.rye]
managed = true
dev-dependencies = []
universal = true
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["src/megaparse", "src/api"]
================================================
FILE: libs/megaparse/src/megaparse/__init__.py
================================================
from .megaparse import MegaParse
__all__ = ["MegaParse"]
================================================
FILE: libs/megaparse/src/megaparse/api/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/app.py
================================================
import io
import os
import tempfile
from typing import Any, Optional
import httpx
import psutil
import uvicorn
from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
from langchain_anthropic import ChatAnthropic
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_openai import ChatOpenAI
from llama_parse.utils import Language
from megaparse_sdk.schema.document import Document
from megaparse_sdk.schema.parser_config import (
ParserType,
StrategyEnum,
)
from megaparse_sdk.schema.supported_models import SupportedModel
from megaparse import MegaParse
from megaparse.api.exceptions.megaparse_exceptions import (
HTTPDownloadError,
HTTPFileNotFound,
HTTPModelNotSupported,
HTTPParsingException,
ParsingException,
)
from megaparse.parser.builder import ParserBuilder
app = FastAPI()
playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"])
def parser_builder_dep():
return ParserBuilder()
def get_playwright_loader():
return playwright_loader
@app.get("/healthz")
def healthz():
return {"status": "ok"}
def _check_free_memory() -> bool:
"""Reject traffic when free memory is below minimum (default 2GB)."""
mem = psutil.virtual_memory()
memory_free_minimum = int(os.environ.get("MEMORY_FREE_MINIMUM_MB", 2048))
if mem.available <= memory_free_minimum * 1024 * 1024:
return False
return True
@app.post(
"/v1/file",
)
async def parse_file(
file: UploadFile = File(...),
method: ParserType = Form(ParserType.UNSTRUCTURED),
strategy: StrategyEnum = Form(StrategyEnum.AUTO),
check_table: bool = Form(False),
language: Language = Form(Language.ENGLISH),
parsing_instruction: Optional[str] = Form(None),
model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O),
parser_builder=Depends(parser_builder_dep),
) -> dict[str, str | Document]:
if not _check_free_memory():
raise HTTPException(
status_code=503, detail="Service unavailable due to low memory"
)
model = None
if model_name and check_table:
if model_name.startswith("gpt"):
model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
elif model_name.startswith("claude"):
model = ChatAnthropic(
model_name=model_name,
api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore
timeout=60,
stop=None,
)
else:
raise HTTPModelNotSupported()
# parser_config = ParseFileConfig( #FIXME
# method=method,
# strategy=strategy,
# llm_model_name=SupportedModel(model_name) if model_name and check_table else None,
# language=language,
# parsing_instruction=parsing_instruction,
# )
try:
# parser = parser_builder.build(parser_config)
megaparse = MegaParse()
if not file.filename:
raise HTTPFileNotFound("No filename provided")
_, extension = os.path.splitext(file.filename)
file_bytes = await file.read()
file_stream = io.BytesIO(file_bytes)
result = await megaparse.aload(file=file_stream, file_extension=extension)
return {"message": "File parsed successfully", "result": result}
except ParsingException as e:
print(e)
raise HTTPParsingException(file.filename)
except ValueError as e:
print(e)
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
print(e)
raise HTTPException(status_code=500, detail=str(e))
@app.post(
"/v1/url",
)
async def upload_url(
url: str, playwright_loader=Depends(get_playwright_loader)
) -> dict[str, Any]:
playwright_loader.urls = [url]
if url.endswith(".pdf"):
## Download the file
async with httpx.AsyncClient() as client:
response = await client.get(url)
if response.status_code != 200:
raise HTTPDownloadError(url)
with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
try:
megaparse = MegaParse()
result = await megaparse.aload(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except ParsingException:
raise HTTPParsingException(url)
else:
data = await playwright_loader.aload()
# Now turn the data into a string
extracted_content = ""
for page in data:
extracted_content += page.page_content
if not extracted_content:
raise HTTPDownloadError(
url,
message="Failed to extract content from the website. Valid URL example : https://www.quivr.com",
)
return {
"message": "Website content parsed successfully",
"result": extracted_content,
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
================================================
FILE: libs/megaparse/src/megaparse/api/exceptions/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py
================================================
from fastapi import HTTPException
class HTTPModelNotSupported(HTTPException):
def __init__(
self,
detail: str = "The requested model is not supported yet.",
headers: dict | None = None,
):
super().__init__(status_code=501, detail=detail, headers=headers)
class HTTPFileNotFound(HTTPException):
def __init__(
self,
message="The UploadFile.filename does not exist and is needed for this operation",
):
super().__init__(status_code=404, detail=message)
class HTTPDownloadError(HTTPException):
def __init__(self, file_name, message="Failed to download the file"):
message = f"{file_name} : {message}"
super().__init__(status_code=400, detail=message)
class HTTPParsingException(HTTPException):
def __init__(self, file_name, message="Failed to parse the file"):
message = f"{file_name} : {message}"
super().__init__(status_code=500, detail=message)
class ParsingException(Exception):
"""Exception raised for errors in the parsing process."""
def __init__(self, message="An error occurred during parsing"):
self.message = message
super().__init__(self.message)
================================================
FILE: libs/megaparse/src/megaparse/api/models/__init__.py
================================================
================================================
FILE: libs/megaparse/src/megaparse/api/models/base.py
================================================
from enum import Enum
class MarkDownType(str, Enum):
"""Markdown type enumeration."""
TITLE = "Title"
SUBTITLE = "Subtitle"
HEADER = "Header"
FOOTER = "Footer"
NARRATIVE_TEXT = "NarrativeText"
LIST_ITEM = "ListItem"
TABLE = "Table"
PAGE_BREAK = "PageBreak"
IMAGE = "Image"
FORMULA = "Formula"
FIGURE_CAPTION = "FigureCaption"
ADDRESS = "Address"
EMAIL_ADDRESS = "EmailAddress"
CODE_SNIPPET = "CodeSnippet"
PAGE_NUMBER = "PageNumber"
DEFAULT = "Default"
UNDEFINED = "Undefined"
================================================
FILE: libs/megaparse/src/megaparse/configs/auto.py
================================================
from enum import Enum
from pydantic import BaseModel
from pydantic_settings import BaseSettings, SettingsConfigDict
class TextDetConfig(BaseModel):
det_arch: str = "fast_base"
batch_size: int = 2
assume_straight_pages: bool = True
preserve_aspect_ratio: bool = True
symmetric_pad: bool = True
load_in_8_bit: bool = False
class AutoStrategyConfig(BaseModel):
page_threshold: float = 0.6
document_threshold: float = 0.2
class TextRecoConfig(BaseModel):
reco_arch: str = "crnn_vgg16_bn"
batch_size: int = 512
class DeviceEnum(str, Enum):
CPU = "cpu"
CUDA = "cuda"
COREML = "coreml"
class DoctrConfig(BaseModel):
straighten_pages: bool = False
detect_orientation: bool = False
detect_language: bool = False
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()
class MegaParseConfig(BaseSettings):
"""
Configuration for Megaparse.
"""
model_config = SettingsConfigDict(
env_prefix="MEGAPARSE_",
env_file=(".env.local", ".env"),
env_nested_delimiter="__",
extra="ignore",
use_enum_values=True,
)
doctr_config: DoctrConfig = DoctrConfig()
auto_config: AutoStrategyConfig = AutoStrategyConfig()
device: DeviceEnum = DeviceEnum.CPU
================================================
FILE: libs/megaparse/src/megaparse/examples/parse_file.py
================================================
from pathlib import Path
from megaparse.megaparse import MegaParse
from pydantic import BaseModel, Field
class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")
def main():
# model = ChatOpenAI(name="gpt-4o")
# formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)
megaparse = MegaParse()
file_path = Path("./tests/pdf/ocr/0168127.pdf")
result = megaparse.load(file_path=file_path)
print(result)
if __name__ == "__main__":
main()
================================================
FILE: libs/megaparse/src/megaparse/examples/parsing_process.py
================================================
from pathlib import Path
from typing import IO, Any, List, Tuple
import numpy as np
import onnxruntime as rt
import pypdfium2 as pdfium
from megaparse.configs.auto import (
AutoStrategyConfig,
DeviceEnum,
TextDetConfig,
TextRecoConfig,
)
from megaparse.models.page import Page, PageDimension
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.document import BBOX, BlockLayout, BlockType, TextDetection
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from onnxtr.models import detection_predictor, recognition_predictor
from onnxtr.models.builder import DocumentBuilder
from onnxtr.models.engine import EngineConfig
from onnxtr.utils.geometry import (
detach_scores,
extract_crops,
extract_rcrops,
)
from pypdfium2._helpers.page import PdfPage
def get_strategy_page(
pdfium_page: PdfPage, onnxtr_page: TextDetection, page_threshold: float = 0.6
) -> StrategyEnum:
# assert (
# p_width == onnxtr_page.dimensions[1]
# and p_height == onnxtr_page.dimensions[0]
# ), "Page dimensions do not match"
text_coords = []
# Get all the images in the page
for obj in pdfium_page.get_objects():
if obj.type == 1:
text_coords.append(obj.get_pos())
p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())
pdfium_canva = np.zeros((int(p_height), int(p_width)))
for coords in text_coords:
# (left,bottom,right, top)
# 0---l--------------R-> y
# |
# B (x0,y0)
# |
# T (x1,y1)
# ^
# x
x0, y0, x1, y1 = (
p_height - coords[3],
coords[0],
p_height - coords[1],
coords[2],
)
x0 = max(0, min(p_height, int(x0)))
y0 = max(0, min(p_width, int(y0)))
x1 = max(0, min(p_height, int(x1)))
y1 = max(0, min(p_width, int(y1)))
pdfium_canva[x0:x1, y0:y1] = 1
onnxtr_canva = np.zeros((int(p_height), int(p_width)))
for block in onnxtr_page.bboxes:
x0, y0 = block.bbox[0]
x1, y1 = block.bbox[1]
x0 = max(0, min(int(x0 * p_width), int(p_width)))
y0 = max(0, min(int(y0 * p_height), int(p_height)))
x1 = max(0, min(int(x1 * p_width), int(p_width)))
y1 = max(0, min(int(y1 * p_height), int(p_height)))
onnxtr_canva[y0:y1, x0:x1] = 1
intersection = np.logical_and(pdfium_canva, onnxtr_canva)
union = np.logical_or(pdfium_canva, onnxtr_canva)
iou = np.sum(intersection) / np.sum(union)
if iou < page_threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
def validate_input(
file_path: Path | str | None = None,
file: IO[bytes] | None = None,
file_extension: str | FileExtension | None = None,
) -> FileExtension:
if not (file_path or file):
raise ValueError("Either file_path or file should be provided")
if file_path and file:
raise ValueError("Only one of file_path or file should be provided")
if file_path and file is None:
if isinstance(file_path, str):
file_path = Path(file_path)
file_extension = file_path.suffix
elif file and file_path is None:
if not file_extension:
raise ValueError(
"file_extension should be provided when given file argument"
)
file.seek(0)
else:
raise ValueError("Either provider a file_path or file")
if isinstance(file_extension, str):
try:
file_extension = FileExtension(file_extension)
except ValueError:
raise ValueError(f"Unsupported file extension: {file_extension}")
return file_extension
def _generate_crops(
pages: list[np.ndarray],
loc_preds: list[np.ndarray],
channels_last: bool,
assume_straight_pages: bool = False,
assume_horizontal: bool = False,
) -> list[list[np.ndarray]]:
if assume_straight_pages:
crops = [
extract_crops(page, _boxes[:, :4], channels_last=channels_last)
for page, _boxes in zip(pages, loc_preds, strict=False)
]
else:
crops = [
extract_rcrops(
page,
_boxes[:, :4],
channels_last=channels_last,
assume_horizontal=assume_horizontal,
)
for page, _boxes in zip(pages, loc_preds, strict=False)
]
return crops
def _prepare_crops(
pages: list[np.ndarray],
loc_preds: list[np.ndarray],
channels_last: bool,
assume_straight_pages: bool = False,
assume_horizontal: bool = False,
) -> tuple[list[list[np.ndarray]], list[np.ndarray]]:
crops = _generate_crops(
pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal
)
# Avoid sending zero-sized crops
is_kept = [
[all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops
]
crops = [
[crop for crop, _kept in zip(page_crops, page_kept, strict=False) if _kept]
for page_crops, page_kept in zip(crops, is_kept, strict=False)
]
loc_preds = [
_boxes[_kept] for _boxes, _kept in zip(loc_preds, is_kept, strict=False)
]
return crops, loc_preds
def _process_predictions(
loc_preds: list[np.ndarray],
word_preds: list[tuple[str, float]],
crop_orientations: list[dict[str, Any]],
) -> tuple[list[np.ndarray], list[list[tuple[str, float]]], list[list[dict[str, Any]]]]:
text_preds = []
crop_orientation_preds = []
if len(loc_preds) > 0:
# Text & crop orientation predictions at page level
_idx = 0
for page_boxes in loc_preds:
text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]])
crop_orientation_preds.append(
crop_orientations[_idx : _idx + page_boxes.shape[0]]
)
_idx += page_boxes.shape[0]
return loc_preds, text_preds, crop_orientation_preds
def main():
file_path = Path("./tests/pdf/sample_pdf.pdf")
strategy = StrategyEnum.AUTO
device = DeviceEnum.COREML
ocr_parser = DoctrParser()
default_parser = UnstructuredParser(strategy=StrategyEnum.FAST)
file_extension = validate_input(file_path=file_path)
with open(file_path, "rb") as file:
pdfium_document = pdfium.PdfDocument(file)
rasterized_pages: list[np.ndarray] = [
np.array(page.render().to_pil(scale=2)) for page in pdfium_document
]
##-----------------------------------
## GET PAGES
##-----------------------------------
mp_pages = []
if strategy == StrategyEnum.FAST:
parsed_document = default_parser.convert(
file=file,
file_extension=file_extension,
)
else:
text_det_config = TextDetConfig()
general_options = rt.SessionOptions()
providers = get_providers(device=device)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
det_predictor = detection_predictor(
arch=text_det_config.det_arch,
assume_straight_pages=text_det_config.assume_straight_pages,
preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,
symmetric_pad=text_det_config.symmetric_pad,
batch_size=text_det_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
if any(page.ndim != 3 for page in rasterized_pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
orientations = None
general_pages_orientations = None
# Localize text elements
loc_preds, out_maps = det_predictor(rasterized_pages, return_maps=True)
# FIXME: For simplicity we do not care about page orientation rn
# FIXME: similaly we don't care about straighten page
# Detach objectness scores from loc_preds
loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type]
# FIXME: Do not care about hooks here
# # Apply hooks to loc_preds if any
# for hook in hooks:
# loc_preds = hook(loc_preds)
all_pages_layouts = []
for page_index, (page, loc_pred, objectness_score) in enumerate(
zip(rasterized_pages, loc_preds, objectness_scores, strict=True)
):
block_layouts = []
for bbox, score in zip(loc_pred, objectness_score, strict=True):
block_layouts.append(
BlockLayout(
bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),
objectness_score=score,
block_type=BlockType.TEXT,
)
)
all_pages_layouts.append(
TextDetection(
bboxes=block_layouts,
page_index=page_index,
dimensions=page.shape[:2],
orientation=general_pages_orientations[page_index]
if general_pages_orientations is not None
else 0,
)
)
for pdfium_page, onnxtr_page, rasterized_page in zip(
pdfium_document, all_pages_layouts, rasterized_pages, strict=True
):
strategy = get_strategy_page(pdfium_page, onnxtr_page)
mp_pages.append(
Page(
strategy=strategy,
text_detections=onnxtr_page,
rasterized=rasterized_page,
page_size=PageDimension(
width=pdfium_page.get_width(),
height=pdfium_page.get_height(),
),
page_index=onnxtr_page.page_index,
pdfium_elements=pdfium_page,
)
)
##-----------------------------------
## GET PARSER BASE ON CHOSE STRATEGY
##-----------------------------------
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
parser = default_parser
elif strategy == StrategyEnum.HI_RES:
parser = ocr_parser
else:
if need_hi_res(mp_pages, AutoStrategyConfig()):
parser = ocr_parser
else:
parser = default_parser
##-----------------------------------
## PARSE FILE
##-----------------------------------
if isinstance(parser, UnstructuredParser):
parsed_document = parser.convert(
file=file,
pages=mp_pages,
file_extension=file_extension,
)
else:
origin_page_shapes: List[Tuple[int, int]] = [
(page.shape[0], page.shape[1]) for page in rasterized_pages
]
reco_config = TextRecoConfig()
reco_predictor = recognition_predictor(
arch=reco_config.reco_arch,
batch_size=reco_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
# Crop images
crops, loc_preds = _prepare_crops(
rasterized_pages,
loc_preds, # type: ignore[arg-type]
channels_last=True,
assume_straight_pages=True, # FIXME: To change
assume_horizontal=True, # FIXME: To change
)
# Rectify crop orientation and get crop orientation predictions
crop_orientations: Any = []
# Identify character sequences
word_preds = reco_predictor(
[crop for page_crops in crops for crop in page_crops]
)
if not crop_orientations:
crop_orientations = [
{"value": 0, "confidence": None} for _ in word_preds
]
boxes, text_preds, crop_orientations = _process_predictions(
loc_preds, word_preds, crop_orientations
)
doc_builder = DocumentBuilder()
parsed_document = doc_builder(
rasterized_pages,
boxes,
objectness_scores,
text_preds,
origin_page_shapes,
crop_orientations,
orientations,
None,
)
print(parsed_document)
if __name__ == "__main__":
main()
================================================
FILE: libs/megaparse/src/megaparse/exceptions/base.py
================================================
class ParsingException(Exception):
"""Exception raised for errors in the parsing process."""
def __init__(self, message="An error occurred during parsing"):
self.message = message
super().__init__(self.message)
================================================
FILE: libs/megaparse/src/megaparse/formatter/base.py
================================================
from abc import ABC
from pathlib import Path
from typing import Union
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse_sdk.schema.document import Document
class BaseFormatter(ABC):
"""
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
Attributes
----------
model : BaseChatModel
An instance of a chat model used to process and improve the layout of elements.
Methods
-------
improve_layout(elements: List[Element]) -> List[Element]
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
"""
def __init__(self, model: BaseChatModel | None = None):
self.model = model
def format(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py
================================================
from pathlib import Path
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from megaparse_sdk.schema.document import Document
from pydantic import BaseModel
class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model
async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
================================================
FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py
================================================
from pathlib import Path
from megaparse.formatter.structured_formatter import StructuredFormatter
from megaparse_sdk.schema.document import Document
from pydantic import BaseModel
class CustomStructuredFormatter(StructuredFormatter):
def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str:
"""
Structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
text = str(document)
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)
structured_model = self.model.with_structured_output(self.output_model) # type: ignore
formatted_text = structured_model.invoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
return formatted_text.model_dump_json()
async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str:
"""
Asynchronously structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
text = str(document)
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)
structured_model = self.model.with_structured_output(self.output_model) # type: ignore
formatted_text = await structured_model.ainvoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
return formatted_text.model_dump_json()
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py
================================================
from pathlib import Path
from megaparse.formatter.base import BaseFormatter
from megaparse_sdk.schema.document import Document
class TableFormatter(BaseFormatter):
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
raise NotImplementedError("Subclasses should implement this method")
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py
================================================
import re
import warnings
from pathlib import Path
from typing import Optional
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
from megaparse.formatter.table_formatter import TableFormatter
from megaparse_sdk.schema.document import Document, TableBlock
class SimpleMDTableFormatter(TableFormatter):
"""
A formatter that converts table elements into Markdown format using llms.
"""
TABLE_MARKER_START = "[TABLE]"
TABLE_MARKER_END = "[/TABLE]"
CODE_BLOCK_PATTERN = r"^```.*$\n?"
def __init__(self, model: Optional[BaseChatModel] = None):
super().__init__(model)
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
warnings.warn(
"The SimpleMDTableFormatter is a sync formatter, please use the sync format method",
UserWarning,
stacklevel=2,
)
return self.format(document=document, file_path=file_path)
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Formats table elements within a list of elements.
Args:
elements: A list of Element objects.
Returns:
A list of Element objects with formatted tables.
"""
if not self.model:
raise ValueError("A Model is needed to use the SimpleMDTableFormatter.")
print("Formatting tables using SimpleMDTableFormatter...")
table_stack = []
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
previous_table = table_stack[-1] if table_stack else ""
formatted_table = self.format_table(block, previous_table)
table_stack.append(formatted_table.text)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
def format_table(
self, table_element: TableBlock, previous_table: str
) -> TableBlock:
"""
Formats a single table element into Markdown using an AI language model.
Args:
table_element: The table element to format.
previous_table: The previously formatted table text.
Returns:
The formatted table element.
"""
assert self.model is not None, "Model is not set."
prompt = ChatPromptTemplate.from_messages(
[
(
"human",
(
"You are an expert in markdown tables. Transform the following parsed table into a "
"markdown table. Provide just the table in pure markdown, nothing else.\n"
"\n{text}\n\n"
"\n{previous_table}\n"
),
),
]
)
chain = prompt | self.model
result = chain.invoke(
{
"text": table_element.text,
"previous_table": previous_table,
}
)
content_str = str(result.content)
cleaned_content = re.sub(
self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE
)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{cleaned_content}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
table_element.text = markdown_table
return table_element
================================================
FILE: libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py
================================================
import base64
from io import BytesIO
from pathlib import Path
from typing import List, Optional
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from megaparse.formatter.table_formatter import TableFormatter
from megaparse_sdk.schema.document import Document, TableBlock
from pdf2image import convert_from_path
from PIL import Image
TABLE_OCR_PROMPT = """
You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting.
Answer uniquely with the parsed table. Do not include the fenced code blocks backticks.
"""
class VisionMDTableFormatter(TableFormatter):
"""
A formatter that converts table elements into Markdown format using an AI language model.
"""
TABLE_MARKER_START = "[TABLE]"
TABLE_MARKER_END = "[/TABLE]"
CODE_BLOCK_PATTERN = r"^```.*$\n?"
def __init__(self, model: Optional[BaseChatModel] = None):
super().__init__(model)
def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str:
"""
Helper method to crop the table portion of the PDF page and convert it to a base64 string.
"""
assert table_element.bbox, "Table element must have coordinates."
bbox = table_element.bbox
page_number = table_element.page_range[0]
assert page_number, "Table element must have a page number."
assert bbox, "Table element must have coordinates."
pages = convert_from_path(file_path)
# Calculate the box for cropping
box = (
bbox.top_left.x,
bbox.top_left.y,
bbox.bottom_right.x,
bbox.bottom_right.y,
)
table_image = pages[page_number - 1].crop(box)
# Convert the cropped image to base64
table_image64 = self.process_file([table_image])[0]
return table_image64
async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Asynchronously formats table elements within a list of elements.
"""
if not self.model:
raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
print("Formatting tables using VisionMDTableFormatter (async)...")
assert (
file_path
), "A file path is needed to format tables using VisionMDTableFormatter."
if not isinstance(file_path, str):
file_path = str(file_path)
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
formatted_table = await self.aformat_table(block, file_path)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
def format(
self, document: Document, file_path: Path | str | None = None
) -> Document:
"""
Asynchronously formats table elements within a list of elements.
"""
if not self.model:
raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
print("Formatting tables using VisionMDTableFormatter (async)...")
assert (
file_path
), "A file path is needed to format tables using VisionMDTableFormatter."
if not isinstance(file_path, str):
file_path = str(file_path)
formatted_elements = []
for block in document.content:
if isinstance(block, TableBlock):
formatted_table = self.format_table(block, file_path)
formatted_elements.append(formatted_table)
else:
formatted_elements.append(block)
document.content = formatted_elements
return document
async def aformat_table(
self, table_element: TableBlock, file_path: str
) -> TableBlock:
"""
Asynchronously formats a table element into Markdown format using a Vision Model.
"""
table_image64 = self._crop_table_image(table_element, file_path)
formatted_table = await self.avision_extract(table_image64)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{formatted_table}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
# Replace the element's text with the formatted table text
table_element.text = markdown_table
return table_element
def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock:
"""
Asynchronously formats a table element into Markdown format using a Vision Model.
"""
table_image64 = self._crop_table_image(table_element, file_path)
formatted_table = self.vision_extract(table_image64)
markdown_table = (
f"{self.TABLE_MARKER_START}\n"
f"{formatted_table}\n"
f"{self.TABLE_MARKER_END}\n\n"
)
# Replace the element's text with the formatted table text
table_element.text = markdown_table
return table_element
def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]:
"""
Convert a list of PIL images to base64 encoded images.
"""
try:
images_base64 = []
for image in images:
buffered = BytesIO()
image.save(buffered, format=image_format)
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
images_base64.append(image_base64)
return images_base64
except Exception as e:
raise ValueError(f"Error processing PDF file: {str(e)}")
async def avision_extract(self, table_image: str) -> str:
"""
Asynchronously send image data to the language model for processing.
"""
assert (
self.model
), "A model is needed to use the VisionMDTableFormatter (async)."
image_prompt = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
}
message = HumanMessage(
content=[
{"type": "text", "text": TABLE_OCR_PROMPT},
image_prompt,
],
)
response = await self.model.ainvoke([message])
return str(response.content)
def vision_extract(self, table_image: str) -> str:
"""
Synchronously send image data to the language model for processing.
"""
assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)."
image_prompt = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
}
message = HumanMessage(
content=[
{"type": "text", "text": TABLE_OCR_PROMPT},
image_prompt,
],
)
response = self.model.invoke([message])
return str(response.content)
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/layout_detector.py
================================================
import logging
import os
import pathlib
import uuid
from typing import Any, List
import numpy as np
import onnxruntime as rt
from megaparse.configs.auto import DeviceEnum
from megaparse.layout_detection.output import LayoutDetectionOutput
from megaparse.utils.onnx import get_providers
from megaparse_sdk.schema.document import BBOX, Point2D
from onnxtr.models.engine import EngineConfig
from onnxtr.models.preprocessor import PreProcessor
from PIL import Image, ImageDraw
from PIL.Image import Image as PILImage
logger = logging.getLogger("megaparse")
LABEL_MAP = {
0: "Caption",
1: "Footnote",
2: "Formula",
3: "List-item",
4: "Page-footer",
5: "Page-header",
6: "Picture",
7: "Section-header",
8: "Table",
9: "Text",
10: "Title",
}
default_cfg: dict[str, dict[str, Any]] = {
"yolov10s-doclaynet": {
"mean": (0.5, 0.5, 0.5),
"std": (1.0, 1.0, 1.0),
"url_8_bit": None,
"input_shape": (1, 1024, 1024),
"url": pathlib.Path(__file__).parent.joinpath("models/yolov10s-doclaynet.onnx"),
}
}
class LayoutDetector:
def __init__(
self,
device: DeviceEnum = DeviceEnum.CPU,
threshold: float = 0.1,
preserve_aspect_ratio: bool = True,
model_name: str = "yolov10s-doclaynet",
load_in_8_bit: bool = False,
):
model_config = default_cfg[model_name]
self.device = device
general_options = rt.SessionOptions()
providers = get_providers(self.device)
self.threshold = threshold
self.batch_size, self.required_width, self.required_height = model_config[
"input_shape"
]
self.preserve_aspect_ratio = preserve_aspect_ratio
self.pre_processor = PreProcessor(
output_size=(self.required_width, self.required_height),
batch_size=self.batch_size,
preserve_aspect_ratio=self.preserve_aspect_ratio,
)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
model_path = (
model_config.get("url_8_bit") if load_in_8_bit else model_config.get("url")
)
assert model_path, f"Model path not found for {model_name}"
self.model = rt.InferenceSession(model_path, engine_config=engine_config)
def __call__(
self, img_pages: list[PILImage], output_dir: str | None = None
) -> List[List[LayoutDetectionOutput]]:
pages = [np.array(img) for img in img_pages]
# Dimension check
if any(page.ndim != 3 for page in pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
processed_batches = self.pre_processor(pages)
processed_batches = np.array(processed_batches)
processed_batches = processed_batches.squeeze(1) # Horrendus
processed_batches = processed_batches.transpose(0, 3, 1, 2)
pred_batches = np.array(
[
self.model.run(None, {"images": np.expand_dims(batch, axis=0)})
for batch in processed_batches
]
)
pred_batches = np.concatenate(pred_batches, axis=0)
pred_batches = pred_batches.squeeze(1) # Horrendus
processed_preds = []
for page, pred in zip(pages, pred_batches, strict=True):
img_h, img_w = page.shape[:2]
bboxes = self.extract_bboxes_from_page(pred, img_h, img_w)
processed_preds.append(bboxes)
if output_dir:
self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir)
return processed_preds
def extract_bboxes_from_page(
self, preds: np.ndarray, img_h: int, img_w: int
) -> List[LayoutDetectionOutput]:
results = []
assert preds.shape == (300, 6)
scale_h = img_h / self.required_height
scale_w = img_w / self.required_width
for det in preds:
# Rescale the bounding box coordinates to the original dimensions
x1, y1, x2, y2, score, cls_idx = det
if score < self.threshold:
continue
x1 *= scale_w
x2 *= scale_w
y1 *= scale_h
y2 *= scale_h
if self.preserve_aspect_ratio:
ratio = img_h / img_w
x1 = x1 * (ratio if ratio > 1 else 1)
x2 = x2 * (ratio if ratio > 1 else 1)
y1 = y1 / (ratio if ratio < 1 else 1)
y2 = y2 / (ratio if ratio < 1 else 1)
x1 = max(0, min(x1, img_w))
x2 = max(0, min(x2, img_w))
y1 = max(0, min(y1, img_h))
y2 = max(0, min(y2, img_h))
bbox_id = uuid.uuid4()
results.append(
LayoutDetectionOutput(
bbox_id=bbox_id,
bbox=BBOX(
top_left=Point2D(x=x1 / img_w, y=y1 / img_h),
bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h),
),
prob=det[4],
label=int(det[5]),
)
)
result = self.topK(results) # or topK
return result
def nms(
self,
raw_bboxes: List[LayoutDetectionOutput],
iou_threshold: float = 0.9, # FIXME: thresh Configurable in constructor
) -> List[LayoutDetectionOutput]:
"""
Non-Maximum Suppression (NMS) algorithm.
Args:
raw_bboxes (list): List of LayoutBBox objects.
iou_threshold (float): IoU threshold for suppression.
Returns:
None: The input list `raw_bboxes` is modified in-place.
"""
raw_bboxes.sort(key=lambda x: x.prob, reverse=True)
current_index = 0
for index in range(len(raw_bboxes)):
drop = False
for prev_index in range(current_index):
iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox)
if iou > iou_threshold:
drop = True
break
if not drop:
raw_bboxes[current_index], raw_bboxes[index] = (
raw_bboxes[index],
raw_bboxes[current_index],
)
current_index += 1
return raw_bboxes[:current_index]
def topK(
self, detectResult: List[LayoutDetectionOutput], topK: int = 50
) -> List[LayoutDetectionOutput]:
if len(detectResult) <= topK:
return detectResult
else:
predBoxs = []
sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True)
for i in range(topK):
predBoxs.append(sort_detectboxs[i])
return predBoxs
def _save_layout(
self,
pages: list[np.ndarray],
preds: list[list[LayoutDetectionOutput]],
output_dir: str,
):
os.makedirs(output_dir, exist_ok=True)
for i, (page, layout) in enumerate(zip(pages, preds, strict=True)):
image = Image.fromarray(page)
draw = ImageDraw.Draw(image)
img_w, img_h = image.size
for detection in layout:
x_min, y_min, x_max, y_max = detection.bbox.to_numpy()
bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h
confidence = detection.prob
category = detection.label
label = LABEL_MAP.get(category, "Unknown")
draw.rectangle(bbox, outline="red", width=2)
# assert bbox[2] <= image.width
# assert bbox[3] <= image.height
draw.text(
(bbox[0], bbox[1]),
f"{label} ({confidence:.2f})",
fill="red",
)
image.save(os.path.join(output_dir, f"page_{i}.png"))
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx
================================================
[File too large to display: 27.9 MB]
================================================
FILE: libs/megaparse/src/megaparse/layout_detection/output.py
================================================
from uuid import UUID
from megaparse_sdk.schema.document import BBOX
from pydantic import BaseModel
class LayoutDetectionOutput(BaseModel):
bbox_id: UUID
bbox: BBOX
prob: float
label: int
================================================
FILE: libs/megaparse/src/megaparse/megaparse.py
================================================
import logging
import warnings
from pathlib import Path
from typing import IO, BinaryIO, List
import pypdfium2 as pdfium
from megaparse_sdk.schema import document
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from megaparse.configs.auto import MegaParseConfig
from megaparse.exceptions.base import ParsingException
from megaparse.formatter.base import BaseFormatter
from megaparse.layout_detection.layout_detector import LayoutDetector
from megaparse.models.page import Page, PageDimension
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse.utils.strategy import (
determine_global_strategy,
get_page_strategy,
)
logger = logging.getLogger("megaparse")
class MegaParse:
def __init__(
self,
formatters: List[BaseFormatter] | None = None,
config: MegaParseConfig = MegaParseConfig(),
unstructured_strategy: StrategyEnum = StrategyEnum.AUTO,
) -> None:
self.config = config
self.formatters = formatters
self.doctr_parser = DoctrParser(
text_det_config=self.config.doctr_config.text_det_config,
text_reco_config=self.config.doctr_config.text_reco_config,
device=self.config.device,
straighten_pages=self.config.doctr_config.straighten_pages,
detect_orientation=self.config.doctr_config.detect_orientation,
detect_language=self.config.doctr_config.detect_language,
)
self.unstructured_parser = UnstructuredParser()
self.layout_model = LayoutDetector()
self.unstructured_parser = UnstructuredParser(unstructured_strategy)
def validate_input(
self,
file_path: Path | str | None = None,
file: IO[bytes] | None = None,
file_extension: str | FileExtension | None = None,
) -> FileExtension:
if not (file_path or file):
raise ValueError("Either file_path or file should be provided")
if file_path and file:
raise ValueError("Only one of file_path or file should be provided")
if file_path and file is None:
if isinstance(file_path, str):
file_path = Path(file_path)
file_extension = file_path.suffix
elif file and file_path is None:
if not file_extension:
raise ValueError(
"file_extension should be provided when given file argument"
)
file.seek(0)
else:
raise ValueError("Either provider a file_path or file")
if isinstance(file_extension, str):
try:
file_extension = FileExtension(file_extension)
except ValueError:
raise ValueError(f"Unsupported file extension: {file_extension}")
return file_extension
def extract_page_strategies(
self, pdfium_document: pdfium.PdfDocument, rast_scale: int = 2
) -> List[Page]:
pages = []
for i, pdfium_page in enumerate(pdfium_document):
rasterized_page = pdfium_page.render(scale=rast_scale)
assert (
abs(pdfium_page.get_width() * rast_scale - rasterized_page.width) <= 1
), (
f"Widths do not match within a margin of 1: "
f"{pdfium_page.get_width() * rast_scale} != {rasterized_page.width}"
)
pages.append(
Page(
strategy=StrategyEnum.AUTO,
text_detections=None,
rasterized=rasterized_page.to_pil(),
page_size=PageDimension(
width=pdfium_page.get_width() * rast_scale,
height=pdfium_page.get_height() * rast_scale,
),
page_index=i,
pdfium_elements=pdfium_page,
)
)
pages.append(
Page(
strategy=StrategyEnum.AUTO,
text_detections=None,
rasterized=rasterized_page.to_pil(),
page_size=PageDimension(
width=pdfium_page.get_width() * rast_scale,
height=pdfium_page.get_height() * rast_scale,
),
page_index=i,
pdfium_elements=pdfium_page,
)
)
# ----
# Get text detection for each page -> PAGE
pages = self.doctr_parser.get_text_detections(pages)
# ---
# Get strategy per page -> PAGE
for page in pages:
page.strategy = get_page_strategy(
page.pdfium_elements,
page.text_detections,
threshold=self.config.auto_config.page_threshold,
)
return pages
pages = self.doctr_parser.get_text_detections(pages)
for page in pages:
page.strategy = get_page_strategy(
page.pdfium_elements,
page.text_detections,
threshold=self.config.auto_config.page_threshold,
)
return pages
def load(
self,
file_path: Path | str | None = None,
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
strategy: StrategyEnum = StrategyEnum.AUTO,
) -> str:
file_extension = self.validate_input(
file=file, file_path=file_path, file_extension=file_extension
)
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
self.unstructured_parser.strategy = strategy
return str(
self.unstructured_parser.convert(
file_path=file_path, file=file, file_extension=file_extension
)
)
else:
opened_file = None
try:
if file_path:
opened_file = open(file_path, "rb")
file = opened_file
assert file is not None, "No File provided"
pdfium_document = pdfium.PdfDocument(file)
# Rasterize pages and extract text recognition
pages = self.extract_page_strategies(pdfium_document)
strategy = determine_global_strategy(
pages, self.config.auto_config.document_threshold
)
# Extract layout model
assert all(p.rasterized for p in pages)
layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore
if strategy == StrategyEnum.HI_RES:
logger.debug("Using doctr for text recognition")
parsed_document = self.doctr_parser.get_text_recognition(
pages, layout_result
)
else:
logger.debug("Using Unstructured Parser")
self.unstructured_parser.strategy = StrategyEnum.FAST
parsed_document = self.unstructured_parser.convert(
file=file, file_extension=file_extension
)
# additional attributes
parsed_document.file_name = str(file_path) if file_path else None
parsed_document.metadata = pdfium_document.get_metadata_dict()
# Format -> TODO: should be generic
if self.formatters:
for formatter in self.formatters:
if isinstance(parsed_document, str):
warnings.warn(
f"The last step returned a string, the {formatter.__class__} and following will not be applied",
stacklevel=2,
)
break
parsed_document = formatter.format(parsed_document)
if not isinstance(parsed_document, str):
return str(parsed_document)
return parsed_document
except Exception as e:
logger.exception(f"Error occured while parsing {file}: {e}")
raise ParsingException(
f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}"
)
finally:
if opened_file:
opened_file.close()
async def aload(
self,
file_path: Path | str | None = None,
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
strategy: StrategyEnum = StrategyEnum.AUTO,
) -> str | document.Document:
file_extension = self.validate_input(
file=file, file_path=file_path, file_extension=file_extension
)
if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:
self.unstructured_parser.strategy = strategy
parsed_document = await self.unstructured_parser.aconvert(
file_path=file_path, file=file, file_extension=file_extension
)
return str(parsed_document)
else:
opened_file = None
try:
if file_path:
opened_file = open(file_path, "rb")
file = opened_file
assert file is not None, "No File provided"
pdfium_document = pdfium.PdfDocument(file)
# Determine strategy
pages = self.extract_page_strategies(pdfium_document)
strategy = determine_global_strategy(
pages, self.config.auto_config.document_threshold
)
# Run layout model
assert all(p.rasterized for p in pages)
layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore
if strategy == StrategyEnum.HI_RES:
logger.info("Using Doctr for text recognition")
parsed_document = self.doctr_parser.get_text_recognition(
pages, layout_result
)
else:
logger.info("Switching to Unstructured Parser")
self.unstructured_parser.strategy = StrategyEnum.FAST
parsed_document = await self.unstructured_parser.aconvert(
file=file, file_extension=file_extension
)
parsed_document.file_name = str(file_path) if file_path else None
parsed_document.metadata = pdfium_document.get_metadata_dict()
if self.formatters:
for formatter in self.formatters:
if isinstance(parsed_document, str):
warnings.warn(
f"The last step returned a string, the {formatter.__class__} and following will not be applied",
stacklevel=2,
)
break
parsed_document = await formatter.aformat(parsed_document)
return parsed_document
except Exception as e:
raise ParsingException(
f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}"
)
finally:
if opened_file:
opened_file.close()
================================================
FILE: libs/megaparse/src/megaparse/models/page.py
================================================
from typing import List
from megaparse_sdk.schema.document import TextDetection
from megaparse_sdk.schema.parser_config import StrategyEnum
from PIL.Image import Image as PILImage
from pydantic import BaseModel, ConfigDict
from pypdfium2._helpers.page import PdfPage
class PageDimension(BaseModel):
"""
A class to represent a page dimension
"""
width: float
height: float
class Page(BaseModel):
"""
A class to represent a page
"""
strategy: StrategyEnum
text_detections: TextDetection | None = None
rasterized: PILImage | None = None
page_size: PageDimension
page_index: int
pdfium_elements: PdfPage
model_config = ConfigDict(arbitrary_types_allowed=True)
class GatewayDocument(BaseModel):
"""
A class to represent a Gateway MegaParse Document, which is a container of pages.
"""
file_name: str
pages: List[Page]
================================================
FILE: libs/megaparse/src/megaparse/parser/__init__.py
================================================
from .base import BaseParser
__all__ = ["BaseParser"]
================================================
FILE: libs/megaparse/src/megaparse/parser/base.py
================================================
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO
from megaparse_sdk.schema.document import Document
from megaparse_sdk.schema.extensions import FileExtension
class BaseParser(ABC):
"""Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]"""
supported_extensions = []
def check_supported_extension(
self, file_extension: FileExtension | None, file_path: str | Path | None = None
):
if not file_extension and not file_path:
raise ValueError(
f"Either file_path or file_extension must be provided for {self.__class__.__name__}"
)
if file_path and not file_extension:
file_path = Path(file_path) if isinstance(file_path, str) else file_path
file_extension = FileExtension(file_path.suffix)
if file_extension and file_extension not in self.supported_extensions:
raise ValueError(
f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}"
)
@abstractmethod
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> Document:
"""
Convert the given file to a specific format.
Args:
file_path (str | Path): The path to the file to be converted.
**kwargs: Additional keyword arguments for the conversion process.
Returns:
str: The result of the conversion process.
Raises:
NotImplementedError: If the method is not implemented by a subclass.
"""
raise NotImplementedError("Subclasses should implement this method")
@abstractmethod
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> Document:
"""
Convert the given file to the unstructured format.
Args:
file_path (str | Path): The path to the file to be converted.
**kwargs: Additional keyword arguments for the conversion process.
Returns:
str: The result of the conversion process.
Raises:
NotImplementedError: If the method is not implemented by a subclass.
"""
raise NotImplementedError("Subclasses should implement this method")
================================================
FILE: libs/megaparse/src/megaparse/parser/builder.py
================================================
from megaparse_sdk.schema.parser_config import ParseFileConfig
from megaparse.parser.base import BaseParser
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
parser_dict: dict[str, type] = {
"unstructured": UnstructuredParser,
"llama_parser": LlamaParser,
"megaparse_vision": MegaParseVision,
}
class ParserBuilder:
def build(self, config: ParseFileConfig) -> BaseParser:
"""
Build a parser based on the given configuration.
Args:
config (ParserDict): The configuration to be used for building the parser.
Returns:
BaseParser: The built parser.
Raises:
ValueError: If the configuration is invalid.
"""
return parser_dict[config.method](**config.model_dump())
================================================
FILE: libs/megaparse/src/megaparse/parser/doctr_parser.py
================================================
import logging
import uuid
from typing import Any, Dict, List, Tuple, Type
from uuid import UUID
import numpy as np
import onnxruntime as rt
from megaparse_sdk.schema.document import (
BBOX,
Block,
BlockLayout,
BlockType,
CaptionBlock,
FooterBlock,
HeaderBlock,
ImageBlock,
ListElementBlock,
Point2D,
SubTitleBlock,
TableBlock,
TextBlock,
TextDetection,
TitleBlock,
UndefinedBlock,
)
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from onnxtr.io import Document
from onnxtr.models import detection_predictor, recognition_predictor
from onnxtr.models._utils import get_language
from onnxtr.models.engine import EngineConfig
from onnxtr.models.predictor.base import _OCRPredictor
from onnxtr.utils.geometry import detach_scores
from onnxtr.utils.repr import NestedObject
from megaparse.configs.auto import DeviceEnum, TextDetConfig, TextRecoConfig
from megaparse.layout_detection.output import LayoutDetectionOutput
from megaparse.models.page import Page
from megaparse.utils.onnx import get_providers
logger = logging.getLogger("megaparse")
block_cls_map: Dict[int, Type[Block]] = {
0: CaptionBlock,
1: TextBlock,
2: TextBlock,
3: ListElementBlock,
4: FooterBlock,
5: HeaderBlock,
6: ImageBlock,
7: SubTitleBlock,
8: TableBlock,
9: TextBlock,
10: TitleBlock,
}
class DoctrParser(NestedObject, _OCRPredictor):
supported_extensions = [FileExtension.PDF]
def __init__(
self,
text_det_config: TextDetConfig = TextDetConfig(),
text_reco_config: TextRecoConfig = TextRecoConfig(),
device: DeviceEnum = DeviceEnum.CPU,
straighten_pages: bool = False,
detect_orientation: bool = False,
detect_language: bool = False,
**kwargs,
):
self.device = device
general_options = rt.SessionOptions()
providers = get_providers(self.device)
engine_config = EngineConfig(
session_options=general_options,
providers=providers,
)
_OCRPredictor.__init__(
self,
text_det_config.assume_straight_pages,
straighten_pages,
text_det_config.preserve_aspect_ratio,
text_det_config.symmetric_pad,
detect_orientation,
clf_engine_cfg=engine_config,
**kwargs,
)
self.det_predictor = detection_predictor(
arch=text_det_config.det_arch,
assume_straight_pages=text_det_config.assume_straight_pages,
preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,
symmetric_pad=text_det_config.symmetric_pad,
batch_size=text_det_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
self.reco_predictor = recognition_predictor(
arch=text_reco_config.reco_arch,
batch_size=text_reco_config.batch_size,
load_in_8_bit=text_det_config.load_in_8_bit,
engine_cfg=engine_config,
)
self.detect_orientation = detect_orientation
self.detect_language = detect_language
def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]:
rasterized_pages = [np.array(page.rasterized) for page in pages]
# Dimension check
if any(page.ndim != 3 for page in rasterized_pages):
raise ValueError(
"incorrect input shape: all pages are expected to be multi-channel 2D images."
)
origin_page_shapes = [page.shape[:2] for page in rasterized_pages]
# Localize text elements
loc_preds, out_maps = self.det_predictor(
rasterized_pages, return_maps=True, **kwargs
)
# Detect document rotation and rotate pages
seg_maps = [
np.where(
out_map > self.det_predictor.model.postprocessor.bin_thresh,
255,
0,
).astype(np.uint8)
for out_map in out_maps
]
if self.detect_orientation:
general_pages_orientations, origin_pages_orientations = (
self._get_orientations(rasterized_pages, seg_maps)
)
orientations = [
{"value": orientation_page, "confidence": None}
for orientation_page in origin_pages_orientations
]
else:
orientations = None
general_pages_orientations = None
origin_pages_orientations = None
if self.straighten_pages:
rasterized_pages = self._straighten_pages(
rasterized_pages,
seg_maps,
general_pages_orientations,
origin_pages_orientations,
)
# update page shapes after straightening
origin_page_shapes = [page.shape[:2] for page in rasterized_pages]
# forward again to get predictions on straight pagess
loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment]
# Detach objectness scores from loc_preds
loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type]
# Apply hooks to loc_preds if any
for hook in self.hooks:
loc_preds = hook(loc_preds)
for page_index, (rast_page, loc_pred, objectness_score, page) in enumerate(
zip(rasterized_pages, loc_preds, objectness_scores, pages, strict=True)
):
block_layouts = []
for bbox, score in zip(loc_pred, objectness_score, strict=True):
block_layouts.append(
BlockLayout(
bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),
objectness_score=score,
block_type=BlockType.TEXT,
)
)
page.text_detections = TextDetection(
bboxes=block_layouts,
page_index=page_index,
dimensions=rast_page.shape[:2],
orientation=orientations[page_index] if orientations is not None else 0,
origin_page_shape=origin_page_shapes[page_index],
)
return pages
def get_text_recognition(
self, pages: List[Page], layout: List[List[LayoutDetectionOutput]], **kwargs
) -> MPDocument:
assert any(
page.text_detections is not None for page in pages
), "Text detections should be computed before running text recognition"
rasterized_pages = []
loc_preds = []
objectness_scores = []
orientations = []
origin_page_shapes = []
for page in pages:
page_loc_pred = page.text_detections.get_loc_preds() # type: ignore
if page_loc_pred.shape[0] == 0:
page_loc_pred = np.zeros((0, 4))
rasterized_pages.append(np.array(page.rasterized))
loc_preds.append(page_loc_pred) # type: ignore
objectness_scores.append(page.text_detections.get_objectness_scores()) # type: ignore
orientations.append(page.text_detections.get_orientations()) # type: ignore
origin_page_shapes.append(page.text_detections.get_origin_page_shapes()) # type: ignore
# Crop images
crops, loc_preds = self._prepare_crops(
rasterized_pages,
loc_preds, # type: ignore[arg-type]
channels_last=True,
assume_straight_pages=self.assume_straight_pages,
assume_horizontal=self._page_orientation_disabled,
)
# Rectify crop orientation and get crop orientation predictions
crop_orientations: Any = []
if not self.assume_straight_pages:
crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
crop_orientations = [
{"value": orientation[0], "confidence": orientation[1]}
for orientation in _crop_orientations
]
# Identify character sequences
word_preds = self.reco_predictor(
[crop for page_crops in crops for crop in page_crops], **kwargs
)
if not crop_orientations:
crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
boxes, text_preds, crop_orientations = self._process_predictions(
loc_preds, word_preds, crop_orientations
)
if self.detect_language:
languages = [
get_language(" ".join([item[0] for item in text_pred]))
for text_pred in text_preds
]
languages_dict = [
{"value": lang[0], "confidence": lang[1]} for lang in languages
]
else:
languages_dict = None
# FIXME : Not good return type we want :(
out = self.doc_builder(
rasterized_pages,
boxes,
objectness_scores,
text_preds,
origin_page_shapes,
crop_orientations,
orientations,
languages_dict,
)
return self.__to_elements_list(out, layout)
def _get_block_cls(
self,
coordinates: tuple[float, float, float, float],
layout: List[LayoutDetectionOutput],
threshold: float = 0.6,
) -> Tuple[UUID | None, Type[Block]]:
for det in layout:
x1, y1, x2, y2 = coordinates
X1, Y1, X2, Y2 = det.bbox.to_numpy()
assert x1 <= x2 and y1 <= y2, "bbox1 coordinates are invalid"
assert X1 <= X2 and Y1 <= Y2, "bbox2 coordinates are invalid"
union_x1 = max(x1, X1)
union_y1 = max(y1, Y1)
union_x2 = min(x2, X2)
union_y2 = min(y2, Y2)
union_width = max(0, union_x2 - union_x1)
union_height = max(0, union_y2 - union_y1)
union_area = union_width * union_height
detection_area = max(0, x2 - x1) * max(0, y2 - y1)
if union_area / detection_area > threshold:
# breakpoint()
return (det.bbox_id, block_cls_map[det.label])
return (uuid.uuid4(), UndefinedBlock)
def __to_elements_list(
self, doctr_document: Document, layouts: List[List[LayoutDetectionOutput]]
) -> MPDocument:
results = []
for page_number, (page, layout) in enumerate(
zip(doctr_document.pages, layouts, strict=True)
):
result = {}
for block in page.blocks:
if len(block.lines) and len(block.artefacts) > 0:
raise ValueError(
"Block should not contain both lines and artefacts"
)
for line in block.lines:
line_coordinates = [word.geometry for word in line.words]
x0 = min(word[0][0] for word in line_coordinates)
y0 = min(word[0][1] for word in line_coordinates)
x1 = max(word[1][0] for word in line_coordinates)
y1 = max(word[1][1] for word in line_coordinates)
block_id, block_cls = self._get_block_cls(
coordinates=(x0, y0, x1, y1), layout=layout
)
if block_id in result:
bbx0, bby0, bbx1, bby1 = result[block_id].bbox.to_numpy()
result[block_id].text += "\n" + line.render()
result[block_id].bbox = BBOX(
top_left=Point2D(x=min(x0, bbx0), y=min(y0, bby0)),
bottom_right=Point2D(x=max(x1, bbx1), y=max(y1, bby1)),
)
elif issubclass(block_cls, TextBlock):
result[block_id] = block_cls(
text=line.render(),
bbox=BBOX(
top_left=Point2D(x=x0, y=y0),
bottom_right=Point2D(x=x1, y=y1),
),
metadata={},
page_range=(page_number, page_number),
)
# We add the Image Blocks to the MPDocument with the right order
for det in layout:
if det.label in [6, 8]:
x0, y0, x1, y1 = det.bbox.to_numpy()
block_cls = block_cls_map[det.label]
result[uuid.uuid4()] = block_cls(
bbox=BBOX(
top_left=Point2D(x=x0, y=y0),
bottom_right=Point2D(x=x1, y=y1),
),
metadata={},
page_range=(page_number, page_number),
)
sorted_page_blocks = sorted(
result.values(), key=lambda block: block.bbox.top_left.y
)
results += sorted_page_blocks
return MPDocument(
metadata={},
content=results,
detection_origin="doctr",
)
================================================
FILE: libs/megaparse/src/megaparse/parser/entity.py
================================================
from enum import Enum
from typing import List, Optional
class TagEnum(str, Enum):
"""Possible tags for the elements in the file"""
TABLE = "TABLE"
TOC = "TOC"
HEADER = "HEADER"
IMAGE = "IMAGE"
class SupportedModel(Enum):
GPT_4O = ("gpt-4o", None)
GPT_4O_TURBO = ("gpt-4o-turbo", None)
CLAUDE_3_5_SONNET = ("claude-3-5-sonnet", ["latest", "20241022"])
CLAUDE_3_OPUS = ("claude-3-opus", ["latest", "20240229"])
def __init__(self, model_name: str, supported_releases: Optional[List[str]]):
self.model_name = model_name
self.supported_releases = supported_releases
@classmethod
def is_supported(cls, model_name: str) -> bool:
# Attempt to match model_name by checking if it starts with a known model name
for model in cls:
if model_name.startswith(model.model_name):
# Extract the release version if available
release = model_name[len(model.model_name) :].lstrip("-") or None
# Check if the model supports this release
if model.supported_releases is None:
return True
return release in model.supported_releases if release else False
return False
================================================
FILE: libs/megaparse/src/megaparse/parser/llama.py
================================================
from pathlib import Path
from typing import IO, List
from llama_index.core.schema import Document as LlamaDocument
from llama_parse import LlamaParse as _LlamaParse
from llama_parse.utils import Language, ResultType
from megaparse_sdk.schema.document import BBOX, Point2D, TextBlock
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from megaparse.parser import BaseParser
class LlamaParser(BaseParser):
supported_extensions = [FileExtension.PDF]
def __init__(
self,
api_key: str,
verbose=True,
language: Language = Language.FRENCH,
parsing_instruction: str | None = None,
**kwargs,
) -> None:
self.api_key = api_key
self.verbose = verbose
self.language = language
if parsing_instruction:
self.parsing_instruction = parsing_instruction
else:
self.parsing_instruction = """Do not take into account the page breaks (no --- between pages),
do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables."""
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)
llama_parser = _LlamaParse(
api_key=self.api_key,
result_type=ResultType.MD,
gpt4o_mode=True,
verbose=self.verbose,
language=self.language,
parsing_instruction=self.parsing_instruction,
)
documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path))
return self.__to_elements_list__(documents)
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)
llama_parser = _LlamaParse(
api_key=self.api_key,
result_type=ResultType.JSON,
gpt4o_mode=True,
verbose=self.verbose,
language=self.language,
parsing_instruction=self.parsing_instruction,
)
documents: List[LlamaDocument] = llama_parser.load_data(str(file_path))
return self.__to_elements_list__(documents)
def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument:
list_blocks = []
for i, page in enumerate(llama_doc):
list_blocks.append(
TextBlock(
text=page.text,
metadata={},
page_range=(i, i + 1),
bbox=BBOX(
top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)
),
)
)
return MPDocument(
metadata={},
detection_origin="llamaparse",
content=list_blocks,
)
================================================
FILE: libs/megaparse/src/megaparse/parser/megaparse_vision.py
================================================
import asyncio
import base64
import re
from io import BytesIO
from pathlib import Path
from typing import IO, List
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.extensions import FileExtension
from pdf2image import convert_from_path
from megaparse.parser import BaseParser
from megaparse.parser.entity import SupportedModel, TagEnum
# BASE_OCR_PROMPT = """
# Transcribe the content of this file into markdown. Be mindful of the formatting.
# Add formatting if you think it is not clear.
# Do not include page breaks and merge content of tables if it is continued in the next page.
# Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]'
# Return only the parsed content.
# """
BASE_OCR_PROMPT = """
You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags.
Follow these instructions to complete the task:
1. Carefully read through the entire file content.
2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure.
3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure.
4. For tables, headers, and table of contents, add the following tags:
- Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page.
- Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file.
- Table of contents: Enclose in [TOC] and [/TOC] tags
5. When transcribing tables:
- If a table continues across multiple pages, merge the content into a single, cohesive table.
- Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure.
6. Do not include page breaks in your transcription.
7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.).
8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed.
10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents.
"""
class MegaParseVision(BaseParser):
supported_extensions = [FileExtension.PDF]
def __init__(self, model: BaseChatModel, **kwargs):
if hasattr(model, "model_name"):
if not SupportedModel.is_supported(model.model_name):
raise ValueError(
f"Invald model name, MegaParse vision only supports model that have vision capabilities. "
f"{model.model_name} is not supported."
)
self.model = model
self.parsed_chunks: list[str] | None = None
def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]:
"""
Process a PDF file and convert its pages to base64 encoded images.
:param file_path: Path to the PDF file
:param image_format: Format to save the images (default: PNG)
:return: List of base64 encoded images
"""
try:
images = convert_from_path(file_path)
images_base64 = []
for image in images:
buffered = BytesIO()
image.save(buffered, format=image_format)
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
images_base64.append(image_base64)
return images_base64
except Exception as e:
raise ValueError(f"Error processing PDF file: {str(e)}")
def get_element(self, tag: TagEnum, chunk: str):
pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]"
all_elmts = re.findall(pattern, chunk)
if not all_elmts:
print(f"No {tag.value} found in the chunk")
return []
return [elmt.strip() for elmt in all_elmts]
async def asend_to_mlm(self, images_data: List[str]) -> str:
"""
Send images to the language model for processing.
:param images_data: List of base64 encoded images
:return: Processed content as a string
"""
images_prompt = [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
}
for image_data in images_data
]
message = HumanMessage(
content=[
{"type": "text", "text": BASE_OCR_PROMPT},
*images_prompt,
],
)
response = await self.model.ainvoke([message])
return str(response.content)
def send_to_mlm(self, images_data: List[str]) -> str:
"""
Send images to the language model for processing.
:param images_data: List of base64 encoded images
:return: Processed content as a string
"""
images_prompt = [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
}
for image_data in images_data
]
message = HumanMessage(
content=[
{"type": "text", "text": BASE_OCR_PROMPT},
*images_prompt,
],
)
response = self.model.invoke([message])
return str(response.content)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
batch_size: int = 3,
**kwargs,
) -> MPDocument:
"""
Parse a PDF file and process its content using the language model.
:param file_path: Path to the PDF file
:param batch_size: Number of pages to process concurrently
:return: List of processed content strings
"""
if not file_path:
raise ValueError("File_path should be provided to run MegaParseVision")
if isinstance(file_path, Path):
file_path = str(file_path)
self.check_supported_extension(file_extension, file_path)
pdf_base64 = self.process_file(file_path)
n_pages = len(pdf_base64)
tasks = [
self.asend_to_mlm(pdf_base64[i : i + batch_size])
for i in range(0, len(pdf_base64), batch_size)
]
self.parsed_chunks = await asyncio.gather(*tasks)
responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
return self.__to_elements_list__(responses, n_pages=n_pages)
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
batch_size: int = 3,
**kwargs,
) -> MPDocument:
"""
Parse a PDF file and process its content using the language model.
:param file_path: Path to the PDF file
:param batch_size: Number of pages to process at a time
:return: List of processed content strings
"""
if not file_path:
raise ValueError("File_path should be provided to run MegaParseVision")
if isinstance(file_path, Path):
file_path = str(file_path)
self.check_supported_extension(file_extension, file_path)
pdf_base64 = self.process_file(file_path)
n_pages = len(pdf_base64)
chunks = [
pdf_base64[i : i + batch_size]
for i in range(0, len(pdf_base64), batch_size)
]
self.parsed_chunks = []
for chunk in chunks:
response = self.send_to_mlm(chunk)
self.parsed_chunks.append(response)
responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
return self.__to_elements_list__(responses, n_pages)
def get_cleaned_content(self, parsed_file: str) -> str:
"""
Get cleaned parsed file without any tags defined in TagEnum.
This method removes all tags from TagEnum from the parsed file, formats the content,
and handles the HEADER tag specially by keeping only the first occurrence.
Args:
parsed_file (str): The parsed file content with tags.
Returns:
str: The cleaned content without TagEnum tags.
"""
tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values()))
tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]"
# handle the HEADER tag specially
header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]"
headers = re.findall(header_pattern, parsed_file, re.DOTALL)
if headers:
first_header = headers[0].strip()
# Remove all HEADER tags and their content
parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL)
# Add the first header back at the beginning
parsed_file = f"{first_header}\n{parsed_file}"
# Remove all other tags
def remove_tag(match):
return match.group(2)
cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL)
cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE)
cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
cleaned_content = cleaned_content.replace("|\n\n|", "|\n|")
cleaned_content = cleaned_content.strip()
return cleaned_content
def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument:
list_blocks: List[Block] = [
TextBlock(
text=mpv_doc,
metadata={},
page_range=(0, n_pages - 1),
bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)),
)
]
return MPDocument(
metadata={},
detection_origin="megaparse_vision",
content=list_blocks,
)
================================================
FILE: libs/megaparse/src/megaparse/parser/unstructured_parser.py
================================================
import warnings
from pathlib import Path
from typing import IO, Dict, List
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse_sdk.schema.document import (
BBOX,
Block,
FooterBlock,
HeaderBlock,
ImageBlock,
Point2D,
SubTitleBlock,
TableBlock,
TextBlock,
TitleBlock,
)
from megaparse_sdk.schema.document import (
Document as MPDocument,
)
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from megaparse.parser import BaseParser
load_dotenv()
class UnstructuredParser(BaseParser):
supported_extensions = [
FileExtension.PDF,
FileExtension.DOCX,
FileExtension.TXT,
FileExtension.OTF,
FileExtension.EPUB,
FileExtension.HTML,
FileExtension.XML,
FileExtension.CSV,
FileExtension.XLSX,
FileExtension.XLS,
FileExtension.PPTX,
FileExtension.MD,
FileExtension.MARKDOWN,
]
def __init__(
self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs
):
self.strategy = strategy
self.model = model
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> MPDocument:
self.check_supported_extension(file_extension, file_path)
# Partition the PDF
elements = partition(
filename=str(file_path) if file_path else None,
file=file,
strategy=self.strategy,
content_type=file_extension.mimetype if file_extension else None,
)
return self.__to_mp_document(elements)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: FileExtension | None = None,
**kwargs,
) -> MPDocument:
self.check_supported_extension(file_extension, file_path)
warnings.warn(
"The UnstructuredParser is a sync parser, please use the sync convert method",
UserWarning,
stacklevel=2,
)
return self.convert(file_path, file, file_extension, **kwargs)
def __to_mp_document(self, elements: List[Element]) -> MPDocument:
text_blocks = []
for element in elements:
block = self.__convert_element_to_block(element)
if block:
text_blocks.append(block)
return MPDocument(
content=text_blocks, metadata={}, detection_origin="unstructured"
)
def __convert_element_to_block(self, element: Element) -> Block | None:
element_type = element.category
text = element.text
metadata = element.metadata
category_depth = metadata.category_depth
# Element type-specific markdown content
markdown_types: Dict[str, Block] = {
"Title": TitleBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Subtitle": SubTitleBlock(
text=text,
depth=category_depth if category_depth else 0,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Header": HeaderBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Footer": FooterBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"NarrativeText": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"ListItem": TextBlock( # FIXME: @chloedia, list item need to be handled differently in ListBlock
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Table": TableBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Image": ImageBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Formula": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"FigureCaption": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"Address": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"EmailAddress": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"CodeSnippet": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
"UncategorizedText": TextBlock(
text=text,
metadata={},
page_range=(metadata.page_number, metadata.page_number)
if metadata.page_number
else None,
bbox=BBOX(
top_left=Point2D(
x=metadata.coordinates.points[0][0],
y=metadata.coordinates.points[0][1],
),
bottom_right=Point2D(
x=metadata.coordinates.points[3][0],
y=metadata.coordinates.points[3][1],
),
)
if metadata.coordinates and metadata.coordinates.points
else None,
),
}
return markdown_types.get(element_type, None)
================================================
FILE: libs/megaparse/src/megaparse/predictor/layout_predictor.py
================================================
from PIL import Image
from unstructured_inference.inference.layout import PageLayout
from unstructured_inference.models.base import get_model
from unstructured_inference.visualize import draw_bbox
def extract_layout(
page_number: int, page_image: Image.Image, model_name: str = "yolox"
) -> PageLayout:
layout_model = get_model(model_name)
parsed_page = PageLayout.from_image(
image=page_image,
number=page_number,
detection_model=layout_model,
element_extraction_model=None,
fixed_layout=None,
)
colors = ["red" for _ in parsed_page.elements]
for el, color in zip(parsed_page.elements, colors, strict=True):
page_image = draw_bbox(page_image, el, color=color, details=False)
page_image.show()
return parsed_page
================================================
FILE: libs/megaparse/src/megaparse/utils/extract_metadata.py
================================================
from typing import Any, Dict
import pypdfium2 as pdfium
def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]:
pass
================================================
FILE: libs/megaparse/src/megaparse/utils/onnx.py
================================================
import logging
from typing import List
import onnxruntime as rt
from megaparse.configs.auto import DeviceEnum
logger = logging.getLogger("megaparse")
def get_providers(device: DeviceEnum) -> List[str]:
prov = rt.get_available_providers()
logger.info("Available providers: %s", prov)
if device == DeviceEnum.CUDA:
if "CUDAExecutionProvider" not in prov:
raise ValueError(
"onnxruntime can't find CUDAExecutionProvider in list of available providers"
)
return ["CUDAExecutionProvider"]
elif device == DeviceEnum.COREML:
if "CoreMLExecutionProvider" not in prov:
raise ValueError(
"onnxruntime can't find CoreMLExecutionProvider in list of available providers"
)
return ["CoreMLExecutionProvider"]
elif device == DeviceEnum.CPU:
return ["CPUExecutionProvider"]
else:
raise ValueError("device not in (CUDA,CoreML,CPU)")
================================================
FILE: libs/megaparse/src/megaparse/utils/strategy.py
================================================
from typing import List
import numpy as np
from megaparse.models.page import Page
from megaparse_sdk.schema.document import TextDetection
from megaparse_sdk.schema.parser_config import StrategyEnum
from pypdfium2._helpers.page import PdfPage
def get_page_strategy(
pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float
) -> StrategyEnum:
if onnxtr_page is None:
return StrategyEnum.FAST
text_coords = []
# Get all the images in the page
for obj in pdfium_page.get_objects():
if obj.type == 1: # type: ignore
text_coords.append(obj.get_pos())
p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())
pdfium_canva = np.zeros((int(p_height), int(p_width)))
for coords in text_coords:
# (left,bottom,right, top)
# 0---l--------------R-> y
# |
# B (x0,y0)
# |
# T (x1,y1)
# ^
# x
x0, y0, x1, y1 = (
p_height - coords[3],
coords[0],
p_height - coords[1],
coords[2],
)
x0 = max(0, min(p_height, int(x0)))
y0 = max(0, min(p_width, int(y0)))
x1 = max(0, min(p_height, int(x1)))
y1 = max(0, min(p_width, int(y1)))
pdfium_canva[x0:x1, y0:y1] = 1
onnxtr_canva = np.zeros((int(p_height), int(p_width)))
for block in onnxtr_page.bboxes:
x0, y0 = block.bbox[0]
x1, y1 = block.bbox[1]
x0 = max(0, min(int(x0 * p_width), int(p_width)))
y0 = max(0, min(int(y0 * p_height), int(p_height)))
x1 = max(0, min(int(x1 * p_width), int(p_width)))
y1 = max(0, min(int(y1 * p_height), int(p_height)))
onnxtr_canva[y0:y1, x0:x1] = 1
intersection = np.logical_and(pdfium_canva, onnxtr_canva)
union = np.logical_or(pdfium_canva, onnxtr_canva)
sum_intersection = np.sum(intersection)
sum_union = np.sum(union)
iou = sum_intersection / sum_union if sum_union != 0 else 0
if iou < threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum:
count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES)
if count / len(pages) > threshold:
return StrategyEnum.HI_RES
return StrategyEnum.FAST
================================================
FILE: libs/megaparse/tests/__init__.py
================================================
================================================
FILE: libs/megaparse/tests/certs/client-cert.pem
================================================
-----BEGIN CERTIFICATE-----
MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw
gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p
bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw
PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh
bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow
ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD
VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv
dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG
AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1
2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp
dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ
6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV
HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn
AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p
vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW
0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9
ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr
drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7
/E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=
-----END CERTIFICATE-----
================================================
FILE: libs/megaparse/tests/certs/client-key.pem
================================================
-----BEGIN PRIVATE KEY-----
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU
QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj
rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj
BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k
0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo
8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy
dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0
xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW
OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB
Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18
vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY
nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ
eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M
f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG
qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh
zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq
8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP
HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz
4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI
1OaXIqrCA/V43NydDezh0ylQ
-----END PRIVATE KEY-----
================================================
FILE: libs/megaparse/tests/conftest.py
================================================
from pathlib import Path
from typing import IO
import pytest_asyncio
from httpx import ASGITransport, AsyncClient
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_core.documents import Document
from megaparse.api.app import app, get_playwright_loader, parser_builder_dep
from megaparse.parser.base import BaseParser
from megaparse_sdk.schema.document import Document as MPDocument
from megaparse_sdk.schema.document import TextBlock
from megaparse_sdk.schema.extensions import FileExtension
class FakeParserBuilder:
def build(self, *args, **kwargs) -> BaseParser:
"""
Build a fake parser based on the given configuration.
Returns:
BaseParser: The built fake parser.
Raises:
ValueError: If the configuration is invalid.
"""
class FakeParser(BaseParser):
def convert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
print("Fake parser is converting the file")
return MPDocument(
file_name="Fake file",
content=[TextBlock(text="Fake conversion result", metadata={})],
metadata={},
detection_origin="fakeparser",
)
async def aconvert(
self,
file_path: str | Path | None = None,
file: IO[bytes] | None = None,
file_extension: None | FileExtension = None,
**kwargs,
) -> MPDocument:
print("Fake parser is converting the file")
return MPDocument(
file_name="Fake file",
content=[TextBlock(text="Fake conversion result", metadata={})],
metadata={},
detection_origin="fakeparser",
)
return FakeParser()
@pytest_asyncio.fixture(scope="function")
async def test_client():
print("Setting up test_client fixture")
def fake_parser_builder():
return FakeParserBuilder()
def fake_playwright_loader():
class FakePlaywrightLoader(PlaywrightURLLoader):
async def aload(self):
return [Document(page_content="Fake website content")]
return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"])
app.dependency_overrides[parser_builder_dep] = fake_parser_builder
app.dependency_overrides[get_playwright_loader] = fake_playwright_loader
async with AsyncClient(
transport=ASGITransport(app=app), # type: ignore
base_url="http://test",
) as ac:
yield ac
app.dependency_overrides = {}
================================================
FILE: libs/megaparse/tests/data/grt_example/MegaFake_report.md
================================================
| My Mega fake report | #1756394 | 31/05/2024 |
|---------------------|----------|------------|
## Why Mega Parse might be the best ?
### Introduction
Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.
### Features of Mega Parse
Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.
**Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.
**High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.
**Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.
Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.
Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.
Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.
Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.
# Benefits of Mega Parse
The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.
**Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.
**Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.
**Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.
Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.
Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.
Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.
# Comparative Performance
The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.
| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |
|---------------------|------------------|----------------|----------------|----------------|----------------|
| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX|
| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |
| **Accuracy Rate (%)** | 98 | 95 | 93 | 90 | 92 |
| **Output Format** | Markdown | HTML | Markdown | Plain Text | HTML |
| **Error Rate (%)** | 1 | 3 | 4 | 5 | 3 |
| **Ease of Use** | High | Medium | High | Medium | Medium |
| **Integration Capability** | Excellent | Good | Good | Fair | Good |
| **Batch Processing** | Yes | No | Yes | No | Yes |
| **Custom Parsing Rules** | Yes | Limited | Yes | No | Limited |
| **Multilingual Support** | Yes | Yes | No | Yes | Yes |
| **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes |
| **Price (per user/month)** | $30 | $25 | $20 | $15 | $18 |
| **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |
| **Free Trial Available** | Yes | Yes | No | Yes | No |
| **Cloud Integration** | Yes | No | Yes | Yes | No |
| **Security Features** | Advanced | Basic | Advanced | Basic | Intermediate |
| **User Community Size** | Large | Medium | Medium | Small | Medium |
| **Monthly Updates** | Yes | Yes | No | Yes | No |
| **Mobile App Availability** | Yes | No | Yes | No | Yes |
| **Platform Compatibility** | Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux |
| **Data Privacy Compliance** | High | Medium | High | Low | Medium |
| **AI-Driven Enhancements** | Yes | No | Yes | No | Yes |
| **File Size Limit (per document)** | 1GB | 500MB | 750MB | 200MB | 500MB |
| **User Training Resources** | Extensive | Moderate | Extensive | Limited | Moderate |
| **API Access** | Yes | No | Yes | No | Yes |
| **Customizable Output Templates** | Yes | Limited | Yes | No | Yes |
| **Collaboration Features** | Yes | No | Yes | No | Limited |
| **Document Version Control** | Yes | No | Yes | No | Yes |
| **Import/Export Options** | Extensive | Moderate | Extensive | Limited | Moderate |
| Feedback Mechanism | Yes | No | Yes | No | Yes |
*Note: All data presented in this table is fictional and for illustrative purposes only.*
## Conclusion
Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.
================================================
FILE: libs/megaparse/tests/pdf/test_detect_ocr.py
================================================
import os
import pypdfium2
import pytest
from megaparse.megaparse import MegaParse
from megaparse.utils.strategy import determine_global_strategy
from megaparse_sdk.schema.parser_config import StrategyEnum
ocr_pdfs = os.listdir("./tests/pdf/ocr")
native_pdfs = os.listdir("./tests/pdf/native")
megaparse = MegaParse()
@pytest.mark.parametrize("hi_res_pdf", ocr_pdfs)
def test_hi_res_strategy(hi_res_pdf):
if hi_res_pdf == "0168004.pdf":
pytest.skip("Skip 0168004.pdf as it is flaky currently")
pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/ocr/{hi_res_pdf}")
pages = megaparse.extract_page_strategies(pdf_doc)
assert (
determine_global_strategy(
pages, megaparse.config.auto_config.document_threshold
)
== StrategyEnum.HI_RES
)
@pytest.mark.parametrize("native_pdf", native_pdfs)
def test_fast_strategy(native_pdf):
if native_pdf == "0168029.pdf":
pytest.skip("Skip 0168029.pdf as it is too long to process")
pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/native/{native_pdf}")
pages = megaparse.extract_page_strategies(pdf_doc)
assert (
determine_global_strategy(
pages, megaparse.config.auto_config.document_threshold
)
== StrategyEnum.FAST
)
================================================
FILE: libs/megaparse/tests/pdf/test_pdf_processing.py
================================================
from pathlib import Path
import pypdfium2
import pytest
from megaparse.configs.auto import (
DeviceEnum,
MegaParseConfig,
)
from megaparse.megaparse import MegaParse
from megaparse.utils.strategy import determine_global_strategy
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
@pytest.fixture
def native_pdf() -> Path:
p = Path("./tests/pdf/sample_native.pdf")
return p
@pytest.fixture
def scanned_pdf() -> Path:
p = Path("./tests/pdf/sample_pdf.pdf")
return p
# def test_get_default_processors_megaparse():
# megaparse = MegaParse()
# assert type(megaparse.parser) is UnstructuredParser
@pytest.mark.asyncio
@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
async def test_async_megaparse_pdf_processor_file_path(pdf_name, request):
pdf = request.getfixturevalue(pdf_name)
processor = MegaParse(config=MegaParseConfig(device=DeviceEnum.COREML))
result = await processor.aload(file_path=pdf)
assert len(str(result)) > 0
@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
def test_sync_megaparse_pdf_processor_file_path(pdf_name, request):
pdf = request.getfixturevalue(pdf_name)
processor = MegaParse()
result = processor.load(file_path=pdf)
assert len(result) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
async def test_megaparse_pdf_processor_file(pdf_name, request):
pdf = request.getfixturevalue(pdf_name)
processor = MegaParse()
with open(pdf, "rb") as f:
result = await processor.aload(file=f, file_extension=FileExtension.PDF)
assert len(str(result)) > 0
def test_strategy_native(native_pdf):
processor = MegaParse()
pdf_doc = pypdfium2.PdfDocument(native_pdf)
pages = processor.extract_page_strategies(pdf_doc)
assert (
determine_global_strategy(
pages, processor.config.auto_config.document_threshold
)
== StrategyEnum.FAST
)
pdf_doc.close()
def test_strategy_scanned(scanned_pdf):
processor = MegaParse()
pdf_doc = pypdfium2.PdfDocument(scanned_pdf)
pages = processor.extract_page_strategies(pdf_doc)
assert (
determine_global_strategy(
pages, processor.config.auto_config.document_threshold
)
== StrategyEnum.HI_RES
)
pdf_doc.close()
================================================
FILE: libs/megaparse/tests/pdf/test_pdfium_parser.py
================================================
from pathlib import Path
import pypdfium2 as pdfium
def test_pdfium():
# scanned pdf
p = Path("./tests/pdf/mlbook.pdf")
document = pdfium.PdfDocument(p)
objs = []
for page in document:
for obj in page.get_objects():
objs.append(obj)
document.close()
================================================
FILE: libs/megaparse/tests/supported_docs/sample.csv
================================================
Name,Description
MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ."
OtherParse,"OtherParse is a decent parser, but it struggles with accents."
RandomParse,"RandomParse is another parser, but it often fails with special characters."
================================================
FILE: libs/megaparse/tests/supported_docs/sample.markdown
================================================
# The Difficulty of Parsing Files
Parsing files can be a challenging task due to several factors:
## 1. File Format Variability
Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
## 2. Inconsistent Data
Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
## 3. Large File Sizes
Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
## 4. Encoding Issues
Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
## 5. Nested Structures
Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
## Conclusion
Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
================================================
FILE: libs/megaparse/tests/supported_docs/sample.md
================================================
# The Difficulty of Parsing Files
Parsing files can be a challenging task due to several factors:
## 1. File Format Variability
Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
## 2. Inconsistent Data
Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
## 3. Large File Sizes
Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
## 4. Encoding Issues
Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
## 5. Nested Structures
Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
## Conclusion
Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
================================================
FILE: libs/megaparse/tests/supported_docs/sample.txt
================================================
Lorem ipsum
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.
Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.
Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.
Maecenas non lorem quis tellus placerat varius.
Nulla facilisi.
Aenean congue fringilla justo ut aliquam.
Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis.
Morbi viverra semper lorem nec molestie.
Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.
https://github.com/QuivrHQ/MegaParse
================================================
FILE: libs/megaparse/tests/supported_docs/sample.xml
================================================
Charter Group100 MainFraminghamMA01701720 ProspectFraminghamMA01701120 RidgeMA01760
================================================
FILE: libs/megaparse/tests/supported_docs/sample_complexe.html
================================================
Large HTML page with images
When to load CSS
Large HTML page
with Images
This page shall test if the recorder generates
scripts with requests in correct order. It includes images in sequential order: stadyn_image1.gif
through stadyn_image10.gif
stadyn_image1
stadyn_image2
Open Financial Exchange
Specification 1.0
February 14, 1997
1997 CheckFree Corp., Intuit Inc., Microsoft Corp.
All rights reserved
1. Overview 51.1 Introduction 51.1.1 Design Principles 51.2 Open Financial Exchange at
a Glance 71.2.1 Data Transport 71.2.2 Request and Response Model 81.3 Conventions 92.
Structure 102.1 HTTP Headers 102.2 Open Financial Exchange Headers 112.2.1 The Meaning of
Version Numbers 122.3 SGML Details 122.3.1 Compliance 122.3.2 Special Characters 122.4
Open Financial Exchange SGML Structure 132.4.1 Overview 132.4.2 Top Level 132.4.3 Messages
132.4.4 Message Sets and Version Control 142.4.5 Transactions 152.5 The Signon Message Set
162.5.1 Signon <SONRQ> <SONRS> 162.5.2 PIN Change <PINCHRQ>
<PINCHRS> 192.5.3 Examples 202.6 External Data Support 202.7 Extensions to Open
Financial Exchange 213. Common Aggregates, Elements, and Data Types 223.1 Common
Aggregates 223.1.1 Identifying Financial Institutions and Accounts 223.1.2 Balance Records
<BAL> 223.1.3 Error Reporting <STATUS> 233.2 Common Elements 243.2.1 Financial
Institution Transaction ID <FITID> 243.2.2 Server-Assigned ID <SRVRTID>
243.2.3 Client-Assigned Transaction UID <TRNUID> 253.2.4 Token <TOKEN> 253.2.5
Transaction Amount <TRNAMT> 253.2.6 Memo <MEMO> 253.2.7 Date Start and Date
End <DTSTART> <DTEND> 263.3 Common data types 263.3.1 Dates and Times 263.3.2
Amounts, Prices, and Quantities 283.3.3 Language 283.3.4 Basic data types 284. Security
294.1 Security Solutions 294.1.1 Determining Security Levels <OFXSEC>
<TRANSPSEC> 294.2 Channel-Level Security 304.2.1 Security Requirements 304.2.2 Using
SSL 3.0 in Open Financial Exchange 304.3 Application-Level Security 314.3.1 Requirements
for Application-Layer Security 314.3.2 Using Application-level Encryption in Open
Financial Exchange 325. International Support 335.1 Language and Encoding 335.2 Currency
<CURDEF> <CURRENCY> <ORIGCURRENCY> 335.3 Country-Specific Tag Values
346. Data Synchronization 356.1 Overview 356.2 Background 356.3 Data Synchronization
Approach 366.4 Data Synchronization Specifics 376.5 Conflict Detection and Resolution
396.6 Synchronization vs. Refresh 406.7 Typical Server Architecture for Synchronization
416.8 Typical Client Processing of Synchronization Results 436.9 Simultaneous Connections
446.10 Synchronization Alternatives 446.10.1 Lite Synchronization 446.10.2 Relating
Synchronization and Error Recovery 456.11 Examples 467. FI Profile 487.1 Overview 487.1.1
Message Sets 487.1.2 Version Control 497.1.3 Batching and Routing 497.2 Profile Request
507.3 Profile Response 517.3.1 Message Set 527.3.2 Signon Realms 537.3.3 Status Codes
537.4 Profile Message Set Profile Information 548. Activation & Account Information
558.1 Overview 558.2 Approaches to User Sign-Up with Open Financial Exchange 558.3 Users
and Accounts 568.4 Enrollment and Password Acquisition <ENROLLRQ> <ENROLLRS>
568.4.1 User IDs 578.4.2 Enrollment Request 578.4.3 Enrollment Response 598.4.4 Enrollment
Status Codes 598.4.5 Examples 608.5 Account Information 608.5.1 Request <ACCTINFORQ>
618.5.2 Response <ACCTINFORS> 618.5.3 Account Information Aggregate <ACCTINFO>
628.5.4 Status Codes 628.5.5 Examples 638.6 Service Activation 638.6.1 Activation Request
and Response 648.6.2 Service Activation Synchronization 668.6.3 Examples 668.7 Name and
Address Changes <CHGUSERINFORQ> <CHGUSERINFORS> 678.7.1 <CHGUSERINFORQ>
678.7.2 <CHGUSERINFORS> 688.7.3 Status Codes 688.8 Signup Message Set Profile
Information 699. Customer to FI Communication 709.1 The E-Mail Message Set 709.2 E-Mail
Messages 709.2.1 Regular vs. Specialized E-Mail 719.2.2 Basic <MAIL> Aggregate
719.2.3 E-Mail <MAILRQ> <MAILRS> 719.2.4 E-Mail Synchronization
<MAILSYNCRQ> <MAILSYNCRS> 729.2.5 Example 739.3 Get HTML Page 749.3.1 MIME Get
Request and Response <GETMIMERQ> <GETMIMERS> 749.3.2 Example 759.4 E-Mail
Message Set Profile Information 7610. Recurring Transactions 7710.1 Creating a Recurring
Model 7710.2 Recurring Instructions <RECURRINST> 7710.2.1 Values for <FREQ>
7810.2.2 Examples 7910.3 Retrieving Transactions Generated by a Recurring Model 8010.4
Modifying and Canceling Individual Transactions 8010.5 Modifying and Canceling Recurring
Models 8010.5.1 Examples 81
Open Financial Exchange is a broad-based framework for exchanging
financial data and instructions between customers and their financial institutions. It
allows institutions to connect directly to their customers without requiring an
intermediary.
Open Financial Exchange is an open specification that anyone can
implement: any financial institution, transaction processor, software developer or other
party. It uses widely accepted open standards for data formatting (such as SGML),
connectivity (such as TCP/IP and HTTP), and security (such as SSL).
Open Financial Exchange defines the request and response messages used
by each financial service as well as the common framework and infrastructure to support
the communication of those messages. This specification does not describe any specific
product implementation.
The following principles were used in designing Open Financial Exchange:
l BroadRange of Financial Activities -
Open Financial Exchange provides support for a broad range of financial
activities. Open Financial Exchange 1.0 specifies the following services:
n Bank statement download
n Credit card statement download
n Funds transfers including
recurring transfers
n Consumer payments, including
recurring payments
n Business payments, including
recurring payments
n Brokerage and mutual fund
statement download, including transaction history, current holdings and balances
l BroadRange of Financial Institutions -
Open Financial Exchange supports communication with a broad range of
financial institutions (FIs), including:
n Banks
n Brokerage houses
n Merchants
n Processors
n Financial advisors
n Government agencies
l BroadRange of Front-End applications -
Open Financial Exchange supports a broad range of front-end applications
covering all types of financial activities running on all types of platforms, including
Web-based applications.
l Extensible - Open Financial Exchange has been
designed to allow the easy addition of new services. Future versions will include support
for many new services.
l Open - This specification is publicly available.
You can build client and server applications using the Open Financial Exchange protocols
independent of any specific technology, product, or company.
l Multiple Client Support - Open Financial Exchange
allows a user to use multiple client applications to access the same data at a financial
institution. With the popularity of the World Wide Web, customers are increasingly more
likely to use multiple applications-either desktop-based or Web-based-to perform financial
activities. For example, a customer can track personal finances at home with a desktop
application and occasionally pay bills while at work with a Web-based application. The use
of data synchronization to support multiple clients is a key innovation in Open Financial
Exchange.
l Robust - Open Financial Exchange will be used for
executing important financial transactions and for communicating important financial
information. Assuring users that transactions are executed and information is correct is
crucial. Open Financial Exchange provides robust protocols for error recovery.
l Secure - Open Financial Exchange provides a
framework for building secure online financial services. In Open Financial Exchange,
security encompasses authentication of the parties involved, as well as secrecy and
integrity of the information being exchanged.
l Batch & Interactive - The design of request and
response messages in Open Financial Exchange is for use in either batch or interactive
style of communication. Open Financial Exchange provides for applying a single
authentication context to multiple requests in order to reduce the overhead of user
authentication.
l InternationalSupport - Open Financial Exchange is designed to supply financial services throughout
the world. It supports multiple currencies, country-specific extensions, and different
forms of encoding such as UNICODE.
l Platform Independent -Open Financial Exchange can
be implemented on a wide variety of front-end client devices, including those running
Windows 3.1, Windows 95, Windows NT, Macintosh, or UNIX. It also supports a wide variety
of Web-based environments, including those using HTML, Java, JavaScript, or ActiveX.
Similarly on the back-end, Open Financial Exchange can be implemented on a wide variety of
server systems, including those running UNIX, Windows NT, or OS/2.
l Transport Independent - Open Financial Exchange is
independent of the data communication protocol used to transport the messages between the
client and server computers. Open Financial Exchange 1.0 will use HTTP.
The design of Open Financial Exchange is as a client and server system.
An end-user uses a client application to communicate with a server at a financial
institution. The form of communication is requests from the client to the server and
responses from the server back to the client.
Open Financial Exchange uses the Internet Protocol (IP) suite to provide
the communication channel between a client and a server. IP protocols are the foundation
of the public Internet and a private network can also use them.
Clients use the HyperText Transport Protocol (HTTP) to communicate to an
Open Financial Exchange server. The World Wide Web throughout uses the same HTTP protocol.
In principle, a financial institution can use any off-the-shelf web server to implement
its support for Open Financial Exchange.
To communicate by means of Open Financial Exchange over the Internet,
the client must establish an Internet connection. This connection can be a dial-up
Point-to-Point Protocol (PPP) connection to an Internet Service Provider (ISP) or a
connection over a local area network that has a gateway to the Internet.
Clients use the HTTP POST command to send a request to the previously
acquired Uniform Resource Locator (URL) for the desired financial institution. The URL
presumably identifies a Common Gateway Interface (CGI) or other process on an FI server
that can accept Open Financial Exchange requests and produce a response.
The POST identifies the data as being of type application/x-ofx. Use
application/x-ofx as the return type as well. Fill in other fields per the HTTP 1.0 spec.
Here is a typical request:
A blank line defines the separation between the HTTP headers and the
start of the actual Open Financial Exchange data. A blank line also separates the Open
Financial Exchange headers and the actual response. (See Chapter 2, for more information.)
The structure of a response is similar to the request, with the first
line containing the standard HTTP result, as shown next. The content length is given in
bytes.
The basis for Open Financial Exchange is the request and response model.
One or more requests can be batched in a single file. This file typically includes a
signon request and one or more service-specific requests. An FI server will process all of
the requests and return a single response file. This batch model lends itself to Internet
transport as well as other off-line transports. Both requests and responses are plain text
files, formatted using a grammar based on Standard Generalized Markup Language (SGML).
Open Financial Exchange is syntactically similar to HyperText Markup Language (HTML),
featuring tags to identify and delimit the data. The use of a tagged data format allows
Open Financial Exchange to evolve over time while continuing to support older clients and
servers.
Here is a simplified example of an Open Financial Exchange request file.
(This example does not show the Open Financial Exchange headers and the indentation is
only for readability.) For complete details, see the more complete examples throughout
this specification.
<OFX> <!-- Begin request data --> <SIGNONMSGSRQV1> <SONRQ>
<!-- Begin signon --> <DTCLIENT>19961029101000 <!-- Oct. 29, 1996, 10:10:00
am --> <USERID>123-45-6789 <!-- User ID (that is, SSN) -->
<USERPASS>MyPassword <!-- Password (SSL encrypts whole) -->
<LANGUAGE>ENG <!-- Language used for text --> <FI> <!-- ID of
receiving institution --> <ORG>NCH <!-- Name of ID owner -->
<FID>1001 <!-- Actual ID --> </FI> <APPID>MyApp <APPVER>0500
</SONRQ> <!-- End of signon --> </SIGNONMSGSRQV1> <BANKMSGSRQV1>
<STMTTRNRQ> <!-- First request in file --> <TRNUID>1001 <STMTRQ>
<!-- Begin statement request --> <BANKACCTFROM> <!-- Identify the account
--> <BANKID>121099999 <!-- Routing transit or other FI ID -->
<ACCTID>999988 <!-- Account number --> <ACCTTYPE>CHECKING <!--
Account type --> </BANKACCTFROM> <!-- End of account ID --> <INCTRAN>
<!-- Begin include transaction --> <INCLUDE>Y <!-- Include transactions
--> </INCTRAN> <!-- End of include transaction --> </STMTRQ> <!--
End of statement request --> </STMTTRNRQ> <!-- End of first request -->
</BANKMSGSRQV1></OFX> <!-- End of request data -->The
response format follows a similar structure. Although a response such as a statement
response contains all of the details of each transaction, each element is identified using
tags.
The key rule of Open Financial Exchange syntax is that each tag is
either an element or an aggregate. Data follows its element tag. An aggregate tag begins a
compound tag sequence, which must end with a matching tag; for example, <AGGREGATE>
... </AGGREGATE>.
The actual file Open Financial Exchange sends is without any extra white
space between tags.
The conventions used in the detailed descriptions include:
Required tags are in bold. Regular face indicates tags that are
optional. Required means that a client will always include a tag in a request, and a
server must always include a tag in a response.
Italic shows a required or optional aggregate from a set of
possible aggregates.
Required tags occur once unless noted as one or more in the description,
in which case the specification allows multiple occurrences.
Optional tags occur once if present unless noted as zero or more in the
description, in which case the specification allows multiple occurrences.
Allowable specific values are listed, where applicable.
A-n or N-n, specify those values that take general
alphanumeric or pure numeric type values, where n indicates the maximum size.
References to certain common value types, such as a dollar amount, are by
name. Chapter 3 lists value types that can be referenced by name.
Tag
Description
<REQUIREDTAG>
Required tag (1 or more)
<REQUIREDTAG2>
Required tag that occurs only once
<OPTIONALTAG>
Optional tag; this particular one can occur multiple times
(0 or more)
<SPECIFIC>
Values are A, B, and C
<ALPHAVALUE>
Takes an alphanumeric value up to 32 characters, A-32
This chapter describes the basic structure of an Open Financial Exchange
request and response. Structure includes headers, basic syntax, and the Signon request and
response. This chapter also describes how Open Financial Exchange encodes external data,
such as bit maps.
Open Financial Exchange data consists of some headers plus one or more
Open Financial Exchange data blocks. Each block consists of a signon message and zero or
more additional messages. When sent over the internet using HTTP, standard HTTP and
multi-part MIME headers and formats surround the Open Financial Exchange data. A simple
file that contained only Open Financial Exchange data would have the following form:
HTTP headers
MIME type application/x-ofx
Open Financial Exchange headers
Open Financial Exchange SGML block 1
A more complex file that contained multiple Open Financial Exchange data
blocks and additional Open Financial Exchange data would have this form:
HTTP headers
MIME type multipart/x-mixed-replace; boundary =--boundary-
---boundary---
MIME type application/x-ofx
Open Financial Exchange headers
Open Financial Exchange SGML block 1
Open Financial Exchange SGML block 2
---boundary---
MIME type image/jpeg
FI logo
Data delivered by way of HTTP places the standard HTTP result code on
the first line. HTTP defines a number of status codes. Servers can return any standard
HTTP result. However, FIs should expect clients to collapse these codes into the following
three cases:
Code
Meaning
Action
200
OK
The request was processed and a valid Open Financial
Exchange result is returned.
400s
Bad request
The request was invalid and was not processed. Clients will
report an internal error to the user.
500s
Server error
The server is unavailable. Clients should advise the user
to retry shortly.
NOTE: Open Financial Exchange returns a code 400 only if it cannot parse the
file. Open Financial Exchange handles content errors such as wrong PIN, or invalid
account, by returning a valid Open Financial Exchange response along with code 200.
Open Financial Exchange requires the following HTTP standard headers:
Code
Value
Explanation
Content-type
application/x-ofx
The MIME type for Open Financial Exchange
Content-length
length
Length of the data after removing HTTP headers
When responding with multi-part MIME, the main type will be
multi-part/x-mixed-replace;
one of the parts will use application/x-ofx.
The intent of Open Financial Exchange is for use with a variety of
transports and to provide sufficient version control capabilities for future expansion. To
support this goal, the contents of an Open Financial Exchange file consist of a simple set
of headers followed by contents defined by that header. "File format" means the
entire content after removal of any transport headers. The HTTP transport described in
this document, means without the HTTP and MIME headers.
The Open Financial Exchange headers are in a simple tag:value
syntax and terminated by a blank line. Open Financial Exchange always sends headers
unencrypted, even if there is application-level encryption in use for the remaining
contents. The first entry will always be OFXHEADER with a version number. This entry will
help identify the contents as an Open Financial Exchange file, and provides the version of
the Open Financial Exchange headers that follow (not of the content itself). For example:
OFXHEADER:100
This document defines version 1.0 of the headers to contain at least the
following additional tags:
The data tag identifies the contents as being in OFX SGML form. VERSION
identifies the version type as OFXSGML data. In the case of OFXSGML, it translates to the
version of the Document Type Definition (DTD) that it uses for parsing. The ENCODING and
CHARSET tags define the interpretation of the character data. See Chapter 5,
"International Support" for more information on these tags. Chapter 4 describes
the security tag. A future version of this specification will define compression.
Open Financial Exchange uses OLDFILEUID and NEWFILEUID to support error
recovery. They are not present when clients are not requesting error recovery. (See
Chapter 6, "Data Synchronization")
A blank line follows the last tag. Then (for type OFXSGML), the
SGML-readable data begins with the <OFX> tag.
NOTE: Here, VERSION provides the overall version of the DTD.
The <OFX> block describes the specific message set versions used, shown later in
this chapter.
The OFXHEADER value should only change its major number if an existing
client is unable to process the new header. This can occur because of a complete syntax
change in a header, or a significant change in the semantics of an existing tag-not the
entire response. You can add new tags as long as clients can function without
understanding them.
You should add new values for a data tag only when you introduce an
entirely new syntax. In the case of OFXSGML, a new syntax would have to be non-SGML
compliant to warrant a new data value. It is possible that there will be more than one
syntax in use at the same time to meet different needs.
The intent of the header version tag is to identify syntactic changes.
In the case of OFXSGML, this corresponds to the DTD. Purely for identification purposes,
each change will increment the minor number of the version tag. If you introduce an
incompatible change so that an older DTD can not parse the file, the major number will
change. See the general discussion of message sets and version control, later in this
chapter.
SGML is the basis for Open Financial Exchange. There is a DTD that
formally defines the SGML wire format. However, Open Financial Exchange is not completely
SGML-compliant because the specification allows unrecognized tags to be present. It
requires clients and servers to skip over the unrecognized material. That is, if
<XYZ>qqq</XYZ> appeared and a client or server cannot recognize <XYZ>,
the server should ignore that tag and its enclosed data. A fully-compliant SGML parser
would not validate an Open Financial Exchange document if it contained any tags
that the DTD does not define.
Although SGML is the basis for the specification, and the specification
is largely compliant with SGML, do not assume Open Financial Exchange supports any SGML
features not documented in this specification. The intent is to allow parsing to be as
simple as possible, while retaining compatibility with the SGML world.
The following characters are special to SGML. Use the given alternative
sequence to represent them:
Character
Escape sequence
< (less than)
<
> (greater than)
>
& (ampersand)
&
For example, the string "AT&T" encodes "AT&T."
A special case applies in specific tags that can accept HTML-formatted strings, such as
e-mail records. These accept SGML marked section syntax to hide the HTML from the Open
Financial Exchange parser. You must prefix strings with "<![ CDATA ["and
suffixed with"]]>." Within these bounds, treat the above characters literally
without an escape. See the Chapter 9 for an example.
Open Financial Exchange hierarchically organizes request and response
blocks:
Top Level <OFX>
Message Set and Version <XXXMSGSVn>
Synchronization Wrappers <YYYSYNCRQ>, <YYYSYNCRS>
Transaction Wrappers <YYYTRNRQ>, <YYYTRNRS>
Specific requests and responses
The following sections describe each of these levels.
An Open Financial Exchange request or response has the following
top-level form:
Tag
Description
<OFX>
Opening tag
... Open Financial Exchange requests or responses ...
0 or more transaction requests and responses inside
appropriate message set aggregates
</OFX>
Closing tag for the Open Financial Exchange record
This chapter specifies the order of requests and responses.
A single file can contain multiple <OFX> ... </OFX> blocks. A typical use
of multiple blocks is to request in a single file information associated with different
users.
A message is the unit of work in Open Financial Exchange. It refers to a
request and response pair, and the status codes associated with that response. For
example, the message to download a bank statement consists of the request <STMTRQ>
and the response <STMTRS>. In addition, with the exception of the signon message,
each message includes a transaction wrapper. These aggregates add a transaction
unique ID <TRNUID>, and for responses, a <STATUS> aggregate, to the basic
request and response.
For messages subject to synchronization (see Chapter 6), a third layer
of aggregates is also part of a message definition: a synchronization request and
response. These add a token and, in some cases, other information to the transactions.
Open Financial Exchange uses the following naming where the XXX
message includes:
Basic request <XXXRQ> and response <XXXRS>
Transaction wrapper <XXXTRNRQ> and <XXXTRNRS>
If needed, synchronization wrapper <XXXSYNCRQ> and
<XXXSYNCRS>
In a few cases, a small number of related basic requests and responses
share a transaction and synchronization wrapper. The term message will still apply to each
request and response; only the naming scheme will not hold in those cases.
Message sets are collections of messages. Generally they form all or
part of what a user would consider a service, something for which they might have
signed up, such as "banking." Message sets are the basis of version control,
routing, and security. They are also the basis for the required ordering in Open Financial
Exchange files.
Within an Open Financial Exchange block, Open Financial Exchange
organizes messages by message set. A message set can appear at most once within an Open
Financial Exchange block. All messages from a message set must be from the same version of
that message set.
For each message set of XXX and version n, there exists an
aggregate named <XXXMSGSVn>. (Compare with <XXXMSGSETVn>
in Chapter 7.) All of the messages from that message set must be inside the appropriate
message set aggregate. In the following example, the Open Financial Exchange block
contains a signon request inside the signon message set, and two statement requests and a
transfer request inside the bank message set.
Other than the signon message, each request is made as a transaction.
Transactions contain a client-assigned globally unique ID, optional client-supplied
pass-back data, and then the record for the specific request. A transaction similarly
wraps each response. The response transaction returns the client ID sent in the request,
along with a status message, the pass-back data if present, and the specific response
record. This technique allows a client to track responses against requests.
The <STATUS> aggregate, defined in Chapter 3, provides feedback on
the processing of the request. If the <SEVERITY> of the status is ERROR, the server
provides no specific response record. Otherwise, the response will be complete even though
some warning might have occurred.
Clients can send additional information in <CLTCOOKIE> that
servers will return in the response. This allows clients that do not maintain state, and
thus do not save TRNUIDs, to cause some additional descriptive information to be present
in the response. For example, a client might identify a request as relating to a user or a
spouse.
In some countries some transactions require a customer-supplied
authorization number for each transaction. In those countries, the <TAN> element
provides the means to pass this information to servers. As Open Financial Exchange is
implemented in each country, the specification will define the specific requirements for
the use of <TAN> in each country.
A typical request is as follows:
Tag
Description
<XXXTRNRQ>
Transaction-request aggregate
<TRNUID>
Client-assigned globally unique ID for this transaction trnuid
<CLTCOOKIE>
Data to be echoed in the transaction response A-32
<TAN>
Transaction authorization number; used in some countries
with some types of transactions. Country-specific documentation will define messages that
require a TAN, A-80
specific request
Aggregate for the specific request
</XXXTRNRQ>
A typical response is as follows:
Tag
Description
<XXXTRNRS>
Transaction-response aggregate
<TRNUID>
Client-assigned globally unique ID for this transaction, trnuid
<CLTCOOKIE>
Client-provided data, REQUIRED if provided in
request, A-32
The Signon message set includes the signon message and the PIN change
message, and must appear in that order. The <SIGNONMSGSRQV1> and
<SIGNONMSGSRSV1> aggregates wrap the message.
The signon record identifies and authenticates a user to an FI. It also
includes information about the application making the request, because some services might
be appropriate only for certain clients. Every Open Financial Exchange request contains
exactly one <SONRQ>. Every response must contain exactly one <SONRS> record.
Use of Open Financial Exchange presumes that FIs authenticate each
customer and then give the customer access to one or more accounts or services. If
passwords are specific to individual services or accounts, a separate Open Financial
Exchange request will be made for each distinct user ID or password required. This will
not necessarily be in a manner visible to the user. Note that some situations, such as
joint accounts or business accounts, will have multiple user IDs and multiple passwords
that can access the same account.
FIs assign user IDs for the customer. It can be the customer's social
security number, but the client will not make any assumptions about the syntax of the ID,
add check-digits, or do similar processing.
To improve server efficiency in handling a series of Open Financial
Exchange request files sent over a short period of time, clients can request that a server
return a <USERKEY> in the signon response. If the server provide a user key, clients
will send the <USERKEY> in instead of the user ID and password in subsequent
sessions, until the <USERKEY> expires. This allows servers to authenticate
subsequent requests more quickly.
The client returns <SESSCOOKIE> if it sent one in a previous
<SONRS>. Servers can use this value to track client usage but cannot assume that all
requests come from a single client, nor can they deny service if they did not expect the
returned cookie. Use of a backup file, for example, would lead to an unexpected
<SESSCOOKIE> value that should nevertheless not stop a user from connecting.
Servers can request that a consumer change his or her password by
returning status code 15000. Servers should keep in mind that only one status code can be
returned. If the current signon response status should be 15500 (invalid ID or password),
the request to change password will need to wait until an otherwise successful signon is
achieved.
Record Request <SONRQ>
Tag
Description
<SONRQ>
Record- request aggregate
<DTCLIENT>
Date and time of the request from the client computer, datetime
<USERID>
User identification string. Use <USERID> &
<USERPASS>, or <USERKEY>, but not both; A-32
<USERPASS>
User password on server - either <USERID> &
<USERPASS> are used, or <USERKEY>, but not both; A-32
<USERKEY>
Login using previously authenticated context - use
<USERID> & <USERPASS>, or <USERKEY>, but not both; A-64
<GENUSERKEY>
Request server to return a USERKEY for future use, Boolean
<LANGUAGE>
Requested language for text responses, language
<SESSCOOKIE>
Session cookie, value received in previous <SONRS>,
not sent if first login or if none sent by FI A-1000
<FI>
Financial-Institution-identification aggregate
</FI>
<APPID>
ID of client application, A-5
<APPVER>
Version of client application, N-4 (6.00 encoded as
0600)
</SONRQ>
Response <SONRS>
Tag
Description
<SONRS>
Record-response aggregate
<STATUS>
Status aggregate, see list of possible code values
<DTSERVER>
Date and time of the server response, datetime
<USERKEY>
Use user key that instead of USERID and USERPASS for
subsequent requests. TSKEYEXPIRE can limit lifetime
<TSKEYEXPIRE>
Date and time that USERKEY expires
<LANGUAGE>
Language used in text responses, language
<DTPROFUP>
Date and time of last update to profile information for any
service supported by this FI (see Chapter 7), datetime
<DTACCTUP>
Date and time of last update to account information (see
Chapter 8), datetime
<FI>
Financial-Institution-identification aggregate
</FI>
<SESSCOOKIE>
Session cookie that the client should return on the next
<SONRQ> A-1000
</SONRS>
List of status code values for the <CODE> element of <STATUS>:
Value
Meaning
0
Success (INFO)
2000
General error (ERROR)
15000
Must change PIN (INFO)
15500
Signon (for example, user ID or password) invalid (ERROR)
15501
Customer account already in use (ERROR)
15502
PIN Lockout (ERROR)
Financial Institution ID <FI>
Some service providers support multiple FIs, and assign each FI an ID.
The signon allows clients to pass this information along, so that providers will know to
which FI the user is actually doing a signon.
Tag
Description
<FI>
FI-record aggregate
<ORG>
Organization defining this FI name space, A-32
<FID>
Financial Institution ID (unique within <ORG>), A-32
The signon sends a request to change a customer password as a separate
request. The transaction request <PINCHTRNRQ> aggregate contains <PINCHRQ>.
Responses are also inside transaction responses <PINCHTRNRS>.
Password changes pose a special problem for error recovery. If the
client does not receive a response, it does not know whether the password change was
successful or not. Open Financial Exchange recommends that servers accept either the old
password or the new password on the connection following the one containing a password
change. The password used becomes the new password.
Tag
Description
<PINCHRQ>
PIN-change-request aggregate
<USERID>
User identification string. Often a social security number,
but if so, does not include any check digits, A-32
<NEWUSERPASS>
New user password, A-32
</PINCHRQ>
Tag
Description
<PINCHRS>
PIN-change-response aggregate
<USERID>
User identification string. Often a social security number,
but if so, does not include any check digits, A-32
<DTCHANGED>
Date and time the password was changed, datetime
</PINCHRS>
Status Code Values for the <CODE> Element of
<STATUS>
Some data, such as binary data, cannot be easily sent directly within
SGML. For these situations, the specification will define a tag that contains a reference
to some external data. The way that clients pick up the external data depends on the
transport used. For the HTTP-based transport described in this document, servers can send
the data in one of two ways:
Send the same response, using multi-part MIME types to separate the
response into basic
Open Financial Exchange and one or more external data files
Client can make a separate HTTP get against the supplied URL, if it
really needs the data
For example, to retrieve a logo, a <GETMIMERS> might answer a
<GETMIMERQ> as follows:
An organization that provides a customized client and server that
communicate by means of
Open Financial Exchange might wish to add new requests and responses or even specific
elements to existing requests and responses. To ensure that each organization can extend
the specification without the risk of conflict, Open Financial Exchange defines a style of
tag naming that lets each organization have its own name space.
Organizations can register a specific tag name prefix. (The specific
procedure or organization to manage this registration will be detailed at a later time.)
If an organization registers "ABC," then they can safely add new tags named
<ABC.SOMETHING> without
Colliding with another party wishing to extend the specification
Confusing a client or server that does not support the extension
The extensions are not considered proprietary. An organization is free
to publish their extensions and encourage client and server implementers to support them.
All tag names that do not contain a period (.) are reserved for use in
future versions of the core
Open Financial Exchange specification.
Open Financial Exchange does not provide a universal space for
identifying financial institutions, accounts, or types of accounts. The way to identify an
FI and an account at that FI depends on the service. For information about
service-specific ID aggregates, see Chapters 11, 12, and 13 on banking, payments, and
investments.
Several responses allow FIs to send an arbitrary set of balance
information as part of a response, for example a bank statement download. FIs might want
to send information on outstanding balances, payment dates, interest rates, and so forth.
Balances can report the date the given balance reflects in <DTASOF>.
Tag
Description
<BAL>
Balance-response aggregate
<NAME>
Balance name, A-20
<DESC>
Balance description, A-80
<BALTYPE>
Balance type.
DOLLAR = dollar (value formatted DDDDcc)
PERCENT = percentage (value formatted XXXX.YYYY)
NUMBER = number (value formatted as is)
<VALUE>
Balance value.
Interpretation depends on <BALTYPE> field, N-20
<CURRENCY>
If dollar formatting, can optionally include currency
To provide as much feedback as possible to clients and their users, Open
Financial Exchange defines a <STATUS> aggregate. The most important element is the
code that identifies the error. Each response defines the codes it uses. Codes 0 through
2999 have common meanings in all Open Financial Exchange transactions. Codes from 3000 and
up have meanings specific to each transaction.
Tag
Description
<STATUS>
Error-reporting aggregate.
<CODE>
Error code, N-6
<SEVERITY>
Severity of the error:
INFO = Informational only
WARN = Some problem with the request occurred but valid response still present
ERROR = A problem severe enough that response could not be made
<MESSAGE>
A textual explanation from the FI. Note that clients will
generally have messages of their own for each error ID. Use this tag only to provide more
details or for the General errors.
</STATUS>
stadyn_image6
stadyn_image7
For general errors, the server can respond with one of the following <CODE>
values. However, not all codes are possible in a specific context.
Code
Meaning
0
Success (INFO)
2000
General error (ERROR)
2021
Unsupported version (ERROR)
NOTE: Clients will generally have error messages based on <CODE>.
Therefore, do not use <MESSAGE> to replace that text. Use <MESSAGE> only to
explain an error not well described by one of the defined CODEs, or to provide some
additional information.
This section defines elements used in several services of Open Financial
Exchange. The format of the value is either alphanumeric (A-n)or numeric
(N-n) with a maximum length n; or as a named type. Section 3.3 describes the
named types.
An FI assigns an <FITID> to uniquely identify a transaction. Its
primary purpose is to allow a client to detect duplicate responses. Open Financial
Exchange intends <FITID> for use in statement download applications, where every
transaction requires a unique ID; not just those that are client-originated or
server-originated.
FITIDs must be unique within the scope of the requested transactions
(that is, within an account) but need not be sequential or even increasing. Clients should
be aware that FITIDs are not unique across FIs. If a client performs the same type of
request within the same scope at two different FIs, clients will need to use FI + account
+ <FITID> as a unique key in a client database.
Usage: Bank statement download, investment statement download
A <SRVRTID> is a server-assigned ID. It should remain constant
throughout the lifetime of the object on the server. The client will consider the SRVRTID
as its "receipt" or confirmation and will use this ID in any subsequent requests
to change, delete, or inquire about this object.
Where the context allows, it is possible for a server to use the same value
for a given server object for both <SRVRTID> and <FITID>, but the client
will not know this. SRVRTIDs need be unique only within the scope of the requests and
responses they apply to, such as an account number. Like <FITID>, a <SRVRTID>
is not unique across FIs and clients might need to use FI + <SRVRTID> if a client
requires a unique key.
Open Financial Exchange uses TRNUIDs to identify transactions,
specifically <XXXTRNRQ>. Clients expect a server to return the same
<TRNUID> in the corresponding response and can use this to match up requests and
responses. Servers can use TRNUIDs to reject duplicate requests. Because multiple clients
might be generating requests to the same server, transaction IDs need to be unique across
clients. Thus, <TRNUID> must be a globally unique ID.
The Open Software Foundation Distributed Computing Environment standards
specify a 36-character hexadecimal encoding of a 128-bit number and an algorithm to
generate it. Clients are free to use their own algorithm, to use smaller TRNUIDs, or to
relax the uniqueness requirements if in their particular application it makes sense.
However, it is RECOMMENDED that clients allow for the full 36 characters in
responses to work better with other clients.
Open Financial Exchange uses <TOKEN> as part of data
synchronization requests to identify the point in history that the client has already
received data, and in responses to identify the server's current end of history. See
Chapter 6, "Data Synchronization," for more information.
<TOKEN> is unique within an FI and the scope of the
synchronization request. For example, if the synchronization request includes an account
ID, the <TOKEN> needs be unique only within an account. Servers are free to use a
<TOKEN> that is unique across the entire FI. Clients must save separate
<TOKEN>s for each account, FI, and type of synchronization request.
Open Financial Exchange uses these tags in requests to provide guidance
to the FI about the range of response that is desired. It also uses them in responses to
let clients know what the FI was actually able to produce.
In requests, the following rules apply:
If <DTSTART> is absent, the client is requesting all available
history (up to the <DTEND>, if specified). Otherwise, it indicates the inclusive
date and time in history where the client expects servers to start sending information.
If <DTEND> is absent, the client is requesting all available
history (starting from <DTSTART>, if specified). Otherwise, it indicates the exclusive
date and time in history where the client expects servers to stop sending information.
In responses, the following rules apply:
<DTSTART> is the date and time where the server began looking
for information, not necessarily the date of the earliest returned information. If the
response <DTSTART> is later than the requested <DTSTART>, clients can infer
that the user has not signed on frequently enough to ensure that the client has retrieved
all information. If the user has been calling frequently enough, <DTSTART> in the
response will match <DTSTART> in the request.
<DTEND> is the date and time that, if used by the client as the
next requested <DTSTART>, it would pick up exactly where the current response left
off. It is the exclusive date and time in history where the server stopped looking
for information, based on the request <DTEND> rules.
In all cases, servers are REQUIRED to use a "system add
datetime" as the basis for deciding which details match the requested date range. For
example, if an FI posts a transaction dated Jan 3 to a user's account on Jan 5, and a
client connects on Jan 4 and again on Jan 6, the server is REQUIRED to return that
Jan 3 dated transaction to the client when it calls on Jan 6.
Usage: Bank statement download, investment statement download
There is one format for representing dates, times, and time zones. The
complete form is:
YYYYMMDDHHMMSS.XXX[gmt offset:tz name]
For example, "19961005132200.1234[-5:EST]" represents October
5, 1996, at 1:22 and 124 milliseconds p.m., in Eastern Standard Time. This is the same as
6:22 p.m. Greenwich Mean Time (GMT).
Tags specified as type date and generally starting with the
letters "DT" will accept a fully formatted date-time-timezone as specified
above. They will also accept values with fields omitted from the right. They assume the
following defaults if a field is missing:
YYYYMMDD: 12:00 midnight (the start of the day), GMT
YYYYMMDDHHMMSS: GMT
YYYYMMDDHHMMSS.XXX: GMT
YYYYMMDDHHMMSS.XXX[-0500:EST]: Fully qualified
Open Financial Exchange identifies elements that require a time as
having type timestamp and their tag name will start with the prefix TS. The
timezone and milliseconds are still optional, and will default to GMT.
Take care when specifying an ending date without a time. If the last
transaction returned for a bank statement download was Jan 5 1996 10:46 a.m. and if the
<DTEND> was given as just Jan 5, the transactions on Jan 5 would be resent. If
results are only available daily, then just using dates and not times will work correctly.
NOTE: Open Financial Exchange does not require servers or
clients to use the full precision specified. However, they are REQUIRED to
accept any of these forms without complaint.
Some services extend the general notion of a date by adding
special values, such as "TODAY." These special values are called "smart
dates." Specific requests indicate when to use these extra values, and list the tag
as having a special data type.
Time Zone Issues
Several issues arise when a customer and the FI are not in the same time
zone, or when a customer moves a computer into new time zones. In addition, it is
generally unsafe to assume that computer users have correctly set their time or timezone.
Although most transactions are not sensitive to the exact time, they
often are sensitive to the date. In some cases, time zone errors lead to actions occurring
on a different date than intended by the customer. For this reason, servers should always
use a complete local time plus GMT offset in any datetime values in a response. If a
customer's request is for 5 p.m. EST, and a server in Europe responds with 1 a.m. MET the
next day, a smart client can choose to warn the customer about the date shift.
Clients that maintain local state, especially of long-lived server
objects, should be careful how they store datetime values. If a customer initiates a
repeating transaction for 5 p.m. EST, then moves to a new time zone, the customer might
have intended that the transaction remain 5 p.m. in the new local time, requiring a change
request to be sent to the server. If, however, they intended it to remain fixed in server
time, this would require a change in the local time stored in the client.
Unless otherwise noted in the specification, Open Financial Exchange
always signs amounts and quantities from the perspective of the customer. Some typically
negative amounts:
Investment buy amount, investment sell quantity
Bank statement debit amounts, checks, fees
Credit card purchases
Margin balance (unless the FI owes the client money)
Some typically positive amounts:
Investment sell amount, investment buy quantity
Bank statement credits
Credit card payments
Ledger balance (unless the account is overdrawn)
Amount: All amount-valued tags are sent with a decimal point or
comma, as in "XXXX.XX." There should not be any punctuation separating
thousands, millions, and so forth. The maximum value accepted depends on the client.
Quantity: Use decimal notation.
Unitprice: Use decimal notation. Unless specifically noted,
prices should always be positive.
Rate: Use decimal notation, with the rate specified out of 100%.
For example, 5.2 is 5.2%.
Some services define special values, such as INFLATION, which you can
use instead of a designated value. Open Financial Exchange refers to these as "smart
types," and identifies them in the specification.
Open Financial Exchange carries financial information over the Internet
in such a way to provide privacy, message integrity, and authentication for applications
at the appropriate level. Each service within Open Financial Exchange requires a certain
level of security. Online banking and payments require strong secrecy, whereas stock
quotations consist of publicly available information and consequently have a much weaker
secrecy requirement.
Some Internet protocols, such as HTTPS (which uses Secure Socket Layer
version 3, SSLv3), provide channel-level security. When the security requirement exceeds
that provided by the channel, you must use an application-level protocol.
Secure Socket Layer version 3 (SSLv3) provides channel-level security in
Open Financial Exchange. SSLv3 provides confidentiality, message integrity, and implicit
authentication. In Open Financial Exchange 1.0, channel-level security using SSLv3 is the
primary form of security.
Open Financial Exchange provides a method to exchange financial
information over public networks. This necessitates strong security facilities and careful
protocol design. The most commonly used facility, and trusted method for accomplishing
many of these goals is SSL. The following sub-sections describe the most prominent
requirements for security and how Secure Socket Layer (SSL) addresses these.
Privacy, Authentication, and Message Integrity
SSL provides a range of strong encryption methods for insuring
confidentiality, and strong measures to insure that messages are not altered as they
propagate over the Internet. User authentication is usually addressed at the application
layer, not within SSL. Servers are configured with public key certificates that client
application software verifies. This provides some measure of server authentication.
Testing certificate revocation lists is not commonly performed. However, as these
facilities emerge, client software will be written to support this need.
Facilities for Authorization
Open Financial Exchange messages typically provide user ID and password
so that a service provider can authenticate the user. Once a system authenticates a user,
the service provider must insure that the user is authorized to perform the requested
actions. For example, the service provider must decide if the specified user is authorized
to perform a transfer from the specified account. The service provider must also determine
whether the user has exceeded allowed limits on withdrawals, whether the activity on this
account is unusual given past history, and other context-sensitive issues.
SSL version 3.0 provides a set of widely and commonly accepted methods
for securing Internet transactions. These common methods within SSL are called
CipherSuites. You can secure applications appropriately within SSL by specifying an
ordered sequence of preferred CipherSuites (highest preference listed first). Servers
select the strongest supported CipherSuite from the list provided by the client.
NOTE: Passing username and password pairs in a weakly
encrypted channel exposes this information to cryptographic attack. When implementing Open
Financial Exchange, use the strongest available ciphers.
You should not use the following CipherSuites because they are
vulnerable to man-in-the-middle attacks during Open Financial Exchange message exchanges:
While strong channel-level security is sufficient for the current suite
of Open Financial Exchange transactions, there are features that channel security does not
provide. These include (but are not limited to) data signing, non-repudiation, rational
certificate management and revocation, and trust proxy. Where the trust model for an
application requires such features to conduct the transaction safely, Open Financial
Exchange stipulates the use of an application-level protocol. A future implementation
guide will publish this protocol.
The standard method for providing application-level security is to rely
upon the RSA Public Key Cryptography Standard (PKCS) message format. The PKCS #7 standard
specifies a message format that is both cryptographically strong and flexible enough to
provide sufficient facilities for evolution.
RSA Public Key Cryptography Standard #7 (PKCS#7) defines a rich set of
message formats for securely exchanging information over public networks. These message
formats provide for encrypting data using a combination of cryptographic techniques to
leverage manageability of public key cryptography. It also utilizes the speed of block
ciphers into a hybrid, which exploits the best properties of each.
PKCS#7 message encryption provides privacy. A digitally signed message
(or applying HMAC) insures message integrity.
Use one of the following to define PKCS#7 messages: Data, Digitally
Signed-Data, Enveloped-Data, or Digitally Signed and Enveloped-Data (also referred to as
Sealed-Data). Open Financial Exchange can use Digested-Data, which digests application
data before it embeds data within an Enveloped-Data object. However, it should not
transmit this data over public networks without encryption applied.
Facilities for Authorization
As stated previously in the section 4.2, authentication and
authorization are the responsibility of the service provider. Open Financial Exchange
messages contain the information to enable authentication and authorization decisions.
With application-level security that uses a digitally signed format, the verification of
that signature provides an additional method of authenticating the user.
Open Financial Exchange applications requiring a sophisticated trust
model require more facilities than those provided by SSL. If an Open Financial Exchange
application requires only point-to-point security, SSL version 3.0 provides adequate
facilities for message security. However, if the application requires more directed,
specific forms of security, then use the appropriate PKCS#7 message formats for the
application. An example of this might be a stock trading application issuing orders whose
values demand that the security level be high, and where Open Financial Exchange treats
the message with special handling instructions.
Recommended cryptographic techniques for Open Financial Exchange
application security are:
RC4 for bulk encryption (using 40 bits for exportable applications, 128
for North America)
RSA encryption of bulk encryption keys and digital signatures
SHA-1 as a secure hash algorithm
In the absence of digital signatures, Open Financial Exchange
applications should utilize the HMAC keyed MAC algorithm, using SHA-1 as a secure hash
function.
When you set the tags for application-layer security-which determines
whether to use PKCS#7 message format-in the FI profile, the application software uses
these facilities.
Most of the content in Open Financial Exchange is language-neutral.
However, some error messages, balance descriptions, and similar tags contain text meant to
appear to the financial institution customers. There are also cases, such as e-mail
records, where customers need to send text in other languages. To support world-wide
languages, Open Financial Exchange must identify the basic text encoding, specific
character set, and the specific language.
The outer Open Financial Exchange headers specify the encoding and
character set, as described Chapter 2. Current encoding values are ASCII and UNICODE. For
ASCII, character set values are code pages. Unicode ignores the character set per se
although it still requires the syntax. Clients identify the language in the signon
request. Open Financial Exchange specifies languages by three-letter codes as defined in
ISO-639. Servers report their supported languages in the profile (see Chapter 7). If a
server cannot support the requested language, they must return an error and not process
the rest of the transactions.
In each transaction involving amounts, responses include a default
currency identification, <CURDEF>. The values are based on the ISO 4217 three-letter
currency identifiers.
Within each transaction, specific parts of the response might need to
report a different currency. Where appropriate, aggregates will include an optional
<CURRENCY> aggregate. The scope of a <CURRENCY> aggregate is everything within
the same aggregate that the <CURRENCY> aggregate appears in, including nested
aggregates, unless overridden by a nested <CURRENCY> aggregate. For example,
specifying a <CURRENCY> aggregate in an investment statement detail means that the
unit price, transaction total, commission, and all other amounts are in terms of the given
currency, not the default currency.
Note that there is no way for two or more individual elements that
represent amounts-and are directly part of the same aggregate-to have different
currencies. For example, there is no way in a statement download to have a different
currency for the <LEDGERBAL> and the <AVAILBAL>, because they are both
directly members of <STMTRS>. In most cases, you can use the optional <BAL>
records to overcome this limitation, which do accept individual <CURRENCY>
aggregates.
The default currency for a request is the currency of the source
account. For example, the currency for <BANKACCTFROM>.
The <CURRATE> should be the one in effect throughout the scope of
the <CURRENCY> aggregate. It is not necessarily the current rate. Note that the
<CURRATE> needs to take into account the choice of the FI for formatting of amounts
(that is, where the decimal is) in both default and overriding currency, so that a client
can do math. This can mean that the rate is adjusted by orders of magnitude (up or down)
from what is commonly reported in newspapers.
Tag
Description
<CURRENCY>or
<ORIGCURRENCY>
Currency aggregate
<CURSYM>
ISO 4217 3-letter currency identifier, A-3
<CURRATE>
Ratio of <CURDEF> currency to <CURSYM>
currency, in decimal form
</CURRENCY>or </ORIGCURRENCY>
In some cases, Open Financial Exchange will define transaction responses so that
amounts have been converted to the home currency. However, Open Financial Exchange will
allow FIs to optionally report the original amount and the original (foreign) currency. In
these cases, transactions include a specific tag for the original amount, and then a
<ORIGCURRENCY> tag to report the details of the foreign currency.
Again, <CURRENCY> means that Open Financial Exchange has not converted
amounts. Whereas, <ORIGCURRENCY> means that Open Financial Exchange has
already converted amounts.
Some of the tags in Open Financial Exchange have values that are
country-specific. For example, <USPRODUCTTYPE> is only useful within the United
States. Open Financial Exchange will extend in each country as needed to provide tags that
accept values useful to that country. Clients in other countries that do not know about
these tags will simply skip them.
In some cases, a tag value represents a fundamental way of identifying
something, yet there does not exist a world-wide standard for such identification.
Examples include bank accounts and securities. In these cases, it is important that Open
Financial Exchange defines a single, extensible approach for identification. For example,
CUSIPs are used within the U.S., but not in other countries. However, CUSIPs are
fundamental to relating investment securities, holdings, and transactions. Thus, a
security ID consists of a two-part aggregate: one to identify the naming scheme, and one
to provide a value. Open Financial Exchange will define valid naming schemes as necessary
for each country.
Currently, some systems provide only limited support for error recovery
and no support for backup files or multiple clients. The Open Financial Exchange data
synchronization approach described in this chapter handles all of these situations.
Open Financial Exchange defines a powerful form of data synchronization
between clients and servers.
Open Financial Exchange data synchronization addresses the following
problems:
Error recovery
Use of multiple client applications
Restoring from a backup file
Multiple data files (for example, one copy at home, another at work).
This chapter first provides a brief background of error recovery
problems and then presents the basic strategy used in Open Financial Exchange to perform
data synchronization. Each Open Financial Exchange service includes specific details for
data synchronization requests and responses.
Most of the information in this chapter concerns data synchronization,
since it is a relatively new concept. The final section in this chapter discusses
alternatives to full synchronization, and summarizes the options for each.
When a client begins a connection with a server for which the connection
does not successfully complete, there are two main problems:
Unconfirmed requests
If a client does not receive a response to work it initiates, it has no
way of knowing whether the server processed the request. It also will not have any
server-supplied information about the request, such as a server ID number.
Unsolicited data
Some banking protocols allow a server to send data to the client
whenever the client makes a connection. This specification assumes that the first client
that calls in after the unsolicited data is available for download receives the data. If
the connection fails, this information would be forever lost to the client. Examples of
unsolicited data include updates in the status of a bill payment and e-mail responses.
Unsolicited data presents problems beyond error recovery. Because the
first client that connects to a server is the only one to receive unsolicited data, this
situation precludes use of multiple clients without a data synchronization method. For
example, if a user has a computer at work and one at home, and wants to perform online
banking from both computers, a bank server could send unsolicited data to one but not the
other.
An even greater problem occurs when a user resorts to a backup copy of
the client data file. This backup file will be missing recent unsolicited data with no way
to retrieve it from the server once the server sends it.
A simple solution is to make sure that clients can always obtain
information from the server for a reasonable length of time. Clients can request recent
responses-whether due to client-initiated work or other status changes on the server-by
supplying the previous endpoint in the response history. Servers always supply a new
endpoint whenever they supply responses.
If a client connection fails-or a client receives a response, but
crashes before updating its database-the client will not save the new endpoint. On the
next synchronization request, the server sends the same information (plus any further
status changes).
If a user switches to a backup file, again the client will use the older
endpoint in the synchronization request.
If multiple clients are in use, each will send requests based on its own
endpoint, so that both clients will obtain complete information from the server. This is
one reason why Open Financial Exchange responses carry enough information from the request
to enable them to be processed on their own. The diagram below shows the relationship
between clients and servers.
Open Financial Exchange relieves the server from maintaining any special
error-recovery state information. However, Open Financial Exchange requires the server to
maintain a history of individual responses it would have sent and a way to identify a
position in the history. This ID could be a timestamp, or be based on its existing state
information.
NOTE: Open Financial Exchange does not require servers to
store these responses based on individual connections. Also, not all requests are subject
to synchronization. For example, Open Financial Exchange does not require servers to store
statement-download responses separately for data synchronization.
Open Financial Exchange does synchronization separately for each type of
response. In addition, a synchronization request might include further identifying
information, such as a specific account number. This specification defines the additional
information for each synchronization request.
Each Open Financial Exchange service will identify the specific
responses that are subject to data synchronization. For example, a bank statement-download
is a read-only operation from the server. A client can request again if it fails;
consequently, there is no special data synchronization for this type of response.
The basis for synchronization is a token as defined by the
<TOKEN> tag. The server is free to create a token in any way it wishes. The client
simply holds the token for possible use in a future synchronization request.
The server can derive a token from one of the following:
Timestamp
Sequential number
Unique non-sequential number
Other convenient values for a server
Clients will send a <TOKEN> of zero on their first synchronization
request. Servers should send all available history, allowing a new client to know about
work done by other clients. If a user's account has never been used with Open Financial
Exchange, the server returns no history.
The server can use different types of tokens for different types of
responses, if suitable for the server.
Tokens will be subject to a maximum size; see Chapter 3, "Common
Aggregates, Elements, and Data Types." Tokens need to be unique only with respect to
a specific type of synchronization request and the additional information in that request.
For example, a bill payment synchronization request takes an account number; therefore, a
token needs to be unique only within payments for a specific account.
Servers will not have infinite history available, so synchronization
responses include a <LOSTSYNC> element with a value of Y (yes) if the old token in
the synchronization request was older than available history. This tag allows clients to
alert users that some responses have been lost.
NOTE: A token is unrelated to a <TRNUID>,
<SRVRTID>, or <FITID>. Each of these serves a specific purpose, and has its
own scope and lifetime.
A <SRVRTID> is not appropriate as a <TOKEN> for bill
payment. A single payment has a single <SRVRTID>, but it can undergo several state
changes over its life and thus have several entries in the token history.
There are three different ways a client and a server can conduct their
requests and responses:
Explicit synchronization - A client can request synchronization without
sending any (other) Open Financial Exchange requests. Clients will send a specific
synchronization request, including the current token for that request. The response will
be a set of individual responses more recent than the given token, along with a new token.
Synchronization with new requests - A client can request synchronization
as part of the response to any new requests it has. It gives the old token. The response
should include responses to the new requests plus any others that became available since
the old token, along with a new token. An aggregate contains the requests so that the
server can process the new requests and update the token as an atomic action.
New requests without synchronization - A client can make new requests
without providing the old token. In this case, it expects just responses to the new
requests. A subsequent request for synchronization will cause the client to send the same
response again, because the client did not update its token.
Each request and response that requires data synchronization will define
a synchronization aggregate. The aggregate tells the server which particular kind of data
it should synchronize. By convention, these aggregates always have SYNC as part of their
tag names, for example, <PMTSYNCRQ>. You can use these aggregates on their
own to perform explicit synchronization, or as wrappers around one or more new
transactions. For example, you can use <PMTSYNCRQ> aggregates to request
synchronization in combination with new work. You can use the <PMTTRNRQ> by itself
if you do not require synchronization.
Some clients can choose to perform an explicit synchronization before
sending any new requests (with or without synchronization). This practice allows clients
to be up-to-date and possibly interact with the user before sending any new requests.
Other clients can simply send new requests as part of the synchronization request.
If a client synchronizes in one file, then sends new work inside a
synchronization request in a second file, there is a small chance that additional
responses become available between the two connections. There is even a smaller chance
that these would be conflicting requests, such as modifications to the same object.
However, some clients and some requests might require absolute control, so that the user
can be certain that they are changing known data. To support this, synchronization
requests can optionally specify <REJECTIFMISSING>. The tag tells a server that it
should reject all enclosed requests if the supplied <TOKEN> is out of date before
considering the new requests. That is, if any new responses became available, whether
related to the incoming requests or not (but part of the scope of the synchronization
request), the server should immediately reject the requests. It should still return the
new responses. A client can then try again until it finds a stable window to submit the
work. See section 6.5 for more information about conflict detection and resolution.
The password change request and response present a special problem. See
section 2.5.2 for further information.
Conflicts arise whenever two or more users or servers modify the same
data. This can happen to any object that has a <SRVRTID> that supports change or
delete requests. For example, one spouse and the other might independently modify the same
recurring bill payment model. From a server perspective, there is usually no way to
distinguish between the same user making two intended changes and two separate users
making perhaps unintended changes. Therefore, Open Financial Exchange provides enough
tools to allow clients to carefully detect and resolve conflicts. Open Financial Exchange
requires only that a server process atomically all requests in a single <OFX> block.
A careful client will always synchronize before sending any new
requests. If any responses come back that could affect a user's pending requests, the
client can ask the user whether it should still send those pending requests. Because there
is a small chance for additional server actions to occur between the initial
synchronization request and sending the user's pending requests, extremely careful clients
can use the <REJECTIFMISSING> option. Clients can iterate sending pending requests
inside a synchronization request with <REJECTIFMISSING> and testing the responses to
see if they conflict with pending requests. A client can continue to do this until a
window of time exists wherein the client is the only agent trying to modify the server. In
reality, this will almost always succeed on the first try.
There are some situations, and some types of clients, where it is
preferable for a client to ask a server to send everything it knows, rather than just
receive a set of changes. For example, a situation where a client that has not connected
often enough has lost synchronization. An example of "type" might be a
completely stateless client, such as a web browser. This choice is made during client
implementation. Open Financial Exchange does not require a client to refresh just because
it has lost synchronization.
Clients will mainly want to refresh lists of long-lived objects on the
server; generally objects with a <SRVRTID>. For example, Open Financial Exchange
Payments has both individual payments and models of recurring payments.
A brand new client, or a client that lost synchronization, might want to
learn about in-progress payments by doing a synchronization refresh of the payment
requests. It would almost certainly want to do a synchronization refresh of the recurring
payment models, because these often live for months or years.
A client might not perform a synchronization refresh on e-mail
responses.
A client can request a refresh by using the <REFRESH> tag with
value of Y instead of the <TOKEN> tag. Server descriptions detail the exact behavior
that servers should follow. However, the general rule is that servers should send
responses that emulate a client creating or adding each of the objects governed by the
particular synchronization request.
In these cases, you can set <TRNUID> to zero; the standard value
for server-generated responses.
There is no need to recreate a stream of responses that emulate the
entire history of the object, just an add response that reflects the current state. For
example, if you create a model and then modify it three times, even if this history would
have been available for a regular synchronization, servers should only send a single add
that reflects the current state.
A client that just wants the current token, without refresh or
synchronization, makes requests with <TOKENONLY> and a value of Y.
In all cases, servers should send the current ending <TOKEN> for
the synchronization request in refresh responses. This allows a client to perform regular
synchronization requests in the future.
The following table summarizes the options in a client synchronization
request:
Tag
Description
<TOKEN>
Previous value of <TOKEN> received for this type of
synchronization request from server; 0 for first-time requests; token
<TOKENONLY>
Request for just the current <TOKEN> without the
history, Boolean
<REFRESH>
Request for refresh of current state, Boolean
<REJECTIFMISSING>
If Y, do not process requests if client <TOKEN> is
out of date, Boolean
NOTE:Open Financial Exchange requires one each of <TOKEN>,
<TOKENONLY>, or <REFRESH>.
This section describes how an FI can approach supporting synchronization
based on the assumption that modifications to an existing financial server will be kept to
a minimum.
The simplest approach is to create a history database separate from the
existing server. This history could consist of the actual Open Financial Exchange
transaction responses (<TRNRS> aggregates) that are available to a synchronization
request. The history database could index records by token, response type, and any other
identifying information for that type, such as account number.
The diagram below shows a high-level model of the Open Financial
Exchange architecture for a financial institution. Notice that the diagram shows the
presence of a history journal.
The server adds responses to the history journal for any action that
takes place on the existing server. This is true whether the Open Financial Exchange
requests initiate the action or, in the case of recurring payments, it happens
automatically on the server. Once added to the history journal, the server can forget
them.
The areas of the Open Financial Exchange server that process
synchronization requests need only search this history database for matching responses
that are more recent than the incoming token.
For a refresh request, an Open Financial Exchange server would access
the actual bank server to obtain the current state rather than recent history.
Periodically the bank server would purge the history server of older
entries.
Only requests that are subject to synchronization need to have entries
in the history database. Statement downloads do not involve synchronization; therefore,
the FI server should not add these responses to the history database. Since statement
downloads are usually the largest in space and the most frequent, eliminating these saves
much of the space a response history might otherwise require.
More sophisticated implementations can save even more space. The history
database could save responses in a coded binary form that is more compact than the full
Open Financial Exchange response format. Some FIs might have much or all of the necessary
data already in their servers; consequently, they would not require new data. An FI could
regenerate synchronization responses rather than recall them from a database.
The diagram below shows a general flowchart of what an Open Financial
Exchange client would do with the results of a synchronization request. Most requests and
responses subject to data synchronization contain both <TRNUID> and <SRVRTID>.
It is increasingly common that a server can get simultaneous or
overlapping requests from the same user over two different computers. Open Financial
Exchange requires a server to process each set of requests sent in a file as an atomic
action. Servers can deal with the problems that arise with simultaneous use in two ways:
Allow simultaneous connections, insure each is processed atomically, and
use the data synchronization mechanism to bring the two clients up to date. This is the
preferred method.
Lock out all but one user at a time, returning the error code for
multiple users.
Although it is RECOMMENDED that Open Financial Exchange servers
implement full synchronization as described in this chapter, an alternate approach,
"lite synchronization," could be easier for some servers to support. This
approach focuses only on error recovery and does not provide any support for multiple
clients, multiple data files, or use of backup files. The approach is to preserve the
message sets while simplifying the implementation.
In addition, some clients might prefer to use response-file based error
recovery with all servers, even if the client and some server both support full
synchronization. This section first describes lite synchronization, and then explains the
rules that clients and servers use to decide how to communicate.
Lite synchronization requires servers to accept all synchronization
messages, but does not require them to keep any history or tokens. Responses need only be
sent once and then the server can forget them. Responses to client requests, whether or
not they are made inside a synchronization request, are processed normally. Responses that
represent server-initiated work, such as payment responses that arise from recurring
payments, are sent only in response to synchronization requests. A server does not have to
hold responses in case a second client makes a synchronization request.
Because full synchronization supports error recovery, an alternative is
needed for lite synchronization. Servers using lite synchronization keep a copy of the
entire response file they last sent. Clients requesting that servers prepare for error
recovery generate a globally unique ID for each file they send. In the OFX headers, there
are two tags associated with error recovery:
OLDFILEUID - UID of the last request and response that was successfully
received and processed by the client
NEWFILEUID - UID of the current file
The format of these is the same as used with <TRNUID> as
documented in Chapter 3.
Servers use the following rules:
If these tags are absent, the client is not requesting error recovery
protection for this file. The server does not need to save a copy of the response.
If the NEWFILEUID matches a file saved on the server, then the client is
in error recovery. The server must ignore the new requests in this file and instead send
back the matching saved response file.
If the OLDFILEUID matches a file saved on the server, then OLDFILEUID is
a file that the client has successfully processed and the server can delete it. The client
is also requesting that the response for the current file be saved under the NEWFILEUID
for possible error recovery.
A server will never need to save more than one file per client data
file, but because of possible multi-client or multi-datafile usage, it might need to save
several files for a given user. A server should save as long as possible, but not
indefinitely. A server cannot recognize an error recovery attempt if it comes after it has
purged the error recovery file. A server would process it as a new request. In this case,
a server should recognize duplicate transaction UIDs for client-initiated work, such as
payments, and then reject them individually. Server-generated responses would be lost to
the client.
For a server accustomed to sending unsolicited responses, lite
synchronization should closely match the current response-file based implementation. The
only difference is that a server should hold the unsolicited responses until the client
makes the first appropriate synchronization request; rather than automatically adding them
to any response file. Once added, the server can mark them as delivered, relying on error
recovery to insure actual delivery.
Client and server developers should first decide whether they will
support full synchronization or not. If they can, then they can support response-file
error recovery as well, or they can rely on synchronization to perform error recovery. If
they adopt only lite synchronization, Open Financial Exchange requires response-file error
recovery. A severs describes each of these choices in its server profile records. The
following combinations are valid:
Full synchronization with response-file error recovery
Full synchronization without separate response-file error recovery
Lite synchronization with response-file error recovery
Clients request response-file error recovery by including the old and
new session UIDs in the header. If they are absent, servers need not save the response
file for error recovery. Clients request synchronization by using those synchronization
requests defined throughout this specification.
Here is an example of full synchronization using bill payment as the
service. Open Financial Exchange Payments provides two different synchronization requests
and responses, each with their own token; one for payment requests and one for repeating
payment model requests. See Chapter 102 for full details.
These simplified examples, show without the outer <OFX> layer,
<SIGNON>, and so forth.Client A requests synchronization:
<PMTSYNCRQ>
<TOKEN>123
<BANKACCTFROM>
<BANKID>121000248
<ACCTID>123456789
<ACCTTYPE>CHECKING
</BANKACCTFROM>
</PMTSYNCRQ>The server sends in response:
<PMTSYNCRS>
<TOKEN>125
<LOSTSYNC>N
<BANKACCTFROM>
<BANKID>121000248
<ACCTID>123456789
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<PMTTRNRS>
<STATUS>
... status details
</STATUS>
<TRNUID>123
<PMTRS>
... details on a payment response
</PMTRS>
</PMTTRNRS>
<PMTTRNRS>
<STATUS>
... status details
</STATUS>
<TRNUID>546
<PMTRS>
... details on another payment response
</PMTRS>
</PMTTRNRS>
</PMTSYNCRS>
Client A was missing two payment responses, which the server provides.
At this point, client A is synchronized with the server. Client A now makes a new payment
request, and includes a synchronization update as part of the request. This update avoids
having to re-synchronize the expected response at a later time.
<PMTSYNCRQ>
<TOKEN>125
<BANKACCTFROM>
<BANKID>121000248
<ACCTID>123456789
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<PMTTRNRQ>
<TRNUID>12345
<PMTRQ>
... details of a new payment request
</PMTRQ>
</PMTTRNRQ>
</PMTSYNCRQ>The response to this new
request:
<PMTSYNCRS>
<TOKEN>126
<LOSTSYNC>N
<BANKACCTFROM>
<BANKID>121000248
<ACCTID>123456789
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<PMTTRNRS>
... details on a payment response to the new request
</PMTTRNRS>
</PMTSYNCRS>
The client now knows that the server has processed the payments request
it just made, and that nothing else has happened on the server since it last synchronized
with the server.
Assume client B was synchronized with respect to payments for this
account up through token 125. If it called in now and synchronized-with or without making
additional requests-it would pick up the payment response associated with token 126. It
records the same information that was in client A, which would give both clients a
complete picture of payment status.
Open Financial Exchange clients use the profile to learn the
capabilities of an Open Financial Exchange server. This information includes general
properties such as account types supported, user password requirements, specific messages
supported, and how the client should batch requests and where to send the requests. A
client obtains a portion of the profile when a user first selects an FI. The client
obtains the remaining information prior to sending any actual requests to that FI. The
server uses a timestamp to indicate whether the server has updated the profile, and the
client checks periodically to see if it should obtain a new profile.
In more detail, a profile response contains the following sections,
which a client can request independently:
Message Sets - list of services and any general attributes of those
services. Message sets are collections of messages that are related functionally. They are
generally subsets of what users see as a service.
Signon realms - FIs can require different signons (user ID and/or
password) for different message sets. Because there can only be one signon per <OFX>
block, a client needs to know which signon the server requires and then provide the right
signon for the right batch of messages.
The profile message is itself a message set. In files, Open Financial
Exchange uses the <PROFMSGSV1> aggregate to identify this profile message set.
The following sections describe the general use of profile information.
A message set is a collection of related messages. For example, Chapter
11, "Banking," defines several message sets: statement download, credit card
statement download, intrabank transfers, and so forth. A server routes all of the messages
in a message set to a single URL and merges their versions together.
Clients and servers generally use message sets as the granularity to
decide what functionality they will support. A "banking" server can choose to
support the statement download and intrabank transfer message sets, but not the wire
transfer message set. Attributes are available in many cases to further define how Open
Financial Exchange supports a message set.
Each portion of the Open Financial Exchange specification that defines
messages also defines the message set to which that the messages belongs. This includes
what additional attributes are available for those messages, and whether Open Financial
Exchange requires the message set or it is optional.
Message sets are the basis of version control. Over time there will be
new versions of the message sets, and at any given time servers will likely want to
support more than one version of a message set. Clients should also be capable of
supporting as many versions as possible. Through the profile, clients discover which
versions are supported for each message set. Considering the client capabilities, it
exchanges messages at the highest common level for each message set.
For the Open Financial Exchange-SGML data format, there is a single DTD
for all message sets. Its version advances when any syntactic change is made to any
of the message sets. (It is possible to make a semantic change that would not even
require a change in syntax. A change in rules, for example, that would change the version
of the message set without changing the DTD.) A single DTD cannot have two different
definitions for the same aggregate. There are limitations to how a server that uses true
DTD-based parsing can handle multiple versions of a message at the same time.
To allow FIs to set up different servers for different message sets,
different versions, or to directly route some messages to third party processors, message
sets define the URL to which a server sends messages in that message set. Each version of
a message set can have a different URL. In the common case where many or all message sets
are sent to a single URL, clients will consolidate messages across compatible message
sets. Clients can consolidate when:
A profile request indicates which profile components a client desires.
It also indicates what the client's routing capability is. Profiles returned by the FI
must be compatible with the requested routing style, or it returns an error.
Profile requests are not subject to synchronization. Use the
<PROFTRNRQ> transaction tag.
Tag
Description
<PROFRQ>
Profile-request aggregate
<CLIENTROUTING>
Identifies client routing capabilities, see table below
<DTPROFUP>
Date and time client last received a profile update
</PROFRQ>
Tag
Description
NONE
Client cannot perform any routing. All URLs must be the
same. All message sets share a single signon realm.
SERVICE
Client can perform limited routing. See details below.
MSGSET
Client can route at the message-set level. Each message set
may have a different URL and/or signon realm.
The intent of the SERVICE option for client routing is to support clients that can
route bill payment messages to a separate URL from the rest of the messages. Because the
exact mapping of message sets to the general concept of bill payment can vary by client
and by locale, this specification does not provide precise rules for the SERVICE option.
Each client will define its requirements.
An aggregate describes each message set supported by an FI. Message sets
in turn contain an aggregate for each version of the message set that is supported. For a
message set named XXX, the convention is to name the outer aggregate <XXXMSGSET>
and the tag for each version <XXXMSGSETVn>. The reason for message
set-specific aggregates is that the set of attributes depends on the message set. These
can change from version to version, so there are version-specific aggregates as well.
The general form of the response is:
Tag
Description
<XXXMSGSET>
Service aggregate
<XXXMSGSETVn>
Version-of-message-set aggregate, 1 or more
</XXXMSGSETVn>
</XXXMSGSET>
The <XXXMSGSETVn> aggregate has the following form:
Tag
Description
<XXXMSGSETVn>
Message-set-version aggregate
<MSGSETCORE>
Common message set information
</MSGSETCORE>
message-set specific
Zero or more attributes specific to this version of this
message set, as defined by each message set
</XXXMSGSETVn>
The common message set information <MSGSETCORE> is as follows:
Tag
Description
<MSGSETCORE>
Common-message-set-information aggregate
<VER>
Version number, N-5 (version 1.0 formatted as 100)
<URL>
URL where messages in this set are to be sent
<OFXSEC>
Security level required for this message set; see Chapter 4
<TRANSPSEC>
Y if transport security must be used, N if not used; Boolean
A signon realm identifies a set of messages that can be accessed using
the same password. Realms are used to disassociate signons from specific services,
allowing FIs to require different signons for different message sets. In practice, FIs
will want to use the absolute minimum number of realms possible to reduce the user's
workload.
The profile message set functions the same way as all other message
sets; therefore, it contains a profile description for that message set. Because
<PROFMSGSET> is always part of a message set response, it is described here. Servers
that support profile information must include the <PROFMSGSET> as part of the
profile response <MSGSETLIST>. There are no attributes, but the aggregate must be
present to indicate support for the message set.
Tag
Description
<PROFMSGSET>
Message-set-profile-information aggregate
<PROFMSGSETV1>
Opening tag for V1 of the message set profile information
The Signup message set defines three messages to help users get setup
with their FI:
Enrollment - informs FI that a user wants to use Open Financial Exchange
and requests that a password be returned
Accounts - asks the FI to return a list of accounts, and the services
supported for each account
Activation - allows a client to tell the FI which services a user wants
on each account
There is also a message to request name and address changes.
Clients use the account information request on a regular basis to look
for changes in a user's account information. A timestamp is part of the request so that a
server has only to report new changes. Account activation requests are subject to data
synchronization, and will allow multiple clients to learn how the other clients have been
enabled.
In Open Financial Exchange files, the <SIGNUPMSGSV1> aggregate
identifies the Signup message.
The message sets in this chapter are designed to allow both FIs and
clients to support a variety of sign-up procedures. There are four basic steps a user
needs to go through to complete the sign-up:
Select the FI. Open Financial Exchange does not define this step
or provide message sets to support it. Client developers and FIs can let a user browse or
search this information on a web site, or might define additional message sets to do this
within the client. At the conclusion of this step, the client will have some minimal
profile information about the FI, including the set of services supported and the URL to
use for the next step.
Enrollment and password acquisition. In this step, the user
identifies and authenticates itself to the FI without a password. In return, the
user obtains a password (possibly temporary) to use with Open Financial Exchange. FIs can
perform this entire step over the telephone, through a combination of telephone requests
and a mailed response, or at the FI web site. FIs can also use the Open Financial Exchange
enrollment message to do this by means of the client. The response can contain a temporary
password or users can wait for a mailed welcome letter containing the password.
Account Information. In this step, the user obtains a list of
accounts available for use with Open Financial Exchange, and which specific services are
available for each account. Even if users have enrolled over the telephone, clients will
still use this message set to help users properly set up the accounts within the client.
Clients periodically check back with the FI for updates.
Service Activation. The last step is to activate specific services
on specific accounts. The activation messages support this step. Synchronization is
applied to these messages to insure that other clients are aware of activated services.
The combination of media-interface through which an FI accomplishes
these steps can vary. FIs might wish to do steps two through four over the telephone.
Clients will still use Open Financial Exchange messages in steps 3 and 4 to automatically
set up the client based on the choices made by the user over the phone. Other FIs might
wish to have the entire user experience occur within the client. Either way, the Open
Financial Exchange sign-up messages support the process.
To support the widest possible set of FIs, Open Financial Exchange
assumes that individual users and accounts are in a many-to-many relationship. Consider a
household with three accounts:
Checking 1 - held individually by one spouse
Checking 2 - held jointly by both
Checking 3 - held individually by the other spouse
Checking 2 should be available to either spouse, and the spouse holding
Checking 1 should be able to see both Checking 1 and 2.
Open Financial Exchange expects FIs to give each user their own user ID
and password. Each user will go through the enrollment step separately. A given account
need only be activated once for a service; not once for each user. Clients will use the
account information and activation messages to combine information about jointly-held
accounts.
If an FI prefers to have a single user ID and password per household or
per master account, they will have to make this clear to users through the enrollment
process. It is up to the FI to assign a single user ID and password that can access all
three of the checking accounts described above.
The main purpose of the enrollment message is to communicate a user's
intent to access the FI by way of Open Financial Exchange and to acquire a password for
future use with Open Financial Exchange. Some FIs might return a user ID and an initial
password in the enrollment response, while others will send them by way of regular mail.
NOTE: Because the server does not know the user ID and
password when the client sends the enrollment request, the <SONRQ> will not contain
a valid user ID or password. The enrollment message accepts standard user identification
information.
Enrollment requests are not subject to synchronization. If the client
does not receive a response, it will simply re-request the enrollment. If a user
successfully enrolls from another client before the first client obtains a response, the
server should respond to subsequent requests from the first client with status code:
The Open Financial Exchange <SONRQ> requires a user ID to uniquely
identify a user to an FI. Many FIs in the United States use social security numbers (SSNs)
as the ID. Others create IDs that are unrelated to the users' SSNs. FIs can have an
existing user IDs that they use for other online activities that they wish to use for Open
Financial Exchange as well. They might also create new IDs specifically for Open Financial
Exchange. Finally, some FIs might assign IDs while others might allow users to create
them.
Because users do not usually know either their Open Financial Exchange
sign-on user ID or their password at time of enrollment, the enrollment response is
designed to return both. The enrollment request allows users to optionally provide a user
ID, which an FI can interpret as their existing online ID or a suggestion for what their
new user ID should be. It is recommended that the enrollment process explains ID syntax to
users.
The enrollment request captures enough information to identify and
authenticate a user as being legitimate and that it has a relationship with the FI.
FIs might require that an account number be entered as part of the
identification process. However, this is discouraged since the account information request
is designed to automatically obtain all account information, avoiding the effort and
potential mistakes of a user-supplied account number.
It is RECOMMENDED that FIs provide detailed specifications for
IDs and passwords along with information about the services available when a user is
choosing an FI.
Tag
Description
<ENROLLRQ>
Enrollment-request aggregate
<FIRSTNAME>
First name of user
<MIDDLENAME>
Middle name of user
<LASTNAME>
Last name of user
<ADDR1>
Address line 1
<ADDR2>
Address line 2
<ADDR3>
Address line 3
<CITY>
City
<STATE>
State or province
<POSTALCODE>
Postal code
<COUNTRY>
3-letter country code from ISO/DIS-3166
<DAYPHONE>
Daytime telephone number
<EVEPHONE>
Evening telephone number
<EMAIL>
Electronic e-mail address
<USERID>
Actual user ID if already known, or preferred user ID if
user can pick
<TAXID>
ID used for tax purposes (such as SSN), may be same as user
ID
<SECURITYNAME>
Mother's maiden name or equivalent
<DATEBIRTH>
Date of birth
<ACCTFROM>
An account description aggregate for one existing account
at the FI, for identification purposes only. Can be <BANKACCTFROM>,
<INVACCTFROM>, etc.
</ACCTFROM>
</ENROLLRQ>
This enrollment request is intended for use only by individuals. Business enrollment
will be defined in a later release.
The main purpose of the enrollment response is to acknowledge the
request. In those cases where FIs permit delivery of an ID and a temporary password, the
response also provides for this. Otherwise the server will send the real response to the
user by way of regular mail, electronic mail, or over the telephone. If enrollment is
successful, but the server does not return the ID and password in the response, a server
is REQUIRED to use status code 10 and provide some information to the user by means of the
<MESSAGE> element in the <STATUS> aggregate about what to expect next.
Tag
Description
<ENROLLRS>
Enrollment-response aggregate
<TEMPPASS>
Temporary password
<USERID>
User ID
<DTEXPIRE>
Time the temporary password expires (if <TEMPPASS>
included)
Account information requests ask a server to identify and describe all
of the accounts accessible by the signed-on user. The definition of all is up to
the FI. At a minimum, it is RECOMMENDED that a server include information about all
accounts that it can activate for one or more Open Financial Exchange services. To give
the user a complete picture of his relationship with an FI, FIs can give information on
other accounts, even if those accounts are available only for limited Open Financial
Exchange services.
Some service providers will not have any prior knowledge of any user
account information. The profile allows these servers to report this, and clients will
then know to ask users for account information rather than reading it from the server.
Clients can perform several tasks for users with this account
information. First, the information helps a client set up a user for online services by
giving it a precise list of its account information and available services for each.
Clients can set up their own internal state as well as prepare service activation requests
with no further typing by users. This can eliminate data entry mistakes in account
numbers, routing transit numbers, and so forth.
Second, FIs can provide limited information on accounts that would not
ordinarily be suitable to Open Financial Exchange services. For example, a balance-only
statement download would be useful for certificates of deposits even though a customer or
an FI might not want or allow CDs to be used for full statement download.
For each account, there is one <ACCTINFO> aggregate returned. The
aggregate includes one service-specific account information aggregate for each available
service on that account. That, in turn, provides the service-specific account
identification. Common to each service-specific account information aggregate is the
<SVCSTATUS> tag, which indicates the status of this service on this account.
A server should return joint accounts (accounts for which more than one
user ID can be used to access the account) for either user. Clients that wish to have a
unified view will aggregate the results and remove duplicates before making specific
requests involving joint accounts.
Requests and responses include a <DTACCTUP> element. Responses
contain the last time a server updated the information. Clients can OPTIONALLY send
this in a subsequent request, and servers are REQUIRED to compare this to the
current modification time and only send information if it is more recent. The server sends
the entire account information response if the client's time is older; there is no attempt
to incrementally update specific account information.
URL to request the logo for the account (actual logos
should be included via multi-part MIME in the response file if requested), URL
<XXXACCTINFO>
Service-specific account information, defined in each
service chapter, one or more allowed
<XXXACCTFROM>
Service-specific account identification
</XXXACCTFROM>
<SVCSTATUS>
AVAIL = Available, but not yet requested
PEND = Requested, but not yet available
ACTIVE = In use
</XXXACCTINFO>
</ACCTINFO>
NOTE: A server uses the <DESC> field to convey the FI's preferred name
for the account, such as "PowerChecking." It should not include the account
number.
Clients inform FIs that they wish to start, modify, or terminate a
service for an account by sending service activation requests. These are subject to data
synchronization, and servers should send responses to inform clients of any changes, even
if the changes originated on the server.
Clients use these records during the initial user sign-up process. Once
a client learns about the available accounts and services (by using the account
information request above, or by having a user directly enter the required information),
it sends a series of service ADD requests.
If a user changes any of the identifying information about an account,
the client sends a service activation request containing both the old and the new account
information. Servers should interpret this as a change in the account, not a request to
transfer the service between two existing accounts, and all account-based information such
as synchronization tokens should continue. If a user or FI is reporting that service
should be moved between two existing accounts, service must be terminated for the old
account and started for the new account. The new account will have reset token histories,
as with any new service.
Each service to be added, changed, or removed is contained in its own
request because the same real-world account might require different <ACCTFROM>
aggregates depending on the type of service.
Service activation requests are subject to the standard data
synchronization protocol. The scope of these requests and the <TOKEN> is the
user-ID. The request and response tags are <ACCTSYNCRQ> and <ACCTSYNCRS>.
Users may request that an FI update the official name, address, phone,
and e-mail information using the <CHGUSERINFORQ>. Only the fields that should be
changed are sent. The response reports all of the current values. For security reasons,
some of the fields in the <ENROLLRQ> cannot be changed online, such as tax ID.
The transaction tag is <CHGUSERINFOTRNRQ> and
<CHGUSERINFOTRNRSRQ>. These methods are subject to synchronization,
<CHGUSERINFOSYNCRQ> and <CHGUSERINFOSYNCRS>.
A server must include the following aggregates as part of the profile
<MSGSETLIST> response, since every server must support at least the account
information and service activation messages. In the <ENROLLPROF> aggregate, servers
indicate how enrollment should proceed: via the client, a given web page, or a text
message directing users to some other method (such as a phone call)..
Tag
Description
<SIGNUPMSGSET>
Signup-message-set-profile-information aggregate
<SIGNUPMSGSETV1>
Opening tag for V1 of the message set profile information
<MSGSETCORE>
Common message set information, defined in the profile
chapter
</MSGSETCORE>
Enrollment options - only one of <CLIENTENROLL>,
<WEBENROLL>, or <OTHERENROLL> is allowed
<CLIENTENROLL>
Client-based enrollment supported
<ACCTREQUIRED>
Y if account number is required as part of enrollment Boolean
</CLIENTENROLL>
<WEBENROLL>
Web-based enrollment supported
<URL>
URL to start enrollment process
</WEBENROLL>
<OTHERENROLL>
Some other enrollment process
<MESSAGE>
Message to give to consumer about what to do next (e.g. a
phone number) A-80
</OTHERENROLL>
<CHGUSERINFO>
Y if server supports client-based user information changes
<AVAILACCTS>
Y if server can provide information on accounts with
SVCSTATUS available, N means client should expect to ask user for specific account
information Boolean
The e-mail message set includes two messages: generic e-mail and generic
MIME requests by way of URLs. In Open Financial Exchange files, the message set name is
EMAILMSGSV1.
Open Financial Exchange allows consumers and FIs to exchange messages.
The message body is in HTML so that FIs can provide some graphic structure to the message.
Keep in mind that, as with regular World Wide Web browsing, an Open Financial Exchange
client might not support some or all of the HTML formatting, so the text of the message
must be clear on its own. Clients can request that graphics (the images referenced in an
<IMG> tag) be sent as part of the response file, or clients can separately request
those elements. If a server sends images, it should use the standard procedure for
incorporating external data as described in Chapter 2. Servers are not required to support
HTML or to send images, even if the client asks.
A user or an FI can originate a message. E-mail messages are subject to
data synchronization so that a server can send a response again if it is lost or if it is
used by multiple clients.
Because e-mail messages cannot be replied to immediately, the response
should just echo back the original message (so that data synchronization will get this
original e-mail message to other clients). When the FI is ready to reply, it should
generate an unsolicited response (<TRNUID>0) and the client will pick this up during
synchronization.
Several services with Open Financial Exchange define e-mail requests and
responses that contain additional information specific to that service. To simplify
implementation for both clients and servers, this section defines a <MAIL> aggregate
that Open Financial Exchange uses in all e-mail requests and responses. For regular
e-mail, the only additional information is an account from aggregate and whether to
include images in the e-mail response or not.
Body of message, HTML-encoded or plain text depending on
<USEHTML>, A-10000
</MSGBODY>
End of message
<INCIMAGES>
Include images in response, Boolean
<USEHTML>
Y if client wants an HTML response, N if client wants plain
text, Boolean
</MAIL>
If using HTML for the message body, clients and servers are REQUIRED to wrap the
desired HTML in an SGML marked section to protect the HTML markup: <![ CDATA [ ... html
... ]]>. See the example.
E-mail is subject to synchronization. The transaction tag is
<MAILTRNRQ> / <MAILTRNRS> and the synchronization tag is <MAILSYNCRQ> /
<MAILSYNCRS>.
Tag
Description
<MAILRQ>
E-mail-message-request aggregate
<MAIL>
Core e-mail aggregate
</MAIL>
</MAILRQ>
In a response, the <TRNUID> is zero if this is an unsolicited message. Otherwise,
it should contain the <TRNUID> of the user's original message. It is RECOMMENDED
that servers include the <MESSAGE> of the user's message as part of the reply
<MESSAGE>. The <MESSAGE> contents can include carriage returns to identify
desired line breaks.
Open Financial Exchange uses data synchronization to collect responses
that could have been lost due to communication problems, or that the servers previously
sent to a different client or data file. All messages sent to the signed-on user ID are
covered by a single <TOKEN>. Note that this synchronization action expects only the
basic <MAILRS> responses. Specialized e-mail is received by means of their own
synchronization requests.
Tag
Description
<MAILSYNCRQ>
E-mail-synchronization-request aggregate
<TOKEN>
Client history marker
<INCIMAGES>
Include images in response, Boolean
<USEHTML>
Y if client wants an HTML response, N if client wants plain
text, Boolean
In this example, a consumer requests information from customer service
about the checking statement just downloaded. This example omits the <OFX> top level
and the signon <SONRQ>. This example uses HTML for the message body, and so it must
protect the HTML content in an SGML CDATA marked section.The request:
<MAILTRNRQ>
<TRNUID>54321
<MAILRQ>
<MAIL>
<USERID>123456789
<FROM>James Hackleman
<TO>Noelani Federal Savings
<SUBJECT>What do I need to earn interest?
<DTCREATED>19960305
<MSGBODY><![ CDATA [<HTML><BODY>I didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?</BODY></HTML>
]]></MSGBODY>
<INCIMAGES>N
<USEHTML>Y
</MAIL>
</MAILRQ>
</MAILTRNRQ>The response from the FI:
<MAILTRNRS>
<TRNUID>54321
<STATUS>
<CODE>0
<SEVERITY>INFO
</STATUS>
<MAILRS>
<MAIL>
<USERID>123456789
<DTCREATED>19960307
<FROM>Noelani Federal Savings
<TO>James Hackleman
<SUBJECT>Re: What do I need to earn interest?
<MSGBODY>><![ CDATA [<HTML><BODY>You need to maintain $1000 in this account to earn interest. Because your balance was only $750 this month, no interest was earned. You could also switch to our new Checking Extra plan that always pays interest. Call us or check our web page http://www.fi.com/check-plans.html for more information.
Sincerely,
Customer Service Department
Original message:
I didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?</BODY></HTML>
]]></MSGBODY>
<INCIMAGES>N
<USEHTML>Y
</MAIL>
</MAILRS>
</MAILTRNRS>
Example of Synchronization Involving E-Mail
In the following example the client did not receive the reply to the
message sent in the previous example, so its <TOKEN> is one less than the server's.
The server replies by giving the current <TOKEN> and the missed response.
<MAILSYNCRQ>
<TOKEN>101
</MAILSYNCRQ>
<MAILSYNCRS>
<TOKEN>102
<MAILTRNRS>
<TRNUID>54321
<STATUS>
<CODE>0
<SEVERITY>INFO
</STATUS>
<MAILRS>
... contents of e-mail message response as shown in previous example
</MAILRS>
</MAILTRNRS>
</MAILSYNCRS>
Some responses contain values that are URLs, intended to be separately
fetched by clients if desired. Clients can use their own HTTP libraries to perform this
fetch outside of the Open Financial Exchange specification. However, to insulate clients
against changes in transport technology, and to allow for fetches that require the
protection of an authenticated signon by a specific user, Open Financial Exchange defines
a transaction roughly equivalent to an HTTP Get. Any MIME type can be retrieved, including
images as well as HTML pages.
The following table lists the components of a request:
Tag
Description
<GETMIMERQ>
Get-MIME-request aggregate
<URL>
URL, URL
</GETMIMERQ>
The response simply echoes back the URL. The actual response, whether HTML, an image,
or some other type, is always sent as a separate part of the file using multi-part MIME.
If either or both of the messages in the e-mail message set are
supported, the following aggregate must be included in the profile <MSGSETLIST>
response.
Tag
Description
<EMAILMSGSET>
E-mail-message-set-profile-information aggregate
<EMAILMSGSETV1>
Opening tag for V1 of the message set profile information
<MSGSETCORE>
Common message set information, defined in the profile
chapter
Open Financial Exchange enables users to automate transactions that
occur on a regular basis. Recurring transactions are useful when a customer has payments
or transfers, for example, that repeat at regular intervals. The customer can create a
"model" at the server for automatic generation of these instructions. The model
in turn creates payments or transfers until it is canceled or expires. After the user
creates a recurring model at the server, the server can relieve the user from the burden
of creating these transactions; it generates the transactions on its own, based on the
operating parameters of the model.
The client must provide the following information to create a model:
Type of transaction generated by the model (payment or transfer)
Frequency of recurring transaction
Total number of recurring transactions to generate
Service-specific information, such as transfer date, payment amount,
payee address
The model creates each transaction some time before its due date,
usually thirty days. This allows the user to retrieve the transactions in advance of
posting. This also gives the user the opportunity to modify or cancel individual
transactions without changing the recurring model itself.
When a model is created, it can generate several transactions
immediately. The model does not automatically return responses for the newly created
transactions. It only returns a response to the request that was made to create the model.
For this reason, clients should send a synchronization request along with the request to
create a model. This allows the server to return the newly created transaction responses,
as well as the response to the request to set up a new model.
The Recurring Instructions aggregate is used to specify the schedule for
a repeating instruction. It is passed to the server when a recurring transfer or payment
model is first created.
Tag
Description
<RECURRINST>
Recurring-Instructions aggregate
<FREQ>
Frequency, see section 10.2.1
<NINSTS>
Number of instructions
If this tag
is absent, the schedule is open-ended, N-3
The following example illustrates the creation of a repeating payment.
The payment repeats on a monthly basis for 12 months. All payments are for $395.The
request:
.
.
.
<RECPMTRQ>
<RECURRINST>
<FREQ>MONTHLY
<NINSTS>12
</RECURRINST>
<PMTINFO>
<BANKACCTFROM>
<BANKID>555432180
<ACCTID>763984
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<TRNAMT>395.00
<PAYEEID>77810
<PAYACCT>444-78-97572
<DTDUE>19971115
<MEMO>Auto loan payment
</PMTINFO>
</RECPMTRQ>
.
.
.
The response includes the <RECSRVRTID>
that the client can use
to cancel or modify the model:
.
.
.
<RECPMTRS>
<RECSRVRTID>387687138
<RECURRINST>
<FREQ>MONTHLY
<NINSTS>12
</RECURRINST>
<PMTINFO>
<BANKACCTFROM>
<BANKID>555432180
<ACCTID>763984
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<TRNAMT>395.00
<PAYEEID>77810
<PAYACCT>444-78-97572
<DTDUE>19971115
<MEMO>Auto loan payment
</PMTINFO>
</RECPMTRS>
.
.
.
Once created, a recurring model independently generates instructions.
Since the client has not directly generated these transactions, the client has no record
of their creation. To enable users to modify and/or cancel pending instructions, the
client must use data synchronization in order to retrieve these transactions.
The client has two purposes for synchronizing state with the server with
respect to recurring models:
Retrieve any added, modified, or canceled recurring models
Retrieve any added, modified, or canceled transactions generated by any
models
The client must be able to synchronize with the state of any models at
the server, as well as the state of any transactions generated by the server.
Once created and retrieved by the customer, recurring payments and
transfers are almost identical to customer-created payments or transfers. As with ordinary
payments or transfers, you can cancel or modify transactions individually. However,
because servers generate these transfers, they are different in the following respects:
Recurring transactions must be retrieved as part of a synchronization
request.
Recurring transactions are related to a model. A server can modify or
cancel transactions if the model is modified or canceled.
A recurring model can be modified or canceled. When a model is modified,
all transactions that it generates in the future will change as well. The client can
indicate whether transactions that have been generated, but have not been sent, should be
modified as well. The actual elements within a transaction that can be modified differ by
service. See the recurring sections within the Banking and Payments chapters for details.
A user can cancel a model immediately or at a future date. If a user
cancels the model immediately, the client cancels any transactions that it has not yet
sent. If the client schedules the cancel for a future date, the client will not cancel
pending transactions.
Canceling a recurring payment model requires the client to pass the
<RECSRVRTID> of the model. The client requests that pending payments also be
canceled. The server cancels the model immediately and notifies the client that both the
model and any scheduled payments were canceled.The request:
The server also cancels any payments that have been generated but not
executed. In the example shown above, the client would not learn of this immediately. To
receive notification that the model and all generated payments were canceled, the client
would need to include a synchronization request in the file. The following example
illustrates this alternate approach.The request file now includes a synchronization
request:
.
.
.
<RECPMTCANCRQ>
<RECSRVRTID>387687138
<CANPENDING>Y
</RECPMTCANCRQ>
<PMTSYNCRQ>
<TOKEN>12345
<BANKACCTFROM>
<BANKID>123432123
<ACCTID>516273
<ACCTTYPE>CHECKING
</BANKACCTFROM>
</PMTSYNCRQ>
.
.
.
The response file now contains two responses
(assuming one payment was pending),
one for the canceled model and one for the canceled payment.
.
.
.
<RECPMTCANCRS>
<RECSRVRTID>387687138
<CANPENDING>Y
</RECPMTCANCRS>
<PMTSYNCRS>
<TOKEN>3247989384
<BANKACCTFROM>
<BANKID>123432123
<ACCTID>516273
<ACCTTYPE>CHECKING
</BANKACCTFROM>
<PMTTRNRS>
<TRNUID>10103
<STATUS>
<CODE>0
<SEVERITY>INFO
</STATUS>
<PMTCANCRS>
<SRVRTID>1030155
</PMTCANCRS>
</PMTTRNRS>
</PMTSYNCRS>
.
.
.
stadyn_image10
================================================
FILE: libs/megaparse/tests/test_endpoints.py
================================================
import pytest
@pytest.mark.asyncio
async def test_parse_file_endpoint(test_client):
# Simulate a request to the parse endpoint
with open("./tests/pdf/sample_pdf.pdf", "rb") as file:
response = await test_client.post(
"/v1/file",
files={"file": ("test.pdf", file)},
data={
"method": "unstructured",
"strategy": "auto",
"language": "en",
"check_table": False,
},
)
assert response.status_code == 200
assert response.json()["message"] == "File parsed successfully"
@pytest.mark.asyncio
async def test_parse_url_endpoint(test_client):
response = await test_client.post("/v1/url?url=https://www.quivr.com")
assert response.status_code == 200
assert response.json() == {
"message": "Website content parsed successfully",
"result": "Fake website content",
}
================================================
FILE: libs/megaparse/tests/test_import.py
================================================
import pytest
from megaparse import MegaParse
@pytest.mark.skip("slow test")
def test_load():
megaparse = MegaParse()
response = megaparse.load("./tests/data/dummy.pdf")
print(response)
assert response.strip("\n") == "Dummy PDF download"
================================================
FILE: libs/megaparse/tests/test_parsers.py
================================================
import os
import pytest
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.extensions import FileExtension
PARSER_LIST = [
UnstructuredParser,
# DoctrParser,
]
@pytest.mark.parametrize("parser", PARSER_LIST)
@pytest.mark.parametrize("extension", list(FileExtension))
def test_sync_parser(parser, extension):
directory = "./tests/supported_docs"
file_path = next(
(
os.path.join(root, file)
for root, _, files in os.walk(directory)
for file in files
if file.endswith(extension.value)
),
None,
)
if file_path is None:
pytest.fail(f"No file with extension {extension.value} found in {directory}")
myparser = parser()
if extension in myparser.supported_extensions:
response = myparser.convert(file_path)
assert response
assert len(str(response)) > 0
else:
with pytest.raises(ValueError):
myparser.convert(file_path)
================================================
FILE: libs/megaparse_sdk/CHANGELOG.md
================================================
# Changelog
## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (2025-02-13)
### Features
* add layout detection ([#228](https://github.com/QuivrHQ/MegaParse/issues/228)) ([77f7040](https://github.com/QuivrHQ/MegaParse/commit/77f7040c9c221a17effce089be7ec575cdd83468))
## [0.1.11](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.10...megaparse-sdk-v0.1.11) (2025-02-11)
### Features
* add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))
* Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))
### Bug Fixes
* Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))
* add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))
* Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))
## [0.1.10](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.9...megaparse-sdk-v0.1.10) (2024-12-16)
### Bug Fixes
* hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))
## [0.1.9](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.8...megaparse-sdk-v0.1.9) (2024-12-13)
### Features
* small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))
## [0.1.8](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.7...megaparse-sdk-v0.1.8) (2024-12-12)
### Features
* custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))
* faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))
## [0.1.7](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.6...megaparse-sdk-v0.1.7) (2024-11-25)
### Bug Fixes
* Update README.md ([#154](https://github.com/QuivrHQ/MegaParse/issues/154)) ([a103393](https://github.com/QuivrHQ/MegaParse/commit/a1033938184e20c24b0e54ee0db088b28075fd14))
## [0.1.6](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.5...megaparse-sdk-v0.1.6) (2024-11-25)
### Features
* megaparse sdk tests ([#148](https://github.com/QuivrHQ/MegaParse/issues/148)) ([e030285](https://github.com/QuivrHQ/MegaParse/commit/e0302853fc2c1526b8e912bf3ef85b970a5b89bc))
## [0.1.5](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.4...megaparse-sdk-v0.1.5) (2024-11-21)
### Features
* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
* release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))
================================================
FILE: libs/megaparse_sdk/README.md
================================================
## MegaParse SDK
Welcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload URLs and files for processing.
### Installation
To install the MegaParse SDK, use pip:
```sh
pip install megaparse-sdk
```
### Usage
Here is an example of how to use the MegaParse SDK:
#### Uploading URLs
```python
import asyncio
import os
from megaparse.sdk import MegaParseSDK
async def upload_url():
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)
url = "https://www.quivr.com"
# Upload a URL
url_response = await megaparse.url.upload(url)
print(f"\n----- URL Response : {url} -----\n")
print(url_response)
await megaparse.close()
if __name__ == "__main__":
asyncio.run(upload_url())
```
#### Uploading Files
```python
import asyncio
import os
from megaparse.sdk import MegaParseSDK
async def upload_file():
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)
file_path = "your/file/path.pdf"
# Upload a file
response = await megaparse.file.upload(
file_path=file_path,
method="unstructured", # unstructured, llama_parser, megaparse_vision
strategy="auto",
)
print(f"\n----- File Response : {file_path} -----\n")
print(response)
await megaparse.close()
if __name__ == "__main__":
asyncio.run(upload_file())
```
### Features
- **Upload URLs**: Easily upload URLs for processing.
- **Upload Files**: Upload files with different processing methods and strategies.
### Getting Started
1. **Set up your API key**: Make sure to set the `MEGAPARSE_API_KEY` environment variable with your MegaParse API key.
2. **Run the example**: Use the provided example to see how to upload URLs and files.
For more details, refer to the [usage example](#file:usage_example.py-context).
We hope you find the MegaParse SDK useful for your projects!
Enjoy, _Quivr Team_ !
================================================
FILE: libs/megaparse_sdk/__init__.py
================================================
================================================
FILE: libs/megaparse_sdk/examples/usage_example.py
================================================
import asyncio
import os
from megaparse.sdk.megaparse_sdk import MegaParseSDK
async def main():
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)
# url = "https://www.quivr.com"
# # Upload a URL
# url_response = await megaparse.url.upload(url)
# print(f"\n----- URL Response : {url} -----\n")
# print(url_response)
# file_path = "megaparse/sdk/pdf/MegaFake_report.pdf"
file_path = (
"megaparse/sdk/examples/only_pdfs/4 The Language of Medicine 2024.07.21.pdf"
)
# Upload a file
response = await megaparse.file.upload(
file_path=file_path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto", # type: ignore # fast, auto, hi_res
)
print(f"\n----- File Response : {file_path} -----\n")
print(response)
await megaparse.close()
if __name__ == "__main__":
asyncio.run(main())
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/__init__.py
================================================
from .client import MegaParseClient
from .endpoints.file_upload import FileUpload
from .endpoints.url_upload import URLUpload
class MegaParseSDK:
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.client = MegaParseClient(api_key, base_url)
self.file = FileUpload(self.client)
self.url = URLUpload(self.client)
async def close(self):
await self.client.close()
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/client.py
================================================
import asyncio
import enum
import logging
import os
from io import BytesIO
from pathlib import Path
from types import TracebackType
from typing import Any, Self
import httpx
import nats
from nats.errors import NoRespondersError, TimeoutError
from megaparse_sdk.config import ClientNATSConfig, MegaParseSDKConfig
from megaparse_sdk.schema.document import Document
from megaparse_sdk.schema.mp_exceptions import (
DownloadError,
InternalServiceError,
MemoryLimitExceeded,
ModelNotSupported,
ParsingException,
)
from megaparse_sdk.schema.mp_inputs import (
FileInput,
MPInput,
ParseFileConfig,
ParseFileInput,
ParseUrlInput,
)
from megaparse_sdk.schema.mp_outputs import (
MPErrorType,
MPOutput,
MPOutputType,
)
from megaparse_sdk.utils.load_ssl import load_ssl_cxt
logger = logging.getLogger("megparse_sdk")
class MegaParseClient:
def __init__(
self,
api_key: str | None = None,
base_url: str | None = None,
):
config = MegaParseSDKConfig()
self.base_url = base_url or config.url
self.api_key = api_key or config.api_key
self.max_retries = config.max_retries
if self.api_key:
self.session = httpx.AsyncClient(
headers={"x-api-key": self.api_key}, timeout=config.timeout
)
else:
self.session = httpx.AsyncClient(timeout=config.timeout)
async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any:
url = f"{self.base_url}{endpoint}"
client = self.session
for attempt in range(self.max_retries):
try:
response = await client.request(method, url, **kwargs)
response.raise_for_status()
return response.json()
except (httpx.HTTPStatusError, httpx.RequestError):
if attempt < self.max_retries - 1:
await asyncio.sleep(2**attempt) # Exponential backoff
raise RuntimeError(f"Can't send request to the server: {url}")
async def close(self):
await self.session.aclose()
class ClientState(enum.Enum):
# First state of the client
UNOPENED = 1
# Client has either sent a request, or is within a `with` block.
OPENED = 2
# Client has either exited the `with` block, or `close()` called.
CLOSED = 3
class MegaParseNATSClient:
def __init__(self, config: ClientNATSConfig):
self.nc_config = config
self.max_retries = self.nc_config.max_retries
self.backoff = self.nc_config.backoff
if self.nc_config.ssl_config:
self.ssl_ctx = load_ssl_cxt(self.nc_config.ssl_config)
else:
self.ssl_ctx = None
# Client connection
self._state = ClientState.UNOPENED
self._nc = None
async def _get_nc(self):
if self._nc is None:
self._nc = await nats.connect(
self.nc_config.endpoint,
tls=self.ssl_ctx,
connect_timeout=self.nc_config.connect_timeout,
reconnect_time_wait=self.nc_config.reconnect_time_wait,
max_reconnect_attempts=self.nc_config.max_reconnect_attempts,
)
return self._nc
return self._nc
async def __aenter__(self: Self) -> Self:
if self._state != ClientState.UNOPENED:
msg = {
ClientState.OPENED: "Cannot open a client instance more than once.",
ClientState.CLOSED: (
"Cannot reopen a client instance, client was closed."
),
}[self._state]
raise RuntimeError(msg)
self._state = ClientState.OPENED
await self._get_nc()
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None = None,
exc_value: BaseException | None = None,
traceback: TracebackType | None = None,
) -> None:
self._state = ClientState.CLOSED
await self.aclose()
async def parse_url(self, url: str):
url_inp = ParseUrlInput(url=url)
return await self._send_req(MPInput(input=url_inp))
async def parse_file(
self, file: Path | BytesIO, file_name: str | None = None
) -> str | Document:
if isinstance(file, Path):
with open(file, "rb") as f:
data = f.read()
file_name = os.path.basename(file)
else:
file.seek(0)
data = file.read()
if file_name is None:
raise ValueError("please provide file_name if passing ByteIO stream")
file_input = ParseFileInput(
file_input=FileInput(file_name=file_name, file_size=len(data), data=data),
parse_config=ParseFileConfig(),
)
inp = MPInput(input=file_input)
return await self._send_req(inp)
async def _send_req(self, inp: MPInput) -> str | Document:
logger.debug(f"Sending {inp} to megaparse service.")
for attempt in range(self.max_retries):
try:
return await self._send_req_inner(inp)
except (TimeoutError, NoRespondersError) as e:
logger.error(f"Sending req error: {e}. Retrying for {attempt} time")
if attempt < self.max_retries - 1:
logger.debug(f"Backoff for {2**self.backoff}s")
await asyncio.sleep(2**self.backoff)
raise ParsingException
async def _send_req_inner(self, inp: MPInput):
nc = await self._get_nc()
raw_response = await nc.request(
self.nc_config.subject,
inp.model_dump_json().encode("utf-8"),
timeout=self.nc_config.timeout,
)
response = MPOutput.model_validate_json(raw_response.data.decode("utf-8"))
return self._handle_mp_output(response)
def _handle_mp_output(self, response: MPOutput) -> str | Document:
if response.output_type == MPOutputType.PARSE_OK:
assert response.result, "Parsing OK but response is None"
return response.result
elif response.output_type == MPOutputType.PARSE_ERR:
assert response.err, "Parsing OK but response is None"
match response.err.mp_err_code:
case MPErrorType.MEMORY_LIMIT:
raise MemoryLimitExceeded
case MPErrorType.INTERNAL_SERVER_ERROR:
raise InternalServiceError
case MPErrorType.MODEL_NOT_SUPPORTED:
raise ModelNotSupported
case MPErrorType.DOWNLOAD_ERROR:
raise DownloadError
case MPErrorType.PARSING_ERROR:
raise ParsingException
raise ValueError(f"unknown service response type: {response}")
async def aclose(self):
nc = await self._get_nc()
await nc.close()
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/config.py
================================================
from pydantic import BaseModel, FilePath
from pydantic_settings import BaseSettings, SettingsConfigDict
class MegaParseSDKConfig(BaseSettings):
"""
Configuration for the Megaparse SDK.
"""
model_config = SettingsConfigDict(env_prefix="MEGAPARSE_SDK_")
api_key: str | None = None
url: str = "https://megaparse.tooling.quivr.app"
timeout: int = 600
max_retries: int = 3
class SSLConfig(BaseModel):
ssl_key_file: FilePath
ssl_cert_file: FilePath
ca_cert_file: FilePath | None = None
class ClientNATSConfig(BaseSettings):
model_config = SettingsConfigDict(
env_prefix="MEGAPARSE_NATS_",
env_file=(".env.local", ".env"),
env_nested_delimiter="__",
extra="ignore",
)
subject: str = "parsing"
endpoint: str = "https://tests@nats.tooling.quivr.app:4222"
timeout: float = 20
max_retries: int = 5
backoff: float = 3
connect_timeout: int = 5
reconnect_time_wait: int = 1
max_reconnect_attempts: int = 20
ssl_config: SSLConfig | None = None
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py
================================================
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py
================================================
from typing import Optional
from httpx import Response
from pydantic import BaseModel
from megaparse_sdk.client import MegaParseClient
from megaparse_sdk.schema.languages import Language
from megaparse_sdk.schema.parser_config import ParserType, StrategyEnum
class UploadFileConfig(BaseModel):
method: ParserType
strategy: StrategyEnum
check_table: bool
language: Language
parsing_instruction: str | None = None
model_name: str = "gpt-4o"
class FileUpload:
def __init__(self, client: MegaParseClient):
self.client = client
async def upload(
self,
file_path: str,
method: ParserType = ParserType.UNSTRUCTURED,
strategy: StrategyEnum = StrategyEnum.AUTO,
check_table: bool = False,
language: Language = Language.ENGLISH,
parsing_instruction: Optional[str] = None,
model_name: str = "gpt-4o",
) -> Response:
data = UploadFileConfig(
method=method,
strategy=strategy,
check_table=check_table,
language=language,
parsing_instruction=parsing_instruction,
model_name=model_name,
)
with open(file_path, "rb") as file:
files = {"file": (file_path, file)}
response = await self.client.request(
"POST",
"/v1/file",
files=files,
data=data.model_dump(mode="json"),
)
return response
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py
================================================
from httpx import Response
from megaparse_sdk.client import MegaParseClient
class URLUpload:
def __init__(self, client: MegaParseClient):
self.client = client
async def upload(self, url: str, max_retries: int = 3) -> Response:
endpoint = f"/v1/url?url={url}"
headers = {"accept": "application/json"}
response = await self.client.request("POST", endpoint, headers=headers, data="")
return response
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/__init__.py
================================================
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/document.py
================================================
import uuid
from enum import Enum
from typing import Any, Dict, List, Literal, NamedTuple, Optional, Self, Tuple
import numpy as np
from PIL import Image, ImageDraw
from pydantic import BaseModel, Field, field_validator
class Point2D(NamedTuple):
x: float
y: float
class BlockType(str, Enum):
TEXT = "text"
class BBOX(NamedTuple):
top_left: Point2D
bottom_right: Point2D
def to_numpy(self):
return np.array(
[self.top_left.x, self.top_left.y, self.bottom_right.x, self.bottom_right.y]
)
def iou(self, other: Self):
x1 = max(self.top_left.x, other.top_left.x)
y1 = max(self.top_left.y, other.top_left.y)
x2 = min(self.bottom_right.x, other.bottom_right.x)
y2 = min(self.bottom_right.y, other.bottom_right.y)
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area_self = (self.bottom_right.x - self.top_left.x) * (
self.bottom_right.y - self.top_left.y
)
area_other = (other.bottom_right.x - other.top_left.x) * (
other.bottom_right.y - other.top_left.y
)
union = area_self + area_other - intersection
return intersection / union
class BlockLayout(BaseModel):
bbox: BBOX
objectness_score: float
block_type: BlockType
class TextDetection:
__slots__ = [
"bboxes",
"page_index",
"dimensions",
"orientation",
"origin_page_shape",
]
def __init__(
self,
bboxes: List[BlockLayout],
page_index: int,
dimensions: Tuple[int, ...],
orientation: Tuple[int, float] | Literal[0],
origin_page_shape,
):
self.bboxes = bboxes
self.page_index = page_index
self.dimensions = dimensions
self.orientation = orientation
self.origin_page_shape = origin_page_shape
def __repr__(self) -> str:
return f"PageLayout(bboxes={self.bboxes}, page_index={self.page_index}, dimensions={self.dimensions}, orientation={self.orientation})"
def render(
self, page_array: np.ndarray, output_path: Optional[str] = "page_layout.png"
):
"""
Render the page layout with bounding boxes on the original page image.
Args:
page_array (np.ndarray): The original page image as a NumPy array.
output_path (str): The path to save the rendered image.
"""
# Convert the NumPy array to a PIL image
image = Image.fromarray(page_array)
draw = ImageDraw.Draw(image)
width, height = self.dimensions
# Draw each bounding box
for block in self.bboxes:
bbox = block.bbox
top_left = (bbox[0][0] * height, bbox[0][1] * width)
bottom_right = (bbox[1][0] * height, bbox[1][1] * width)
draw.rectangle([top_left, bottom_right], outline="red", width=2)
if output_path:
# Save the image
image.save(output_path)
print(f"Page layout saved to {output_path}")
return image
def get_loc_preds(self) -> np.ndarray:
"""
Get the location predictions of the bounding boxes.
Returns:
np.ndarray: The location predictions as a NumPy array.
"""
loc_preds = np.array([block.bbox.to_numpy() for block in self.bboxes])
return loc_preds
def get_objectness_scores(self) -> np.ndarray:
"""
Get the objectness scores of the bounding boxes.
Returns:
np.ndarray: The objectness scores as a NumPy array.
"""
objectness_scores = np.array([block.objectness_score for block in self.bboxes])
return objectness_scores
def get_origin_page_shapes(self) -> np.ndarray:
"""
Get the original page shapes.
Returns:
np.ndarray: The original page shapes as a NumPy array.
"""
origin_page_shapes = np.array([self.origin_page_shape for _ in self.bboxes])
return origin_page_shapes
def get_orientations(self) -> np.ndarray:
"""
Get the orientations of the bounding boxes.
Returns:
np.ndarray: The orientations as a NumPy array.
"""
orientations = np.array([self.orientation for _ in self.bboxes])
return orientations
class Block(BaseModel):
"""
A class to represent a block
"""
block_id: Optional[uuid.UUID] = Field(default_factory=uuid.uuid4)
metadata: Dict[str, Any] # FIXME: TBD @Amine
bbox: Optional[BBOX] = (
None # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in
)
page_range: Optional[Tuple[int, int]] = Field(
default=None
) # (start_page, end_page)
@field_validator("page_range")
def validate_range(cls, value):
if value is None:
return None
start, end = value
if start > end:
raise ValueError(
"The first value of the page range must be less than the second value"
)
return value
class TextBlock(Block):
"""
A class to represent a text block
"""
text: str
def __str__(self):
return self.text
class UndefinedBlock(TextBlock):
"""
A class to represent a text block
"""
pass
class TitleBlock(TextBlock):
"""
A class to represent a title block
"""
def __str__(self):
return f"# {self.text}"
class SubTitleBlock(TextBlock):
"""
A class to represent a subtitle block
"""
depth: int = 0
def __str__(self):
heading_level = min(self.depth + 1, 6)
return f"{'#' * heading_level} {self.text}"
class CaptionBlock(TextBlock):
"""
A class to represent a caption block
"""
pass
class ImageBlock(Block):
"""
A class to represent an image block
"""
text: Optional[str] = None
caption: Optional[str] = "unknown"
def __str__(self) -> str:
return f"[Image: {self.caption}]"
class TableBlock(ImageBlock):
"""
A class to represent a table block
"""
def __str__(self):
return self.text if self.text else f"[Table : {self.caption}]"
class ListElementBlock(TextBlock):
"""
A class to represent a list element
"""
depth: int = 0
class ListBlock(Block):
"""
A class to represent a list block
"""
list_elements: List[ListElementBlock]
# rajouter fonction pydantic pour compute l attribut
def __str__(self):
return "\n".join(
f"{' ' * (2 * element.depth)}* {element.text}"
for element in self.list_elements
)
class HeaderBlock(TextBlock):
"""
A class to represent a header block
"""
def __str__(self):
return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}"
class FooterBlock(TextBlock):
"""
A class to represent a footer block
"""
def __str__(self):
return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}"
class SectionBlock(Block):
"""
A class to represent a section block
"""
title: str
depth: int
content: List[Block]
def __str__(self):
lines = []
lines.extend(str(block) for block in self.content)
return "\n".join(lines)
class TOCItem(BaseModel):
title: str
depth: int
page_range: Tuple[int, int] = Field(...) # (start_page, end_page)
@field_validator("page_range")
def validate_range(cls, value):
start, end = value
if start >= end:
raise ValueError(
"The first value of the page range must be less than the second value"
)
return value
def __str__(self):
start_page, end_page = self.page_range
page_info = (
f"page {start_page}"
if start_page == end_page
else f"pages {start_page}-{end_page}"
)
return f"{' ' * (2 * self.depth)}* {self.title} ({page_info})"
class TOC(BaseModel):
content: List[TOCItem]
@property
def text(self) -> str:
return "\n".join(str(item) for item in self.content)
def __str__(self):
return self.text
class Document(BaseModel):
"""
A class to represent a document
"""
file_name: Optional[str] = None
table_of_contents: Optional[TOC] = None
content: List[Block]
detection_origin: str
metadata: Dict[str, Any]
def __str__(self) -> str:
lines = []
# If there's a table of contents, include it
if self.table_of_contents:
lines.append("Table of Contents:")
# Use TOC’s own string-building property or method
lines.append(self.table_of_contents.text)
# Print each block’s text representation
lines.extend(str(block) + "\n" for block in self.content)
return "\n".join(lines)
def clean(self):
"""
Clean the Document element by :
- Merging Caption in ImageBlock
- Merging continuous list items elements into ListBlock
- Add Depth to Title / SubTitle / ListElementBlock
- Creating sections
- Creating TOC
"""
# Merge caption in ImageBlock simplified
i = 0
list_elements_stack = []
while i < len(self.content) - 1:
if isinstance(self.content[i], ListElementBlock):
list_elements_stack.append(self.content[i])
self.content.pop(i)
continue
else:
if list_elements_stack:
self.content.insert(
i, ListBlock(list_elements=list_elements_stack, metadata={})
)
list_elements_stack = []
if isinstance(self.content[i], ImageBlock) and isinstance(
self.content[i + 1], CaptionBlock
):
self.content[i].caption = str(self.content[i + 1]) # type: ignore
self.content.pop(i + 1)
elif isinstance(self.content[i], CaptionBlock) and isinstance(
self.content[i + 1], ImageBlock
):
self.content[i + 1].caption = str(self.content[i]) # type: ignore
self.content.pop(i)
i += 1
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/extensions.py
================================================
from enum import Enum
class FileExtension(str, Enum):
"""Supported file extension enumeration."""
_mimetype: str
def __new__(cls, value: str, mimetype: str):
obj = str.__new__(cls, value)
obj._value_ = value
obj._mimetype = mimetype
return obj
PDF = (".pdf", "application/pdf")
DOCX = (
".docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
TXT = (".txt", "text/plain")
OTF = (".odt", "application/vnd.oasis.opendocument.text")
EPUB = (".epub", "application/epub")
HTML = (".html", "text/html")
XML = (".xml", "application/xml")
CSV = (".csv", "text/csv")
XLSX = (
".xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
XLS = (".xls", "application/vnd.ms-excel")
PPTX = (
".pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
MD = (".md", "text/markdown")
MARKDOWN = (".markdown", "text/markdown")
@property
def mimetype(self) -> str:
return self._mimetype
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/languages.py
================================================
from enum import Enum
class Language(str, Enum):
BAZA = "abq"
ADYGHE = "ady"
AFRIKAANS = "af"
ANGIKA = "ang"
ARABIC = "ar"
ASSAMESE = "as"
AVAR = "ava"
AZERBAIJANI = "az"
BELARUSIAN = "be"
BULGARIAN = "bg"
BIHARI = "bh"
BHOJPURI = "bho"
BENGALI = "bn"
BOSNIAN = "bs"
SIMPLIFIED_CHINESE = "ch_sim"
TRADITIONAL_CHINESE = "ch_tra"
CHECHEN = "che"
CZECH = "cs"
WELSH = "cy"
DANISH = "da"
DARGWA = "dar"
GERMAN = "de"
ENGLISH = "en"
SPANISH = "es"
ESTONIAN = "et"
PERSIAN_FARSI = "fa"
FRENCH = "fr"
IRISH = "ga"
GOAN_KONKANI = "gom"
HINDI = "hi"
CROATIAN = "hr"
HUNGARIAN = "hu"
INDONESIAN = "id"
INGUSH = "inh"
ICELANDIC = "is"
ITALIAN = "it"
JAPANESE = "ja"
KABARDIAN = "kbd"
KANNADA = "kn"
KOREAN = "ko"
KURDISH = "ku"
LATIN = "la"
LAK = "lbe"
LEZGHIAN = "lez"
LITHUANIAN = "lt"
LATVIAN = "lv"
MAGAHI = "mah"
MAITHILI = "mai"
MAORI = "mi"
MONGOLIAN = "mn"
MARATHI = "mr"
MALAY = "ms"
MALTESE = "mt"
NEPALI = "ne"
NEWARI = "new"
DUTCH = "nl"
NORWEGIAN = "no"
OCCITAN = "oc"
PALI = "pi"
POLISH = "pl"
PORTUGUESE = "pt"
ROMANIAN = "ro"
RUSSIAN = "ru"
SERBIAN_CYRILLIC = "rs_cyrillic"
SERBIAN_LATIN = "rs_latin"
NAGPURI = "sck"
SLOVAK = "sk"
SLOVENIAN = "sl"
ALBANIAN = "sq"
SWEDISH = "sv"
SWAHILI = "sw"
TAMIL = "ta"
TABASSARAN = "tab"
TELUGU = "te"
THAI = "th"
TAJIK = "tjk"
TAGALOG = "tl"
TURKISH = "tr"
UYGHUR = "ug"
UKRAINIAN = "uk"
URDU = "ur"
UZBEK = "uz"
VIETNAMESE = "vi"
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py
================================================
class ModelNotSupported(Exception):
def __init__(
self,
message: str = "The requested model is not supported yet.",
):
super().__init__(message)
class MemoryLimitExceeded(Exception):
def __init__(self, message="The service is under high memory pressure"):
super().__init__(message)
class InternalServiceError(Exception):
def __init__(self, message="Internal service error occured"):
super().__init__(message)
class DownloadError(Exception):
def __init__(self, message="Failed to download the file"):
super().__init__(message)
class ParsingException(Exception):
def __init__(self, message="An error occurred during parsing"):
super().__init__(message)
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py
================================================
import base64
from enum import Enum
from typing import Literal, Union
from pydantic import BaseModel, Field, field_serializer, field_validator
from .parser_config import ParseFileConfig
class FileInput(BaseModel):
file_name: str
file_size: int
data: bytes
@field_validator("data", mode="before")
def decode_data(cls, value):
if isinstance(value, str):
try:
return base64.b64decode(value)
except Exception:
raise ValueError("Invalid Base64 encoding for the 'data' field.")
return value
# TODO: this is slow !!! Move to reading bytes directly from bucket storage
# append bytes with CRC32
@field_serializer("data", return_type=str)
def serialize_data(self, data: bytes, _info):
return base64.b64encode(data).decode("utf-8")
class MPParseType(str, Enum):
PARSE_FILE = "parse_file"
PARSE_URL = "parse_url"
class ParseFileInput(BaseModel):
mp_parse_type: Literal[MPParseType.PARSE_FILE] = MPParseType.PARSE_FILE
file_input: FileInput
parse_config: ParseFileConfig
class ParseUrlInput(BaseModel):
mp_parse_type: Literal[MPParseType.PARSE_URL] = MPParseType.PARSE_URL
url: str
class MPInput(BaseModel):
input: Union[ParseFileInput, ParseUrlInput] = Field(
..., discriminator="mp_parse_type"
)
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py
================================================
from enum import Enum, auto
from typing import Dict
from pydantic import BaseModel, Field
from megaparse_sdk.schema.document import Document
class MPErrorType(Enum):
MEMORY_LIMIT = auto()
INTERNAL_SERVER_ERROR = auto()
MODEL_NOT_SUPPORTED = auto()
DOWNLOAD_ERROR = auto()
PARSING_ERROR = auto()
class ParseError(BaseModel):
mp_err_code: MPErrorType
message: str
class MPOutputType(str, Enum):
PARSE_OK = "parse_file_ok"
PARSE_ERR = "parse_file_err"
class MPOutput(BaseModel):
output_type: MPOutputType
result: str | Document | None
err: ParseError | None = None
metadata: Dict[str, str] = Field(default_factory=dict)
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py
================================================
from enum import Enum
from typing import Optional
from pydantic import BaseModel
from .languages import Language
from .supported_models import SupportedModel
class ParserType(str, Enum):
"""Parser type enumeration."""
UNSTRUCTURED = "unstructured"
LLAMA_PARSER = "llama_parser"
MEGAPARSE_VISION = "megaparse_vision"
class StrategyEnum(str, Enum):
"""Method to use for the conversion"""
FAST = "fast"
AUTO = "auto"
HI_RES = "hi_res"
class ParseFileConfig(BaseModel):
llm_model_name: SupportedModel = SupportedModel.GPT_4
method: ParserType = ParserType.UNSTRUCTURED
strategy: StrategyEnum = StrategyEnum.AUTO
check_table: bool = False
language: Language = Language.ENGLISH
parsing_instruction: Optional[str] = None
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py
================================================
from enum import Enum
class SupportedModel(str, Enum):
"""Supported models enumeration."""
# OpenAI Models
GPT_4 = "gpt-4"
GPT_4_TURBO = "gpt-4-turbo"
GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4O = "gpt-4o"
GPT_4O_MINI = "gpt-4o-mini"
# Anthropic Models
CLAUDE_3_5_SONNET_LATEST = "claude-3-5-sonnet-latest"
CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022"
CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022"
CLAUDE_3_5_HAIKU_LATEST = "claude-3-5-haiku-latest"
CLAUDE_3_OPUS = "claude-3-opus-20240229"
CLAUDE_3_OPUS_LATEST = "claude-3-opus-latest"
CLAUDE_3_SONNET = "claude-3-sonnet-20240229"
CLAUDE_3_HAIKU = "claude-3-haiku-20240307"
def __str__(self):
return self.value
@classmethod
def is_supported(cls, model_name: str) -> bool:
"""Check if the model is supported."""
return model_name in cls.__members__.values()
@classmethod
def get_supported_models(cls) -> list[str]:
"""Get the list of supported models."""
return list(cls.__members__.values())
================================================
FILE: libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py
================================================
import ssl
from megaparse_sdk.config import SSLConfig
def load_ssl_cxt(ssl_config: SSLConfig):
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
if ssl_config.ca_cert_file:
context.load_verify_locations(cafile=ssl_config.ca_cert_file)
context.load_cert_chain(
certfile=ssl_config.ssl_cert_file, keyfile=ssl_config.ssl_key_file
)
return context
================================================
FILE: libs/megaparse_sdk/pyproject.toml
================================================
[project]
name = "megaparse-sdk"
version = "0.1.12"
description = "Megaparse SDK"
dependencies = [
"python-dotenv>=1.0.0",
"pycryptodome>=3.21.0",
"psutil>=6.1.0",
"httpx>=0.27.0",
"nats-py>=2.9.0",
"loguru>=0.7.2",
]
readme = "README.md"
requires-python = ">= 3.11"
[build-system]
requires = ["hatchling==1.26.3"]
build-backend = "hatchling.build"
[tool.rye]
managed = true
dev-dependencies = []
universal = true
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["megaparse_sdk"]
================================================
FILE: libs/megaparse_sdk/tests/README.md
================================================
================================================
FILE: libs/megaparse_sdk/tests/certs/client-cert.pem
================================================
-----BEGIN CERTIFICATE-----
MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw
gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p
bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw
PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh
bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow
ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD
VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv
dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG
AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1
2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp
dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ
6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV
HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn
AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p
vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW
0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9
ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr
drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7
/E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=
-----END CERTIFICATE-----
================================================
FILE: libs/megaparse_sdk/tests/certs/client-key.pem
================================================
-----BEGIN PRIVATE KEY-----
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp
tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU
QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj
rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj
BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k
0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo
8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy
dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0
xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW
OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB
Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18
vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY
nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ
eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M
f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG
qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh
zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq
8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP
HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz
4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI
1OaXIqrCA/V43NydDezh0ylQ
-----END PRIVATE KEY-----
================================================
FILE: libs/megaparse_sdk/tests/certs/rootCA.pem
================================================
-----BEGIN CERTIFICATE-----
MIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB
nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu
ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+
BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt
aW5lIGRpcmhvdXNzaSkwHhcNMjQxMTE5MTAwMTA5WhcNMzQxMTE5MTAwMTA5WjCB
nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu
ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+
BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt
aW5lIGRpcmhvdXNzaSkwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCw
6TX1kvqVMb8ZUQVT/vuDsedmbYgSFn68yJRlmE9BsqG7TLQHl2Kw6VQqZBSIkeZG
CypmUysX/3qrvICeArIdmmsrWUTDYPoauw/a/RY0I07rALj3YR0Y7039Hxf/UPT9
xlUtnM2NafkZyp6WRjEN0N4ETvJDIbUQiosiiPilxhwRbJURhT/JPskaw+OM2Sw5
dFAT20zkYC5VIc4wJBFLAMG0XzI6Sy/4wI1WdRBXd2UMpQU4u7TyD0RB4mnHorV6
kXjtLKD/KWSrSG1nnum9SB9eVatbRD+TUgoclwAKedrlCDEM4EsXVVuUuYCizQNb
+H3BSPfj1upUW5eKfgAyB+8r4QGf2yCY9O8NMMrJ1K5Qv4vSuWAU2tZqAyE8Z4Ke
UtHsl/M0zIvIKwyki2N/rieL/m6lTzS3dwSf9vv7eePEvxd8SBClSF07MUzyxkZ5
UYNxaK5t2ZRADZ6n/9/hAQsMscCkHiX1N2ypBFV+86Pr78BC48JgIyCMwuiBN4sC
AwEAAaNFMEMwDgYDVR0PAQH/BAQDAgIEMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYD
VR0OBBYEFFdsN4L0DOS2tdn5PNLSV6DP9eJeMA0GCSqGSIb3DQEBCwUAA4IBgQBj
KosfLfW/ZH80NM16pvpyRF3mCi+q+I+P8zrfilMYJBH4EEdEGAUgTO5do1kJXeel
Wky+FNxaP6KCNiT+0amypKg+yjBlnqLKVdnEgR5s12ZfmerV59stx1A/c/bYMEAS
re6xskBkowP2cVQHAC2dy/0Ov+lZsiNaPV2bQx6KUJurveebUQsH3uF3ZEhnUVQ6
rt5+JGY4x9Tr1YMhvHqEDTrsipPdDB1MyW1SnCkqSXrz+DPXGd8BW0O0hpM5la81
J+rfZGinbcUgXM6JMLIHDxLc4Xxzm4NijFzXhbR3XPXqEwsnZOuxcYYFgUGs3FwS
4ro+34a/O4uKS2KV8wsUWj/tWD2rLpduDgag4WSipCvWtaNve8gPdUiyPxUqxyoZ
aFAFg/izXwmRntogJtV0Zvo3fqAaQQDl8t2s21IIx0wmgHzgmkswb5OwFg3dOn/S
lmaH8v7FCBP7jHx/NCPTT5Sy/1EMRATmhFDUZ8Bod/TIlV3e+FCVqlX3kBBRbAU=
-----END CERTIFICATE-----
================================================
FILE: libs/megaparse_sdk/tests/test_nats_client.py
================================================
import asyncio
import logging
from pathlib import Path
import nats
import pytest
import pytest_asyncio
from megaparse_sdk.client import ClientState, MegaParseNATSClient
from megaparse_sdk.config import ClientNATSConfig, SSLConfig
from megaparse_sdk.schema.mp_exceptions import (
DownloadError,
InternalServiceError,
MemoryLimitExceeded,
ModelNotSupported,
ParsingException,
)
from megaparse_sdk.schema.mp_inputs import MPInput, ParseFileInput, ParseUrlInput
from megaparse_sdk.schema.mp_outputs import (
MPErrorType,
MPOutput,
MPOutputType,
ParseError,
)
from nats.aio.client import Client
logger = logging.getLogger(__name__)
NATS_URL = "nats://test@127.0.0.1:4222"
NATS_SUBJECT = "parsing"
SSL_CERT_FILE = "./tests/certs/client-cert.pem"
SSL_KEY_FILE = "./tests/certs/client-key.pem"
CA_CERT_FILE = "./tests/certs/rootCA.pem"
@pytest.fixture(scope="session")
def ssl_config() -> SSLConfig:
return SSLConfig(
ca_cert_file=CA_CERT_FILE,
ssl_key_file=SSL_KEY_FILE,
ssl_cert_file=SSL_CERT_FILE,
)
@pytest.fixture(scope="session")
def nc_config(ssl_config: SSLConfig) -> ClientNATSConfig:
config = ClientNATSConfig(
subject=NATS_SUBJECT,
endpoint=NATS_URL,
ssl_config=ssl_config,
timeout=0.5,
max_retries=1,
backoff=-1,
connect_timeout=1,
reconnect_time_wait=1,
max_reconnect_attempts=1,
)
return config
@pytest_asyncio.fixture(scope="function")
async def nats_service(nc_config: ClientNATSConfig):
# TODO: fix TLS handshake to work in CI
# ssl_config = load_ssl_cxt(nc_config.ssl_config)
nc = await nats.connect(
nc_config.endpoint,
tls=ssl_config,
connect_timeout=nc_config.connect_timeout,
reconnect_time_wait=nc_config.reconnect_time_wait,
max_reconnect_attempts=nc_config.max_reconnect_attempts,
)
yield nc
await nc.drain()
@pytest.mark.asyncio
async def test_client_state_transition(nc_config: ClientNATSConfig):
mpc = MegaParseNATSClient(nc_config)
assert mpc._state == ClientState.UNOPENED
async with mpc:
assert mpc._state == ClientState.OPENED
assert mpc._state == ClientState.CLOSED
with pytest.raises(RuntimeError):
async with mpc:
pass
@pytest.mark.asyncio(loop_scope="session")
async def test_client_parse_file(nats_service: Client, nc_config: ClientNATSConfig):
async def message_handler(msg):
parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
assert isinstance(parsed_input, ParseFileInput)
output = MPOutput(output_type=MPOutputType.PARSE_OK, result="test")
await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
file_path = Path("./tests/pdf/sample_table.pdf")
async with MegaParseNATSClient(nc_config) as mp_client:
resp = await mp_client.parse_file(file=file_path)
assert resp == "test"
@pytest.mark.asyncio(loop_scope="session")
async def test_client_parse_url(nats_service: Client, nc_config: ClientNATSConfig):
async def message_handler(msg):
parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
assert isinstance(parsed_input, ParseUrlInput)
output = MPOutput(output_type=MPOutputType.PARSE_OK, result="url")
await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
async with MegaParseNATSClient(nc_config) as mp_client:
resp = await mp_client.parse_url(url="this://this")
assert resp == "url"
@pytest.mark.asyncio(loop_scope="session")
async def test_client_parse_timeout(nats_service: Client, ssl_config: SSLConfig):
nc_config = ClientNATSConfig(
subject=NATS_SUBJECT,
endpoint=NATS_URL,
ssl_config=ssl_config,
timeout=0.1,
max_retries=1,
backoff=1,
)
async def service(msg):
await asyncio.sleep(2 * nc_config.timeout)
await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service)
file_path = Path("./tests/pdf/sample_table.pdf")
with pytest.raises(ParsingException):
async with MegaParseNATSClient(nc_config) as mp_client:
await mp_client.parse_file(file=file_path)
@pytest.mark.asyncio(loop_scope="session")
async def test_client_parse_timeout_retry(nats_service: Client, ssl_config: SSLConfig):
nc_config = ClientNATSConfig(
subject=NATS_SUBJECT,
endpoint=NATS_URL,
ssl_config=ssl_config,
timeout=0.1,
max_retries=2,
backoff=-5,
)
msgs = []
async def service(msg):
msgs.append(msg)
await asyncio.sleep(2 * nc_config.timeout)
await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service)
file_path = Path("./tests/pdf/sample_table.pdf")
with pytest.raises(ParsingException):
async with MegaParseNATSClient(nc_config) as mp_client:
await mp_client.parse_file(file=file_path)
assert len(msgs) == 2
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.parametrize(
"mp_error_type, exception_class",
[
("MEMORY_LIMIT", MemoryLimitExceeded),
("INTERNAL_SERVER_ERROR", InternalServiceError),
("MODEL_NOT_SUPPORTED", ModelNotSupported),
("DOWNLOAD_ERROR", DownloadError),
("PARSING_ERROR", ParsingException),
],
)
async def test_client_parse_file_excp(
nats_service: Client, nc_config: ClientNATSConfig, mp_error_type, exception_class
):
async def message_handler(msg):
parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
assert isinstance(parsed_input, ParseFileInput)
err = ParseError(mp_err_code=MPErrorType[mp_error_type], message="")
output = MPOutput(
output_type=MPOutputType.PARSE_ERR,
err=err,
result=None,
)
await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
file_path = Path("./tests/pdf/sample_table.pdf")
with pytest.raises(exception_class):
async with MegaParseNATSClient(nc_config) as mp_client:
await mp_client.parse_file(file=file_path)
================================================
FILE: pyproject.toml
================================================
[project]
name = "megaparse-monorepo"
version = "0.0.1"
description = "Megaparse monorepo"
authors = [
{ name = "Stan Girard", email = "stan@quivr.app" },
{ name = "Chloé Daems", email = "chloe@quivr.app" },
{ name = "Amine Dirhoussi", email = "amine@quivr.app" },
{ name = "Jacopo Chevallard", email = "jacopo@quivr.app" },
]
readme = "README.md"
requires-python = ">= 3.11"
dependencies = [
"packaging>=22.0",
]
[build-system]
requires = ["hatchling==1.26.3"]
build-backend = "hatchling.build"
[tool.rye]
python = ">= 3.11"
managed = true
universal = true
dev-dependencies = [
"mypy>=1.11.1",
"pre-commit>=3.8.0",
"ipykernel>=6.29.5",
"ruff>=0.6.0",
"flake8>=7.1.1",
"flake8-black>=0.3.6",
"pytest-asyncio>=0.23.8",
"pytest>=8.3.3",
"pytest-xdist>=3.6.1",
"pytest-cov>=5.0.0",
"pytest-profiling>=1.8.1",
]
[tool.rye.workspace]
members = ["libs/*"]
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["src/megaparse"]
[tool.ruff]
line-length = 88
exclude = [".git", "__pycache__", ".mypy_cache", ".pytest_cache"]
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"C", # flake8-comprehensions
"B", # flake8-bugbear
]
ignore = [
"B904",
"B006",
"E501", # line too long, handled by black
"B008", # do not perform function calls in argument defaults
"C901", # too complex
]
[tool.ruff.lint.isort]
order-by-type = true
relative-imports-order = "closest-to-furthest"
extra-standard-library = ["typing"]
section-order = [
"future",
"standard-library",
"third-party",
"first-party",
"local-folder",
]
known-first-party = []
[tool.pytest.ini_options]
addopts = "--tb=short -ra -v"
asyncio_default_fixture_loop_scope = "session"
filterwarnings = ["ignore::DeprecationWarning"]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"base: these tests require quivr-core with extra `base` to be installed",
"tika: these tests require a tika server to be running",
"unstructured: these tests require `unstructured` dependency",
]
================================================
FILE: release-please-config.json
================================================
{
"$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json",
"separate-pull-requests": true,
"include-v-in-tag": true,
"bump-patch-for-minor-pre-major": true,
"include-component-in-tag": true,
"packages": {
"libs/megaparse": {
"release-type": "python",
"package-name": "megaparse",
"changelog-notes-type": "github"
},
"libs/megaparse_sdk": {
"release-type": "python",
"package-name": "megaparse-sdk",
"changelog-notes-type": "github"
}
}
}