Repository: QuivrHQ/MegaParse Branch: main Commit: ba9a24aec950 Files: 123 Total size: 28.4 MB Directory structure: gitextract_ylqgqesz/ ├── .aws/ │ └── task_definition.json ├── .flake8 ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── CI.yml │ ├── build-and-deploy.yml │ ├── build-gpu.yml │ ├── release-please.yml │ └── test-build-docker.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .release-please-manifest.json ├── .vscode/ │ ├── extensions.json │ ├── launch.json │ └── settings.json ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.gpu ├── LICENSE ├── Makefile ├── Pipfile ├── README.md ├── benchmark/ │ ├── process_single_doc.py │ └── test_quality_sim.py ├── docker-compose.dev.yml ├── docker-compose.yml ├── docs/ │ └── archive.txt ├── evaluations/ │ └── script.py ├── libs/ │ ├── megaparse/ │ │ ├── .python-version │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── bench.md │ │ ├── examples/ │ │ │ ├── parse_file_fast.py │ │ │ ├── parse_file_mp.py │ │ │ └── parse_file_unstructured.py │ │ ├── program.prof │ │ ├── pyproject.toml │ │ ├── src/ │ │ │ └── megaparse/ │ │ │ ├── __init__.py │ │ │ ├── api/ │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ ├── exceptions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── megaparse_exceptions.py │ │ │ │ └── models/ │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ ├── configs/ │ │ │ │ └── auto.py │ │ │ ├── examples/ │ │ │ │ ├── parse_file.py │ │ │ │ └── parsing_process.py │ │ │ ├── exceptions/ │ │ │ │ └── base.py │ │ │ ├── formatter/ │ │ │ │ ├── base.py │ │ │ │ ├── structured_formatter/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── custom_structured_formatter.py │ │ │ │ └── table_formatter/ │ │ │ │ ├── __init__.py │ │ │ │ ├── llm_table_formatter.py │ │ │ │ └── vision_table_formatter.py │ │ │ ├── layout_detection/ │ │ │ │ ├── layout_detector.py │ │ │ │ ├── models/ │ │ │ │ │ └── yolov10s-doclaynet.onnx │ │ │ │ └── output.py │ │ │ ├── megaparse.py │ │ │ ├── models/ │ │ │ │ └── page.py │ │ │ ├── parser/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── builder.py │ │ │ │ ├── doctr_parser.py │ │ │ │ ├── entity.py │ │ │ │ ├── llama.py │ │ │ │ ├── megaparse_vision.py │ │ │ │ └── unstructured_parser.py │ │ │ ├── predictor/ │ │ │ │ └── layout_predictor.py │ │ │ └── utils/ │ │ │ ├── extract_metadata.py │ │ │ ├── onnx.py │ │ │ └── strategy.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── certs/ │ │ │ ├── client-cert.pem │ │ │ └── client-key.pem │ │ ├── conftest.py │ │ ├── data/ │ │ │ └── grt_example/ │ │ │ └── MegaFake_report.md │ │ ├── pdf/ │ │ │ ├── test_detect_ocr.py │ │ │ ├── test_pdf_processing.py │ │ │ └── test_pdfium_parser.py │ │ ├── supported_docs/ │ │ │ ├── Sway.epub │ │ │ ├── file-sample_500kB.odt │ │ │ ├── file_example_XLSX_50.xlsx │ │ │ ├── file_example_XLS_50.xls │ │ │ ├── sample.csv │ │ │ ├── sample.docx │ │ │ ├── sample.markdown │ │ │ ├── sample.md │ │ │ ├── sample.otf │ │ │ ├── sample.pptx │ │ │ ├── sample.txt │ │ │ ├── sample.xml │ │ │ └── sample_complexe.html │ │ ├── test_endpoints.py │ │ ├── test_import.py │ │ └── test_parsers.py │ └── megaparse_sdk/ │ ├── CHANGELOG.md │ ├── README.md │ ├── __init__.py │ ├── examples/ │ │ └── usage_example.py │ ├── megaparse_sdk/ │ │ ├── __init__.py │ │ ├── client.py │ │ ├── config.py │ │ ├── endpoints/ │ │ │ ├── __init__.py │ │ │ ├── file_upload.py │ │ │ └── url_upload.py │ │ ├── schema/ │ │ │ ├── __init__.py │ │ │ ├── document.py │ │ │ ├── extensions.py │ │ │ ├── languages.py │ │ │ ├── mp_exceptions.py │ │ │ ├── mp_inputs.py │ │ │ ├── mp_outputs.py │ │ │ ├── parser_config.py │ │ │ └── supported_models.py │ │ └── utils/ │ │ └── load_ssl.py │ ├── pyproject.toml │ └── tests/ │ ├── README.md │ ├── certs/ │ │ ├── client-cert.pem │ │ ├── client-key.pem │ │ └── rootCA.pem │ └── test_nats_client.py ├── pyproject.toml └── release-please-config.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .aws/task_definition.json ================================================ { "taskDefinitionArn": "arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2", "containerDefinitions": [ { "name": "megaparse", "image": "quay.io/unstructured-io/unstructured-api:latest", "cpu": 0, "portMappings": [ { "containerPort": 8000, "hostPort": 8000, "protocol": "tcp" } ], "essential": true, "environment": [ { "name": "UNSTRUCTURED_HI_RES_MODEL_NAME", "value": "detectron2_onnx" }, { "name": "UNSTRUCTURED_PARALLEL_MODE_ENABLED", "value": "false" } ], "mountPoints": [], "volumesFrom": [], "logConfiguration": { "logDriver": "awslogs", "options": { "awslogs-group": "/ecs/megaparse", "awslogs-region": "eu-west-1", "awslogs-stream-prefix": "ecs" } }, "systemControls": [] } ], "family": "megaparse-task", "executionRoleArn": "arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole", "networkMode": "awsvpc", "revision": 2, "volumes": [], "status": "ACTIVE", "requiresAttributes": [ { "name": "com.amazonaws.ecs.capability.logging-driver.awslogs" }, { "name": "ecs.capability.execution-role-awslogs" }, { "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19" }, { "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18" }, { "name": "ecs.capability.task-eni" } ], "placementConstraints": [], "compatibilities": [ "EC2", "FARGATE" ], "requiresCompatibilities": [ "FARGATE" ], "cpu": "2048", "memory": "8192", "tags": [] } ================================================ FILE: .flake8 ================================================ [flake8] ; Minimal configuration for Flake8 to work with Black. max-line-length = 100 ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100 ================================================ FILE: .gitattributes ================================================ *.ipynb linguist-vendored *.html linguist-vendored ================================================ FILE: .github/workflows/CI.yml ================================================ name: Run tests on: pull_request: workflow_dispatch: env: NATS_TOKEN: test jobs: test: name: Run tests on Python ${{ matrix.python-version }} runs-on: ubuntu-latest strategy: matrix: python-version: ["3.11", "3.12"] steps: - name: 👀 Checkout code uses: actions/checkout@v2 with: submodules: true - name: Setup apt cache uses: actions/cache@v2 with: path: /var/cache/apt/archives key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }} - name: 😭 Install system dependencies run: | sudo apt-get update && sudo apt-get install -y \ netcat-traditional \ unzip \ libgeos-dev \ libcurl4-openssl-dev \ libssl-dev \ binutils \ curl \ git \ autoconf \ automake \ build-essential \ libtool \ gcc \ libmagic-dev \ poppler-utils \ tesseract-ocr \ libreoffice \ libpq-dev \ pandoc - name: 🔽 Install the latest version of rye uses: eifinger/setup-rye@v4 with: enable-cache: true - name: 📌 Pin Python version run: rye pin ${{ matrix.python-version }} - name: 🔽 Download and Install NATS Server run: | curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin - name: 🛠️ Set up NATS arguments run: | nohup nats-server \ --addr 0.0.0.0 \ --port 4222 \ --auth "$NATS_TOKEN" > nats.log 2>&1 & - name: 🔍 Verify NATS Server is Running run: | sleep 1 # Give the server some time to start if nc -zv localhost 4222; then echo "✅ NATS Server is running on port 4222." else echo "❌ Failed to start NATS Server." cat nats.log exit 1 fi - name: 🔨 Sync dependencies run: | UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock - name: 🚀 Run tests run: | rye test -p megaparse-sdk ================================================ FILE: .github/workflows/build-and-deploy.yml ================================================ name: Build Docker image and push ECR on: push: tags: - "v*" branches: [main] env: AWS_REGION: eu-west-1 ECR_REPOSITORY: quivrhq/megaparse ECS_CLUSTER: megaparse ECS_TASK_DEFINITION: .aws/task_definition.json CONTAINER_NAME: megaparse permissions: contents: read jobs: deploy: name: build docker runs-on: ubuntu-latest environment: production outputs: imageoutput: ${{ steps.build-image.outputs.imageoutput }} steps: - name: Checkout uses: actions/checkout@v3 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v1 with: registry-type: public - name: Build, tag, and push image to Amazon ECR id: build-image env: ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} IMAGE_TAG: ${{ github.sha }} run: | # Build a docker container and push it to ECR docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG # Tag the image as 'latest' and push docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT ================================================ FILE: .github/workflows/build-gpu.yml ================================================ name: Build docker GPU and push ECR on: push: tags: - "v*" branches: [main] env: AWS_REGION: eu-west-1 ECR_REPOSITORY: quivrhq/megaparse-gpu ECS_CLUSTER: megaparse ECS_TASK_DEFINITION: .aws/task_definition.json CONTAINER_NAME: megaparse permissions: contents: read jobs: deploy: name: Build docker-gpu runs-on: group: big-boy-gpu environment: production outputs: imageoutput: ${{ steps.build-image.outputs.imageoutput }} steps: - name: Checkout uses: actions/checkout@v3 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v1 with: registry-type: public - name: Build, tag, and push image to Amazon ECR id: build-image env: ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} IMAGE_TAG: ${{ github.sha }} run: | # Build a docker container and push it to ECR docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu . docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG # Tag the image as 'latest' and push docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT ================================================ FILE: .github/workflows/release-please.yml ================================================ on: push: branches: - main permissions: contents: write pull-requests: write name: release-please jobs: release-please: runs-on: ubuntu-latest outputs: release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }} release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }} steps: - name: Checkout repository uses: actions/checkout@v3 with: fetch-depth: 0 # Fetch all history for tags and releases - name: Setup Python uses: actions/setup-python@v4 with: python-version: "3.11" - name: Run release-please id: release uses: google-github-actions/release-please-action@v4 with: token: ${{ secrets.RELEASE_PLEASE_TOKEN }} deploy-megaparse: if: needs.release-please.outputs.release_created == 'true' needs: release-please runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install Rye uses: eifinger/setup-rye@v2 with: enable-cache: true - name: Rye Sync run: rye sync --no-lock - name: Rye Build run: cd libs/megaparse && rye build - name: Rye Publish run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes deploy-sdk: if: needs.release-please.outputs.release_created_sdk == 'true' needs: release-please runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install Rye uses: eifinger/setup-rye@v2 with: enable-cache: true - name: Rye Sync run: cd libs/megaparse_sdk && rye sync --no-lock - name: Rye Build run: cd libs/megaparse_sdk && rye build - name: Rye Publish run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes ================================================ FILE: .github/workflows/test-build-docker.yml ================================================ on: pull_request: branches: - main name: Test build docker jobs: build-docker: runs-on: ubuntu-latest strategy: matrix: dockerfile: [Dockerfile, Dockerfile.gpu] steps: - name: Checkout repository uses: actions/checkout@v3 - name: Set up QEMU uses: docker/setup-qemu-action@v3 with: platforms: all - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build Docker image with caching uses: docker/build-push-action@v4 with: context: . file: ${{ matrix.dockerfile }} push: false tags: quivrhq/megaparse:${{ matrix.dockerfile }} cache-from: type=gha cache-to: type=gha,mode=max ================================================ FILE: .gitignore ================================================ /output /input .env __pycache__/ dist/** megaparse.egg-info/ *.pyc build/* ENV venv */evaluations/* */cdp/* *.pkl !megaparse/tests/output_tests/MegaFake_report.md *.DS_Store .tool-versions megaparse/sdk/examples/only_pdfs/* **/profile/ **/prof/ .ropeproject/ benchmark/hi_res/* benchmark/auto/* ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: - id: check-added-large-files args: ["--maxkb=5000"] - id: check-toml - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - id: check-merge-conflict - id: detect-private-key - id: check-case-conflict - repo: https://github.com/pre-commit/pre-commit rev: v3.6.2 hooks: - id: validate_manifest - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.5.1 hooks: # Run the linter. - id: ruff args: [--fix] additional_dependencies: [] # Run the formatter. - id: ruff-format additional_dependencies: [] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.10.1 hooks: - id: mypy name: mypy additional_dependencies: ["types-aiofiles"] ================================================ FILE: .python-version ================================================ 3.11.9 ================================================ FILE: .release-please-manifest.json ================================================ { "libs/megaparse": "0.0.55", "libs/megaparse_sdk": "0.1.12" } ================================================ FILE: .vscode/extensions.json ================================================ { "recommendations": [ "dbaeumer.vscode-eslint", "charliermarsh.ruff", "knisterpeter.vscode-github", "github.vscode-pull-request-github", "ms-python.python", "ms-python.vscode-pylance", "ms-python.debugpy" ] } ================================================ FILE: .vscode/launch.json ================================================ { "version": "0.2.0", "configurations": [ { "name": "Python: Remote Attach", "type": "python", "request": "attach", "connect": { "host": "localhost", "port": 5678 }, "pathMappings": [ { "localRoot": "${workspaceFolder}/backend", "remoteRoot": "." } ], "justMyCode": true }, { "name": "Python: Debug Test Script", "type": "python", "request": "launch", "program": "${workspaceFolder}/backend/test_process_file_and_notify.py", "console": "integratedTerminal", "justMyCode": false }, { "name": "Python: Debug", "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", "justMyCode": false, "env": { "PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}" }, "envFile": "${workspaceFolder}/.env" } ] } ================================================ FILE: .vscode/settings.json ================================================ { "editor.formatOnSave": true, "editor.formatOnSaveMode": "file", "files.exclude": { "**/__pycache__": true, "**/.benchmarks/": true, "**/.cache/": true, "**/.pytest_cache/": true, "**/.next/": true, "**/build/": true, "**/.docusaurus/": true, "**/node_modules/": true }, "[python]": { "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true, "editor.codeActionsOnSave": { "source.organizeImports": "explicit", "source.fixAll": "explicit" } }, "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.analysis.autoImportCompletions": true, "python.analysis.typeCheckingMode": "basic", "python.analysis.diagnosticSeverityOverrides": { "reportMissingImports": "error", "reportUnusedImport": "warning", "reportGeneralTypeIssues": "warning" }, "makefile.configureOnOpen": false } ================================================ FILE: CHANGELOG.md ================================================ # Changelog ## [0.0.46](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.45...megaparse-v0.0.46) (2024-11-21) ### Features * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334)) ## [0.0.45](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.44...megaparse-v0.0.45) (2024-11-19) ### Bug Fixes * small fixes from backlogs ([#128](https://github.com/QuivrHQ/MegaParse/issues/128)) ([954554c](https://github.com/QuivrHQ/MegaParse/commit/954554c5abaa7b0513e9ff3f6bbaff393d36cf03)) ## [0.0.44](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.43...megaparse-v0.0.44) (2024-11-18) ### Bug Fixes * fixing the wrong passing of arguments to the parse_file endpoint ([#123](https://github.com/QuivrHQ/MegaParse/issues/123)) ([9105672](https://github.com/QuivrHQ/MegaParse/commit/9105672abc0942f26785e494053112d486e8d2d9)) ## [0.0.43](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.42...megaparse-v0.0.43) (2024-11-14) ### Features * increase the robustness of megaparse ([#121](https://github.com/QuivrHQ/MegaParse/issues/121)) ([d21d8bb](https://github.com/QuivrHQ/MegaParse/commit/d21d8bb77bd8e687b1a951db6b81653e4e47a8bb)) ### Bug Fixes * uvicorn version ([#127](https://github.com/QuivrHQ/MegaParse/issues/127)) ([ceaba3d](https://github.com/QuivrHQ/MegaParse/commit/ceaba3df2951be27e6a4835e5784917a62867896)) * version requirements ([#126](https://github.com/QuivrHQ/MegaParse/issues/126)) ([a10d502](https://github.com/QuivrHQ/MegaParse/commit/a10d502f1b3576690cebe33b656d2480a24defe3)) ## [0.0.42](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.41...megaparse-v0.0.42) (2024-11-08) ### Features * **sdk:** new version ([e377cd6](https://github.com/QuivrHQ/MegaParse/commit/e377cd6df98b3ea9265788a4d907b43bde796196)) ## [0.0.41](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.40...megaparse-v0.0.41) (2024-11-08) ### Bug Fixes * add megaparse url env variable ([#118](https://github.com/QuivrHQ/MegaParse/issues/118)) ([132c2eb](https://github.com/QuivrHQ/MegaParse/commit/132c2ebd13177fd116c4e710a4b1c864a9fa04bb)) ## [0.0.40](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.39...megaparse-v0.0.40) (2024-11-08) ### Bug Fixes * sdk version ([#116](https://github.com/QuivrHQ/MegaParse/issues/116)) ([8bfeb4a](https://github.com/QuivrHQ/MegaParse/commit/8bfeb4a52326a5f645d3ed20e113153dc19bf012)) ## [0.0.39](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.38...megaparse-v0.0.39) (2024-11-08) ### Bug Fixes * add_logs ([#114](https://github.com/QuivrHQ/MegaParse/issues/114)) ([63c9236](https://github.com/QuivrHQ/MegaParse/commit/63c9236590016ee4c210174e746e96ff2b654480)) ## [0.0.38](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.37...megaparse-v0.0.38) (2024-11-07) ### Bug Fixes * env roots, imports root ([#112](https://github.com/QuivrHQ/MegaParse/issues/112)) ([a04230d](https://github.com/QuivrHQ/MegaParse/commit/a04230dc2de9e0bb0bde39ab66b2208f80743922)) ## [0.0.37](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.36...megaparse-v0.0.37) (2024-11-07) ### Features * bump megaparse-sdk version to 0.1.1 ([ed3fdfb](https://github.com/QuivrHQ/MegaParse/commit/ed3fdfb10498c95d4f9a510df3a2913e0dfc3c23)) ## [0.0.36](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.35...megaparse-v0.0.36) (2024-11-07) ### Features * **readme:** update ([9d571b7](https://github.com/QuivrHQ/MegaParse/commit/9d571b7c71db610e7a0b08045ad98994ecf71baa)) ## [0.0.35](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.34...megaparse-v0.0.35) (2024-11-07) ### Bug Fixes * unnecessary dep and readme ([#107](https://github.com/QuivrHQ/MegaParse/issues/107)) ([b80aaa3](https://github.com/QuivrHQ/MegaParse/commit/b80aaa3a894b2bd2c7d7f518919c41af5c99219f)) ## [0.0.34](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.33...megaparse-v0.0.34) (2024-11-07) ### Features * megaparse-sdk-cherry ([#105](https://github.com/QuivrHQ/MegaParse/issues/105)) ([ad44aa3](https://github.com/QuivrHQ/MegaParse/commit/ad44aa34999596e156c78f91adab97bce7ceeb0e)) ## [0.0.33](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.32...megaparse-v0.0.33) (2024-11-01) ### Bug Fixes * readme ([#99](https://github.com/QuivrHQ/MegaParse/issues/99)) ([b3b80a3](https://github.com/QuivrHQ/MegaParse/commit/b3b80a3a599bbd4bec8ed79bb9ef44c8c7c92789)) ## [0.0.32](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.31...megaparse-v0.0.32) (2024-11-01) ### Features * **api:** megaparse under api ([#93](https://github.com/QuivrHQ/MegaParse/issues/93)) ([2edf44b](https://github.com/QuivrHQ/MegaParse/commit/2edf44bd8c09ac7127db74206e463ebe29c68998)) ### Bug Fixes * api call error & tests ([#98](https://github.com/QuivrHQ/MegaParse/issues/98)) ([6bf1ce8](https://github.com/QuivrHQ/MegaParse/commit/6bf1ce8c6ed0e4f1e81577973a0fc71f61b10776)) ## [0.0.31](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.30...megaparse-v0.0.31) (2024-08-20) ### Features * **pytorch:** cpu only removed ([#88](https://github.com/QuivrHQ/MegaParse/issues/88)) ([6b2fcfa](https://github.com/QuivrHQ/MegaParse/commit/6b2fcfa4413b8a72d398aab57f277dd28ab69c2f)) ## [0.0.30](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.29...megaparse-v0.0.30) (2024-08-20) ### Features * **pytorch:** cpu only optional ([#86](https://github.com/QuivrHQ/MegaParse/issues/86)) ([e5d8806](https://github.com/QuivrHQ/MegaParse/commit/e5d8806ee6182de250352ce65ac6cd57c1093494)) ## [0.0.29](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.28...megaparse-v0.0.29) (2024-08-18) ### Bug Fixes * **building:** version not working ([#83](https://github.com/QuivrHQ/MegaParse/issues/83)) ([c5e73f6](https://github.com/QuivrHQ/MegaParse/commit/c5e73f6c821424ef277ddd15ddb5b2df48ff7ab2)) ## [0.0.28](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.27...megaparse-v0.0.28) (2024-08-16) ### Features * **rye:** added package manager ([#81](https://github.com/QuivrHQ/MegaParse/issues/81)) ([a3a50a3](https://github.com/QuivrHQ/MegaParse/commit/a3a50a3f27d3d9b4d6de4f3415472f8e52710656)) ## [0.0.27](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.26...megaparse-v0.0.27) (2024-08-16) ### Features * **unstructured:** increased version ([#78](https://github.com/QuivrHQ/MegaParse/issues/78)) ([eb49cf5](https://github.com/QuivrHQ/MegaParse/commit/eb49cf5e79cd7a38c8212b315a4b64860c35a7b7)) ## [0.0.26](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.25...megaparse-v0.0.26) (2024-08-16) ### Bug Fixes * **pycrypto:** being used by an old version of pdfplumber ([#76](https://github.com/QuivrHQ/MegaParse/issues/76)) ([d28f88c](https://github.com/QuivrHQ/MegaParse/commit/d28f88ceb2a722b15c84738f395b3ff4c818a365)) ## [0.0.25](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.24...megaparse-v0.0.25) (2024-08-16) ### Features * **rye:** implemented ([#74](https://github.com/QuivrHQ/MegaParse/issues/74)) ([1e9ad8e](https://github.com/QuivrHQ/MegaParse/commit/1e9ad8e0000f28c709d915219fe62c0dbe7fa812)) ## [0.0.24](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.23...megaparse-v0.0.24) (2024-07-30) ### Features * async load ([#71](https://github.com/QuivrHQ/MegaParse/issues/71)) ([fbc3e1b](https://github.com/QuivrHQ/MegaParse/commit/fbc3e1b5f504eee9757e15592169ddad9b069f03)) ## [0.0.23](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.22...megaparse-v0.0.23) (2024-07-30) ### Features * megaparse 0.0.22 ([071fd4d](https://github.com/QuivrHQ/MegaParse/commit/071fd4da2e8f0abb58fc66c3cdd87c4ee5cda4d6)) ## 0.0.20 (2024-07-10) ## What's Changed * add: resolve multiple page problem on llama parse by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/61 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.19...v0.0.20 ## 0.0.19 (2024-06-28) ## What's Changed * add: choose unstructured strategy by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/57 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.18...v0.0.19 ## 0.0.18 (2024-06-28) ## What's Changed * fix: add __init__.py by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/54 * fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/56 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.17...v0.0.18 ## 0.0.17 (2024-06-27) ## What's Changed * markdown by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/48 * fix:Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/49 * fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/50 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.16...v0.0.17 ## 0.0.16 (2024-06-27) ## What's Changed * Fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/47 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.15...v0.0.16 ## 0.0.15 (2024-06-26) ## What's Changed * add: llm megaparser by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/42 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.14...v0.0.15 ## 0.0.14 (2024-06-24) ## What's Changed * fix: remove nest asycio by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/40 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.13...v0.0.14 ## 0.0.13 (2024-06-24) ## What's Changed * fix: use aload_data by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/38 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.12...v0.0.13 ## 0.0.12 (2024-06-18) ## What's Changed * fix:delete markdownify dependency by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/33 * fix: fake fix README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/34 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.11...v0.0.12 ## 0.0.11 (2024-06-17) ## What's Changed * Fix OpenAI key error. Add docstrings. Polish code by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/24 * Fix DOCX reader. Add input tests by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/25 * add: xlsx convertor by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/29 * add: convert_tab by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/31 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.10...v0.0.11 ## 0.0.10 (2024-06-04) ## What's Changed * Change from LiteralString to Literal (typing) by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/21 * chore: Add Dockerfile and Makefile for project setup by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/23 ## New Contributors * @dSupertramp made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/21 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.9...v0.0.10 ## 0.0.9 (2024-06-04) ## What's Changed * chore: Update README.md to include optional use of LlamaParse for improved results by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/19 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.8...v0.0.9 ## 0.0.8 (2024-06-04) **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.7...v0.0.8 ## 0.0.7 (2024-06-03) ## What's Changed * feat: Update benchmark results in README.md by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/15 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.6...v0.0.7 ## 0.0.6 (2024-06-03) ## What's Changed * add: gpt cleaner for header and footer by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/13 ## New Contributors * @chloedia made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/13 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.5...v0.0.6 ## 0.0.5 (2024-06-02) ## What's Changed * feat: Add instructions for installing poppler and tesseract by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/10 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.4...v0.0.5 ## 0.0.4 (2024-06-02) ## What's Changed * add: baseline evaluation by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/7 * Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/9 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.3...v0.0.4 ## 0.0.3 (2024-05-30) **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.2...v0.0.3 ## 0.0.2 (2024-05-30) ## What's Changed * feat: Megaparse example and working by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/2 **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2 ## 0.0.2 (2024-05-30) **Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2 ================================================ FILE: Dockerfile ================================================ FROM python:3.11.10-slim-bullseye WORKDIR /app # Install runtime dependencies RUN apt-get update && apt-get upgrade && apt-get install -y \ libgeos-dev \ libcurl4-openssl-dev \ libssl-dev \ binutils \ curl \ git \ autoconf \ automake \ build-essential \ libtool \ python-dev \ build-essential \ wget \ gcc \ # Additional dependencies for document handling libmagic-dev \ poppler-utils \ tesseract-ocr \ libreoffice \ libpq-dev \ pandoc && \ rm -rf /var/lib/apt/lists/* && apt-get clean COPY requirements.lock pyproject.toml README.md ./ COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/ COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/ RUN pip install uv RUN uv pip install --no-cache --system -r requirements.lock RUN playwright install --with-deps RUN python3 - -m nltk.downloader all COPY . . RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk EXPOSE 8000 CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"] ================================================ FILE: Dockerfile.gpu ================================================ FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04 WORKDIR /app ENV UV_COMPILE_BYTECODE=1 ENV UV_NO_CACHE=1 ENV DEBIAN_FRONTEND=noninteractive # Install runtime dependencies RUN apt-get update && apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && apt-get install -y \ python3.11 \ python3.11-dev \ libgeos-dev \ libcurl4-openssl-dev \ libssl-dev \ binutils \ curl \ git \ autoconf \ automake \ libtool \ python3-pip \ build-essential \ wget \ gcc \ # Additional dependencies for document handling libmagic-dev \ poppler-utils \ tesseract-ocr \ libreoffice \ libpq-dev \ pandoc && \ rm -rf /var/lib/apt/lists/* && apt-get clean RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ update-alternatives --set python3 /usr/bin/python3.11 COPY requirements.lock pyproject.toml README.md ./ COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/ COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/ RUN curl -LsSf https://astral.sh/uv/install.sh | sh ENV PATH="/root/.local/bin:$PATH" RUN uv pip install --no-cache --system -r requirements.lock RUN playwright install --with-deps RUN python3 - -m nltk.downloader all # FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured # RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ # RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" COPY . . RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ .DEFAULT_TARGET=help ## help: Display list of commands .PHONY: help help: @echo "Available commands:" @sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |' ## dev: Start development environment .PHONY: dev dev: DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build ## dev-build: Build development environment without cache .PHONY: dev-build dev-build: DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up ## prod: Build and start production environment .PHONY: prod prod: docker compose -f docker-compose.yml up --build ================================================ FILE: Pipfile ================================================ [[source]] url = "https://pypi.org/simple" verify_ssl = true name = "pypi" [packages] [dev-packages] [requires] python_version = "3.11" ================================================ FILE: README.md ================================================ # MegaParse - Your Parser for every type of documents
Quivr-logo
MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing. ## Key Features 🎯 - **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease. - **No Information Loss**: Focus on having no information loss during parsing. - **Fast and Efficient**: Designed with speed and efficiency at its core. - **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents. - **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use. ## Support - Files: ✅ PDF ✅ Powerpoint ✅ Word - Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images ### Example https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3 ## Installation required python version >= 3.11 ```bash pip install megaparse ``` ## Usage 1. Add your OpenAI or Anthropic API key to the .env file 2. Install poppler on your computer (images and PDFs) 3. Install tesseract on your computer (images and PDFs) 4. If you have a mac, you also need to install libmagic ```brew install libmagic``` Use MegaParse as it is : ```python from megaparse import MegaParse from langchain_openai import ChatOpenAI megaparse = MegaParse() response = megaparse.load("./test.pdf") print(response) ``` ### Use MegaParse Vision ```python from megaparse.parser.megaparse_vision import MegaParseVision model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore parser = MegaParseVision(model=model) response = parser.convert("./test.pdf") print(response) ``` **Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4. ## Use as an API There is a MakeFile for you, simply use : ```make dev``` at the root of the project and you are good to go. See localhost:8000/docs for more info on the different endpoints ! ## BenchMark | Parser | similarity_ratio | | ----------------------------- | ---------------- | | megaparse_vision | 0.87 | | unstructured_with_check_table | 0.77 | | unstructured | 0.59 | | llama_parser | 0.33 | _Higher the better_ Note: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together . ## In Construction 🚧 - Improve table checker - Create Checkers to add **modular postprocessing** ⚙️ - Add Structured output, **let's get computer talking** 🤖 ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date) ================================================ FILE: benchmark/process_single_doc.py ================================================ import asyncio import time from pathlib import Path import numpy as np from megaparse import MegaParse N_TRY = 1 async def process_file(megaparse: MegaParse, file_path: str | Path): try: t0 = time.perf_counter() _ = await megaparse.aload( file_path=file_path, ) total = time.perf_counter() - t0 return total except Exception as e: print(f"Exception occured: {e}") return None async def test_process_file(file: str | Path): # parser = UnstructuredParser(strategy=StrategyEnum.HI_RES) megaparse = MegaParse() task = [] for _ in range(N_TRY): task.append(process_file(megaparse, file)) list_process_time = await asyncio.gather(*task) n_errors = sum([t is None for t in list_process_time]) list_process_time = [t for t in list_process_time if t is not None] np_list_process_time = np.array(list_process_time) print(f"All errors : {n_errors}") print(f"Average time taken: {np_list_process_time.mean()}") print(f"Median time taken: {np.median(list_process_time)}") print(f"Standard deviation of time taken: {np.std(list_process_time)}") print(f"Max time taken: {np.max(list_process_time)}") print(f"Min time taken: {np.min(list_process_time)}") if __name__ == "__main__": folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf" asyncio.run(test_process_file(folder_path)) ================================================ FILE: benchmark/test_quality_sim.py ================================================ import os import difflib from pathlib import Path auto_dir = Path("benchmark/auto") hi_res_dir = Path("benchmark/hi_res") def jaccard_similarity(str1, str2): if len(str1) == 0 and len(str2) == 0: return 1 # Tokenize the strings into sets of words words1 = set(str1.split()) words2 = set(str2.split()) # Find intersection and union of the word sets intersection = words1.intersection(words2) union = words1.union(words2) # Compute Jaccard similarity return len(intersection) / len(union) if len(union) != 0 else 0 def compare_files(file_name): file_path_auto = auto_dir / f"{file_name}.md" file_path_hi_res = hi_res_dir / f"{file_name}.md" with open(file_path_auto, "r") as f: auto_content = f.read() with open(file_path_hi_res, "r") as f: hi_res_content = f.read() if len(auto_content) == 0 and len(hi_res_content) == 0: return 1 similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio() # similarity = jaccard_similarity(auto_content, hi_res_content) return similarity def main(): files = os.listdir(hi_res_dir) print(f"Comparing {len(files)} files...") similarity_dict = {} for file in files: file_name = Path(file).stem similarity = compare_files(file_name) similarity_dict[file_name] = similarity avg_similarity = sum(similarity_dict.values()) / len(similarity_dict) print(f"\nAverage similarity: {avg_similarity}\n") pass_rate = sum( [similarity > 0.9 for similarity in similarity_dict.values()] ) / len(similarity_dict) print(f"Pass rate: {pass_rate}\n") print("Under 0.9 similarity documents:") print("-------------------------------") for file_name, similarity in similarity_dict.items(): if similarity < 0.9: print(f"{file_name}: {similarity}") if __name__ == "__main__": main() ================================================ FILE: docker-compose.dev.yml ================================================ version: "3.8" services: megaparse: build: context: . dockerfile: Dockerfile cache_from: - megaparse:latest args: - DEV_MODE=true image: megaparse:latest extra_hosts: - "host.docker.internal:host-gateway" container_name: megaparse volumes: - ./:/app/ command: > /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000" restart: always ports: - 8000:8000 ================================================ FILE: docker-compose.yml ================================================ version: "3.8" services: megaparse: image: megaparse:latest pull_policy: if_not_present container_name: megaparse extra_hosts: - "host.docker.internal:host-gateway" healthcheck: test: [ "CMD", "curl", "http://localhost:5050/healthz" ] command: > /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop" restart: always ports: - 8000:8000 ================================================ FILE: docs/archive.txt ================================================ ### (Optional) Use LlamaParse for Improved Results 1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key. 2. Change the parser to LlamaParser ```python from megaparse import MegaParse from langchain_openai import ChatOpenAI from megaparse.parser.llama_parser import LlamaParser parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY")) megaparse = MegaParse(parser) response = megaparse.load("./test.pdf") print(response) megaparse.save("./test.md") #saves the last processed doc in md format ``` ================================================ FILE: evaluations/script.py ================================================ import difflib import os from langchain_openai import ChatOpenAI from megaparse.megaparse import MegaParse from megaparse.parser.llama import LlamaParser from megaparse.parser.megaparse_vision import MegaParseVision from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.schema.parser_config import StrategyEnum if __name__ == "__main__": print("---Launching evaluations script---") model = ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))) # type: ignore parser_dict = { "unstructured": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None), "unstructured_with_check_table": UnstructuredParser( strategy=StrategyEnum.AUTO, model=model, ), "llama_parser": LlamaParser(api_key=str(os.getenv("LLAMA_CLOUD_API_KEY"))), "megaparse_vision": MegaParseVision(model=model), } base_pdf_path = "tests/data/MegaFake_report.pdf" base_md_path = "tests/data/grt_example/MegaFake_report.md" with open(base_md_path, "r", encoding="utf-8") as f: base_md = f.read() score_dict = {} for method, parser in parser_dict.items(): print(f"Method: {method}") megaparse = MegaParse() result = megaparse.load(file_path=base_pdf_path) score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio() print(f"Score for method {method}: {score_dict[method]}") # Sort the results sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True) # Generate a table with the results benchmark_results = "| Parser | similarity_ratio |\n|---|---|\n" for parser, score in sorted_score: benchmark_results += f"| {parser} | {score:.2f} |\n" print(benchmark_results) # Update README.md file with open("README.md", "r") as readme_file: readme_content = readme_file.read() start_marker = "" end_marker = "" start_index = readme_content.find(start_marker) + len(start_marker) end_index = readme_content.find(end_marker) updated_readme_content = ( readme_content[:start_index] + "\n" + benchmark_results + readme_content[end_index:] ) with open("README.md", "w") as readme_file: readme_file.write(updated_readme_content) ================================================ FILE: libs/megaparse/.python-version ================================================ 3.11.9 ================================================ FILE: libs/megaparse/CHANGELOG.md ================================================ # Changelog ## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14) ### Features * remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8)) ## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11) ### Features * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7)) ## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16) ### Features * modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2)) * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37)) * type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8)) ### Bug Fixes * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38)) * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228)) * logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e)) * megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9)) * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3)) * sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707)) ## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16) ### Bug Fixes * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473)) ## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16) ### Features * updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862)) ## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13) ### Features * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609)) ## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12) ### Features * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29)) * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036)) ## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03) ### Features * Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430)) ## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21) ### Features * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334)) * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925)) ## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24) ### Features * Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc)) * Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72)) * Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914)) * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0)) * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648)) * Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0)) ### Bug Fixes * add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70)) * change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f)) * PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7)) * remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4)) * use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089)) ## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24) ### Features * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0)) * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648)) ================================================ FILE: libs/megaparse/README.md ================================================ # MegaParse CORE - Core package of megaparse > **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/). ================================================ FILE: libs/megaparse/bench.md ================================================ ------------ UNSTRUCTURED(HI-RES): ------------ folder: cdp cdp_etiquette.pdf parsing took: 2.10s folder: scanned-tables POZIBILAN 2022.pdf parsing took: 78.72s Banco Popilar Number 2.pdf parsing took: 94.44s folder: native 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s folder: scanned machine.pdf parsing took: 54.29s medical.pdf parsing took: 76.11s les_americains.pdf parsing took: 643.84s agency.pdf parsing took: 114.19s clark.pdf parsing took: 27.89s tables_ocr.pdf parsing took: 81.21s folder: rich language_learning.pdf parsing took: 2.60s dites nous tout....pdf parsing took: 1.62s ------------ UNSTRUCTURED(FAST): ------------ folder: cdp cdp_etiquette.pdf parsing took: 0.05s folder: scanned-tables POZIBILAN 2022.pdf: can't parse Banco Popilar Number 2.pdf: can't parse folder: native 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s folder: scanned machine.pdf parsing took: 0.02s medical.pdf parsing took: 0.04s les_americains.pdf parsing took: 5.90s agency.pdf: can't parse clark.pdf: can't parse tables_ocr.pdf: can't parse folder: rich language_learning.pdf: can't parse dites nous tout....pdf parsing took: 0.02s ------------ Megaparse ( strategy = AUTO Config = { provider=COREML, det_arch: str = "fast_base" det_batch_size: int = 2 assume_straight_pages: bool = True preserve_aspect_ratio: bool = True symmetric_pad: bool = True load_in_8_bit: bool = False reco_arch: str = "crnn_vgg16_bn" rec_batch_size: int = 512 } ) ------------ folder: cdp cdp_etiquette.pdf parsing took: 1.71s folder: scanned-tables POZIBILAN 2022.pdf parsing took: 17.76s Banco Popilar Number 2.pdf parsing took: 19.25s folder: native 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s folder: scanned machine.pdf parsing took: 9.90s medical.pdf parsing took: 13.09s les_americains.pdf parsing took: 139.53s agency.pdf parsing took: 10.73s clark.pdf parsing took: 10.69s tables_ocr.pdf parsing took: 15.58s folder: rich language_learning.pdf parsing took: 1.74s dites nous tout....pdf parsing took: 0.64s ---- | Type | PDF Name | Unstructured(HI-RES) | Unstructured(FAST) | Megaparse( w/ doctr COREML) | |------------------|-----------------------------------|---------------------|----------------------|--------------------| | **cdp** | cdp_etiquette.pdf | 2.10s | 0.05s (bad parsing) | 1.71s | | **scanned-tables** | POZIBILAN 2022.pdf | 78.72s | can't parse | 17.76s | | **scanned-tables** | Banco Popilar Number 2.pdf | 94.44s | can't parse | 19.25s | | **native** | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s | 0.07s | 0.96s | | **native** | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s | 12.57s | | **native** | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s | 1.53s | | **scanned** | machine.pdf | 54.29s | 0.02s | 9.90s | | **scanned** | medical.pdf | 76.11s | 0.04s | 13.09s | | **scanned** | les_americains.pdf | 643.84s | 5.90s | 139.53s | | **scanned** | agency.pdf | 114.19s | can't parse | 10.73s | | **scanned** | clark.pdf | 28.89s | can't parse | 10.69s | | **scanned** | tables_ocr.pdf | 81.21s | can't parse | 15.58s | | **rich** | language_learning.pdf | 2.60s | can't parse | 1.74s | | **rich** | dites nous tout....pdf | 1.62s | 0.02s | 0.64s | ================================================ FILE: libs/megaparse/examples/parse_file_fast.py ================================================ import os from dataclasses import dataclass from time import perf_counter from unstructured.partition.auto import partition @dataclass class File: file_path: str file_name: str file_extension: str def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: directory_dict = {} for root, _, files in os.walk(directory_path): folder_name = os.path.basename(root) if len(folder_name) > 0: file_list = [] for file_name in files: file_path = os.path.join(root, file_name) file_extension = os.path.splitext(file_name)[1] file_list.append( File( file_path=file_path, file_name=file_name, file_extension=file_extension, ) ) directory_dict[folder_name] = file_list return directory_dict def main(): file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf" folder_path = "/Users/amine/data/quivr/parsing/" list_files = list_files_in_directory(folder_path) for folder_name, files in list_files.items(): print(f"folder: {folder_name}") for file in files: if file.file_extension == ".pdf": s = perf_counter() elements = partition( filename=file.file_path, strategy="fast", ) if len(elements) == 0: print(f"\t{file.file_name}: can't parse ") continue e = perf_counter() print(f"\t {file.file_name} parsing took: {e-s:.2f}s") if __name__ == "__main__": els = main() ================================================ FILE: libs/megaparse/examples/parse_file_mp.py ================================================ import os from dataclasses import dataclass from time import perf_counter from megaparse import MegaParse from megaparse.configs.auto import DeviceEnum, MegaParseConfig @dataclass class File: file_path: str file_name: str file_extension: str def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: directory_dict = {} for root, _, files in os.walk(directory_path): folder_name = os.path.basename(root) if len(folder_name) > 0: file_list = [] for file_name in files: file_path = os.path.join(root, file_name) file_extension = os.path.splitext(file_name)[1] file_list.append( File( file_path=file_path, file_name=file_name, file_extension=file_extension, ) ) directory_dict[folder_name] = file_list return directory_dict def main(): folder_path = "/Users/amine/data/quivr/parsing/" list_files = list_files_in_directory(folder_path) config = MegaParseConfig(device=DeviceEnum.COREML) mp = MegaParse(config=config) for folder_name, files in list_files.items(): print(f"folder: {folder_name}") for file in files: if file.file_extension == ".pdf": s = perf_counter() result = mp.load(file.file_path) if len(result) == 0: print(f"\t{file.file_name}: can't parse ") continue e = perf_counter() print(f"\t {file.file_name} parsing took: {e-s:.2f}s") if __name__ == "__main__": els = main() ================================================ FILE: libs/megaparse/examples/parse_file_unstructured.py ================================================ import os from dataclasses import dataclass from time import perf_counter from unstructured.partition.auto import partition @dataclass class File: file_path: str file_name: str file_extension: str def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: directory_dict = {} for root, _, files in os.walk(directory_path): folder_name = os.path.basename(root) if len(folder_name) > 0: file_list = [] for file_name in files: file_path = os.path.join(root, file_name) file_extension = os.path.splitext(file_name)[1] file_list.append( File( file_path=file_path, file_name=file_name, file_extension=file_extension, ) ) directory_dict[folder_name] = file_list return directory_dict def main(): file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf" folder_path = "/Users/amine/data/quivr/parsing/" list_files = list_files_in_directory(folder_path) for folder_name, files in list_files.items(): print(f"folder: {folder_name}") for file in files: if file.file_extension == ".pdf": s = perf_counter() _ = partition( filename=file.file_path, strategy="hi_res", ) e = perf_counter() print(f"\t {file.file_name} parsing took: {e-s:.2f}s") if __name__ == "__main__": els = main() ================================================ FILE: libs/megaparse/pyproject.toml ================================================ [project] name = "megaparse" version = "0.0.55" authors = [ { name = "Stan Girard", email = "stan@quivr.app" }, { name = "Chloé Daems", email = "chloe@quivr.app" }, { name = "Amine Dirhoussi", email = "amine@quivr.app" }, { name = "Jacopo Chevallard", email = "jacopo@quivr.app" }, ] readme = "README.md" requires-python = ">= 3.11" dependencies = [ "megaparse-sdk", "pycryptodome>=3.21.0", "pdfplumber>=0.11.0", "backoff>=2.2.1", "pypdf>=5.0.1", "psutil>=6.1.0", "numpy<=2.0.0", "playwright>=1.47.0", "langchain-anthropic>=0.1.23", "python-magic>=0.4.27", "unstructured[all-docs]==0.15.0", "langchain>=0.3,<0.4", "langchain-community>=0.3,<0.4", "langchain-openai>=0.1.21", "langchain-core>=0.3,<0.4", "llama-parse>=0.4.0", "pydantic-settings>=2.6.1", "onnxruntime==1.20.0; platform_machine == 'x86_64'", "onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'", "onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'", "onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'", "pypdfium2>=4.30.0", ] [project.optional-dependencies] api = [ "python-dotenv>=1.0.0", "uvloop>=0.18.0", "pydantic-settings>=2.6.1", "uvicorn>=0.32.0", "fastapi>=0.115.2", "ratelimit>=2.2.1", ] [build-system] requires = ["hatchling==1.26.3"] build-backend = "hatchling.build" [tool.rye] managed = true dev-dependencies = [] universal = true [tool.hatch.metadata] allow-direct-references = true [tool.hatch.build.targets.wheel] packages = ["src/megaparse", "src/api"] ================================================ FILE: libs/megaparse/src/megaparse/__init__.py ================================================ from .megaparse import MegaParse __all__ = ["MegaParse"] ================================================ FILE: libs/megaparse/src/megaparse/api/__init__.py ================================================ ================================================ FILE: libs/megaparse/src/megaparse/api/app.py ================================================ import io import os import tempfile from typing import Any, Optional import httpx import psutil import uvicorn from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile from langchain_anthropic import ChatAnthropic from langchain_community.document_loaders import PlaywrightURLLoader from langchain_openai import ChatOpenAI from llama_parse.utils import Language from megaparse_sdk.schema.document import Document from megaparse_sdk.schema.parser_config import ( ParserType, StrategyEnum, ) from megaparse_sdk.schema.supported_models import SupportedModel from megaparse import MegaParse from megaparse.api.exceptions.megaparse_exceptions import ( HTTPDownloadError, HTTPFileNotFound, HTTPModelNotSupported, HTTPParsingException, ParsingException, ) from megaparse.parser.builder import ParserBuilder app = FastAPI() playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"]) def parser_builder_dep(): return ParserBuilder() def get_playwright_loader(): return playwright_loader @app.get("/healthz") def healthz(): return {"status": "ok"} def _check_free_memory() -> bool: """Reject traffic when free memory is below minimum (default 2GB).""" mem = psutil.virtual_memory() memory_free_minimum = int(os.environ.get("MEMORY_FREE_MINIMUM_MB", 2048)) if mem.available <= memory_free_minimum * 1024 * 1024: return False return True @app.post( "/v1/file", ) async def parse_file( file: UploadFile = File(...), method: ParserType = Form(ParserType.UNSTRUCTURED), strategy: StrategyEnum = Form(StrategyEnum.AUTO), check_table: bool = Form(False), language: Language = Form(Language.ENGLISH), parsing_instruction: Optional[str] = Form(None), model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O), parser_builder=Depends(parser_builder_dep), ) -> dict[str, str | Document]: if not _check_free_memory(): raise HTTPException( status_code=503, detail="Service unavailable due to low memory" ) model = None if model_name and check_table: if model_name.startswith("gpt"): model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore elif model_name.startswith("claude"): model = ChatAnthropic( model_name=model_name, api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore timeout=60, stop=None, ) else: raise HTTPModelNotSupported() # parser_config = ParseFileConfig( #FIXME # method=method, # strategy=strategy, # llm_model_name=SupportedModel(model_name) if model_name and check_table else None, # language=language, # parsing_instruction=parsing_instruction, # ) try: # parser = parser_builder.build(parser_config) megaparse = MegaParse() if not file.filename: raise HTTPFileNotFound("No filename provided") _, extension = os.path.splitext(file.filename) file_bytes = await file.read() file_stream = io.BytesIO(file_bytes) result = await megaparse.aload(file=file_stream, file_extension=extension) return {"message": "File parsed successfully", "result": result} except ParsingException as e: print(e) raise HTTPParsingException(file.filename) except ValueError as e: print(e) raise HTTPException(status_code=400, detail=str(e)) except Exception as e: print(e) raise HTTPException(status_code=500, detail=str(e)) @app.post( "/v1/url", ) async def upload_url( url: str, playwright_loader=Depends(get_playwright_loader) ) -> dict[str, Any]: playwright_loader.urls = [url] if url.endswith(".pdf"): ## Download the file async with httpx.AsyncClient() as client: response = await client.get(url) if response.status_code != 200: raise HTTPDownloadError(url) with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file: temp_file.write(response.content) try: megaparse = MegaParse() result = await megaparse.aload(temp_file.name) return {"message": "File parsed successfully", "result": result} except ParsingException: raise HTTPParsingException(url) else: data = await playwright_loader.aload() # Now turn the data into a string extracted_content = "" for page in data: extracted_content += page.page_content if not extracted_content: raise HTTPDownloadError( url, message="Failed to extract content from the website. Valid URL example : https://www.quivr.com", ) return { "message": "Website content parsed successfully", "result": extracted_content, } if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) ================================================ FILE: libs/megaparse/src/megaparse/api/exceptions/__init__.py ================================================ ================================================ FILE: libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py ================================================ from fastapi import HTTPException class HTTPModelNotSupported(HTTPException): def __init__( self, detail: str = "The requested model is not supported yet.", headers: dict | None = None, ): super().__init__(status_code=501, detail=detail, headers=headers) class HTTPFileNotFound(HTTPException): def __init__( self, message="The UploadFile.filename does not exist and is needed for this operation", ): super().__init__(status_code=404, detail=message) class HTTPDownloadError(HTTPException): def __init__(self, file_name, message="Failed to download the file"): message = f"{file_name} : {message}" super().__init__(status_code=400, detail=message) class HTTPParsingException(HTTPException): def __init__(self, file_name, message="Failed to parse the file"): message = f"{file_name} : {message}" super().__init__(status_code=500, detail=message) class ParsingException(Exception): """Exception raised for errors in the parsing process.""" def __init__(self, message="An error occurred during parsing"): self.message = message super().__init__(self.message) ================================================ FILE: libs/megaparse/src/megaparse/api/models/__init__.py ================================================ ================================================ FILE: libs/megaparse/src/megaparse/api/models/base.py ================================================ from enum import Enum class MarkDownType(str, Enum): """Markdown type enumeration.""" TITLE = "Title" SUBTITLE = "Subtitle" HEADER = "Header" FOOTER = "Footer" NARRATIVE_TEXT = "NarrativeText" LIST_ITEM = "ListItem" TABLE = "Table" PAGE_BREAK = "PageBreak" IMAGE = "Image" FORMULA = "Formula" FIGURE_CAPTION = "FigureCaption" ADDRESS = "Address" EMAIL_ADDRESS = "EmailAddress" CODE_SNIPPET = "CodeSnippet" PAGE_NUMBER = "PageNumber" DEFAULT = "Default" UNDEFINED = "Undefined" ================================================ FILE: libs/megaparse/src/megaparse/configs/auto.py ================================================ from enum import Enum from pydantic import BaseModel from pydantic_settings import BaseSettings, SettingsConfigDict class TextDetConfig(BaseModel): det_arch: str = "fast_base" batch_size: int = 2 assume_straight_pages: bool = True preserve_aspect_ratio: bool = True symmetric_pad: bool = True load_in_8_bit: bool = False class AutoStrategyConfig(BaseModel): page_threshold: float = 0.6 document_threshold: float = 0.2 class TextRecoConfig(BaseModel): reco_arch: str = "crnn_vgg16_bn" batch_size: int = 512 class DeviceEnum(str, Enum): CPU = "cpu" CUDA = "cuda" COREML = "coreml" class DoctrConfig(BaseModel): straighten_pages: bool = False detect_orientation: bool = False detect_language: bool = False text_det_config: TextDetConfig = TextDetConfig() text_reco_config: TextRecoConfig = TextRecoConfig() class MegaParseConfig(BaseSettings): """ Configuration for Megaparse. """ model_config = SettingsConfigDict( env_prefix="MEGAPARSE_", env_file=(".env.local", ".env"), env_nested_delimiter="__", extra="ignore", use_enum_values=True, ) doctr_config: DoctrConfig = DoctrConfig() auto_config: AutoStrategyConfig = AutoStrategyConfig() device: DeviceEnum = DeviceEnum.CPU ================================================ FILE: libs/megaparse/src/megaparse/examples/parse_file.py ================================================ from pathlib import Path from megaparse.megaparse import MegaParse from pydantic import BaseModel, Field class MyCustomFormat(BaseModel): title: str = Field(description="The title of the document.") problem: str = Field(description="The problem statement.") solution: str = Field(description="The solution statement.") def main(): # model = ChatOpenAI(name="gpt-4o") # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) megaparse = MegaParse() file_path = Path("./tests/pdf/ocr/0168127.pdf") result = megaparse.load(file_path=file_path) print(result) if __name__ == "__main__": main() ================================================ FILE: libs/megaparse/src/megaparse/examples/parsing_process.py ================================================ from pathlib import Path from typing import IO, Any, List, Tuple import numpy as np import onnxruntime as rt import pypdfium2 as pdfium from megaparse.configs.auto import ( AutoStrategyConfig, DeviceEnum, TextDetConfig, TextRecoConfig, ) from megaparse.models.page import Page, PageDimension from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.schema.document import BBOX, BlockLayout, BlockType, TextDetection from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from onnxtr.models import detection_predictor, recognition_predictor from onnxtr.models.builder import DocumentBuilder from onnxtr.models.engine import EngineConfig from onnxtr.utils.geometry import ( detach_scores, extract_crops, extract_rcrops, ) from pypdfium2._helpers.page import PdfPage def get_strategy_page( pdfium_page: PdfPage, onnxtr_page: TextDetection, page_threshold: float = 0.6 ) -> StrategyEnum: # assert ( # p_width == onnxtr_page.dimensions[1] # and p_height == onnxtr_page.dimensions[0] # ), "Page dimensions do not match" text_coords = [] # Get all the images in the page for obj in pdfium_page.get_objects(): if obj.type == 1: text_coords.append(obj.get_pos()) p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) pdfium_canva = np.zeros((int(p_height), int(p_width))) for coords in text_coords: # (left,bottom,right, top) # 0---l--------------R-> y # | # B (x0,y0) # | # T (x1,y1) # ^ # x x0, y0, x1, y1 = ( p_height - coords[3], coords[0], p_height - coords[1], coords[2], ) x0 = max(0, min(p_height, int(x0))) y0 = max(0, min(p_width, int(y0))) x1 = max(0, min(p_height, int(x1))) y1 = max(0, min(p_width, int(y1))) pdfium_canva[x0:x1, y0:y1] = 1 onnxtr_canva = np.zeros((int(p_height), int(p_width))) for block in onnxtr_page.bboxes: x0, y0 = block.bbox[0] x1, y1 = block.bbox[1] x0 = max(0, min(int(x0 * p_width), int(p_width))) y0 = max(0, min(int(y0 * p_height), int(p_height))) x1 = max(0, min(int(x1 * p_width), int(p_width))) y1 = max(0, min(int(y1 * p_height), int(p_height))) onnxtr_canva[y0:y1, x0:x1] = 1 intersection = np.logical_and(pdfium_canva, onnxtr_canva) union = np.logical_or(pdfium_canva, onnxtr_canva) iou = np.sum(intersection) / np.sum(union) if iou < page_threshold: return StrategyEnum.HI_RES return StrategyEnum.FAST def validate_input( file_path: Path | str | None = None, file: IO[bytes] | None = None, file_extension: str | FileExtension | None = None, ) -> FileExtension: if not (file_path or file): raise ValueError("Either file_path or file should be provided") if file_path and file: raise ValueError("Only one of file_path or file should be provided") if file_path and file is None: if isinstance(file_path, str): file_path = Path(file_path) file_extension = file_path.suffix elif file and file_path is None: if not file_extension: raise ValueError( "file_extension should be provided when given file argument" ) file.seek(0) else: raise ValueError("Either provider a file_path or file") if isinstance(file_extension, str): try: file_extension = FileExtension(file_extension) except ValueError: raise ValueError(f"Unsupported file extension: {file_extension}") return file_extension def _generate_crops( pages: list[np.ndarray], loc_preds: list[np.ndarray], channels_last: bool, assume_straight_pages: bool = False, assume_horizontal: bool = False, ) -> list[list[np.ndarray]]: if assume_straight_pages: crops = [ extract_crops(page, _boxes[:, :4], channels_last=channels_last) for page, _boxes in zip(pages, loc_preds, strict=False) ] else: crops = [ extract_rcrops( page, _boxes[:, :4], channels_last=channels_last, assume_horizontal=assume_horizontal, ) for page, _boxes in zip(pages, loc_preds, strict=False) ] return crops def _prepare_crops( pages: list[np.ndarray], loc_preds: list[np.ndarray], channels_last: bool, assume_straight_pages: bool = False, assume_horizontal: bool = False, ) -> tuple[list[list[np.ndarray]], list[np.ndarray]]: crops = _generate_crops( pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal ) # Avoid sending zero-sized crops is_kept = [ [all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops ] crops = [ [crop for crop, _kept in zip(page_crops, page_kept, strict=False) if _kept] for page_crops, page_kept in zip(crops, is_kept, strict=False) ] loc_preds = [ _boxes[_kept] for _boxes, _kept in zip(loc_preds, is_kept, strict=False) ] return crops, loc_preds def _process_predictions( loc_preds: list[np.ndarray], word_preds: list[tuple[str, float]], crop_orientations: list[dict[str, Any]], ) -> tuple[list[np.ndarray], list[list[tuple[str, float]]], list[list[dict[str, Any]]]]: text_preds = [] crop_orientation_preds = [] if len(loc_preds) > 0: # Text & crop orientation predictions at page level _idx = 0 for page_boxes in loc_preds: text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]]) crop_orientation_preds.append( crop_orientations[_idx : _idx + page_boxes.shape[0]] ) _idx += page_boxes.shape[0] return loc_preds, text_preds, crop_orientation_preds def main(): file_path = Path("./tests/pdf/sample_pdf.pdf") strategy = StrategyEnum.AUTO device = DeviceEnum.COREML ocr_parser = DoctrParser() default_parser = UnstructuredParser(strategy=StrategyEnum.FAST) file_extension = validate_input(file_path=file_path) with open(file_path, "rb") as file: pdfium_document = pdfium.PdfDocument(file) rasterized_pages: list[np.ndarray] = [ np.array(page.render().to_pil(scale=2)) for page in pdfium_document ] ##----------------------------------- ## GET PAGES ##----------------------------------- mp_pages = [] if strategy == StrategyEnum.FAST: parsed_document = default_parser.convert( file=file, file_extension=file_extension, ) else: text_det_config = TextDetConfig() general_options = rt.SessionOptions() providers = get_providers(device=device) engine_config = EngineConfig( session_options=general_options, providers=providers, ) det_predictor = detection_predictor( arch=text_det_config.det_arch, assume_straight_pages=text_det_config.assume_straight_pages, preserve_aspect_ratio=text_det_config.preserve_aspect_ratio, symmetric_pad=text_det_config.symmetric_pad, batch_size=text_det_config.batch_size, load_in_8_bit=text_det_config.load_in_8_bit, engine_cfg=engine_config, ) if any(page.ndim != 3 for page in rasterized_pages): raise ValueError( "incorrect input shape: all pages are expected to be multi-channel 2D images." ) orientations = None general_pages_orientations = None # Localize text elements loc_preds, out_maps = det_predictor(rasterized_pages, return_maps=True) # FIXME: For simplicity we do not care about page orientation rn # FIXME: similaly we don't care about straighten page # Detach objectness scores from loc_preds loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type] # FIXME: Do not care about hooks here # # Apply hooks to loc_preds if any # for hook in hooks: # loc_preds = hook(loc_preds) all_pages_layouts = [] for page_index, (page, loc_pred, objectness_score) in enumerate( zip(rasterized_pages, loc_preds, objectness_scores, strict=True) ): block_layouts = [] for bbox, score in zip(loc_pred, objectness_score, strict=True): block_layouts.append( BlockLayout( bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()), objectness_score=score, block_type=BlockType.TEXT, ) ) all_pages_layouts.append( TextDetection( bboxes=block_layouts, page_index=page_index, dimensions=page.shape[:2], orientation=general_pages_orientations[page_index] if general_pages_orientations is not None else 0, ) ) for pdfium_page, onnxtr_page, rasterized_page in zip( pdfium_document, all_pages_layouts, rasterized_pages, strict=True ): strategy = get_strategy_page(pdfium_page, onnxtr_page) mp_pages.append( Page( strategy=strategy, text_detections=onnxtr_page, rasterized=rasterized_page, page_size=PageDimension( width=pdfium_page.get_width(), height=pdfium_page.get_height(), ), page_index=onnxtr_page.page_index, pdfium_elements=pdfium_page, ) ) ##----------------------------------- ## GET PARSER BASE ON CHOSE STRATEGY ##----------------------------------- if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: parser = default_parser elif strategy == StrategyEnum.HI_RES: parser = ocr_parser else: if need_hi_res(mp_pages, AutoStrategyConfig()): parser = ocr_parser else: parser = default_parser ##----------------------------------- ## PARSE FILE ##----------------------------------- if isinstance(parser, UnstructuredParser): parsed_document = parser.convert( file=file, pages=mp_pages, file_extension=file_extension, ) else: origin_page_shapes: List[Tuple[int, int]] = [ (page.shape[0], page.shape[1]) for page in rasterized_pages ] reco_config = TextRecoConfig() reco_predictor = recognition_predictor( arch=reco_config.reco_arch, batch_size=reco_config.batch_size, load_in_8_bit=text_det_config.load_in_8_bit, engine_cfg=engine_config, ) # Crop images crops, loc_preds = _prepare_crops( rasterized_pages, loc_preds, # type: ignore[arg-type] channels_last=True, assume_straight_pages=True, # FIXME: To change assume_horizontal=True, # FIXME: To change ) # Rectify crop orientation and get crop orientation predictions crop_orientations: Any = [] # Identify character sequences word_preds = reco_predictor( [crop for page_crops in crops for crop in page_crops] ) if not crop_orientations: crop_orientations = [ {"value": 0, "confidence": None} for _ in word_preds ] boxes, text_preds, crop_orientations = _process_predictions( loc_preds, word_preds, crop_orientations ) doc_builder = DocumentBuilder() parsed_document = doc_builder( rasterized_pages, boxes, objectness_scores, text_preds, origin_page_shapes, crop_orientations, orientations, None, ) print(parsed_document) if __name__ == "__main__": main() ================================================ FILE: libs/megaparse/src/megaparse/exceptions/base.py ================================================ class ParsingException(Exception): """Exception raised for errors in the parsing process.""" def __init__(self, message="An error occurred during parsing"): self.message = message super().__init__(self.message) ================================================ FILE: libs/megaparse/src/megaparse/formatter/base.py ================================================ from abc import ABC from pathlib import Path from typing import Union from langchain_core.language_models.chat_models import BaseChatModel from megaparse_sdk.schema.document import Document class BaseFormatter(ABC): """ A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. Attributes ---------- model : BaseChatModel An instance of a chat model used to process and improve the layout of elements. Methods ------- improve_layout(elements: List[Element]) -> List[Element] Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. """ def __init__(self, model: BaseChatModel | None = None): self.model = model def format( self, document: Document, file_path: Path | str | None = None ) -> Union[Document, str]: raise NotImplementedError("Subclasses should implement this method") async def aformat( self, document: Document, file_path: Path | str | None = None ) -> Union[Document, str]: raise NotImplementedError("Subclasses should implement this method") ================================================ FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py ================================================ from pathlib import Path from langchain_core.language_models.chat_models import BaseChatModel from megaparse.formatter.base import BaseFormatter from megaparse_sdk.schema.document import Document from pydantic import BaseModel class StructuredFormatter(BaseFormatter): def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): super().__init__(model) self.output_model = output_model async def aformat( self, document: Document, file_path: Path | str | None = None, ) -> str: # FIXME: Return a structured output of type BaseModel ? raise NotImplementedError() def format( self, document: Document, file_path: Path | str | None = None, ) -> str: # FIXME: Return a structured output of type BaseModel ? raise NotImplementedError() ================================================ FILE: libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py ================================================ from pathlib import Path from megaparse.formatter.structured_formatter import StructuredFormatter from megaparse_sdk.schema.document import Document from pydantic import BaseModel class CustomStructuredFormatter(StructuredFormatter): def format( self, document: Document, file_path: Path | str | None = None, ) -> str: """ Structure the file using an AI language model. Args: text: The text to format. file_path: The file path of the text. model: The AI language model to use for formatting. Returns: The structured text. """ if not self.model: raise ValueError("A Model is needed to use the CustomStructuredFormatter.") print("Formatting text using CustomStructuredFormatter...") text = str(document) if len(text) < 0: raise ValueError( "A non empty text is needed to format text using CustomStructuredFormatter." ) if not self.output_model: raise ValueError( "An output model is needed to structure text using CustomStructuredFormatter." ) structured_model = self.model.with_structured_output(self.output_model) # type: ignore formatted_text = structured_model.invoke( f"Parse the text in a structured format: {text}" ) assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." return formatted_text.model_dump_json() async def aformat( self, document: Document, file_path: Path | str | None = None, ) -> str: """ Asynchronously structure the file using an AI language model. Args: text: The text to format. file_path: The file path of the text. model: The AI language model to use for formatting. Returns: The structured text. """ if not self.model: raise ValueError("A Model is needed to use the CustomStructuredFormatter.") print("Formatting text using CustomStructuredFormatter...") text = str(document) if len(text) < 0: raise ValueError( "A non empty text is needed to format text using CustomStructuredFormatter." ) if not self.output_model: raise ValueError( "An output model is needed to structure text using CustomStructuredFormatter." ) structured_model = self.model.with_structured_output(self.output_model) # type: ignore formatted_text = await structured_model.ainvoke( f"Parse the text in a structured format: {text}" ) assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." return formatted_text.model_dump_json() ================================================ FILE: libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py ================================================ from pathlib import Path from megaparse.formatter.base import BaseFormatter from megaparse_sdk.schema.document import Document class TableFormatter(BaseFormatter): def format( self, document: Document, file_path: Path | str | None = None ) -> Document: raise NotImplementedError("Subclasses should implement this method") async def aformat( self, document: Document, file_path: Path | str | None = None ) -> Document: raise NotImplementedError("Subclasses should implement this method") ================================================ FILE: libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py ================================================ import re import warnings from pathlib import Path from typing import Optional from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import ChatPromptTemplate from megaparse.formatter.table_formatter import TableFormatter from megaparse_sdk.schema.document import Document, TableBlock class SimpleMDTableFormatter(TableFormatter): """ A formatter that converts table elements into Markdown format using llms. """ TABLE_MARKER_START = "[TABLE]" TABLE_MARKER_END = "[/TABLE]" CODE_BLOCK_PATTERN = r"^```.*$\n?" def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) async def aformat( self, document: Document, file_path: Path | str | None = None ) -> Document: warnings.warn( "The SimpleMDTableFormatter is a sync formatter, please use the sync format method", UserWarning, stacklevel=2, ) return self.format(document=document, file_path=file_path) def format( self, document: Document, file_path: Path | str | None = None ) -> Document: """ Formats table elements within a list of elements. Args: elements: A list of Element objects. Returns: A list of Element objects with formatted tables. """ if not self.model: raise ValueError("A Model is needed to use the SimpleMDTableFormatter.") print("Formatting tables using SimpleMDTableFormatter...") table_stack = [] formatted_elements = [] for block in document.content: if isinstance(block, TableBlock): previous_table = table_stack[-1] if table_stack else "" formatted_table = self.format_table(block, previous_table) table_stack.append(formatted_table.text) formatted_elements.append(formatted_table) else: formatted_elements.append(block) document.content = formatted_elements return document def format_table( self, table_element: TableBlock, previous_table: str ) -> TableBlock: """ Formats a single table element into Markdown using an AI language model. Args: table_element: The table element to format. previous_table: The previously formatted table text. Returns: The formatted table element. """ assert self.model is not None, "Model is not set." prompt = ChatPromptTemplate.from_messages( [ ( "human", ( "You are an expert in markdown tables. Transform the following parsed table into a " "markdown table. Provide just the table in pure markdown, nothing else.\n" "\n{text}\n\n" "\n{previous_table}\n" ), ), ] ) chain = prompt | self.model result = chain.invoke( { "text": table_element.text, "previous_table": previous_table, } ) content_str = str(result.content) cleaned_content = re.sub( self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE ) markdown_table = ( f"{self.TABLE_MARKER_START}\n" f"{cleaned_content}\n" f"{self.TABLE_MARKER_END}\n\n" ) table_element.text = markdown_table return table_element ================================================ FILE: libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py ================================================ import base64 from io import BytesIO from pathlib import Path from typing import List, Optional from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage from megaparse.formatter.table_formatter import TableFormatter from megaparse_sdk.schema.document import Document, TableBlock from pdf2image import convert_from_path from PIL import Image TABLE_OCR_PROMPT = """ You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. """ class VisionMDTableFormatter(TableFormatter): """ A formatter that converts table elements into Markdown format using an AI language model. """ TABLE_MARKER_START = "[TABLE]" TABLE_MARKER_END = "[/TABLE]" CODE_BLOCK_PATTERN = r"^```.*$\n?" def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str: """ Helper method to crop the table portion of the PDF page and convert it to a base64 string. """ assert table_element.bbox, "Table element must have coordinates." bbox = table_element.bbox page_number = table_element.page_range[0] assert page_number, "Table element must have a page number." assert bbox, "Table element must have coordinates." pages = convert_from_path(file_path) # Calculate the box for cropping box = ( bbox.top_left.x, bbox.top_left.y, bbox.bottom_right.x, bbox.bottom_right.y, ) table_image = pages[page_number - 1].crop(box) # Convert the cropped image to base64 table_image64 = self.process_file([table_image])[0] return table_image64 async def aformat( self, document: Document, file_path: Path | str | None = None ) -> Document: """ Asynchronously formats table elements within a list of elements. """ if not self.model: raise ValueError("A Model is needed to use the VisionMDTableFormatter.") print("Formatting tables using VisionMDTableFormatter (async)...") assert ( file_path ), "A file path is needed to format tables using VisionMDTableFormatter." if not isinstance(file_path, str): file_path = str(file_path) formatted_elements = [] for block in document.content: if isinstance(block, TableBlock): formatted_table = await self.aformat_table(block, file_path) formatted_elements.append(formatted_table) else: formatted_elements.append(block) document.content = formatted_elements return document def format( self, document: Document, file_path: Path | str | None = None ) -> Document: """ Asynchronously formats table elements within a list of elements. """ if not self.model: raise ValueError("A Model is needed to use the VisionMDTableFormatter.") print("Formatting tables using VisionMDTableFormatter (async)...") assert ( file_path ), "A file path is needed to format tables using VisionMDTableFormatter." if not isinstance(file_path, str): file_path = str(file_path) formatted_elements = [] for block in document.content: if isinstance(block, TableBlock): formatted_table = self.format_table(block, file_path) formatted_elements.append(formatted_table) else: formatted_elements.append(block) document.content = formatted_elements return document async def aformat_table( self, table_element: TableBlock, file_path: str ) -> TableBlock: """ Asynchronously formats a table element into Markdown format using a Vision Model. """ table_image64 = self._crop_table_image(table_element, file_path) formatted_table = await self.avision_extract(table_image64) markdown_table = ( f"{self.TABLE_MARKER_START}\n" f"{formatted_table}\n" f"{self.TABLE_MARKER_END}\n\n" ) # Replace the element's text with the formatted table text table_element.text = markdown_table return table_element def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock: """ Asynchronously formats a table element into Markdown format using a Vision Model. """ table_image64 = self._crop_table_image(table_element, file_path) formatted_table = self.vision_extract(table_image64) markdown_table = ( f"{self.TABLE_MARKER_START}\n" f"{formatted_table}\n" f"{self.TABLE_MARKER_END}\n\n" ) # Replace the element's text with the formatted table text table_element.text = markdown_table return table_element def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]: """ Convert a list of PIL images to base64 encoded images. """ try: images_base64 = [] for image in images: buffered = BytesIO() image.save(buffered, format=image_format) image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") images_base64.append(image_base64) return images_base64 except Exception as e: raise ValueError(f"Error processing PDF file: {str(e)}") async def avision_extract(self, table_image: str) -> str: """ Asynchronously send image data to the language model for processing. """ assert ( self.model ), "A model is needed to use the VisionMDTableFormatter (async)." image_prompt = { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, } message = HumanMessage( content=[ {"type": "text", "text": TABLE_OCR_PROMPT}, image_prompt, ], ) response = await self.model.ainvoke([message]) return str(response.content) def vision_extract(self, table_image: str) -> str: """ Synchronously send image data to the language model for processing. """ assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)." image_prompt = { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, } message = HumanMessage( content=[ {"type": "text", "text": TABLE_OCR_PROMPT}, image_prompt, ], ) response = self.model.invoke([message]) return str(response.content) ================================================ FILE: libs/megaparse/src/megaparse/layout_detection/layout_detector.py ================================================ import logging import os import pathlib import uuid from typing import Any, List import numpy as np import onnxruntime as rt from megaparse.configs.auto import DeviceEnum from megaparse.layout_detection.output import LayoutDetectionOutput from megaparse.utils.onnx import get_providers from megaparse_sdk.schema.document import BBOX, Point2D from onnxtr.models.engine import EngineConfig from onnxtr.models.preprocessor import PreProcessor from PIL import Image, ImageDraw from PIL.Image import Image as PILImage logger = logging.getLogger("megaparse") LABEL_MAP = { 0: "Caption", 1: "Footnote", 2: "Formula", 3: "List-item", 4: "Page-footer", 5: "Page-header", 6: "Picture", 7: "Section-header", 8: "Table", 9: "Text", 10: "Title", } default_cfg: dict[str, dict[str, Any]] = { "yolov10s-doclaynet": { "mean": (0.5, 0.5, 0.5), "std": (1.0, 1.0, 1.0), "url_8_bit": None, "input_shape": (1, 1024, 1024), "url": pathlib.Path(__file__).parent.joinpath("models/yolov10s-doclaynet.onnx"), } } class LayoutDetector: def __init__( self, device: DeviceEnum = DeviceEnum.CPU, threshold: float = 0.1, preserve_aspect_ratio: bool = True, model_name: str = "yolov10s-doclaynet", load_in_8_bit: bool = False, ): model_config = default_cfg[model_name] self.device = device general_options = rt.SessionOptions() providers = get_providers(self.device) self.threshold = threshold self.batch_size, self.required_width, self.required_height = model_config[ "input_shape" ] self.preserve_aspect_ratio = preserve_aspect_ratio self.pre_processor = PreProcessor( output_size=(self.required_width, self.required_height), batch_size=self.batch_size, preserve_aspect_ratio=self.preserve_aspect_ratio, ) engine_config = EngineConfig( session_options=general_options, providers=providers, ) model_path = ( model_config.get("url_8_bit") if load_in_8_bit else model_config.get("url") ) assert model_path, f"Model path not found for {model_name}" self.model = rt.InferenceSession(model_path, engine_config=engine_config) def __call__( self, img_pages: list[PILImage], output_dir: str | None = None ) -> List[List[LayoutDetectionOutput]]: pages = [np.array(img) for img in img_pages] # Dimension check if any(page.ndim != 3 for page in pages): raise ValueError( "incorrect input shape: all pages are expected to be multi-channel 2D images." ) processed_batches = self.pre_processor(pages) processed_batches = np.array(processed_batches) processed_batches = processed_batches.squeeze(1) # Horrendus processed_batches = processed_batches.transpose(0, 3, 1, 2) pred_batches = np.array( [ self.model.run(None, {"images": np.expand_dims(batch, axis=0)}) for batch in processed_batches ] ) pred_batches = np.concatenate(pred_batches, axis=0) pred_batches = pred_batches.squeeze(1) # Horrendus processed_preds = [] for page, pred in zip(pages, pred_batches, strict=True): img_h, img_w = page.shape[:2] bboxes = self.extract_bboxes_from_page(pred, img_h, img_w) processed_preds.append(bboxes) if output_dir: self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir) return processed_preds def extract_bboxes_from_page( self, preds: np.ndarray, img_h: int, img_w: int ) -> List[LayoutDetectionOutput]: results = [] assert preds.shape == (300, 6) scale_h = img_h / self.required_height scale_w = img_w / self.required_width for det in preds: # Rescale the bounding box coordinates to the original dimensions x1, y1, x2, y2, score, cls_idx = det if score < self.threshold: continue x1 *= scale_w x2 *= scale_w y1 *= scale_h y2 *= scale_h if self.preserve_aspect_ratio: ratio = img_h / img_w x1 = x1 * (ratio if ratio > 1 else 1) x2 = x2 * (ratio if ratio > 1 else 1) y1 = y1 / (ratio if ratio < 1 else 1) y2 = y2 / (ratio if ratio < 1 else 1) x1 = max(0, min(x1, img_w)) x2 = max(0, min(x2, img_w)) y1 = max(0, min(y1, img_h)) y2 = max(0, min(y2, img_h)) bbox_id = uuid.uuid4() results.append( LayoutDetectionOutput( bbox_id=bbox_id, bbox=BBOX( top_left=Point2D(x=x1 / img_w, y=y1 / img_h), bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h), ), prob=det[4], label=int(det[5]), ) ) result = self.topK(results) # or topK return result def nms( self, raw_bboxes: List[LayoutDetectionOutput], iou_threshold: float = 0.9, # FIXME: thresh Configurable in constructor ) -> List[LayoutDetectionOutput]: """ Non-Maximum Suppression (NMS) algorithm. Args: raw_bboxes (list): List of LayoutBBox objects. iou_threshold (float): IoU threshold for suppression. Returns: None: The input list `raw_bboxes` is modified in-place. """ raw_bboxes.sort(key=lambda x: x.prob, reverse=True) current_index = 0 for index in range(len(raw_bboxes)): drop = False for prev_index in range(current_index): iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox) if iou > iou_threshold: drop = True break if not drop: raw_bboxes[current_index], raw_bboxes[index] = ( raw_bboxes[index], raw_bboxes[current_index], ) current_index += 1 return raw_bboxes[:current_index] def topK( self, detectResult: List[LayoutDetectionOutput], topK: int = 50 ) -> List[LayoutDetectionOutput]: if len(detectResult) <= topK: return detectResult else: predBoxs = [] sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True) for i in range(topK): predBoxs.append(sort_detectboxs[i]) return predBoxs def _save_layout( self, pages: list[np.ndarray], preds: list[list[LayoutDetectionOutput]], output_dir: str, ): os.makedirs(output_dir, exist_ok=True) for i, (page, layout) in enumerate(zip(pages, preds, strict=True)): image = Image.fromarray(page) draw = ImageDraw.Draw(image) img_w, img_h = image.size for detection in layout: x_min, y_min, x_max, y_max = detection.bbox.to_numpy() bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h confidence = detection.prob category = detection.label label = LABEL_MAP.get(category, "Unknown") draw.rectangle(bbox, outline="red", width=2) # assert bbox[2] <= image.width # assert bbox[3] <= image.height draw.text( (bbox[0], bbox[1]), f"{label} ({confidence:.2f})", fill="red", ) image.save(os.path.join(output_dir, f"page_{i}.png")) ================================================ FILE: libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx ================================================ [File too large to display: 27.9 MB] ================================================ FILE: libs/megaparse/src/megaparse/layout_detection/output.py ================================================ from uuid import UUID from megaparse_sdk.schema.document import BBOX from pydantic import BaseModel class LayoutDetectionOutput(BaseModel): bbox_id: UUID bbox: BBOX prob: float label: int ================================================ FILE: libs/megaparse/src/megaparse/megaparse.py ================================================ import logging import warnings from pathlib import Path from typing import IO, BinaryIO, List import pypdfium2 as pdfium from megaparse_sdk.schema import document from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from megaparse.configs.auto import MegaParseConfig from megaparse.exceptions.base import ParsingException from megaparse.formatter.base import BaseFormatter from megaparse.layout_detection.layout_detector import LayoutDetector from megaparse.models.page import Page, PageDimension from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse.utils.strategy import ( determine_global_strategy, get_page_strategy, ) logger = logging.getLogger("megaparse") class MegaParse: def __init__( self, formatters: List[BaseFormatter] | None = None, config: MegaParseConfig = MegaParseConfig(), unstructured_strategy: StrategyEnum = StrategyEnum.AUTO, ) -> None: self.config = config self.formatters = formatters self.doctr_parser = DoctrParser( text_det_config=self.config.doctr_config.text_det_config, text_reco_config=self.config.doctr_config.text_reco_config, device=self.config.device, straighten_pages=self.config.doctr_config.straighten_pages, detect_orientation=self.config.doctr_config.detect_orientation, detect_language=self.config.doctr_config.detect_language, ) self.unstructured_parser = UnstructuredParser() self.layout_model = LayoutDetector() self.unstructured_parser = UnstructuredParser(unstructured_strategy) def validate_input( self, file_path: Path | str | None = None, file: IO[bytes] | None = None, file_extension: str | FileExtension | None = None, ) -> FileExtension: if not (file_path or file): raise ValueError("Either file_path or file should be provided") if file_path and file: raise ValueError("Only one of file_path or file should be provided") if file_path and file is None: if isinstance(file_path, str): file_path = Path(file_path) file_extension = file_path.suffix elif file and file_path is None: if not file_extension: raise ValueError( "file_extension should be provided when given file argument" ) file.seek(0) else: raise ValueError("Either provider a file_path or file") if isinstance(file_extension, str): try: file_extension = FileExtension(file_extension) except ValueError: raise ValueError(f"Unsupported file extension: {file_extension}") return file_extension def extract_page_strategies( self, pdfium_document: pdfium.PdfDocument, rast_scale: int = 2 ) -> List[Page]: pages = [] for i, pdfium_page in enumerate(pdfium_document): rasterized_page = pdfium_page.render(scale=rast_scale) assert ( abs(pdfium_page.get_width() * rast_scale - rasterized_page.width) <= 1 ), ( f"Widths do not match within a margin of 1: " f"{pdfium_page.get_width() * rast_scale} != {rasterized_page.width}" ) pages.append( Page( strategy=StrategyEnum.AUTO, text_detections=None, rasterized=rasterized_page.to_pil(), page_size=PageDimension( width=pdfium_page.get_width() * rast_scale, height=pdfium_page.get_height() * rast_scale, ), page_index=i, pdfium_elements=pdfium_page, ) ) pages.append( Page( strategy=StrategyEnum.AUTO, text_detections=None, rasterized=rasterized_page.to_pil(), page_size=PageDimension( width=pdfium_page.get_width() * rast_scale, height=pdfium_page.get_height() * rast_scale, ), page_index=i, pdfium_elements=pdfium_page, ) ) # ---- # Get text detection for each page -> PAGE pages = self.doctr_parser.get_text_detections(pages) # --- # Get strategy per page -> PAGE for page in pages: page.strategy = get_page_strategy( page.pdfium_elements, page.text_detections, threshold=self.config.auto_config.page_threshold, ) return pages pages = self.doctr_parser.get_text_detections(pages) for page in pages: page.strategy = get_page_strategy( page.pdfium_elements, page.text_detections, threshold=self.config.auto_config.page_threshold, ) return pages def load( self, file_path: Path | str | None = None, file: BinaryIO | None = None, file_extension: str | FileExtension = "", strategy: StrategyEnum = StrategyEnum.AUTO, ) -> str: file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: self.unstructured_parser.strategy = strategy return str( self.unstructured_parser.convert( file_path=file_path, file=file, file_extension=file_extension ) ) else: opened_file = None try: if file_path: opened_file = open(file_path, "rb") file = opened_file assert file is not None, "No File provided" pdfium_document = pdfium.PdfDocument(file) # Rasterize pages and extract text recognition pages = self.extract_page_strategies(pdfium_document) strategy = determine_global_strategy( pages, self.config.auto_config.document_threshold ) # Extract layout model assert all(p.rasterized for p in pages) layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore if strategy == StrategyEnum.HI_RES: logger.debug("Using doctr for text recognition") parsed_document = self.doctr_parser.get_text_recognition( pages, layout_result ) else: logger.debug("Using Unstructured Parser") self.unstructured_parser.strategy = StrategyEnum.FAST parsed_document = self.unstructured_parser.convert( file=file, file_extension=file_extension ) # additional attributes parsed_document.file_name = str(file_path) if file_path else None parsed_document.metadata = pdfium_document.get_metadata_dict() # Format -> TODO: should be generic if self.formatters: for formatter in self.formatters: if isinstance(parsed_document, str): warnings.warn( f"The last step returned a string, the {formatter.__class__} and following will not be applied", stacklevel=2, ) break parsed_document = formatter.format(parsed_document) if not isinstance(parsed_document, str): return str(parsed_document) return parsed_document except Exception as e: logger.exception(f"Error occured while parsing {file}: {e}") raise ParsingException( f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" ) finally: if opened_file: opened_file.close() async def aload( self, file_path: Path | str | None = None, file: BinaryIO | None = None, file_extension: str | FileExtension = "", strategy: StrategyEnum = StrategyEnum.AUTO, ) -> str | document.Document: file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: self.unstructured_parser.strategy = strategy parsed_document = await self.unstructured_parser.aconvert( file_path=file_path, file=file, file_extension=file_extension ) return str(parsed_document) else: opened_file = None try: if file_path: opened_file = open(file_path, "rb") file = opened_file assert file is not None, "No File provided" pdfium_document = pdfium.PdfDocument(file) # Determine strategy pages = self.extract_page_strategies(pdfium_document) strategy = determine_global_strategy( pages, self.config.auto_config.document_threshold ) # Run layout model assert all(p.rasterized for p in pages) layout_result = self.layout_model([p.rasterized for p in pages]) # type: ignore if strategy == StrategyEnum.HI_RES: logger.info("Using Doctr for text recognition") parsed_document = self.doctr_parser.get_text_recognition( pages, layout_result ) else: logger.info("Switching to Unstructured Parser") self.unstructured_parser.strategy = StrategyEnum.FAST parsed_document = await self.unstructured_parser.aconvert( file=file, file_extension=file_extension ) parsed_document.file_name = str(file_path) if file_path else None parsed_document.metadata = pdfium_document.get_metadata_dict() if self.formatters: for formatter in self.formatters: if isinstance(parsed_document, str): warnings.warn( f"The last step returned a string, the {formatter.__class__} and following will not be applied", stacklevel=2, ) break parsed_document = await formatter.aformat(parsed_document) return parsed_document except Exception as e: raise ParsingException( f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" ) finally: if opened_file: opened_file.close() ================================================ FILE: libs/megaparse/src/megaparse/models/page.py ================================================ from typing import List from megaparse_sdk.schema.document import TextDetection from megaparse_sdk.schema.parser_config import StrategyEnum from PIL.Image import Image as PILImage from pydantic import BaseModel, ConfigDict from pypdfium2._helpers.page import PdfPage class PageDimension(BaseModel): """ A class to represent a page dimension """ width: float height: float class Page(BaseModel): """ A class to represent a page """ strategy: StrategyEnum text_detections: TextDetection | None = None rasterized: PILImage | None = None page_size: PageDimension page_index: int pdfium_elements: PdfPage model_config = ConfigDict(arbitrary_types_allowed=True) class GatewayDocument(BaseModel): """ A class to represent a Gateway MegaParse Document, which is a container of pages. """ file_name: str pages: List[Page] ================================================ FILE: libs/megaparse/src/megaparse/parser/__init__.py ================================================ from .base import BaseParser __all__ = ["BaseParser"] ================================================ FILE: libs/megaparse/src/megaparse/parser/base.py ================================================ from abc import ABC, abstractmethod from pathlib import Path from typing import IO from megaparse_sdk.schema.document import Document from megaparse_sdk.schema.extensions import FileExtension class BaseParser(ABC): """Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]""" supported_extensions = [] def check_supported_extension( self, file_extension: FileExtension | None, file_path: str | Path | None = None ): if not file_extension and not file_path: raise ValueError( f"Either file_path or file_extension must be provided for {self.__class__.__name__}" ) if file_path and not file_extension: file_path = Path(file_path) if isinstance(file_path, str) else file_path file_extension = FileExtension(file_path.suffix) if file_extension and file_extension not in self.supported_extensions: raise ValueError( f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}" ) @abstractmethod async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: """ Convert the given file to a specific format. Args: file_path (str | Path): The path to the file to be converted. **kwargs: Additional keyword arguments for the conversion process. Returns: str: The result of the conversion process. Raises: NotImplementedError: If the method is not implemented by a subclass. """ raise NotImplementedError("Subclasses should implement this method") @abstractmethod def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: """ Convert the given file to the unstructured format. Args: file_path (str | Path): The path to the file to be converted. **kwargs: Additional keyword arguments for the conversion process. Returns: str: The result of the conversion process. Raises: NotImplementedError: If the method is not implemented by a subclass. """ raise NotImplementedError("Subclasses should implement this method") ================================================ FILE: libs/megaparse/src/megaparse/parser/builder.py ================================================ from megaparse_sdk.schema.parser_config import ParseFileConfig from megaparse.parser.base import BaseParser from megaparse.parser.llama import LlamaParser from megaparse.parser.megaparse_vision import MegaParseVision from megaparse.parser.unstructured_parser import UnstructuredParser parser_dict: dict[str, type] = { "unstructured": UnstructuredParser, "llama_parser": LlamaParser, "megaparse_vision": MegaParseVision, } class ParserBuilder: def build(self, config: ParseFileConfig) -> BaseParser: """ Build a parser based on the given configuration. Args: config (ParserDict): The configuration to be used for building the parser. Returns: BaseParser: The built parser. Raises: ValueError: If the configuration is invalid. """ return parser_dict[config.method](**config.model_dump()) ================================================ FILE: libs/megaparse/src/megaparse/parser/doctr_parser.py ================================================ import logging import uuid from typing import Any, Dict, List, Tuple, Type from uuid import UUID import numpy as np import onnxruntime as rt from megaparse_sdk.schema.document import ( BBOX, Block, BlockLayout, BlockType, CaptionBlock, FooterBlock, HeaderBlock, ImageBlock, ListElementBlock, Point2D, SubTitleBlock, TableBlock, TextBlock, TextDetection, TitleBlock, UndefinedBlock, ) from megaparse_sdk.schema.document import Document as MPDocument from megaparse_sdk.schema.extensions import FileExtension from onnxtr.io import Document from onnxtr.models import detection_predictor, recognition_predictor from onnxtr.models._utils import get_language from onnxtr.models.engine import EngineConfig from onnxtr.models.predictor.base import _OCRPredictor from onnxtr.utils.geometry import detach_scores from onnxtr.utils.repr import NestedObject from megaparse.configs.auto import DeviceEnum, TextDetConfig, TextRecoConfig from megaparse.layout_detection.output import LayoutDetectionOutput from megaparse.models.page import Page from megaparse.utils.onnx import get_providers logger = logging.getLogger("megaparse") block_cls_map: Dict[int, Type[Block]] = { 0: CaptionBlock, 1: TextBlock, 2: TextBlock, 3: ListElementBlock, 4: FooterBlock, 5: HeaderBlock, 6: ImageBlock, 7: SubTitleBlock, 8: TableBlock, 9: TextBlock, 10: TitleBlock, } class DoctrParser(NestedObject, _OCRPredictor): supported_extensions = [FileExtension.PDF] def __init__( self, text_det_config: TextDetConfig = TextDetConfig(), text_reco_config: TextRecoConfig = TextRecoConfig(), device: DeviceEnum = DeviceEnum.CPU, straighten_pages: bool = False, detect_orientation: bool = False, detect_language: bool = False, **kwargs, ): self.device = device general_options = rt.SessionOptions() providers = get_providers(self.device) engine_config = EngineConfig( session_options=general_options, providers=providers, ) _OCRPredictor.__init__( self, text_det_config.assume_straight_pages, straighten_pages, text_det_config.preserve_aspect_ratio, text_det_config.symmetric_pad, detect_orientation, clf_engine_cfg=engine_config, **kwargs, ) self.det_predictor = detection_predictor( arch=text_det_config.det_arch, assume_straight_pages=text_det_config.assume_straight_pages, preserve_aspect_ratio=text_det_config.preserve_aspect_ratio, symmetric_pad=text_det_config.symmetric_pad, batch_size=text_det_config.batch_size, load_in_8_bit=text_det_config.load_in_8_bit, engine_cfg=engine_config, ) self.reco_predictor = recognition_predictor( arch=text_reco_config.reco_arch, batch_size=text_reco_config.batch_size, load_in_8_bit=text_det_config.load_in_8_bit, engine_cfg=engine_config, ) self.detect_orientation = detect_orientation self.detect_language = detect_language def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]: rasterized_pages = [np.array(page.rasterized) for page in pages] # Dimension check if any(page.ndim != 3 for page in rasterized_pages): raise ValueError( "incorrect input shape: all pages are expected to be multi-channel 2D images." ) origin_page_shapes = [page.shape[:2] for page in rasterized_pages] # Localize text elements loc_preds, out_maps = self.det_predictor( rasterized_pages, return_maps=True, **kwargs ) # Detect document rotation and rotate pages seg_maps = [ np.where( out_map > self.det_predictor.model.postprocessor.bin_thresh, 255, 0, ).astype(np.uint8) for out_map in out_maps ] if self.detect_orientation: general_pages_orientations, origin_pages_orientations = ( self._get_orientations(rasterized_pages, seg_maps) ) orientations = [ {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations ] else: orientations = None general_pages_orientations = None origin_pages_orientations = None if self.straighten_pages: rasterized_pages = self._straighten_pages( rasterized_pages, seg_maps, general_pages_orientations, origin_pages_orientations, ) # update page shapes after straightening origin_page_shapes = [page.shape[:2] for page in rasterized_pages] # forward again to get predictions on straight pagess loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] # Detach objectness scores from loc_preds loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type] # Apply hooks to loc_preds if any for hook in self.hooks: loc_preds = hook(loc_preds) for page_index, (rast_page, loc_pred, objectness_score, page) in enumerate( zip(rasterized_pages, loc_preds, objectness_scores, pages, strict=True) ): block_layouts = [] for bbox, score in zip(loc_pred, objectness_score, strict=True): block_layouts.append( BlockLayout( bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()), objectness_score=score, block_type=BlockType.TEXT, ) ) page.text_detections = TextDetection( bboxes=block_layouts, page_index=page_index, dimensions=rast_page.shape[:2], orientation=orientations[page_index] if orientations is not None else 0, origin_page_shape=origin_page_shapes[page_index], ) return pages def get_text_recognition( self, pages: List[Page], layout: List[List[LayoutDetectionOutput]], **kwargs ) -> MPDocument: assert any( page.text_detections is not None for page in pages ), "Text detections should be computed before running text recognition" rasterized_pages = [] loc_preds = [] objectness_scores = [] orientations = [] origin_page_shapes = [] for page in pages: page_loc_pred = page.text_detections.get_loc_preds() # type: ignore if page_loc_pred.shape[0] == 0: page_loc_pred = np.zeros((0, 4)) rasterized_pages.append(np.array(page.rasterized)) loc_preds.append(page_loc_pred) # type: ignore objectness_scores.append(page.text_detections.get_objectness_scores()) # type: ignore orientations.append(page.text_detections.get_orientations()) # type: ignore origin_page_shapes.append(page.text_detections.get_origin_page_shapes()) # type: ignore # Crop images crops, loc_preds = self._prepare_crops( rasterized_pages, loc_preds, # type: ignore[arg-type] channels_last=True, assume_straight_pages=self.assume_straight_pages, assume_horizontal=self._page_orientation_disabled, ) # Rectify crop orientation and get crop orientation predictions crop_orientations: Any = [] if not self.assume_straight_pages: crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds) crop_orientations = [ {"value": orientation[0], "confidence": orientation[1]} for orientation in _crop_orientations ] # Identify character sequences word_preds = self.reco_predictor( [crop for page_crops in crops for crop in page_crops], **kwargs ) if not crop_orientations: crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds] boxes, text_preds, crop_orientations = self._process_predictions( loc_preds, word_preds, crop_orientations ) if self.detect_language: languages = [ get_language(" ".join([item[0] for item in text_pred])) for text_pred in text_preds ] languages_dict = [ {"value": lang[0], "confidence": lang[1]} for lang in languages ] else: languages_dict = None # FIXME : Not good return type we want :( out = self.doc_builder( rasterized_pages, boxes, objectness_scores, text_preds, origin_page_shapes, crop_orientations, orientations, languages_dict, ) return self.__to_elements_list(out, layout) def _get_block_cls( self, coordinates: tuple[float, float, float, float], layout: List[LayoutDetectionOutput], threshold: float = 0.6, ) -> Tuple[UUID | None, Type[Block]]: for det in layout: x1, y1, x2, y2 = coordinates X1, Y1, X2, Y2 = det.bbox.to_numpy() assert x1 <= x2 and y1 <= y2, "bbox1 coordinates are invalid" assert X1 <= X2 and Y1 <= Y2, "bbox2 coordinates are invalid" union_x1 = max(x1, X1) union_y1 = max(y1, Y1) union_x2 = min(x2, X2) union_y2 = min(y2, Y2) union_width = max(0, union_x2 - union_x1) union_height = max(0, union_y2 - union_y1) union_area = union_width * union_height detection_area = max(0, x2 - x1) * max(0, y2 - y1) if union_area / detection_area > threshold: # breakpoint() return (det.bbox_id, block_cls_map[det.label]) return (uuid.uuid4(), UndefinedBlock) def __to_elements_list( self, doctr_document: Document, layouts: List[List[LayoutDetectionOutput]] ) -> MPDocument: results = [] for page_number, (page, layout) in enumerate( zip(doctr_document.pages, layouts, strict=True) ): result = {} for block in page.blocks: if len(block.lines) and len(block.artefacts) > 0: raise ValueError( "Block should not contain both lines and artefacts" ) for line in block.lines: line_coordinates = [word.geometry for word in line.words] x0 = min(word[0][0] for word in line_coordinates) y0 = min(word[0][1] for word in line_coordinates) x1 = max(word[1][0] for word in line_coordinates) y1 = max(word[1][1] for word in line_coordinates) block_id, block_cls = self._get_block_cls( coordinates=(x0, y0, x1, y1), layout=layout ) if block_id in result: bbx0, bby0, bbx1, bby1 = result[block_id].bbox.to_numpy() result[block_id].text += "\n" + line.render() result[block_id].bbox = BBOX( top_left=Point2D(x=min(x0, bbx0), y=min(y0, bby0)), bottom_right=Point2D(x=max(x1, bbx1), y=max(y1, bby1)), ) elif issubclass(block_cls, TextBlock): result[block_id] = block_cls( text=line.render(), bbox=BBOX( top_left=Point2D(x=x0, y=y0), bottom_right=Point2D(x=x1, y=y1), ), metadata={}, page_range=(page_number, page_number), ) # We add the Image Blocks to the MPDocument with the right order for det in layout: if det.label in [6, 8]: x0, y0, x1, y1 = det.bbox.to_numpy() block_cls = block_cls_map[det.label] result[uuid.uuid4()] = block_cls( bbox=BBOX( top_left=Point2D(x=x0, y=y0), bottom_right=Point2D(x=x1, y=y1), ), metadata={}, page_range=(page_number, page_number), ) sorted_page_blocks = sorted( result.values(), key=lambda block: block.bbox.top_left.y ) results += sorted_page_blocks return MPDocument( metadata={}, content=results, detection_origin="doctr", ) ================================================ FILE: libs/megaparse/src/megaparse/parser/entity.py ================================================ from enum import Enum from typing import List, Optional class TagEnum(str, Enum): """Possible tags for the elements in the file""" TABLE = "TABLE" TOC = "TOC" HEADER = "HEADER" IMAGE = "IMAGE" class SupportedModel(Enum): GPT_4O = ("gpt-4o", None) GPT_4O_TURBO = ("gpt-4o-turbo", None) CLAUDE_3_5_SONNET = ("claude-3-5-sonnet", ["latest", "20241022"]) CLAUDE_3_OPUS = ("claude-3-opus", ["latest", "20240229"]) def __init__(self, model_name: str, supported_releases: Optional[List[str]]): self.model_name = model_name self.supported_releases = supported_releases @classmethod def is_supported(cls, model_name: str) -> bool: # Attempt to match model_name by checking if it starts with a known model name for model in cls: if model_name.startswith(model.model_name): # Extract the release version if available release = model_name[len(model.model_name) :].lstrip("-") or None # Check if the model supports this release if model.supported_releases is None: return True return release in model.supported_releases if release else False return False ================================================ FILE: libs/megaparse/src/megaparse/parser/llama.py ================================================ from pathlib import Path from typing import IO, List from llama_index.core.schema import Document as LlamaDocument from llama_parse import LlamaParse as _LlamaParse from llama_parse.utils import Language, ResultType from megaparse_sdk.schema.document import BBOX, Point2D, TextBlock from megaparse_sdk.schema.document import Document as MPDocument from megaparse_sdk.schema.extensions import FileExtension from megaparse.parser import BaseParser class LlamaParser(BaseParser): supported_extensions = [FileExtension.PDF] def __init__( self, api_key: str, verbose=True, language: Language = Language.FRENCH, parsing_instruction: str | None = None, **kwargs, ) -> None: self.api_key = api_key self.verbose = verbose self.language = language if parsing_instruction: self.parsing_instruction = parsing_instruction else: self.parsing_instruction = """Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables.""" async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) llama_parser = _LlamaParse( api_key=self.api_key, result_type=ResultType.MD, gpt4o_mode=True, verbose=self.verbose, language=self.language, parsing_instruction=self.parsing_instruction, ) documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path)) return self.__to_elements_list__(documents) def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) llama_parser = _LlamaParse( api_key=self.api_key, result_type=ResultType.JSON, gpt4o_mode=True, verbose=self.verbose, language=self.language, parsing_instruction=self.parsing_instruction, ) documents: List[LlamaDocument] = llama_parser.load_data(str(file_path)) return self.__to_elements_list__(documents) def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument: list_blocks = [] for i, page in enumerate(llama_doc): list_blocks.append( TextBlock( text=page.text, metadata={}, page_range=(i, i + 1), bbox=BBOX( top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1) ), ) ) return MPDocument( metadata={}, detection_origin="llamaparse", content=list_blocks, ) ================================================ FILE: libs/megaparse/src/megaparse/parser/megaparse_vision.py ================================================ import asyncio import base64 import re from io import BytesIO from pathlib import Path from typing import IO, List from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage from megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock from megaparse_sdk.schema.document import Document as MPDocument from megaparse_sdk.schema.extensions import FileExtension from pdf2image import convert_from_path from megaparse.parser import BaseParser from megaparse.parser.entity import SupportedModel, TagEnum # BASE_OCR_PROMPT = """ # Transcribe the content of this file into markdown. Be mindful of the formatting. # Add formatting if you think it is not clear. # Do not include page breaks and merge content of tables if it is continued in the next page. # Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]' # Return only the parsed content. # """ BASE_OCR_PROMPT = """ You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags. Follow these instructions to complete the task: 1. Carefully read through the entire file content. 2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure. 3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure. 4. For tables, headers, and table of contents, add the following tags: - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page. - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file. - Table of contents: Enclose in [TOC] and [/TOC] tags 5. When transcribing tables: - If a table continues across multiple pages, merge the content into a single, cohesive table. - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure. 6. Do not include page breaks in your transcription. 7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.). 8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed. 10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents. """ class MegaParseVision(BaseParser): supported_extensions = [FileExtension.PDF] def __init__(self, model: BaseChatModel, **kwargs): if hasattr(model, "model_name"): if not SupportedModel.is_supported(model.model_name): raise ValueError( f"Invald model name, MegaParse vision only supports model that have vision capabilities. " f"{model.model_name} is not supported." ) self.model = model self.parsed_chunks: list[str] | None = None def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]: """ Process a PDF file and convert its pages to base64 encoded images. :param file_path: Path to the PDF file :param image_format: Format to save the images (default: PNG) :return: List of base64 encoded images """ try: images = convert_from_path(file_path) images_base64 = [] for image in images: buffered = BytesIO() image.save(buffered, format=image_format) image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") images_base64.append(image_base64) return images_base64 except Exception as e: raise ValueError(f"Error processing PDF file: {str(e)}") def get_element(self, tag: TagEnum, chunk: str): pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]" all_elmts = re.findall(pattern, chunk) if not all_elmts: print(f"No {tag.value} found in the chunk") return [] return [elmt.strip() for elmt in all_elmts] async def asend_to_mlm(self, images_data: List[str]) -> str: """ Send images to the language model for processing. :param images_data: List of base64 encoded images :return: Processed content as a string """ images_prompt = [ { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, } for image_data in images_data ] message = HumanMessage( content=[ {"type": "text", "text": BASE_OCR_PROMPT}, *images_prompt, ], ) response = await self.model.ainvoke([message]) return str(response.content) def send_to_mlm(self, images_data: List[str]) -> str: """ Send images to the language model for processing. :param images_data: List of base64 encoded images :return: Processed content as a string """ images_prompt = [ { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, } for image_data in images_data ] message = HumanMessage( content=[ {"type": "text", "text": BASE_OCR_PROMPT}, *images_prompt, ], ) response = self.model.invoke([message]) return str(response.content) async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, ) -> MPDocument: """ Parse a PDF file and process its content using the language model. :param file_path: Path to the PDF file :param batch_size: Number of pages to process concurrently :return: List of processed content strings """ if not file_path: raise ValueError("File_path should be provided to run MegaParseVision") if isinstance(file_path, Path): file_path = str(file_path) self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) n_pages = len(pdf_base64) tasks = [ self.asend_to_mlm(pdf_base64[i : i + batch_size]) for i in range(0, len(pdf_base64), batch_size) ] self.parsed_chunks = await asyncio.gather(*tasks) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) return self.__to_elements_list__(responses, n_pages=n_pages) def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, ) -> MPDocument: """ Parse a PDF file and process its content using the language model. :param file_path: Path to the PDF file :param batch_size: Number of pages to process at a time :return: List of processed content strings """ if not file_path: raise ValueError("File_path should be provided to run MegaParseVision") if isinstance(file_path, Path): file_path = str(file_path) self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) n_pages = len(pdf_base64) chunks = [ pdf_base64[i : i + batch_size] for i in range(0, len(pdf_base64), batch_size) ] self.parsed_chunks = [] for chunk in chunks: response = self.send_to_mlm(chunk) self.parsed_chunks.append(response) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) return self.__to_elements_list__(responses, n_pages) def get_cleaned_content(self, parsed_file: str) -> str: """ Get cleaned parsed file without any tags defined in TagEnum. This method removes all tags from TagEnum from the parsed file, formats the content, and handles the HEADER tag specially by keeping only the first occurrence. Args: parsed_file (str): The parsed file content with tags. Returns: str: The cleaned content without TagEnum tags. """ tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values())) tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]" # handle the HEADER tag specially header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]" headers = re.findall(header_pattern, parsed_file, re.DOTALL) if headers: first_header = headers[0].strip() # Remove all HEADER tags and their content parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL) # Add the first header back at the beginning parsed_file = f"{first_header}\n{parsed_file}" # Remove all other tags def remove_tag(match): return match.group(2) cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL) cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE) cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content) cleaned_content = cleaned_content.replace("|\n\n|", "|\n|") cleaned_content = cleaned_content.strip() return cleaned_content def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument: list_blocks: List[Block] = [ TextBlock( text=mpv_doc, metadata={}, page_range=(0, n_pages - 1), bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)), ) ] return MPDocument( metadata={}, detection_origin="megaparse_vision", content=list_blocks, ) ================================================ FILE: libs/megaparse/src/megaparse/parser/unstructured_parser.py ================================================ import warnings from pathlib import Path from typing import IO, Dict, List from dotenv import load_dotenv from langchain_core.language_models.chat_models import BaseChatModel from megaparse_sdk.schema.document import ( BBOX, Block, FooterBlock, HeaderBlock, ImageBlock, Point2D, SubTitleBlock, TableBlock, TextBlock, TitleBlock, ) from megaparse_sdk.schema.document import ( Document as MPDocument, ) from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from unstructured.documents.elements import Element from unstructured.partition.auto import partition from megaparse.parser import BaseParser load_dotenv() class UnstructuredParser(BaseParser): supported_extensions = [ FileExtension.PDF, FileExtension.DOCX, FileExtension.TXT, FileExtension.OTF, FileExtension.EPUB, FileExtension.HTML, FileExtension.XML, FileExtension.CSV, FileExtension.XLSX, FileExtension.XLS, FileExtension.PPTX, FileExtension.MD, FileExtension.MARKDOWN, ] def __init__( self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs ): self.strategy = strategy self.model = model def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> MPDocument: self.check_supported_extension(file_extension, file_path) # Partition the PDF elements = partition( filename=str(file_path) if file_path else None, file=file, strategy=self.strategy, content_type=file_extension.mimetype if file_extension else None, ) return self.__to_mp_document(elements) async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> MPDocument: self.check_supported_extension(file_extension, file_path) warnings.warn( "The UnstructuredParser is a sync parser, please use the sync convert method", UserWarning, stacklevel=2, ) return self.convert(file_path, file, file_extension, **kwargs) def __to_mp_document(self, elements: List[Element]) -> MPDocument: text_blocks = [] for element in elements: block = self.__convert_element_to_block(element) if block: text_blocks.append(block) return MPDocument( content=text_blocks, metadata={}, detection_origin="unstructured" ) def __convert_element_to_block(self, element: Element) -> Block | None: element_type = element.category text = element.text metadata = element.metadata category_depth = metadata.category_depth # Element type-specific markdown content markdown_types: Dict[str, Block] = { "Title": TitleBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Subtitle": SubTitleBlock( text=text, depth=category_depth if category_depth else 0, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Header": HeaderBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Footer": FooterBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "NarrativeText": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "ListItem": TextBlock( # FIXME: @chloedia, list item need to be handled differently in ListBlock text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Table": TableBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Image": ImageBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Formula": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "FigureCaption": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "Address": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "EmailAddress": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "CodeSnippet": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), "UncategorizedText": TextBlock( text=text, metadata={}, page_range=(metadata.page_number, metadata.page_number) if metadata.page_number else None, bbox=BBOX( top_left=Point2D( x=metadata.coordinates.points[0][0], y=metadata.coordinates.points[0][1], ), bottom_right=Point2D( x=metadata.coordinates.points[3][0], y=metadata.coordinates.points[3][1], ), ) if metadata.coordinates and metadata.coordinates.points else None, ), } return markdown_types.get(element_type, None) ================================================ FILE: libs/megaparse/src/megaparse/predictor/layout_predictor.py ================================================ from PIL import Image from unstructured_inference.inference.layout import PageLayout from unstructured_inference.models.base import get_model from unstructured_inference.visualize import draw_bbox def extract_layout( page_number: int, page_image: Image.Image, model_name: str = "yolox" ) -> PageLayout: layout_model = get_model(model_name) parsed_page = PageLayout.from_image( image=page_image, number=page_number, detection_model=layout_model, element_extraction_model=None, fixed_layout=None, ) colors = ["red" for _ in parsed_page.elements] for el, color in zip(parsed_page.elements, colors, strict=True): page_image = draw_bbox(page_image, el, color=color, details=False) page_image.show() return parsed_page ================================================ FILE: libs/megaparse/src/megaparse/utils/extract_metadata.py ================================================ from typing import Any, Dict import pypdfium2 as pdfium def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]: pass ================================================ FILE: libs/megaparse/src/megaparse/utils/onnx.py ================================================ import logging from typing import List import onnxruntime as rt from megaparse.configs.auto import DeviceEnum logger = logging.getLogger("megaparse") def get_providers(device: DeviceEnum) -> List[str]: prov = rt.get_available_providers() logger.info("Available providers: %s", prov) if device == DeviceEnum.CUDA: if "CUDAExecutionProvider" not in prov: raise ValueError( "onnxruntime can't find CUDAExecutionProvider in list of available providers" ) return ["CUDAExecutionProvider"] elif device == DeviceEnum.COREML: if "CoreMLExecutionProvider" not in prov: raise ValueError( "onnxruntime can't find CoreMLExecutionProvider in list of available providers" ) return ["CoreMLExecutionProvider"] elif device == DeviceEnum.CPU: return ["CPUExecutionProvider"] else: raise ValueError("device not in (CUDA,CoreML,CPU)") ================================================ FILE: libs/megaparse/src/megaparse/utils/strategy.py ================================================ from typing import List import numpy as np from megaparse.models.page import Page from megaparse_sdk.schema.document import TextDetection from megaparse_sdk.schema.parser_config import StrategyEnum from pypdfium2._helpers.page import PdfPage def get_page_strategy( pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float ) -> StrategyEnum: if onnxtr_page is None: return StrategyEnum.FAST text_coords = [] # Get all the images in the page for obj in pdfium_page.get_objects(): if obj.type == 1: # type: ignore text_coords.append(obj.get_pos()) p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) pdfium_canva = np.zeros((int(p_height), int(p_width))) for coords in text_coords: # (left,bottom,right, top) # 0---l--------------R-> y # | # B (x0,y0) # | # T (x1,y1) # ^ # x x0, y0, x1, y1 = ( p_height - coords[3], coords[0], p_height - coords[1], coords[2], ) x0 = max(0, min(p_height, int(x0))) y0 = max(0, min(p_width, int(y0))) x1 = max(0, min(p_height, int(x1))) y1 = max(0, min(p_width, int(y1))) pdfium_canva[x0:x1, y0:y1] = 1 onnxtr_canva = np.zeros((int(p_height), int(p_width))) for block in onnxtr_page.bboxes: x0, y0 = block.bbox[0] x1, y1 = block.bbox[1] x0 = max(0, min(int(x0 * p_width), int(p_width))) y0 = max(0, min(int(y0 * p_height), int(p_height))) x1 = max(0, min(int(x1 * p_width), int(p_width))) y1 = max(0, min(int(y1 * p_height), int(p_height))) onnxtr_canva[y0:y1, x0:x1] = 1 intersection = np.logical_and(pdfium_canva, onnxtr_canva) union = np.logical_or(pdfium_canva, onnxtr_canva) sum_intersection = np.sum(intersection) sum_union = np.sum(union) iou = sum_intersection / sum_union if sum_union != 0 else 0 if iou < threshold: return StrategyEnum.HI_RES return StrategyEnum.FAST def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum: count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES) if count / len(pages) > threshold: return StrategyEnum.HI_RES return StrategyEnum.FAST ================================================ FILE: libs/megaparse/tests/__init__.py ================================================ ================================================ FILE: libs/megaparse/tests/certs/client-cert.pem ================================================ -----BEGIN CERTIFICATE----- MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9 ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7 /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw= -----END CERTIFICATE----- ================================================ FILE: libs/megaparse/tests/certs/client-key.pem ================================================ -----BEGIN PRIVATE KEY----- MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0 xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18 vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI 1OaXIqrCA/V43NydDezh0ylQ -----END PRIVATE KEY----- ================================================ FILE: libs/megaparse/tests/conftest.py ================================================ from pathlib import Path from typing import IO import pytest_asyncio from httpx import ASGITransport, AsyncClient from langchain_community.document_loaders import PlaywrightURLLoader from langchain_core.documents import Document from megaparse.api.app import app, get_playwright_loader, parser_builder_dep from megaparse.parser.base import BaseParser from megaparse_sdk.schema.document import Document as MPDocument from megaparse_sdk.schema.document import TextBlock from megaparse_sdk.schema.extensions import FileExtension class FakeParserBuilder: def build(self, *args, **kwargs) -> BaseParser: """ Build a fake parser based on the given configuration. Returns: BaseParser: The built fake parser. Raises: ValueError: If the configuration is invalid. """ class FakeParser(BaseParser): def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, ) -> MPDocument: print("Fake parser is converting the file") return MPDocument( file_name="Fake file", content=[TextBlock(text="Fake conversion result", metadata={})], metadata={}, detection_origin="fakeparser", ) async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, ) -> MPDocument: print("Fake parser is converting the file") return MPDocument( file_name="Fake file", content=[TextBlock(text="Fake conversion result", metadata={})], metadata={}, detection_origin="fakeparser", ) return FakeParser() @pytest_asyncio.fixture(scope="function") async def test_client(): print("Setting up test_client fixture") def fake_parser_builder(): return FakeParserBuilder() def fake_playwright_loader(): class FakePlaywrightLoader(PlaywrightURLLoader): async def aload(self): return [Document(page_content="Fake website content")] return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"]) app.dependency_overrides[parser_builder_dep] = fake_parser_builder app.dependency_overrides[get_playwright_loader] = fake_playwright_loader async with AsyncClient( transport=ASGITransport(app=app), # type: ignore base_url="http://test", ) as ac: yield ac app.dependency_overrides = {} ================================================ FILE: libs/megaparse/tests/data/grt_example/MegaFake_report.md ================================================ | My Mega fake report | #1756394 | 31/05/2024 | |---------------------|----------|------------| ## Why Mega Parse might be the best ? ### Introduction Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing. ### Features of Mega Parse Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises. **Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered. **High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion. **Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount. Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements. Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases. Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time. Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency. # Benefits of Mega Parse The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents. **Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations. **Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task. **Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information. Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks. Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs. Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management. # Comparative Performance The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas. | Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D | |---------------------|------------------|----------------|----------------|----------------|----------------| | Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX| | Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 | | **Accuracy Rate (%)** | 98 | 95 | 93 | 90 | 92 | | **Output Format** | Markdown | HTML | Markdown | Plain Text | HTML | | **Error Rate (%)** | 1 | 3 | 4 | 5 | 3 | | **Ease of Use** | High | Medium | High | Medium | Medium | | **Integration Capability** | Excellent | Good | Good | Fair | Good | | **Batch Processing** | Yes | No | Yes | No | Yes | | **Custom Parsing Rules** | Yes | Limited | Yes | No | Limited | | **Multilingual Support** | Yes | Yes | No | Yes | Yes | | **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes | | **Price (per user/month)** | $30 | $25 | $20 | $15 | $18 | | **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 | | **Free Trial Available** | Yes | Yes | No | Yes | No | | **Cloud Integration** | Yes | No | Yes | Yes | No | | **Security Features** | Advanced | Basic | Advanced | Basic | Intermediate | | **User Community Size** | Large | Medium | Medium | Small | Medium | | **Monthly Updates** | Yes | Yes | No | Yes | No | | **Mobile App Availability** | Yes | No | Yes | No | Yes | | **Platform Compatibility** | Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux | | **Data Privacy Compliance** | High | Medium | High | Low | Medium | | **AI-Driven Enhancements** | Yes | No | Yes | No | Yes | | **File Size Limit (per document)** | 1GB | 500MB | 750MB | 200MB | 500MB | | **User Training Resources** | Extensive | Moderate | Extensive | Limited | Moderate | | **API Access** | Yes | No | Yes | No | Yes | | **Customizable Output Templates** | Yes | Limited | Yes | No | Yes | | **Collaboration Features** | Yes | No | Yes | No | Limited | | **Document Version Control** | Yes | No | Yes | No | Yes | | **Import/Export Options** | Extensive | Moderate | Extensive | Limited | Moderate | | Feedback Mechanism | Yes | No | Yes | No | Yes | *Note: All data presented in this table is fictional and for illustrative purposes only.* ## Conclusion Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence. ================================================ FILE: libs/megaparse/tests/pdf/test_detect_ocr.py ================================================ import os import pypdfium2 import pytest from megaparse.megaparse import MegaParse from megaparse.utils.strategy import determine_global_strategy from megaparse_sdk.schema.parser_config import StrategyEnum ocr_pdfs = os.listdir("./tests/pdf/ocr") native_pdfs = os.listdir("./tests/pdf/native") megaparse = MegaParse() @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs) def test_hi_res_strategy(hi_res_pdf): if hi_res_pdf == "0168004.pdf": pytest.skip("Skip 0168004.pdf as it is flaky currently") pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/ocr/{hi_res_pdf}") pages = megaparse.extract_page_strategies(pdf_doc) assert ( determine_global_strategy( pages, megaparse.config.auto_config.document_threshold ) == StrategyEnum.HI_RES ) @pytest.mark.parametrize("native_pdf", native_pdfs) def test_fast_strategy(native_pdf): if native_pdf == "0168029.pdf": pytest.skip("Skip 0168029.pdf as it is too long to process") pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/native/{native_pdf}") pages = megaparse.extract_page_strategies(pdf_doc) assert ( determine_global_strategy( pages, megaparse.config.auto_config.document_threshold ) == StrategyEnum.FAST ) ================================================ FILE: libs/megaparse/tests/pdf/test_pdf_processing.py ================================================ from pathlib import Path import pypdfium2 import pytest from megaparse.configs.auto import ( DeviceEnum, MegaParseConfig, ) from megaparse.megaparse import MegaParse from megaparse.utils.strategy import determine_global_strategy from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum @pytest.fixture def native_pdf() -> Path: p = Path("./tests/pdf/sample_native.pdf") return p @pytest.fixture def scanned_pdf() -> Path: p = Path("./tests/pdf/sample_pdf.pdf") return p # def test_get_default_processors_megaparse(): # megaparse = MegaParse() # assert type(megaparse.parser) is UnstructuredParser @pytest.mark.asyncio @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) async def test_async_megaparse_pdf_processor_file_path(pdf_name, request): pdf = request.getfixturevalue(pdf_name) processor = MegaParse(config=MegaParseConfig(device=DeviceEnum.COREML)) result = await processor.aload(file_path=pdf) assert len(str(result)) > 0 @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) def test_sync_megaparse_pdf_processor_file_path(pdf_name, request): pdf = request.getfixturevalue(pdf_name) processor = MegaParse() result = processor.load(file_path=pdf) assert len(result) > 0 @pytest.mark.asyncio @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) async def test_megaparse_pdf_processor_file(pdf_name, request): pdf = request.getfixturevalue(pdf_name) processor = MegaParse() with open(pdf, "rb") as f: result = await processor.aload(file=f, file_extension=FileExtension.PDF) assert len(str(result)) > 0 def test_strategy_native(native_pdf): processor = MegaParse() pdf_doc = pypdfium2.PdfDocument(native_pdf) pages = processor.extract_page_strategies(pdf_doc) assert ( determine_global_strategy( pages, processor.config.auto_config.document_threshold ) == StrategyEnum.FAST ) pdf_doc.close() def test_strategy_scanned(scanned_pdf): processor = MegaParse() pdf_doc = pypdfium2.PdfDocument(scanned_pdf) pages = processor.extract_page_strategies(pdf_doc) assert ( determine_global_strategy( pages, processor.config.auto_config.document_threshold ) == StrategyEnum.HI_RES ) pdf_doc.close() ================================================ FILE: libs/megaparse/tests/pdf/test_pdfium_parser.py ================================================ from pathlib import Path import pypdfium2 as pdfium def test_pdfium(): # scanned pdf p = Path("./tests/pdf/mlbook.pdf") document = pdfium.PdfDocument(p) objs = [] for page in document: for obj in page.get_objects(): objs.append(obj) document.close() ================================================ FILE: libs/megaparse/tests/supported_docs/sample.csv ================================================ Name,Description MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ." OtherParse,"OtherParse is a decent parser, but it struggles with accents." RandomParse,"RandomParse is another parser, but it often fails with special characters." ================================================ FILE: libs/megaparse/tests/supported_docs/sample.markdown ================================================ # The Difficulty of Parsing Files Parsing files can be a challenging task due to several factors: ## 1. File Format Variability Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely. ## 2. Inconsistent Data Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms. ## 3. Large File Sizes Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets. ## 4. Encoding Issues Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption. ## 5. Nested Structures Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data. ## Conclusion Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies. ================================================ FILE: libs/megaparse/tests/supported_docs/sample.md ================================================ # The Difficulty of Parsing Files Parsing files can be a challenging task due to several factors: ## 1. File Format Variability Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely. ## 2. Inconsistent Data Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms. ## 3. Large File Sizes Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets. ## 4. Encoding Issues Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption. ## 5. Nested Structures Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data. ## Conclusion Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies. ================================================ FILE: libs/megaparse/tests/supported_docs/sample.txt ================================================ Lorem ipsum Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla. Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. Maecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate. https://github.com/QuivrHQ/MegaParse ================================================ FILE: libs/megaparse/tests/supported_docs/sample.xml ================================================ Charter Group
100 Main Framingham MA 01701
720 Prospect Framingham MA 01701
120 Ridge MA 01760
================================================ FILE: libs/megaparse/tests/supported_docs/sample_complexe.html ================================================ Large HTML page with images

When to load CSS

Large HTML page with Images


This page shall test if the recorder generates scripts with requests in correct order. It includes images in sequential order: stadyn_image1.gif through stadyn_image10.gif


stadyn_image1.gif (6512 bytes)
stadyn_image1

stadyn_image2.gif (5983 bytes)
stadyn_image2

Open Financial Exchange
Specification 1.0

February 14, 1997

1997 CheckFree Corp., Intuit Inc., Microsoft Corp. All rights reserved


Chapters 1 - 10




stadyn_image3.gif (6537 bytes)
stadyn_image3

stadyn_image4.gif (6028 bytes)
stadyn_image4

stadyn_image5.gif (4068 bytes)
stadyn_image5

 

 

Contents

1. Overview 51.1 Introduction 51.1.1 Design Principles 51.2 Open Financial Exchange at a Glance 71.2.1 Data Transport 71.2.2 Request and Response Model 81.3 Conventions 92. Structure 102.1 HTTP Headers 102.2 Open Financial Exchange Headers 112.2.1 The Meaning of Version Numbers 122.3 SGML Details 122.3.1 Compliance 122.3.2 Special Characters 122.4 Open Financial Exchange SGML Structure 132.4.1 Overview 132.4.2 Top Level 132.4.3 Messages 132.4.4 Message Sets and Version Control 142.4.5 Transactions 152.5 The Signon Message Set 162.5.1 Signon <SONRQ> <SONRS> 162.5.2 PIN Change <PINCHRQ> <PINCHRS> 192.5.3 Examples 202.6 External Data Support 202.7 Extensions to Open Financial Exchange 213. Common Aggregates, Elements, and Data Types 223.1 Common Aggregates 223.1.1 Identifying Financial Institutions and Accounts 223.1.2 Balance Records <BAL> 223.1.3 Error Reporting <STATUS> 233.2 Common Elements 243.2.1 Financial Institution Transaction ID <FITID> 243.2.2 Server-Assigned ID <SRVRTID> 243.2.3 Client-Assigned Transaction UID <TRNUID> 253.2.4 Token <TOKEN> 253.2.5 Transaction Amount <TRNAMT> 253.2.6 Memo <MEMO> 253.2.7 Date Start and Date End <DTSTART> <DTEND> 263.3 Common data types 263.3.1 Dates and Times 263.3.2 Amounts, Prices, and Quantities 283.3.3 Language 283.3.4 Basic data types 284. Security 294.1 Security Solutions 294.1.1 Determining Security Levels <OFXSEC> <TRANSPSEC> 294.2 Channel-Level Security 304.2.1 Security Requirements 304.2.2 Using SSL 3.0 in Open Financial Exchange 304.3 Application-Level Security 314.3.1 Requirements for Application-Layer Security 314.3.2 Using Application-level Encryption in Open Financial Exchange 325. International Support 335.1 Language and Encoding 335.2 Currency <CURDEF> <CURRENCY> <ORIGCURRENCY> 335.3 Country-Specific Tag Values 346. Data Synchronization 356.1 Overview 356.2 Background 356.3 Data Synchronization Approach 366.4 Data Synchronization Specifics 376.5 Conflict Detection and Resolution 396.6 Synchronization vs. Refresh 406.7 Typical Server Architecture for Synchronization 416.8 Typical Client Processing of Synchronization Results 436.9 Simultaneous Connections 446.10 Synchronization Alternatives 446.10.1 Lite Synchronization 446.10.2 Relating Synchronization and Error Recovery 456.11 Examples 467. FI Profile 487.1 Overview 487.1.1 Message Sets 487.1.2 Version Control 497.1.3 Batching and Routing 497.2 Profile Request 507.3 Profile Response 517.3.1 Message Set 527.3.2 Signon Realms 537.3.3 Status Codes 537.4 Profile Message Set Profile Information 548. Activation & Account Information 558.1 Overview 558.2 Approaches to User Sign-Up with Open Financial Exchange 558.3 Users and Accounts 568.4 Enrollment and Password Acquisition <ENROLLRQ> <ENROLLRS> 568.4.1 User IDs 578.4.2 Enrollment Request 578.4.3 Enrollment Response 598.4.4 Enrollment Status Codes 598.4.5 Examples 608.5 Account Information 608.5.1 Request <ACCTINFORQ> 618.5.2 Response <ACCTINFORS> 618.5.3 Account Information Aggregate <ACCTINFO> 628.5.4 Status Codes 628.5.5 Examples 638.6 Service Activation 638.6.1 Activation Request and Response 648.6.2 Service Activation Synchronization 668.6.3 Examples 668.7 Name and Address Changes <CHGUSERINFORQ> <CHGUSERINFORS> 678.7.1 <CHGUSERINFORQ> 678.7.2 <CHGUSERINFORS> 688.7.3 Status Codes 688.8 Signup Message Set Profile Information 699. Customer to FI Communication 709.1 The E-Mail Message Set 709.2 E-Mail Messages 709.2.1 Regular vs. Specialized E-Mail 719.2.2 Basic <MAIL> Aggregate 719.2.3 E-Mail <MAILRQ> <MAILRS> 719.2.4 E-Mail Synchronization <MAILSYNCRQ> <MAILSYNCRS> 729.2.5 Example 739.3 Get HTML Page 749.3.1 MIME Get Request and Response <GETMIMERQ> <GETMIMERS> 749.3.2 Example 759.4 E-Mail Message Set Profile Information 7610. Recurring Transactions 7710.1 Creating a Recurring Model 7710.2 Recurring Instructions <RECURRINST> 7710.2.1 Values for <FREQ> 7810.2.2 Examples 7910.3 Retrieving Transactions Generated by a Recurring Model 8010.4 Modifying and Canceling Individual Transactions 8010.5 Modifying and Canceling Recurring Models 8010.5.1 Examples 81

  1. Overview
  2. Introduction

Open Financial Exchange is a broad-based framework for exchanging financial data and instructions between customers and their financial institutions. It allows institutions to connect directly to their customers without requiring an intermediary.

Open Financial Exchange is an open specification that anyone can implement: any financial institution, transaction processor, software developer or other party. It uses widely accepted open standards for data formatting (such as SGML), connectivity (such as TCP/IP and HTTP), and security (such as SSL).

Open Financial Exchange defines the request and response messages used by each financial service as well as the common framework and infrastructure to support the communication of those messages. This specification does not describe any specific product implementation.

  1. Design Principles

The following principles were used in designing Open Financial Exchange:

l Broad Range of Financial Activities - Open Financial Exchange provides support for a broad range of financial activities. Open Financial Exchange 1.0 specifies the following services:

n Bank statement download

n Credit card statement download

n Funds transfers including recurring transfers

n Consumer payments, including recurring payments

n Business payments, including recurring payments

n Brokerage and mutual fund statement download, including transaction history, current holdings and balances

l Broad Range of Financial Institutions - Open Financial Exchange supports communication with a broad range of financial institutions (FIs), including:

n Banks

n Brokerage houses

n Merchants

n Processors

n Financial advisors

n Government agencies

l Broad Range of Front-End applications - Open Financial Exchange supports a broad range of front-end applications covering all types of financial activities running on all types of platforms, including Web-based applications.

l Extensible - Open Financial Exchange has been designed to allow the easy addition of new services. Future versions will include support for many new services.

l Open - This specification is publicly available. You can build client and server applications using the Open Financial Exchange protocols independent of any specific technology, product, or company.

l Multiple Client Support - Open Financial Exchange allows a user to use multiple client applications to access the same data at a financial institution. With the popularity of the World Wide Web, customers are increasingly more likely to use multiple applications-either desktop-based or Web-based-to perform financial activities. For example, a customer can track personal finances at home with a desktop application and occasionally pay bills while at work with a Web-based application. The use of data synchronization to support multiple clients is a key innovation in Open Financial Exchange.

l Robust - Open Financial Exchange will be used for executing important financial transactions and for communicating important financial information. Assuring users that transactions are executed and information is correct is crucial. Open Financial Exchange provides robust protocols for error recovery.

l Secure - Open Financial Exchange provides a framework for building secure online financial services. In Open Financial Exchange, security encompasses authentication of the parties involved, as well as secrecy and integrity of the information being exchanged.

l Batch & Interactive - The design of request and response messages in Open Financial Exchange is for use in either batch or interactive style of communication. Open Financial Exchange provides for applying a single authentication context to multiple requests in order to reduce the overhead of user authentication.

l International Support - Open Financial Exchange is designed to supply financial services throughout the world. It supports multiple currencies, country-specific extensions, and different forms of encoding such as UNICODE.

l Platform Independent -Open Financial Exchange can be implemented on a wide variety of front-end client devices, including those running Windows 3.1, Windows 95, Windows NT, Macintosh, or UNIX. It also supports a wide variety of Web-based environments, including those using HTML, Java, JavaScript, or ActiveX. Similarly on the back-end, Open Financial Exchange can be implemented on a wide variety of server systems, including those running UNIX, Windows NT, or OS/2.

l Transport Independent - Open Financial Exchange is independent of the data communication protocol used to transport the messages between the client and server computers. Open Financial Exchange 1.0 will use HTTP.

  1. Open Financial Exchange at a Glance

The design of Open Financial Exchange is as a client and server system. An end-user uses a client application to communicate with a server at a financial institution. The form of communication is requests from the client to the server and responses from the server back to the client.

Open Financial Exchange uses the Internet Protocol (IP) suite to provide the communication channel between a client and a server. IP protocols are the foundation of the public Internet and a private network can also use them.

  1. Data Transport

Clients use the HyperText Transport Protocol (HTTP) to communicate to an Open Financial Exchange server. The World Wide Web throughout uses the same HTTP protocol. In principle, a financial institution can use any off-the-shelf web server to implement its support for Open Financial Exchange.

To communicate by means of Open Financial Exchange over the Internet, the client must establish an Internet connection. This connection can be a dial-up Point-to-Point Protocol (PPP) connection to an Internet Service Provider (ISP) or a connection over a local area network that has a gateway to the Internet.

Clients use the HTTP POST command to send a request to the previously acquired Uniform Resource Locator (URL) for the desired financial institution. The URL presumably identifies a Common Gateway Interface (CGI) or other process on an FI server that can accept Open Financial Exchange requests and produce a response.

The POST identifies the data as being of type application/x-ofx. Use application/x-ofx as the return type as well. Fill in other fields per the HTTP 1.0 spec. Here is a typical request:

POST http://www.fi.com/ofx.cgi HTTP/1.0
User-Agent:MyApp 5.0
Content-Type: application/x-ofx
Content-Length: 1032

OFXHEADER:100
DATA:OFXSGML
VERSION:100
SECURITY:1
ENCODING:USASCII

<OFX>
... Open Financial Exchange requests ...
</OFX>

A blank line defines the separation between the HTTP headers and the start of the actual Open Financial Exchange data. A blank line also separates the Open Financial Exchange headers and the actual response. (See Chapter 2, for more information.)

The structure of a response is similar to the request, with the first line containing the standard HTTP result, as shown next. The content length is given in bytes.

HTTP 1.0 200 OK
Content-Type: application/x-ofx
Content-Length: 8732

OFXHEADER:100
DATA:OFXSGML
VERSION:100
SECURITY:1
ENCODING:USASCII

<OFX>
... Open Financial Exchange responses ...
</OFX>
  1. Request and Response Model

The basis for Open Financial Exchange is the request and response model. One or more requests can be batched in a single file. This file typically includes a signon request and one or more service-specific requests. An FI server will process all of the requests and return a single response file. This batch model lends itself to Internet transport as well as other off-line transports. Both requests and responses are plain text files, formatted using a grammar based on Standard Generalized Markup Language (SGML). Open Financial Exchange is syntactically similar to HyperText Markup Language (HTML), featuring tags to identify and delimit the data. The use of a tagged data format allows Open Financial Exchange to evolve over time while continuing to support older clients and servers.

Here is a simplified example of an Open Financial Exchange request file. (This example does not show the Open Financial Exchange headers and the indentation is only for readability.) For complete details, see the more complete examples throughout this specification.

<OFX> <!-- Begin request data --> <SIGNONMSGSRQV1> <SONRQ> <!-- Begin signon --> <DTCLIENT>19961029101000 <!-- Oct. 29, 1996, 10:10:00 am --> <USERID>123-45-6789 <!-- User ID (that is, SSN) --> <USERPASS>MyPassword <!-- Password (SSL encrypts whole) --> <LANGUAGE>ENG <!-- Language used for text --> <FI> <!-- ID of receiving institution --> <ORG>NCH <!-- Name of ID owner --> <FID>1001 <!-- Actual ID --> </FI> <APPID>MyApp <APPVER>0500 </SONRQ> <!-- End of signon --> </SIGNONMSGSRQV1> <BANKMSGSRQV1> <STMTTRNRQ> <!-- First request in file --> <TRNUID>1001 <STMTRQ> <!-- Begin statement request --> <BANKACCTFROM> <!-- Identify the account --> <BANKID>121099999 <!-- Routing transit or other FI ID --> <ACCTID>999988 <!-- Account number --> <ACCTTYPE>CHECKING <!-- Account type --> </BANKACCTFROM> <!-- End of account ID --> <INCTRAN> <!-- Begin include transaction --> <INCLUDE>Y <!-- Include transactions --> </INCTRAN> <!-- End of include transaction --> </STMTRQ> <!-- End of statement request --> </STMTTRNRQ> <!-- End of first request --> </BANKMSGSRQV1></OFX> <!-- End of request data -->The response format follows a similar structure. Although a response such as a statement response contains all of the details of each transaction, each element is identified using tags.

The key rule of Open Financial Exchange syntax is that each tag is either an element or an aggregate. Data follows its element tag. An aggregate tag begins a compound tag sequence, which must end with a matching tag; for example, <AGGREGATE> ... </AGGREGATE>.

The actual file Open Financial Exchange sends is without any extra white space between tags.

  1. Conventions

The conventions used in the detailed descriptions include:

Tag Description
<REQUIREDTAG> Required tag (1 or more)
<REQUIREDTAG2> Required tag that occurs only once
<OPTIONALTAG> Optional tag; this particular one can occur multiple times (0 or more)
<SPECIFIC> Values are A, B, and C
<ALPHAVALUE> Takes an alphanumeric value up to 32 characters, A-32
  1. Structure

This chapter describes the basic structure of an Open Financial Exchange request and response. Structure includes headers, basic syntax, and the Signon request and response. This chapter also describes how Open Financial Exchange encodes external data, such as bit maps.

Open Financial Exchange data consists of some headers plus one or more Open Financial Exchange data blocks. Each block consists of a signon message and zero or more additional messages. When sent over the internet using HTTP, standard HTTP and multi-part MIME headers and formats surround the Open Financial Exchange data. A simple file that contained only Open Financial Exchange data would have the following form:

HTTP headers
MIME type application/x-ofx
Open Financial Exchange headers
Open Financial Exchange SGML block 1

A more complex file that contained multiple Open Financial Exchange data blocks and additional Open Financial Exchange data would have this form:

HTTP headers
MIME type multipart/x-mixed-replace; boundary =--boundary-
---boundary---
MIME type application/x-ofx
	Open Financial Exchange headers
	Open Financial Exchange SGML block 1
	Open Financial Exchange SGML block 2
---boundary---
	MIME type image/jpeg
		FI logo
  1. HTTP Headers

Data delivered by way of HTTP places the standard HTTP result code on the first line. HTTP defines a number of status codes. Servers can return any standard HTTP result. However, FIs should expect clients to collapse these codes into the following three cases:

Code Meaning Action
200 OK The request was processed and a valid Open Financial Exchange result is returned.
400s Bad request The request was invalid and was not processed. Clients will report an internal error to the user.
500s Server error The server is unavailable. Clients should advise the user to retry shortly.

NOTE: Open Financial Exchange returns a code 400 only if it cannot parse the file. Open Financial Exchange handles content errors such as wrong PIN, or invalid account, by returning a valid Open Financial Exchange response along with code 200.

Open Financial Exchange requires the following HTTP standard headers:

Code Value Explanation
Content-type application/x-ofx The MIME type for Open Financial Exchange
Content-length length Length of the data after removing HTTP headers


When responding with multi-part MIME, the main type will be multi-part/x-mixed-replace;
one of the parts will use application/x-ofx.

  1. Open Financial Exchange Headers

The intent of Open Financial Exchange is for use with a variety of transports and to provide sufficient version control capabilities for future expansion. To support this goal, the contents of an Open Financial Exchange file consist of a simple set of headers followed by contents defined by that header. "File format" means the entire content after removal of any transport headers. The HTTP transport described in this document, means without the HTTP and MIME headers.

The Open Financial Exchange headers are in a simple tag:value syntax and terminated by a blank line. Open Financial Exchange always sends headers unencrypted, even if there is application-level encryption in use for the remaining contents. The first entry will always be OFXHEADER with a version number. This entry will help identify the contents as an Open Financial Exchange file, and provides the version of the Open Financial Exchange headers that follow (not of the content itself). For example:

OFXHEADER:100

This document defines version 1.0 of the headers to contain at least the following additional tags:

DATA:OFXSGML
VERSION:100
SECURITY:
ENCODING:
CHARSET:
COMPRESSION:
OLDFILEUID:
NEWFILEUID:

The data tag identifies the contents as being in OFX SGML form. VERSION identifies the version type as OFXSGML data. In the case of OFXSGML, it translates to the version of the Document Type Definition (DTD) that it uses for parsing. The ENCODING and CHARSET tags define the interpretation of the character data. See Chapter 5, "International Support" for more information on these tags. Chapter 4 describes the security tag. A future version of this specification will define compression.

Open Financial Exchange uses OLDFILEUID and NEWFILEUID to support error recovery. They are not present when clients are not requesting error recovery. (See Chapter 6, "Data Synchronization")

A blank line follows the last tag. Then (for type OFXSGML), the SGML-readable data begins with the <OFX> tag.

NOTE: Here, VERSION provides the overall version of the DTD. The <OFX> block describes the specific message set versions used, shown later in this chapter.

  1. The Meaning of Version Numbers

The OFXHEADER value should only change its major number if an existing client is unable to process the new header. This can occur because of a complete syntax change in a header, or a significant change in the semantics of an existing tag-not the entire response. You can add new tags as long as clients can function without understanding them.

You should add new values for a data tag only when you introduce an entirely new syntax. In the case of OFXSGML, a new syntax would have to be non-SGML compliant to warrant a new data value. It is possible that there will be more than one syntax in use at the same time to meet different needs.

The intent of the header version tag is to identify syntactic changes. In the case of OFXSGML, this corresponds to the DTD. Purely for identification purposes, each change will increment the minor number of the version tag. If you introduce an incompatible change so that an older DTD can not parse the file, the major number will change. See the general discussion of message sets and version control, later in this chapter.

  1. SGML Details
  2. Compliance

SGML is the basis for Open Financial Exchange. There is a DTD that formally defines the SGML wire format. However, Open Financial Exchange is not completely SGML-compliant because the specification allows unrecognized tags to be present. It requires clients and servers to skip over the unrecognized material. That is, if <XYZ>qqq</XYZ> appeared and a client or server cannot recognize <XYZ>, the server should ignore that tag and its enclosed data. A fully-compliant SGML parser would not validate an Open Financial Exchange document if it contained any tags that the DTD does not define.

Although SGML is the basis for the specification, and the specification is largely compliant with SGML, do not assume Open Financial Exchange supports any SGML features not documented in this specification. The intent is to allow parsing to be as simple as possible, while retaining compatibility with the SGML world.

  1. Special Characters

The following characters are special to SGML. Use the given alternative sequence to represent them:

Character Escape sequence
< (less than) &lt;
> (greater than) &gt;
& (ampersand) &amp;

For example, the string "AT&amp;T" encodes "AT&T."

A special case applies in specific tags that can accept HTML-formatted strings, such as e-mail records. These accept SGML marked section syntax to hide the HTML from the Open Financial Exchange parser. You must prefix strings with "<![ CDATA ["and suffixed with"]]>." Within these bounds, treat the above characters literally without an escape. See the Chapter 9 for an example.

  1. Open Financial Exchange SGML Structure
  2. Overview

Open Financial Exchange hierarchically organizes request and response blocks:

Top Level <OFX>
Message Set and Version <XXXMSGSVn>
Synchronization Wrappers <YYYSYNCRQ>, <YYYSYNCRS>
Transaction Wrappers <YYYTRNRQ>, <YYYTRNRS>
Specific requests and responses

The following sections describe each of these levels.

  1. Top Level

An Open Financial Exchange request or response has the following top-level form:

Tag Description
<OFX> Opening tag
... Open Financial Exchange requests or responses ... 0 or more transaction requests and responses inside appropriate message set aggregates
</OFX> Closing tag for the Open Financial Exchange record

This chapter specifies the order of requests and responses.

A single file can contain multiple <OFX> ... </OFX> blocks. A typical use of multiple blocks is to request in a single file information associated with different users.

  1. Messages

A message is the unit of work in Open Financial Exchange. It refers to a request and response pair, and the status codes associated with that response. For example, the message to download a bank statement consists of the request <STMTRQ> and the response <STMTRS>. In addition, with the exception of the signon message, each message includes a transaction wrapper. These aggregates add a transaction unique ID <TRNUID>, and for responses, a <STATUS> aggregate, to the basic request and response.

For messages subject to synchronization (see Chapter 6), a third layer of aggregates is also part of a message definition: a synchronization request and response. These add a token and, in some cases, other information to the transactions.

Open Financial Exchange uses the following naming where the XXX message includes:

In a few cases, a small number of related basic requests and responses share a transaction and synchronization wrapper. The term message will still apply to each request and response; only the naming scheme will not hold in those cases.

  1. Message Sets and Version Control

Message sets are collections of messages. Generally they form all or part of what a user would consider a service, something for which they might have signed up, such as "banking." Message sets are the basis of version control, routing, and security. They are also the basis for the required ordering in Open Financial Exchange files.

Within an Open Financial Exchange block, Open Financial Exchange organizes messages by message set. A message set can appear at most once within an Open Financial Exchange block. All messages from a message set must be from the same version of that message set.

For each message set of XXX and version n, there exists an aggregate named <XXXMSGSVn>. (Compare with <XXXMSGSETVn> in Chapter 7.) All of the messages from that message set must be inside the appropriate message set aggregate. In the following example, the Open Financial Exchange block contains a signon request inside the signon message set, and two statement requests and a transfer request inside the bank message set.

<OFX>
	<SIGNONMSGSRQV1>	<!-- Signon message set -->
		<SONRQ>				<!-- Signon message -->
		...
		</SONRQ>
	</SIGNONMSGSRQV1>

	<BANKMSGSRQV1>		<!-- Banking message set -->
		<STMTTRNRQ>		<!-- Statement request -->
		...
		</STMTTRNRQ>
		<STMTTRNRQ>		<!-- Another stmt request -->
		...
		</STMTTRNRQ>
		<INTRATRNRQ>		<!-- Intra-bank transfer request -->
		...
		</INTRATRNRQ>
	</BANKMSGSRQV1>
</OFX>

Message sets, if used at all, must appear in the following order:

The definition of each message set can further prescribe an order of its messages within that message set.

  1. Transactions

Other than the signon message, each request is made as a transaction. Transactions contain a client-assigned globally unique ID, optional client-supplied pass-back data, and then the record for the specific request. A transaction similarly wraps each response. The response transaction returns the client ID sent in the request, along with a status message, the pass-back data if present, and the specific response record. This technique allows a client to track responses against requests.

The <STATUS> aggregate, defined in Chapter 3, provides feedback on the processing of the request. If the <SEVERITY> of the status is ERROR, the server provides no specific response record. Otherwise, the response will be complete even though some warning might have occurred.

Clients can send additional information in <CLTCOOKIE> that servers will return in the response. This allows clients that do not maintain state, and thus do not save TRNUIDs, to cause some additional descriptive information to be present in the response. For example, a client might identify a request as relating to a user or a spouse.

In some countries some transactions require a customer-supplied authorization number for each transaction. In those countries, the <TAN> element provides the means to pass this information to servers. As Open Financial Exchange is implemented in each country, the specification will define the specific requirements for the use of <TAN> in each country.

A typical request is as follows:

Tag Description
<XXXTRNRQ> Transaction-request aggregate
<TRNUID> Client-assigned globally unique ID for this transaction trnuid
<CLTCOOKIE> Data to be echoed in the transaction response A-32
<TAN> Transaction authorization number; used in some countries with some types of transactions. Country-specific documentation will define messages that require a TAN, A-80
specific request Aggregate for the specific request
</XXXTRNRQ>  

A typical response is as follows:

Tag Description
<XXXTRNRS> Transaction-response aggregate
<TRNUID> Client-assigned globally unique ID for this transaction, trnuid
<CLTCOOKIE> Client-provided data, REQUIRED if provided in request, A-32
<STATUS> Status aggregate
</STATUS>  
response record Aggregate for the specific response
</XXXTRNRS>  
  1. The Signon Message Set

The Signon message set includes the signon message and the PIN change message, and must appear in that order. The <SIGNONMSGSRQV1> and <SIGNONMSGSRSV1> aggregates wrap the message.

  1. Signon <SONRQ> <SONRS>

The signon record identifies and authenticates a user to an FI. It also includes information about the application making the request, because some services might be appropriate only for certain clients. Every Open Financial Exchange request contains exactly one <SONRQ>. Every response must contain exactly one <SONRS> record.

Use of Open Financial Exchange presumes that FIs authenticate each customer and then give the customer access to one or more accounts or services. If passwords are specific to individual services or accounts, a separate Open Financial Exchange request will be made for each distinct user ID or password required. This will not necessarily be in a manner visible to the user. Note that some situations, such as joint accounts or business accounts, will have multiple user IDs and multiple passwords that can access the same account.

FIs assign user IDs for the customer. It can be the customer's social security number, but the client will not make any assumptions about the syntax of the ID, add check-digits, or do similar processing.

To improve server efficiency in handling a series of Open Financial Exchange request files sent over a short period of time, clients can request that a server return a <USERKEY> in the signon response. If the server provide a user key, clients will send the <USERKEY> in instead of the user ID and password in subsequent sessions, until the <USERKEY> expires. This allows servers to authenticate subsequent requests more quickly.

The client returns <SESSCOOKIE> if it sent one in a previous <SONRS>. Servers can use this value to track client usage but cannot assume that all requests come from a single client, nor can they deny service if they did not expect the returned cookie. Use of a backup file, for example, would lead to an unexpected <SESSCOOKIE> value that should nevertheless not stop a user from connecting.

Servers can request that a consumer change his or her password by returning status code 15000. Servers should keep in mind that only one status code can be returned. If the current signon response status should be 15500 (invalid ID or password), the request to change password will need to wait until an otherwise successful signon is achieved.

  1. Record Request <SONRQ>
Tag Description
<SONRQ> Record- request aggregate
<DTCLIENT> Date and time of the request from the client computer, datetime
<USERID> User identification string. Use <USERID> & <USERPASS>, or <USERKEY>, but not both; A-32
<USERPASS> User password on server - either <USERID> & <USERPASS> are used, or <USERKEY>, but not both; A-32
<USERKEY> Login using previously authenticated context - use <USERID> & <USERPASS>, or <USERKEY>, but not both; A-64
<GENUSERKEY> Request server to return a USERKEY for future use, Boolean
<LANGUAGE> Requested language for text responses, language
<SESSCOOKIE> Session cookie, value received in previous <SONRS>, not sent if first login or if none sent by FI A-1000
<FI> Financial-Institution-identification aggregate
</FI>  
<APPID> ID of client application, A-5
<APPVER> Version of client application, N-4 (6.00 encoded as 0600)
</SONRQ>  
  1. Response <SONRS>
Tag Description
<SONRS> Record-response aggregate
<STATUS> Status aggregate, see list of possible code values
<DTSERVER> Date and time of the server response, datetime
<USERKEY> Use user key that instead of USERID and USERPASS for subsequent requests. TSKEYEXPIRE can limit lifetime
<TSKEYEXPIRE> Date and time that USERKEY expires
<LANGUAGE> Language used in text responses, language
<DTPROFUP> Date and time of last update to profile information for any service supported by this FI (see Chapter 7), datetime
<DTACCTUP> Date and time of last update to account information (see Chapter 8), datetime
<FI> Financial-Institution-identification aggregate
</FI>  
<SESSCOOKIE> Session cookie that the client should return on the next <SONRQ>
A-1000
</SONRS>  

List of status code values for the <CODE> element of <STATUS>:

Value Meaning
0 Success (INFO)
2000 General error (ERROR)
15000 Must change PIN (INFO)
15500 Signon (for example, user ID or password) invalid (ERROR)
15501 Customer account already in use (ERROR)
15502 PIN Lockout (ERROR)
  1. Financial Institution ID <FI>

Some service providers support multiple FIs, and assign each FI an ID. The signon allows clients to pass this information along, so that providers will know to which FI the user is actually doing a signon.

Tag Description
<FI> FI-record aggregate
<ORG> Organization defining this FI name space, A-32
<FID> Financial Institution ID (unique within <ORG>), A-32
</FI>  
  1. PIN Change <PINCHRQ> <PINCHRS>

The signon sends a request to change a customer password as a separate request. The transaction request <PINCHTRNRQ> aggregate contains <PINCHRQ>. Responses are also inside transaction responses <PINCHTRNRS>.

Password changes pose a special problem for error recovery. If the client does not receive a response, it does not know whether the password change was successful or not. Open Financial Exchange recommends that servers accept either the old password or the new password on the connection following the one containing a password change. The password used becomes the new password.

Tag Description
<PINCHRQ> PIN-change-request aggregate
<USERID> User identification string. Often a social security number, but if so, does not include any check digits, A-32
<NEWUSERPASS> New user password, A-32
</PINCHRQ>  
Tag Description
<PINCHRS> PIN-change-response aggregate
<USERID> User identification string. Often a social security number, but if so, does not include any check digits, A-32
<DTCHANGED> Date and time the password was changed, datetime
</PINCHRS>  
  1. Status Code Values for the <CODE> Element of <STATUS>
Value Meaning
0 Success (INFO)
2000 General error (ERROR)
15503 Could not change PIN (ERROR)
  1. Examples

User requests a password change:

<PINCHTRNRQ>
	<TRNUID>888
	<PINCHRQ>
		<USERID>123456789
		<NEWUSERPASS>5321
	</PINCHRQ>
</PINCHTRNRQ>

The server responds with:

<PINCHTRNRS>
	<TRNUID>888
	<STATUS>
		<CODE>0
		<SEVERITY>INFO
	</STATUS>
	<PINCHRS>
		<USERID>123456789
	</PINCHRS>
</PINCHTRNRS>
  1. External Data Support

Some data, such as binary data, cannot be easily sent directly within SGML. For these situations, the specification will define a tag that contains a reference to some external data. The way that clients pick up the external data depends on the transport used. For the HTTP-based transport described in this document, servers can send the data in one of two ways:

For example, to retrieve a logo, a <GETMIMERS> might answer a <GETMIMERQ> as follows:

<GETMIMERS>
	<URL>https://www.fi.com/xxx/yyy/zzz.html
</GETMIMERS>

If the file sent includes the same response using multi-part MIME, clients will assume it has the local file, zzz.jpg.

  1. Extensions to Open Financial Exchange

An organization that provides a customized client and server that communicate by means of
Open Financial Exchange might wish to add new requests and responses or even specific elements to existing requests and responses. To ensure that each organization can extend the specification without the risk of conflict, Open Financial Exchange defines a style of tag naming that lets each organization have its own name space.

Organizations can register a specific tag name prefix. (The specific procedure or organization to manage this registration will be detailed at a later time.) If an organization registers "ABC," then they can safely add new tags named <ABC.SOMETHING> without

The extensions are not considered proprietary. An organization is free to publish their extensions and encourage client and server implementers to support them.

All tag names that do not contain a period (.) are reserved for use in future versions of the core
Open Financial Exchange specification.

  1. Common Aggregates, Elements, and Data Types
  2. Common Aggregates

This section describes aggregates used in more than one service of Open Financial Exchange (for example, investments and payments).

  1. Identifying Financial Institutions and Accounts

Open Financial Exchange does not provide a universal space for identifying financial institutions, accounts, or types of accounts. The way to identify an FI and an account at that FI depends on the service. For information about service-specific ID aggregates, see Chapters 11, 12, and 13 on banking, payments, and investments.

  1. Balance Records <BAL>

Several responses allow FIs to send an arbitrary set of balance information as part of a response, for example a bank statement download. FIs might want to send information on outstanding balances, payment dates, interest rates, and so forth. Balances can report the date the given balance reflects in <DTASOF>.

Tag Description
<BAL> Balance-response aggregate
<NAME> Balance name, A-20
<DESC> Balance description, A-80
<BALTYPE> Balance type.
DOLLAR = dollar (value formatted DDDDcc)
PERCENT = percentage (value formatted XXXX.YYYY)
NUMBER = number (value formatted as is)
<VALUE> Balance value.
Interpretation depends on <BALTYPE> field, N-20
<CURRENCY> If dollar formatting, can optionally include currency
<DTASOF> Effective date of the given balance, datetime
</BAL>  
  1. Error Reporting <STATUS>

To provide as much feedback as possible to clients and their users, Open Financial Exchange defines a <STATUS> aggregate. The most important element is the code that identifies the error. Each response defines the codes it uses. Codes 0 through 2999 have common meanings in all Open Financial Exchange transactions. Codes from 3000 and up have meanings specific to each transaction.

Tag Description
<STATUS> Error-reporting aggregate.
<CODE> Error code, N-6
<SEVERITY> Severity of the error:
INFO = Informational only
WARN = Some problem with the request occurred but valid response still present
ERROR = A problem severe enough that response could not be made
<MESSAGE> A textual explanation from the FI. Note that clients will generally have messages of their own for each error ID. Use this tag only to provide more details or for the General errors.
</STATUS>  

 

stadyn_image6.gif (4356 bytes)
stadyn_image6

stadyn_image7.gif (1702 bytes)
stadyn_image7

For general errors, the server can respond with one of the following <CODE> values. However, not all codes are possible in a specific context.

Code Meaning
0 Success (INFO)
2000 General error (ERROR)
2021 Unsupported version (ERROR)

NOTE: Clients will generally have error messages based on <CODE>. Therefore, do not use <MESSAGE> to replace that text. Use <MESSAGE> only to explain an error not well described by one of the defined CODEs, or to provide some additional information.

  1. Common Elements

This section defines elements used in several services of Open Financial Exchange. The format of the value is either alphanumeric (A-n) or numeric (N-n) with a maximum length n; or as a named type. Section 3.3 describes the named types.

  1. Financial Institution Transaction ID <FITID>

Format: A-255

An FI assigns an <FITID> to uniquely identify a transaction. Its primary purpose is to allow a client to detect duplicate responses. Open Financial Exchange intends <FITID> for use in statement download applications, where every transaction requires a unique ID; not just those that are client-originated or server-originated.

FITIDs must be unique within the scope of the requested transactions (that is, within an account) but need not be sequential or even increasing. Clients should be aware that FITIDs are not unique across FIs. If a client performs the same type of request within the same scope at two different FIs, clients will need to use FI + account + <FITID> as a unique key in a client database.

Usage: Bank statement download, investment statement download

  1. Server-Assigned ID <SRVRTID>

Format: A-10

A <SRVRTID> is a server-assigned ID. It should remain constant throughout the lifetime of the object on the server. The client will consider the SRVRTID as its "receipt" or confirmation and will use this ID in any subsequent requests to change, delete, or inquire about this object.

Where the context allows, it is possible for a server to use the same value for a given server object for both <SRVRTID> and <FITID>, but the client will not know this. SRVRTIDs need be unique only within the scope of the requests and responses they apply to, such as an account number. Like <FITID>, a <SRVRTID> is not unique across FIs and clients might need to use FI + <SRVRTID> if a client requires a unique key.

Usage: Payments, Banking

  1. Client-Assigned Transaction UID <TRNUID>

Format: A-36

Open Financial Exchange uses TRNUIDs to identify transactions, specifically <XXXTRNRQ>. Clients expect a server to return the same <TRNUID> in the corresponding response and can use this to match up requests and responses. Servers can use TRNUIDs to reject duplicate requests. Because multiple clients might be generating requests to the same server, transaction IDs need to be unique across clients. Thus, <TRNUID> must be a globally unique ID.

The Open Software Foundation Distributed Computing Environment standards specify a 36-character hexadecimal encoding of a 128-bit number and an algorithm to generate it. Clients are free to use their own algorithm, to use smaller TRNUIDs, or to relax the uniqueness requirements if in their particular application it makes sense. However, it is RECOMMENDED that clients allow for the full 36 characters in responses to work better with other clients.

Usage: All services

  1. Token <TOKEN>

Format: A-10

Open Financial Exchange uses <TOKEN> as part of data synchronization requests to identify the point in history that the client has already received data, and in responses to identify the server's current end of history. See Chapter 6, "Data Synchronization," for more information.

<TOKEN> is unique within an FI and the scope of the synchronization request. For example, if the synchronization request includes an account ID, the <TOKEN> needs be unique only within an account. Servers are free to use a <TOKEN> that is unique across the entire FI. Clients must save separate <TOKEN>s for each account, FI, and type of synchronization request.

Usage: All synchronization requests and responses

  1. Transaction Amount <TRNAMT>

Format: Amount

Open Financial Exchange uses <TRNAMT> in any request or response that reports the total amount of an individual transaction.

Usage: Bank statement download, investment statement download, payments

  1. Memo <MEMO>

Format: A-255

A <MEMO> provides additional information about a transaction.

Usage: Bank statement download, investment statement download, payments, transfers

  1. Date Start and Date End <DTSTART> <DTEND>

Format: Datetime

Open Financial Exchange uses these tags in requests to provide guidance to the FI about the range of response that is desired. It also uses them in responses to let clients know what the FI was actually able to produce.

In requests, the following rules apply:

In responses, the following rules apply:

In all cases, servers are REQUIRED to use a "system add datetime" as the basis for deciding which details match the requested date range. For example, if an FI posts a transaction dated Jan 3 to a user's account on Jan 5, and a client connects on Jan 4 and again on Jan 6, the server is REQUIRED to return that Jan 3 dated transaction to the client when it calls on Jan 6.

Usage: Bank statement download, investment statement download

  1. Common data types
  2. Dates and Times
  3. Basic Format

There is one format for representing dates, times, and time zones. The complete form is:

YYYYMMDDHHMMSS.XXX[gmt offset:tz name]

For example, "19961005132200.1234[-5:EST]" represents October 5, 1996, at 1:22 and 124 milliseconds p.m., in Eastern Standard Time. This is the same as 6:22 p.m. Greenwich Mean Time (GMT).

Tags specified as type date and generally starting with the letters "DT" will accept a fully formatted date-time-timezone as specified above. They will also accept values with fields omitted from the right. They assume the following defaults if a field is missing:

Open Financial Exchange identifies elements that require a time as having type timestamp and their tag name will start with the prefix TS. The timezone and milliseconds are still optional, and will default to GMT.

Take care when specifying an ending date without a time. If the last transaction returned for a bank statement download was Jan 5 1996 10:46 a.m. and if the <DTEND> was given as just Jan 5, the transactions on Jan 5 would be resent. If results are only available daily, then just using dates and not times will work correctly.

NOTE: Open Financial Exchange does not require servers or clients to use the full precision specified. However, they are REQUIRED to accept any of these forms without complaint.

Some services extend the general notion of a date by adding special values, such as "TODAY." These special values are called "smart dates." Specific requests indicate when to use these extra values, and list the tag as having a special data type.

  1. Time Zone Issues

Several issues arise when a customer and the FI are not in the same time zone, or when a customer moves a computer into new time zones. In addition, it is generally unsafe to assume that computer users have correctly set their time or timezone.

Although most transactions are not sensitive to the exact time, they often are sensitive to the date. In some cases, time zone errors lead to actions occurring on a different date than intended by the customer. For this reason, servers should always use a complete local time plus GMT offset in any datetime values in a response. If a customer's request is for 5 p.m. EST, and a server in Europe responds with 1 a.m. MET the next day, a smart client can choose to warn the customer about the date shift.

Clients that maintain local state, especially of long-lived server objects, should be careful how they store datetime values. If a customer initiates a repeating transaction for 5 p.m. EST, then moves to a new time zone, the customer might have intended that the transaction remain 5 p.m. in the new local time, requiring a change request to be sent to the server. If, however, they intended it to remain fixed in server time, this would require a change in the local time stored in the client.

  1. Amounts, Prices, and Quantities
  2. Positive and Negative Signs

Unless otherwise noted in the specification, Open Financial Exchange always signs amounts and quantities from the perspective of the customer. Some typically negative amounts:

Some typically positive amounts:

Amount: All amount-valued tags are sent with a decimal point or comma, as in "XXXX.XX." There should not be any punctuation separating thousands, millions, and so forth. The maximum value accepted depends on the client.

Quantity: Use decimal notation.

Unitprice: Use decimal notation. Unless specifically noted, prices should always be positive.

Rate: Use decimal notation, with the rate specified out of 100%. For example, 5.2 is 5.2%.

Some services define special values, such as INFLATION, which you can use instead of a designated value. Open Financial Exchange refers to these as "smart types," and identifies them in the specification.

  1. Language

Open Financial Exchange identifies human-readable language-for such things as status messages and e-mail-with a three-letter code based on ISO-639.

  1. Basic data types

Boolean: Y = yes or true, N = no or false.

URL: String form of a World Wide Web Uniform Resource Location. It should be fully qualified including protocol, host, and path.

  1. Security
  2. Security Solutions

Open Financial Exchange carries financial information over the Internet in such a way to provide privacy, message integrity, and authentication for applications at the appropriate level. Each service within Open Financial Exchange requires a certain level of security. Online banking and payments require strong secrecy, whereas stock quotations consist of publicly available information and consequently have a much weaker secrecy requirement.

Some Internet protocols, such as HTTPS (which uses Secure Socket Layer version 3, SSLv3), provide channel-level security. When the security requirement exceeds that provided by the channel, you must use an application-level protocol.

To address these various needs, Open Financial Exchange allows a range of security solutions. Open Financial Exchange 1.0 supports online banking and payment functions for which strong channel security is currently appropriate. Future releases will support a wider array of services, some of which will require more elaborate trust models. Application-level protection will secure the latter.

Open Financial Exchange security properties include:

  1. Determining Security Levels <OFXSEC> <TRANSPSEC>

Two elements in the FI profile, <OFXSEC> and <TRANSPSEC>, contain the security level a client should use to communicate with a server.

The valid values for <OFXSEC> are as follows:

Type Description
NONE No application level security
APPSEC Use application level security

The <TRANSPSEC> element value is Boolean. If the value is YES, use channel-level security.

  1. Channel-Level Security

Secure Socket Layer version 3 (SSLv3) provides channel-level security in Open Financial Exchange. SSLv3 provides confidentiality, message integrity, and implicit authentication. In Open Financial Exchange 1.0, channel-level security using SSLv3 is the primary form of security.

  1. Security Requirements

Open Financial Exchange provides a method to exchange financial information over public networks. This necessitates strong security facilities and careful protocol design. The most commonly used facility, and trusted method for accomplishing many of these goals is SSL. The following sub-sections describe the most prominent requirements for security and how Secure Socket Layer (SSL) addresses these.

  1. Privacy, Authentication, and Message Integrity

SSL provides a range of strong encryption methods for insuring confidentiality, and strong measures to insure that messages are not altered as they propagate over the Internet. User authentication is usually addressed at the application layer, not within SSL. Servers are configured with public key certificates that client application software verifies. This provides some measure of server authentication. Testing certificate revocation lists is not commonly performed. However, as these facilities emerge, client software will be written to support this need.

  1. Facilities for Authorization

Open Financial Exchange messages typically provide user ID and password so that a service provider can authenticate the user. Once a system authenticates a user, the service provider must insure that the user is authorized to perform the requested actions. For example, the service provider must decide if the specified user is authorized to perform a transfer from the specified account. The service provider must also determine whether the user has exceeded allowed limits on withdrawals, whether the activity on this account is unusual given past history, and other context-sensitive issues.

  1. Using SSL 3.0 in Open Financial Exchange

SSL version 3.0 provides a set of widely and commonly accepted methods for securing Internet transactions. These common methods within SSL are called CipherSuites. You can secure applications appropriately within SSL by specifying an ordered sequence of preferred CipherSuites (highest preference listed first). Servers select the strongest supported CipherSuite from the list provided by the client.

NOTE: Passing username and password pairs in a weakly encrypted channel exposes this information to cryptographic attack. When implementing Open Financial Exchange, use the strongest available ciphers.

You should not use the following CipherSuites because they are vulnerable to man-in-the-middle attacks during Open Financial Exchange message exchanges:

  • SSL_DH_anon_EXPORT_WITH_RC4_40_MD5
  • SSL_DH_anon_WITH_RC4_128_MD5
  • SSL_DH_anon_EXPORT_WITH_DES40_CBC_SHA
  • SSL_DH_anon_WITH_DES_CBC_SHA
  • SSL_DH_anon_WITH_3DES_EDE_CBC_SHA
  • Setting tags to enable channel-level security in the FI profile advises the Open Financial Exchange application to use this security method. Usually, the service provider of the Web server configures the allowed CipherSuites within SSL.

    1. Application-Level Security

    While strong channel-level security is sufficient for the current suite of Open Financial Exchange transactions, there are features that channel security does not provide. These include (but are not limited to) data signing, non-repudiation, rational certificate management and revocation, and trust proxy. Where the trust model for an application requires such features to conduct the transaction safely, Open Financial Exchange stipulates the use of an application-level protocol. A future implementation guide will publish this protocol.

    The standard method for providing application-level security is to rely upon the RSA Public Key Cryptography Standard (PKCS) message format. The PKCS #7 standard specifies a message format that is both cryptographically strong and flexible enough to provide sufficient facilities for evolution.

    1. Requirements for Application-Layer Security
    2. Privacy, Authentication, and Message Integrity

    RSA Public Key Cryptography Standard #7 (PKCS#7) defines a rich set of message formats for securely exchanging information over public networks. These message formats provide for encrypting data using a combination of cryptographic techniques to leverage manageability of public key cryptography. It also utilizes the speed of block ciphers into a hybrid, which exploits the best properties of each.

    PKCS#7 message encryption provides privacy. A digitally signed message (or applying HMAC) insures message integrity.

    Use one of the following to define PKCS#7 messages: Data, Digitally Signed-Data, Enveloped-Data, or Digitally Signed and Enveloped-Data (also referred to as Sealed-Data). Open Financial Exchange can use Digested-Data, which digests application data before it embeds data within an Enveloped-Data object. However, it should not transmit this data over public networks without encryption applied.

    1. Facilities for Authorization

    As stated previously in the section 4.2, authentication and authorization are the responsibility of the service provider. Open Financial Exchange messages contain the information to enable authentication and authorization decisions. With application-level security that uses a digitally signed format, the verification of that signature provides an additional method of authenticating the user.

    1. Using Application-level Encryption in Open Financial Exchange

    Open Financial Exchange applications requiring a sophisticated trust model require more facilities than those provided by SSL. If an Open Financial Exchange application requires only point-to-point security, SSL version 3.0 provides adequate facilities for message security. However, if the application requires more directed, specific forms of security, then use the appropriate PKCS#7 message formats for the application. An example of this might be a stock trading application issuing orders whose values demand that the security level be high, and where Open Financial Exchange treats the message with special handling instructions.

    Recommended cryptographic techniques for Open Financial Exchange application security are:

    In the absence of digital signatures, Open Financial Exchange applications should utilize the HMAC keyed MAC algorithm, using SHA-1 as a secure hash function.

    When you set the tags for application-layer security-which determines whether to use PKCS#7 message format-in the FI profile, the application software uses these facilities.

    1. International Support
    2. Language and Encoding

    Most of the content in Open Financial Exchange is language-neutral. However, some error messages, balance descriptions, and similar tags contain text meant to appear to the financial institution customers. There are also cases, such as e-mail records, where customers need to send text in other languages. To support world-wide languages, Open Financial Exchange must identify the basic text encoding, specific character set, and the specific language.

    The outer Open Financial Exchange headers specify the encoding and character set, as described Chapter 2. Current encoding values are ASCII and UNICODE. For ASCII, character set values are code pages. Unicode ignores the character set per se although it still requires the syntax. Clients identify the language in the signon request. Open Financial Exchange specifies languages by three-letter codes as defined in ISO-639. Servers report their supported languages in the profile (see Chapter 7). If a server cannot support the requested language, they must return an error and not process the rest of the transactions.

    1. Currency <CURDEF> <CURRENCY> <ORIGCURRENCY>

    In each transaction involving amounts, responses include a default currency identification, <CURDEF>. The values are based on the ISO 4217 three-letter currency identifiers.

    Within each transaction, specific parts of the response might need to report a different currency. Where appropriate, aggregates will include an optional <CURRENCY> aggregate. The scope of a <CURRENCY> aggregate is everything within the same aggregate that the <CURRENCY> aggregate appears in, including nested aggregates, unless overridden by a nested <CURRENCY> aggregate. For example, specifying a <CURRENCY> aggregate in an investment statement detail means that the unit price, transaction total, commission, and all other amounts are in terms of the given currency, not the default currency.

    Note that there is no way for two or more individual elements that represent amounts-and are directly part of the same aggregate-to have different currencies. For example, there is no way in a statement download to have a different currency for the <LEDGERBAL> and the <AVAILBAL>, because they are both directly members of <STMTRS>. In most cases, you can use the optional <BAL> records to overcome this limitation, which do accept individual <CURRENCY> aggregates.

    The default currency for a request is the currency of the source account. For example, the currency for <BANKACCTFROM>.

    The <CURRATE> should be the one in effect throughout the scope of the <CURRENCY> aggregate. It is not necessarily the current rate. Note that the <CURRATE> needs to take into account the choice of the FI for formatting of amounts (that is, where the decimal is) in both default and overriding currency, so that a client can do math. This can mean that the rate is adjusted by orders of magnitude (up or down) from what is commonly reported in newspapers.

    Tag Description
    <CURRENCY> or
    <ORIGCURRENCY>
    Currency aggregate
    <CURSYM> ISO 4217 3-letter currency identifier, A-3
    <CURRATE> Ratio of <CURDEF> currency to <CURSYM> currency, in decimal form
    </CURRENCY> or
    </ORIGCURRENCY>
     

    In some cases, Open Financial Exchange will define transaction responses so that amounts have been converted to the home currency. However, Open Financial Exchange will allow FIs to optionally report the original amount and the original (foreign) currency. In these cases, transactions include a specific tag for the original amount, and then a <ORIGCURRENCY> tag to report the details of the foreign currency.

    Again, <CURRENCY> means that Open Financial Exchange has not converted amounts. Whereas, <ORIGCURRENCY> means that Open Financial Exchange has already converted amounts.

    1. Country-Specific Tag Values

    Some of the tags in Open Financial Exchange have values that are country-specific. For example, <USPRODUCTTYPE> is only useful within the United States. Open Financial Exchange will extend in each country as needed to provide tags that accept values useful to that country. Clients in other countries that do not know about these tags will simply skip them.

    In some cases, a tag value represents a fundamental way of identifying something, yet there does not exist a world-wide standard for such identification. Examples include bank accounts and securities. In these cases, it is important that Open Financial Exchange defines a single, extensible approach for identification. For example, CUSIPs are used within the U.S., but not in other countries. However, CUSIPs are fundamental to relating investment securities, holdings, and transactions. Thus, a security ID consists of a two-part aggregate: one to identify the naming scheme, and one to provide a value. Open Financial Exchange will define valid naming schemes as necessary for each country.


    1. Data Synchronization
    2. Overview

    Currently, some systems provide only limited support for error recovery and no support for backup files or multiple clients. The Open Financial Exchange data synchronization approach described in this chapter handles all of these situations.

    Open Financial Exchange defines a powerful form of data synchronization between clients and servers.

    Open Financial Exchange data synchronization addresses the following problems:

    This chapter first provides a brief background of error recovery problems and then presents the basic strategy used in Open Financial Exchange to perform data synchronization. Each Open Financial Exchange service includes specific details for data synchronization requests and responses.

    Most of the information in this chapter concerns data synchronization, since it is a relatively new concept. The final section in this chapter discusses alternatives to full synchronization, and summarizes the options for each.

    1. Background

    When a client begins a connection with a server for which the connection does not successfully complete, there are two main problems:

    If a client does not receive a response to work it initiates, it has no way of knowing whether the server processed the request. It also will not have any server-supplied information about the request, such as a server ID number.

    Some banking protocols allow a server to send data to the client whenever the client makes a connection. This specification assumes that the first client that calls in after the unsolicited data is available for download receives the data. If the connection fails, this information would be forever lost to the client. Examples of unsolicited data include updates in the status of a bill payment and e-mail responses.

    Unsolicited data presents problems beyond error recovery. Because the first client that connects to a server is the only one to receive unsolicited data, this situation precludes use of multiple clients without a data synchronization method. For example, if a user has a computer at work and one at home, and wants to perform online banking from both computers, a bank server could send unsolicited data to one but not the other.

    An even greater problem occurs when a user resorts to a backup copy of the client data file. This backup file will be missing recent unsolicited data with no way to retrieve it from the server once the server sends it.

    1. Data Synchronization Approach

    A simple solution is to make sure that clients can always obtain information from the server for a reasonable length of time. Clients can request recent responses-whether due to client-initiated work or other status changes on the server-by supplying the previous endpoint in the response history. Servers always supply a new endpoint whenever they supply responses.

    If a client connection fails-or a client receives a response, but crashes before updating its database-the client will not save the new endpoint. On the next synchronization request, the server sends the same information (plus any further status changes).

    If a user switches to a backup file, again the client will use the older endpoint in the synchronization request.

    If multiple clients are in use, each will send requests based on its own endpoint, so that both clients will obtain complete information from the server. This is one reason why Open Financial Exchange responses carry enough information from the request to enable them to be processed on their own. The diagram below shows the relationship between clients and servers.

    Open Financial Exchange relieves the server from maintaining any special error-recovery state information. However, Open Financial Exchange requires the server to maintain a history of individual responses it would have sent and a way to identify a position in the history. This ID could be a timestamp, or be based on its existing state information.

    NOTE: Open Financial Exchange does not require servers to store these responses based on individual connections. Also, not all requests are subject to synchronization. For example, Open Financial Exchange does not require servers to store statement-download responses separately for data synchronization.

    1. Data Synchronization Specifics

    Open Financial Exchange does synchronization separately for each type of response. In addition, a synchronization request might include further identifying information, such as a specific account number. This specification defines the additional information for each synchronization request.

    Each Open Financial Exchange service will identify the specific responses that are subject to data synchronization. For example, a bank statement-download is a read-only operation from the server. A client can request again if it fails; consequently, there is no special data synchronization for this type of response.

    The basis for synchronization is a token as defined by the <TOKEN> tag. The server is free to create a token in any way it wishes. The client simply holds the token for possible use in a future synchronization request.

    The server can derive a token from one of the following:

  • NOTE: Open Financial Exchange reserves a <TOKEN> value of zero for the first time each type of response does a synchronization task.
  • Clients will send a <TOKEN> of zero on their first synchronization request. Servers should send all available history, allowing a new client to know about work done by other clients. If a user's account has never been used with Open Financial Exchange, the server returns no history.

    The server can use different types of tokens for different types of responses, if suitable for the server.

    Tokens will be subject to a maximum size; see Chapter 3, "Common Aggregates, Elements, and Data Types." Tokens need to be unique only with respect to a specific type of synchronization request and the additional information in that request. For example, a bill payment synchronization request takes an account number; therefore, a token needs to be unique only within payments for a specific account.

    Servers will not have infinite history available, so synchronization responses include a <LOSTSYNC> element with a value of Y (yes) if the old token in the synchronization request was older than available history. This tag allows clients to alert users that some responses have been lost.

    NOTE: A token is unrelated to a <TRNUID>, <SRVRTID>, or <FITID>. Each of these serves a specific purpose, and has its own scope and lifetime.

    A <SRVRTID> is not appropriate as a <TOKEN> for bill payment. A single payment has a single <SRVRTID>, but it can undergo several state changes over its life and thus have several entries in the token history.

    There are three different ways a client and a server can conduct their requests and responses:

    Each request and response that requires data synchronization will define a synchronization aggregate. The aggregate tells the server which particular kind of data it should synchronize. By convention, these aggregates always have SYNC as part of their tag names, for example, <PMTSYNCRQ>. You can use these aggregates on their own to perform explicit synchronization, or as wrappers around one or more new transactions. For example, you can use <PMTSYNCRQ> aggregates to request synchronization in combination with new work. You can use the <PMTTRNRQ> by itself if you do not require synchronization.

    Some clients can choose to perform an explicit synchronization before sending any new requests (with or without synchronization). This practice allows clients to be up-to-date and possibly interact with the user before sending any new requests. Other clients can simply send new requests as part of the synchronization request.

    If a client synchronizes in one file, then sends new work inside a synchronization request in a second file, there is a small chance that additional responses become available between the two connections. There is even a smaller chance that these would be conflicting requests, such as modifications to the same object. However, some clients and some requests might require absolute control, so that the user can be certain that they are changing known data. To support this, synchronization requests can optionally specify <REJECTIFMISSING>. The tag tells a server that it should reject all enclosed requests if the supplied <TOKEN> is out of date before considering the new requests. That is, if any new responses became available, whether related to the incoming requests or not (but part of the scope of the synchronization request), the server should immediately reject the requests. It should still return the new responses. A client can then try again until it finds a stable window to submit the work. See section 6.5 for more information about conflict detection and resolution.

    The password change request and response present a special problem. See section 2.5.2 for further information.

    1. Conflict Detection and Resolution

    Conflicts arise whenever two or more users or servers modify the same data. This can happen to any object that has a <SRVRTID> that supports change or delete requests. For example, one spouse and the other might independently modify the same recurring bill payment model. From a server perspective, there is usually no way to distinguish between the same user making two intended changes and two separate users making perhaps unintended changes. Therefore, Open Financial Exchange provides enough tools to allow clients to carefully detect and resolve conflicts. Open Financial Exchange requires only that a server process atomically all requests in a single <OFX> block.

    A careful client will always synchronize before sending any new requests. If any responses come back that could affect a user's pending requests, the client can ask the user whether it should still send those pending requests. Because there is a small chance for additional server actions to occur between the initial synchronization request and sending the user's pending requests, extremely careful clients can use the <REJECTIFMISSING> option. Clients can iterate sending pending requests inside a synchronization request with <REJECTIFMISSING> and testing the responses to see if they conflict with pending requests. A client can continue to do this until a window of time exists wherein the client is the only agent trying to modify the server. In reality, this will almost always succeed on the first try.

    1. Synchronization vs. Refresh

    There are some situations, and some types of clients, where it is preferable for a client to ask a server to send everything it knows, rather than just receive a set of changes. For example, a situation where a client that has not connected often enough has lost synchronization. An example of "type" might be a completely stateless client, such as a web browser. This choice is made during client implementation. Open Financial Exchange does not require a client to refresh just because it has lost synchronization.

    Clients will mainly want to refresh lists of long-lived objects on the server; generally objects with a <SRVRTID>. For example, Open Financial Exchange Payments has both individual payments and models of recurring payments.

    A brand new client, or a client that lost synchronization, might want to learn about in-progress payments by doing a synchronization refresh of the payment requests. It would almost certainly want to do a synchronization refresh of the recurring payment models, because these often live for months or years.

    A client might not perform a synchronization refresh on e-mail responses.

    A client can request a refresh by using the <REFRESH> tag with value of Y instead of the <TOKEN> tag. Server descriptions detail the exact behavior that servers should follow. However, the general rule is that servers should send responses that emulate a client creating or adding each of the objects governed by the particular synchronization request.

    In these cases, you can set <TRNUID> to zero; the standard value for server-generated responses.

    There is no need to recreate a stream of responses that emulate the entire history of the object, just an add response that reflects the current state. For example, if you create a model and then modify it three times, even if this history would have been available for a regular synchronization, servers should only send a single add that reflects the current state.

    A client that just wants the current token, without refresh or synchronization, makes requests with <TOKENONLY> and a value of Y.

    In all cases, servers should send the current ending <TOKEN> for the synchronization request in refresh responses. This allows a client to perform regular synchronization requests in the future.

    The following table summarizes the options in a client synchronization request:

    Tag Description
    <TOKEN> Previous value of <TOKEN> received for this type of synchronization request from server; 0 for first-time requests; token
    <TOKENONLY> Request for just the current <TOKEN> without the history, Boolean
    <REFRESH> Request for refresh of current state, Boolean
    <REJECTIFMISSING> If Y, do not process requests if client <TOKEN> is out of date, Boolean

    NOTE: Open Financial Exchange requires one each of <TOKEN>, <TOKENONLY>, or <REFRESH>.

    1. Typical Server Architecture for Synchronization

    This section describes how an FI can approach supporting synchronization based on the assumption that modifications to an existing financial server will be kept to a minimum.

    The simplest approach is to create a history database separate from the existing server. This history could consist of the actual Open Financial Exchange transaction responses (<TRNRS> aggregates) that are available to a synchronization request. The history database could index records by token, response type, and any other identifying information for that type, such as account number.

    The diagram below shows a high-level model of the Open Financial Exchange architecture for a financial institution. Notice that the diagram shows the presence of a history journal.

    The server adds responses to the history journal for any action that takes place on the existing server. This is true whether the Open Financial Exchange requests initiate the action or, in the case of recurring payments, it happens automatically on the server. Once added to the history journal, the server can forget them.

    The areas of the Open Financial Exchange server that process synchronization requests need only search this history database for matching responses that are more recent than the incoming token.

    For a refresh request, an Open Financial Exchange server would access the actual bank server to obtain the current state rather than recent history.

    Periodically the bank server would purge the history server of older entries.

    Only requests that are subject to synchronization need to have entries in the history database. Statement downloads do not involve synchronization; therefore, the FI server should not add these responses to the history database. Since statement downloads are usually the largest in space and the most frequent, eliminating these saves much of the space a response history might otherwise require.

    More sophisticated implementations can save even more space. The history database could save responses in a coded binary form that is more compact than the full Open Financial Exchange response format. Some FIs might have much or all of the necessary data already in their servers; consequently, they would not require new data. An FI could regenerate synchronization responses rather than recall them from a database.

    1. Typical Client Processing of Synchronization Results

    The diagram below shows a general flowchart of what an Open Financial Exchange client would do with the results of a synchronization request. Most requests and responses subject to data synchronization contain both <TRNUID> and <SRVRTID>.


    1. Simultaneous Connections

    It is increasingly common that a server can get simultaneous or overlapping requests from the same user over two different computers. Open Financial Exchange requires a server to process each set of requests sent in a file as an atomic action. Servers can deal with the problems that arise with simultaneous use in two ways:

    1. Synchronization Alternatives

    Although it is RECOMMENDED that Open Financial Exchange servers implement full synchronization as described in this chapter, an alternate approach, "lite synchronization," could be easier for some servers to support. This approach focuses only on error recovery and does not provide any support for multiple clients, multiple data files, or use of backup files. The approach is to preserve the message sets while simplifying the implementation.

    In addition, some clients might prefer to use response-file based error recovery with all servers, even if the client and some server both support full synchronization. This section first describes lite synchronization, and then explains the rules that clients and servers use to decide how to communicate.

    1. Lite Synchronization

    Lite synchronization requires servers to accept all synchronization messages, but does not require them to keep any history or tokens. Responses need only be sent once and then the server can forget them. Responses to client requests, whether or not they are made inside a synchronization request, are processed normally. Responses that represent server-initiated work, such as payment responses that arise from recurring payments, are sent only in response to synchronization requests. A server does not have to hold responses in case a second client makes a synchronization request.

    Because full synchronization supports error recovery, an alternative is needed for lite synchronization. Servers using lite synchronization keep a copy of the entire response file they last sent. Clients requesting that servers prepare for error recovery generate a globally unique ID for each file they send. In the OFX headers, there are two tags associated with error recovery:

    The format of these is the same as used with <TRNUID> as documented in Chapter 3.

    Servers use the following rules:

    A server will never need to save more than one file per client data file, but because of possible multi-client or multi-datafile usage, it might need to save several files for a given user. A server should save as long as possible, but not indefinitely. A server cannot recognize an error recovery attempt if it comes after it has purged the error recovery file. A server would process it as a new request. In this case, a server should recognize duplicate transaction UIDs for client-initiated work, such as payments, and then reject them individually. Server-generated responses would be lost to the client.

    For a server accustomed to sending unsolicited responses, lite synchronization should closely match the current response-file based implementation. The only difference is that a server should hold the unsolicited responses until the client makes the first appropriate synchronization request; rather than automatically adding them to any response file. Once added, the server can mark them as delivered, relying on error recovery to insure actual delivery.

    1. Relating Synchronization and Error Recovery

    Client and server developers should first decide whether they will support full synchronization or not. If they can, then they can support response-file error recovery as well, or they can rely on synchronization to perform error recovery. If they adopt only lite synchronization, Open Financial Exchange requires response-file error recovery. A severs describes each of these choices in its server profile records. The following combinations are valid:

    Clients request response-file error recovery by including the old and new session UIDs in the header. If they are absent, servers need not save the response file for error recovery. Clients request synchronization by using those synchronization requests defined throughout this specification.

    1. Examples

    Here is an example of full synchronization using bill payment as the service. Open Financial Exchange Payments provides two different synchronization requests and responses, each with their own token; one for payment requests and one for repeating payment model requests. See Chapter 102 for full details.

    These simplified examples, show without the outer <OFX> layer, <SIGNON>, and so forth.Client A requests synchronization:

    <PMTSYNCRQ>
    	<TOKEN>123
    	<BANKACCTFROM>
    		<BANKID>121000248
    		<ACCTID>123456789
    		<ACCTTYPE>CHECKING
    	</BANKACCTFROM>
    </PMTSYNCRQ>The server sends in response:
    <PMTSYNCRS>
    	<TOKEN>125
    	<LOSTSYNC>N
    	<BANKACCTFROM>
    		<BANKID>121000248
    		<ACCTID>123456789
    		<ACCTTYPE>CHECKING
    	</BANKACCTFROM>
    	<PMTTRNRS>
    		<STATUS>
    			... status details
    		</STATUS>
    		<TRNUID>123
    		<PMTRS>
    			... details on a payment response
    		</PMTRS>
    	</PMTTRNRS>
    	<PMTTRNRS>
    		<STATUS>
    			... status details
    		</STATUS>
    		<TRNUID>546
    		<PMTRS>
    			... details on another payment response
    		</PMTRS>
    	</PMTTRNRS>
    </PMTSYNCRS>
    

    Client A was missing two payment responses, which the server provides. At this point, client A is synchronized with the server. Client A now makes a new payment request, and includes a synchronization update as part of the request. This update avoids having to re-synchronize the expected response at a later time.

    <PMTSYNCRQ>
    	<TOKEN>125
    	<BANKACCTFROM>
    		<BANKID>121000248
    		<ACCTID>123456789
    		<ACCTTYPE>CHECKING
    	</BANKACCTFROM>
    	<PMTTRNRQ>
    		<TRNUID>12345
    		<PMTRQ>
    			... details of a new payment request
    		</PMTRQ>
    	</PMTTRNRQ>
    </PMTSYNCRQ>The response to this new
    request:
    <PMTSYNCRS>
    	<TOKEN>126
    	<LOSTSYNC>N
    	<BANKACCTFROM>
    		<BANKID>121000248
    		<ACCTID>123456789
    		<ACCTTYPE>CHECKING
    	</BANKACCTFROM>
    	<PMTTRNRS>
    		... details on a payment response to the new request
    	</PMTTRNRS>
    </PMTSYNCRS>
    

    The client now knows that the server has processed the payments request it just made, and that nothing else has happened on the server since it last synchronized with the server.

    Assume client B was synchronized with respect to payments for this account up through token 125. If it called in now and synchronized-with or without making additional requests-it would pick up the payment response associated with token 126. It records the same information that was in client A, which would give both clients a complete picture of payment status.

    1. FI Profile
    2. Overview

    Open Financial Exchange clients use the profile to learn the capabilities of an Open Financial Exchange server. This information includes general properties such as account types supported, user password requirements, specific messages supported, and how the client should batch requests and where to send the requests. A client obtains a portion of the profile when a user first selects an FI. The client obtains the remaining information prior to sending any actual requests to that FI. The server uses a timestamp to indicate whether the server has updated the profile, and the client checks periodically to see if it should obtain a new profile.

    In more detail, a profile response contains the following sections, which a client can request independently:

    The profile message is itself a message set. In files, Open Financial Exchange uses the <PROFMSGSV1> aggregate to identify this profile message set.

    The following sections describe the general use of profile information.

    1. Message Sets

    A message set is a collection of related messages. For example, Chapter 11, "Banking," defines several message sets: statement download, credit card statement download, intrabank transfers, and so forth. A server routes all of the messages in a message set to a single URL and merges their versions together.

    Clients and servers generally use message sets as the granularity to decide what functionality they will support. A "banking" server can choose to support the statement download and intrabank transfer message sets, but not the wire transfer message set. Attributes are available in many cases to further define how Open Financial Exchange supports a message set.

    Each portion of the Open Financial Exchange specification that defines messages also defines the message set to which that the messages belongs. This includes what additional attributes are available for those messages, and whether Open Financial Exchange requires the message set or it is optional.

    1. Version Control

    Message sets are the basis of version control. Over time there will be new versions of the message sets, and at any given time servers will likely want to support more than one version of a message set. Clients should also be capable of supporting as many versions as possible. Through the profile, clients discover which versions are supported for each message set. Considering the client capabilities, it exchanges messages at the highest common level for each message set.

    For the Open Financial Exchange-SGML data format, there is a single DTD for all message sets. Its version advances when any syntactic change is made to any of the message sets. (It is possible to make a semantic change that would not even require a change in syntax. A change in rules, for example, that would change the version of the message set without changing the DTD.) A single DTD cannot have two different definitions for the same aggregate. There are limitations to how a server that uses true DTD-based parsing can handle multiple versions of a message at the same time.

    1. Batching and Routing

    To allow FIs to set up different servers for different message sets, different versions, or to directly route some messages to third party processors, message sets define the URL to which a server sends messages in that message set. Each version of a message set can have a different URL. In the common case where many or all message sets are sent to a single URL, clients will consolidate messages across compatible message sets. Clients can consolidate when:

    1. Profile Request

    A profile request indicates which profile components a client desires. It also indicates what the client's routing capability is. Profiles returned by the FI must be compatible with the requested routing style, or it returns an error.

    Profile requests are not subject to synchronization. Use the <PROFTRNRQ> transaction tag.

    Tag Description
    <PROFRQ> Profile-request aggregate
    <CLIENTROUTING> Identifies client routing capabilities, see table below
    <DTPROFUP> Date and time client last received a profile update
    </PROFRQ>  
    Tag Description
    NONE Client cannot perform any routing. All URLs must be the same. All message sets share a single signon realm.
    SERVICE Client can perform limited routing. See details below.
    MSGSET Client can route at the message-set level. Each message set may have a different URL and/or signon realm.

    The intent of the SERVICE option for client routing is to support clients that can route bill payment messages to a separate URL from the rest of the messages. Because the exact mapping of message sets to the general concept of bill payment can vary by client and by locale, this specification does not provide precise rules for the SERVICE option. Each client will define its requirements.

    1. Profile Response

    The response includes message set descriptions, signon information, and general contact information.

    Tag Description
    <PROFRS> Profile-response aggregate
    <MSGSETLIST> Beginning list of message set information
    <XXXMSGSET> One or more message set aggregates
    </XXXMSGSET>  
    </MSGSETLIST>  
    <SIGNONINFOLIST> Beginning of signon information
    <SIGNONINFO> One or more signon information aggregates
    </SIGNONINFO>  
    </SIGNONINFOLIST>  
    <DTPROFUP> Time this was updated on server, datetime
    <FINAME> Name of institution, A-32
    <ADDR1> FI address, line 1
    <ADDR2> FI address, line 2
    <ADDR3> FI address, line 3
    <CITY> FI address city
    <STATE> FI address state
    <POSTALCODE> FI address postal code
    <COUNTRY> FI address country
    <CSPHONE> Customer service telephone number, A-32
    <TSPHONE> Technical support telephone number, A-32
    <FAXPHONE> Fax number, A-32
    <URL> URL for general information about FI (not for sending data) URL
    <EMAIL> E-mail address for FI, A-32
    <SYNCMODE> FULL for full synchronization capability
    LITE for lite synchronization capability
    <RESPFILEER> Y if server supports response-file based error recovery, Boolean
    </PROFRS>  

    See the Chapter 6 for more information on <SYNCMODE> and <RESPFILEER>.

    1. Message Set

    An aggregate describes each message set supported by an FI. Message sets in turn contain an aggregate for each version of the message set that is supported. For a message set named XXX, the convention is to name the outer aggregate <XXXMSGSET> and the tag for each version <XXXMSGSETVn>. The reason for message set-specific aggregates is that the set of attributes depends on the message set. These can change from version to version, so there are version-specific aggregates as well.

    The general form of the response is:

    Tag Description
    <XXXMSGSET> Service aggregate
    <XXXMSGSETVn> Version-of-message-set aggregate, 1 or more
    </XXXMSGSETVn>  
    </XXXMSGSET>  

    The <XXXMSGSETVn> aggregate has the following form:

    Tag Description
    <XXXMSGSETVn> Message-set-version aggregate
    <MSGSETCORE> Common message set information
    </MSGSETCORE>  
    message-set specific Zero or more attributes specific to this version of this message set, as defined by each message set
    </XXXMSGSETVn>  

    The common message set information <MSGSETCORE> is as follows:

    Tag Description
    <MSGSETCORE> Common-message-set-information aggregate
    <VER> Version number, N-5 (version 1.0 formatted as 100)
    <URL> URL where messages in this set are to be sent
    <OFXSEC> Security level required for this message set; see Chapter 4
    <TRANSPSEC> Y if transport security must be used, N if not used; Boolean
    <SIGNONREALM> Signon realm to use with this message set
    <LANGUAGE> One or more languages supported
    </MSGSETCORE>  
    1. Signon Realms

    A signon realm identifies a set of messages that can be accessed using the same password. Realms are used to disassociate signons from specific services, allowing FIs to require different signons for different message sets. In practice, FIs will want to use the absolute minimum number of realms possible to reduce the user's workload.

    Tag Description
    <SIGNONINFO> Signon-information aggregate
    <SIGNONREALM> Identifies this realm
    <MIN> Minimum number of password characters
    <MAX> Max number of password characters
    <ALPHA> Y if alphabetic characters are allowed, Boolean
    <NUMERIC> Y if numeric characters are allowed, Boolean
    <CASESEN> Y if password is case-sensitive, Boolean
    <SPECIAL> Y if special characters are allowed, Boolean
    <SPACES> Y if spaces are allowed, Boolean
    </SIGNONINFO>  
    1. Status Codes
    Value Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    1. Profile Message Set Profile Information

    stadyn_image8.gif (22120 bytes)
    stadyn_image8

     

    The profile message set functions the same way as all other message sets; therefore, it contains a profile description for that message set. Because <PROFMSGSET> is always part of a message set response, it is described here. Servers that support profile information must include the <PROFMSGSET> as part of the profile response <MSGSETLIST>. There are no attributes, but the aggregate must be present to indicate support for the message set.

    Tag Description
    <PROFMSGSET> Message-set-profile-information aggregate
    <PROFMSGSETV1> Opening tag for V1 of the message set profile information
    <MSGSETCORE> Common message set information
    </MSGSETCORE>  
    </PROFMSGSETV1>  
    </PROFMSGSET>  
    1. Activation & Account Information
    2. Overview

    The Signup message set defines three messages to help users get setup with their FI:

    There is also a message to request name and address changes.

    Clients use the account information request on a regular basis to look for changes in a user's account information. A timestamp is part of the request so that a server has only to report new changes. Account activation requests are subject to data synchronization, and will allow multiple clients to learn how the other clients have been enabled.

    In Open Financial Exchange files, the <SIGNUPMSGSV1> aggregate identifies the Signup message.

    1. Approaches to User Sign-Up with Open Financial Exchange

    The message sets in this chapter are designed to allow both FIs and clients to support a variety of sign-up procedures. There are four basic steps a user needs to go through to complete the sign-up:

    1. Select the FI. Open Financial Exchange does not define this step or provide message sets to support it. Client developers and FIs can let a user browse or search this information on a web site, or might define additional message sets to do this within the client. At the conclusion of this step, the client will have some minimal profile information about the FI, including the set of services supported and the URL to use for the next step.
    2. Enrollment and password acquisition. In this step, the user identifies and authenticates itself to the FI without a password. In return, the user obtains a password (possibly temporary) to use with Open Financial Exchange. FIs can perform this entire step over the telephone, through a combination of telephone requests and a mailed response, or at the FI web site. FIs can also use the Open Financial Exchange enrollment message to do this by means of the client. The response can contain a temporary password or users can wait for a mailed welcome letter containing the password.
    3. Account Information. In this step, the user obtains a list of accounts available for use with Open Financial Exchange, and which specific services are available for each account. Even if users have enrolled over the telephone, clients will still use this message set to help users properly set up the accounts within the client. Clients periodically check back with the FI for updates.
    4. Service Activation. The last step is to activate specific services on specific accounts. The activation messages support this step. Synchronization is applied to these messages to insure that other clients are aware of activated services.

    The combination of media-interface through which an FI accomplishes these steps can vary. FIs might wish to do steps two through four over the telephone. Clients will still use Open Financial Exchange messages in steps 3 and 4 to automatically set up the client based on the choices made by the user over the phone. Other FIs might wish to have the entire user experience occur within the client. Either way, the Open Financial Exchange sign-up messages support the process.

    1. Users and Accounts

    To support the widest possible set of FIs, Open Financial Exchange assumes that individual users and accounts are in a many-to-many relationship. Consider a household with three accounts:

    Checking 2 should be available to either spouse, and the spouse holding Checking 1 should be able to see both Checking 1 and 2.

    Open Financial Exchange expects FIs to give each user their own user ID and password. Each user will go through the enrollment step separately. A given account need only be activated once for a service; not once for each user. Clients will use the account information and activation messages to combine information about jointly-held accounts.

    If an FI prefers to have a single user ID and password per household or per master account, they will have to make this clear to users through the enrollment process. It is up to the FI to assign a single user ID and password that can access all three of the checking accounts described above.

    1. Enrollment and Password Acquisition <ENROLLRQ> <ENROLLRS>

    The main purpose of the enrollment message is to communicate a user's intent to access the FI by way of Open Financial Exchange and to acquire a password for future use with Open Financial Exchange. Some FIs might return a user ID and an initial password in the enrollment response, while others will send them by way of regular mail.

    NOTE: Because the server does not know the user ID and password when the client sends the enrollment request, the <SONRQ> will not contain a valid user ID or password. The enrollment message accepts standard user identification information.

    Enrollment requests are not subject to synchronization. If the client does not receive a response, it will simply re-request the enrollment. If a user successfully enrolls from another client before the first client obtains a response, the server should respond to subsequent requests from the first client with status code:

    13501 - user already enrolled.
    
    1. User IDs

    The Open Financial Exchange <SONRQ> requires a user ID to uniquely identify a user to an FI. Many FIs in the United States use social security numbers (SSNs) as the ID. Others create IDs that are unrelated to the users' SSNs. FIs can have an existing user IDs that they use for other online activities that they wish to use for Open Financial Exchange as well. They might also create new IDs specifically for Open Financial Exchange. Finally, some FIs might assign IDs while others might allow users to create them.

    Because users do not usually know either their Open Financial Exchange sign-on user ID or their password at time of enrollment, the enrollment response is designed to return both. The enrollment request allows users to optionally provide a user ID, which an FI can interpret as their existing online ID or a suggestion for what their new user ID should be. It is recommended that the enrollment process explains ID syntax to users.

    1. Enrollment Request

    The enrollment request captures enough information to identify and authenticate a user as being legitimate and that it has a relationship with the FI.

    FIs might require that an account number be entered as part of the identification process. However, this is discouraged since the account information request is designed to automatically obtain all account information, avoiding the effort and potential mistakes of a user-supplied account number.

    It is RECOMMENDED that FIs provide detailed specifications for IDs and passwords along with information about the services available when a user is choosing an FI.

    Tag Description
    <ENROLLRQ> Enrollment-request aggregate
    <FIRSTNAME> First name of user
    <MIDDLENAME> Middle name of user
    <LASTNAME> Last name of user
    <ADDR1> Address line 1
    <ADDR2> Address line 2
    <ADDR3> Address line 3
    <CITY> City
    <STATE> State or province
    <POSTALCODE> Postal code
    <COUNTRY> 3-letter country code from ISO/DIS-3166
    <DAYPHONE> Daytime telephone number
    <EVEPHONE> Evening telephone number
    <EMAIL> Electronic e-mail address
    <USERID> Actual user ID if already known, or preferred user ID if user can pick
    <TAXID> ID used for tax purposes (such as SSN), may be same as user ID
    <SECURITYNAME> Mother's maiden name or equivalent
    <DATEBIRTH> Date of birth
    <ACCTFROM> An account description aggregate for one existing account at the FI, for identification purposes only. Can be <BANKACCTFROM>, <INVACCTFROM>, etc.
    </ACCTFROM>  
    </ENROLLRQ>  

    This enrollment request is intended for use only by individuals. Business enrollment will be defined in a later release.

    1. Enrollment Response

    The main purpose of the enrollment response is to acknowledge the request. In those cases where FIs permit delivery of an ID and a temporary password, the response also provides for this. Otherwise the server will send the real response to the user by way of regular mail, electronic mail, or over the telephone. If enrollment is successful, but the server does not return the ID and password in the response, a server is REQUIRED to use status code 10 and provide some information to the user by means of the <MESSAGE> element in the <STATUS> aggregate about what to expect next.

    Tag Description
    <ENROLLRS> Enrollment-response aggregate
    <TEMPPASS> Temporary password
    <USERID> User ID
    <DTEXPIRE> Time the temporary password expires (if <TEMPPASS> included)
    </ENROLLRS>  
    1. Enrollment Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    13000 User ID & password will be sent out-of-band (INFO)
    13500 Unable to enroll (ERROR)
    13501 User already enrolled (ERROR)
    1. ExamplesAn enrollment request:
    <ENROLLTRNRQ>
    	<TRNUID>12345
    	<ENROLLRQ>
    		<FIRSTNAME>Joe
    		<MIDDLENAME>Lee
    		<LASTNAME>Smith
    		<ADDR1>21 Main St.
    		<CITY>Anytown
    		<STATE>TX
    		<POSTALCODE>87321
    		<COUNTRY>USA
    		<DAYPHONE>123-456-7890
    		<EVEPHONE>987-654-3210
    		<EMAIL>jsmith@isp.com
    		<USERID>jls
    		<TAXID>123-456-1234
    		<SECURITYNAME>jbmam
    		<DATEBIRTH>19530202
    	</ENROLLRQ>
    </ENROLLTRNRQ>And the reply might be:
    <ENROLLTRNRS>
    	<TRNUID>12345
    	<STATUS>
    		<CODE>0
    		<SEVERITY>INFO
    	</STATUS>
    	<ENROLLRS>
    		<TEMPPASS>changeme
    		<USERID>jls
    		<DTEXPIRE>19970105
    	</ENROLLRS>
    </ENROLLTRNRS>
    
    1. Account Information

    Account information requests ask a server to identify and describe all of the accounts accessible by the signed-on user. The definition of all is up to the FI. At a minimum, it is RECOMMENDED that a server include information about all accounts that it can activate for one or more Open Financial Exchange services. To give the user a complete picture of his relationship with an FI, FIs can give information on other accounts, even if those accounts are available only for limited Open Financial Exchange services.

    Some service providers will not have any prior knowledge of any user account information. The profile allows these servers to report this, and clients will then know to ask users for account information rather than reading it from the server.

    Clients can perform several tasks for users with this account information. First, the information helps a client set up a user for online services by giving it a precise list of its account information and available services for each. Clients can set up their own internal state as well as prepare service activation requests with no further typing by users. This can eliminate data entry mistakes in account numbers, routing transit numbers, and so forth.

    Second, FIs can provide limited information on accounts that would not ordinarily be suitable to Open Financial Exchange services. For example, a balance-only statement download would be useful for certificates of deposits even though a customer or an FI might not want or allow CDs to be used for full statement download.

    For each account, there is one <ACCTINFO> aggregate returned. The aggregate includes one service-specific account information aggregate for each available service on that account. That, in turn, provides the service-specific account identification. Common to each service-specific account information aggregate is the <SVCSTATUS> tag, which indicates the status of this service on this account.

    A server should return joint accounts (accounts for which more than one user ID can be used to access the account) for either user. Clients that wish to have a unified view will aggregate the results and remove duplicates before making specific requests involving joint accounts.

    Requests and responses include a <DTACCTUP> element. Responses contain the last time a server updated the information. Clients can OPTIONALLY send this in a subsequent request, and servers are REQUIRED to compare this to the current modification time and only send information if it is more recent. The server sends the entire account information response if the client's time is older; there is no attempt to incrementally update specific account information.

    1. Request <ACCTINFORQ>
    Tag Description
    <ACCTINFORQ> Account-information-request aggregate
    <DTACCTUP> Last <DTACCTUP> received in a response
    <INCIMAGES> Y if server should include logo in response, N if client will separately fetch them based on given URL; Boolean
    </ACCTINFORQ>  
    1. Response <ACCTINFORS>
    Tag Description
    <ACCTINFORS> Account-information-response aggregate
    <DTACCTUP> Date and time of last update to this information on the server
    <ACCTINFO> Zero or more account information aggregates
    </ACCTINFO>  
    </ACCTINFORS> End of account information response
    1. Account Information Aggregate <ACCTINFO>
    Tag Description
    <ACCTINFO> Account-information-record aggregate
    <DESC> Description of the account, A-80
    <PHONE> Telephone number for the account, A-20
    <LOGO> URL to request the logo for the account (actual logos should be included via multi-part MIME in the response file if requested), URL
    <XXXACCTINFO> Service-specific account information, defined in each service chapter, one or more allowed
    <XXXACCTFROM> Service-specific account identification
    </XXXACCTFROM>  
    <SVCSTATUS> AVAIL = Available, but not yet requested

    PEND = Requested, but not yet available

    ACTIVE = In use

    </XXXACCTINFO>  
    </ACCTINFO>  

    NOTE: A server uses the <DESC> field to convey the FI's preferred name for the account, such as "PowerChecking." It should not include the account number.

    1. Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    13001 No change since supplied <DTACCTUP> (INFO)
    1. ExamplesAn account information request:
    <ACCTINFOTRNRQ>
    	<TRNUID>12345
    	<ACCTINFORQ>
    		<DTACCTUP>19960101
    		<INCIMAGES>N
    	</ACCTINFORQ>
    </ACCTINFOTRNRQ>And a response for a
    user with access to one account, supporting banking:
    <ACCTINFOTRNRS>
    	<TRNUID>12345
    	<STATUS>
    		<CODE>0
    		<SEVERITY>INFO
    	</STATUS>
    	<ACCTINFORS>
    		<DTACCTUP>19960102
    		<ACCTINFO>
    		<DESC>Power Checking
    		<PHONE>8002223333
    		<LOGO>https://www.fi.com/ofx/logos/powercheck.jpg
    			<BANKACCTINFO>
    				<BANKACCTFROM>
    					<BANKID>1234567789
    					<ACCTID>12345
    					<ACCTTYPE>CHECKING
    				</BANKACCTFROM>
    			<SUPTXDL>Y
    			<XFERSRC>Y
    			<XFERDEST>Y
    			<SVCSTATUS>ACTIVE
    			</BANKACCTINFO>
    		</ACCTINFO>
    	</ACCTINFORS>
    </ACCTINFOTRNRS>
    
    1. Service Activation

    Clients inform FIs that they wish to start, modify, or terminate a service for an account by sending service activation requests. These are subject to data synchronization, and servers should send responses to inform clients of any changes, even if the changes originated on the server.

    Clients use these records during the initial user sign-up process. Once a client learns about the available accounts and services (by using the account information request above, or by having a user directly enter the required information), it sends a series of service ADD requests.

    If a user changes any of the identifying information about an account, the client sends a service activation request containing both the old and the new account information. Servers should interpret this as a change in the account, not a request to transfer the service between two existing accounts, and all account-based information such as synchronization tokens should continue. If a user or FI is reporting that service should be moved between two existing accounts, service must be terminated for the old account and started for the new account. The new account will have reset token histories, as with any new service.

    Each service to be added, changed, or removed is contained in its own request because the same real-world account might require different <ACCTFROM> aggregates depending on the type of service.

    1. Activation Request and Response
    2. Request <ACCTRQ>
    Tag Description
    <ACCTRQ> Account-service-request aggregate
    <ACTION> Action aggregate, either <SVCADD>, <SVCCHG>, or <SVCDEL>
    </ACTION>  
    <SVC> Service to be added/changed/deleted
    </ACCTRQ>  
    1. Response <ACCTRS>
    Tag Description
    <ACCTRS> Account-service-response aggregate
    <ACTION> Action aggregate, either <SVCADD>, <SVCCHG>, or <SVCDEL>
    </ACTION>  
    <SVC> Service to be added/changed:

    BANKSVC = Banking service
    BPSVC = Payments service
    INVSVC = Investments

    </ACCTRS>  
    1. Service Add Aggregate <SVCADD>
    Tag Description
    <SVCADD> Service-add aggregate
    <ACCTTO> Service-specific-account-identification aggregate (see <BANKACCTTO>, <INVACCTTO>)
    </ACCTTO>  
    </SVCADD>  
    1. Service Change Aggregate <SVCCHG>
    Tag Description
    <SVCCHG> Service-add aggregate
    <ACCTFROM> Service-specific-account-identification aggregate (see <BANKACCTFROM>, <INVACCTFROM>)
    </ACCTFROM>  
    <ACCTTO> Service-specific-account-identification aggregate (see <BANKACCTTO>, <INVACCTTO>)
    </ACCTTO>  
    </SVCCHG>  
    1. Service Delete Aggregate <SVCDEL>
    Tag Description
    <SVCDEL> Service-deletion aggregate
    <ACCTFROM> Service-specific-account-identification aggregate (see <BANKACCTFROM>, <INVACCTFROM>)
    </ACCTFROM>  
    </SVCDEL>  
    1. Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    2002 Other account error (ERROR)
    2006 Source (from) account not found (ERROR)
    2007 Source (from) account closed (ERROR)
    2008 Source (from) account not authorized (ERROR)
    2009 Destination (to) account not found (ERROR)
    2010 Destination (to) account closed (ERROR)
    2011 Destination (to) account not authorized (ERROR)
    13502 Invalid service (ERROR)
    1. Service Activation Synchronization

    Service activation requests are subject to the standard data synchronization protocol. The scope of these requests and the <TOKEN> is the user-ID. The request and response tags are <ACCTSYNCRQ> and <ACCTSYNCRS>.

    1. ExamplesActivating a payment:
    <ACCTTRNRQ>
    	<TRNUID>12345
    	<ACCTRQ>
    		<SVCADD>
    			<BANKACCTTO>
    				<BANKID>1234567789
    				<ACCTID>12345
    				<ACCTTYPE>CHECKING
    			</BANKACCTTO>
    		</SVCADD>
    		<SVC>BPSVC
    	</ACCTRQ>
    </ACCTTRNRQ>A response:
    <ACCTTRNRS>
    	<TRNUID>12345
    		<STATUS>
    		<CODE>0
    		<SEVERITY>INFO
    	</STATUS>
    	<ACCTRS>
    		<SVCADD>
    			<BANKACCTTO>
    				<BANKID>1234567789
    				<ACCTID>12345
    				<ACCTTYPE>CHECKING
    			</BANKACCTTO>
    		</SVCADD>
    		<SVC>BPSVC
    	</ACCTRS>
    </ACCTTRNRS>
    
    1. Name and Address Changes <CHGUSERINFORQ> <CHGUSERINFORS>

    Users may request that an FI update the official name, address, phone, and e-mail information using the <CHGUSERINFORQ>. Only the fields that should be changed are sent. The response reports all of the current values. For security reasons, some of the fields in the <ENROLLRQ> cannot be changed online, such as tax ID.

    The transaction tag is <CHGUSERINFOTRNRQ> and <CHGUSERINFOTRNRSRQ>. These methods are subject to synchronization, <CHGUSERINFOSYNCRQ> and <CHGUSERINFOSYNCRS>.

    1. <CHGUSERINFORQ>
    Tag Description
    <CHGUSERINFORQ> Change-user-information-request aggregate
    <FIRSTNAME> First name of user
    <MIDDLENAME> Middle name of user
    <LASTNAME> Last name of user
    <ADDR1> Address line 1
    <ADDR2> Address line 2
    <ADDR3> Address line 3
    <CITY> City
    <STATE> State or province
    <POSTALCODE> Postal code
    <COUNTRY> 3-letter country code from ISO/DIS-3166
    <DAYPHONE> Daytime telephone number
    <EVEPHONE> Evening telephone number
    <EMAIL> Electronic e-mail address
    </CHGUSERINFORQ>  
    1. <CHGUSERINFORS>
    Tag Description
    <CHGUSERINFORS> Change-user-information-request aggregate
    <FIRSTNAME> First name of user
    <MIDDLENAME> Middle name of user
    <LASTNAME> Last name of user
    <ADDR1> Address line 1
    <ADDR2> Address line 2
    <CITY> City
    <STATE> State or province
    <POSTALCODE> Postal code
    <COUNTRY> 3-letter country code from ISO/DIS-3166
    <DAYPHONE> Daytime telephone number
    <EVEPHONE> Evening telephone number
    <EMAIL> Electronic e-mail address
    <DTINFOCHG> Date and time of update datetime
    </CHGUSERINFORS>  
    1. Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    13503 Cannot change user information (ERROR)
    1. Signup Message Set Profile Information

    A server must include the following aggregates as part of the profile <MSGSETLIST> response, since every server must support at least the account information and service activation messages. In the <ENROLLPROF> aggregate, servers indicate how enrollment should proceed: via the client, a given web page, or a text message directing users to some other method (such as a phone call)..

    Tag Description
    <SIGNUPMSGSET> Signup-message-set-profile-information aggregate
    <SIGNUPMSGSETV1> Opening tag for V1 of the message set profile information
    <MSGSETCORE> Common message set information, defined in the profile chapter
    </MSGSETCORE>  
      Enrollment options - only one of <CLIENTENROLL>, <WEBENROLL>, or <OTHERENROLL> is allowed
    <CLIENTENROLL> Client-based enrollment supported
    <ACCTREQUIRED> Y if account number is required as part of enrollment Boolean
    </CLIENTENROLL>  
    <WEBENROLL> Web-based enrollment supported
    <URL> URL to start enrollment process
    </WEBENROLL>  
    <OTHERENROLL> Some other enrollment process
    <MESSAGE> Message to give to consumer about what to do next (e.g. a phone number) A-80
    </OTHERENROLL>  
    <CHGUSERINFO> Y if server supports client-based user information changes
    <AVAILACCTS> Y if server can provide information on accounts with SVCSTATUS available, N means client should expect to ask user for specific account information Boolean
    </SIGNUPMSGSETV1>  
    </SIGNUPMSGSET>  


    1. Customer to FI Communication
    2. The E-Mail Message Set

    The e-mail message set includes two messages: generic e-mail and generic MIME requests by way of URLs. In Open Financial Exchange files, the message set name is EMAILMSGSV1.

    1. E-Mail Messages

    Open Financial Exchange allows consumers and FIs to exchange messages. The message body is in HTML so that FIs can provide some graphic structure to the message. Keep in mind that, as with regular World Wide Web browsing, an Open Financial Exchange client might not support some or all of the HTML formatting, so the text of the message must be clear on its own. Clients can request that graphics (the images referenced in an <IMG> tag) be sent as part of the response file, or clients can separately request those elements. If a server sends images, it should use the standard procedure for incorporating external data as described in Chapter 2. Servers are not required to support HTML or to send images, even if the client asks.

    A user or an FI can originate a message. E-mail messages are subject to data synchronization so that a server can send a response again if it is lost or if it is used by multiple clients.

    Because e-mail messages cannot be replied to immediately, the response should just echo back the original message (so that data synchronization will get this original e-mail message to other clients). When the FI is ready to reply, it should generate an unsolicited response (<TRNUID>0) and the client will pick this up during synchronization.

    Client Sends Server Responds
    Account information  
    From, To  
    Subject  
    Message  
      Account information
      From, To
      Subject
      Message
      Type
    1. Regular vs. Specialized E-Mail

    Several services with Open Financial Exchange define e-mail requests and responses that contain additional information specific to that service. To simplify implementation for both clients and servers, this section defines a <MAIL> aggregate that Open Financial Exchange uses in all e-mail requests and responses. For regular e-mail, the only additional information is an account from aggregate and whether to include images in the e-mail response or not.

    1. Basic <MAIL> Aggregate
    Tag Description
    <MAIL> Core e-mail aggregate
    <USERID> User ID such as SSN
    <DTCREATED> When message was created datetime
    <FROM> Customer's input for whom message is from, A-32
    <TO> Who e-mail should be delivered to, A-32
    <SUBJECT> Subject of message (plain text, not HTML), A-60
    <MSGBODY> Body of message, HTML-encoded or plain text depending on <USEHTML>, A-10000
    </MSGBODY> End of message
    <INCIMAGES> Include images in response, Boolean
    <USEHTML> Y if client wants an HTML response, N if client wants plain text, Boolean
    </MAIL>  

    If using HTML for the message body, clients and servers are REQUIRED to wrap the desired HTML in an SGML marked section to protect the HTML markup: <![ CDATA [ ... html ... ]]>. See the example.

    1. E-Mail <MAILRQ> <MAILRS>

    E-mail is subject to synchronization. The transaction tag is <MAILTRNRQ> / <MAILTRNRS> and the synchronization tag is <MAILSYNCRQ> / <MAILSYNCRS>.

    Tag Description
    <MAILRQ> E-mail-message-request aggregate
    <MAIL> Core e-mail aggregate
    </MAIL>  
    </MAILRQ>  

    In a response, the <TRNUID> is zero if this is an unsolicited message. Otherwise, it should contain the <TRNUID> of the user's original message. It is RECOMMENDED that servers include the <MESSAGE> of the user's message as part of the reply <MESSAGE>. The <MESSAGE> contents can include carriage returns to identify desired line breaks.

    Tag Description
    <MAILRS> E-mail-message-response aggregate
    <MAIL> Core e-mail aggregate
    </MAIL>  
    </MAILRS>  
    1. Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    16500 HTML not allowed (ERROR)
    16501 Unknown mail To: (ERROR)
    1. E-Mail Synchronization <MAILSYNCRQ> <MAILSYNCRS>

    Open Financial Exchange uses data synchronization to collect responses that could have been lost due to communication problems, or that the servers previously sent to a different client or data file. All messages sent to the signed-on user ID are covered by a single <TOKEN>. Note that this synchronization action expects only the basic <MAILRS> responses. Specialized e-mail is received by means of their own synchronization requests.

    Tag Description
    <MAILSYNCRQ> E-mail-synchronization-request aggregate
    <TOKEN> Client history marker
    <INCIMAGES> Include images in response, Boolean
    <USEHTML> Y if client wants an HTML response, N if client wants plain text, Boolean
    </MAILSYNCRQ>  
    Tag Description
    <MAILSYNCRS> E-mail-synchronization-response. aggregate
    <TOKEN> Server history marker
    <MAILTRNRS> Missing e-mail response transactions (0 or more)
    </MAILSYNCRS>  
    1. Example

    In this example, a consumer requests information from customer service about the checking statement just downloaded. This example omits the <OFX> top level and the signon <SONRQ>. This example uses HTML for the message body, and so it must protect the HTML content in an SGML CDATA marked section.The request:

    <MAILTRNRQ>
    	<TRNUID>54321
    	<MAILRQ>
    		<MAIL>
    			<USERID>123456789
    			<FROM>James Hackleman
    			<TO>Noelani Federal Savings
    			<SUBJECT>What do I need to earn interest?
    			<DTCREATED>19960305
    			<MSGBODY><![ CDATA [<HTML><BODY>I didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?</BODY></HTML>
    ]]></MSGBODY>
    			<INCIMAGES>N
    			<USEHTML>Y
    		</MAIL>
    	</MAILRQ>
    </MAILTRNRQ>The response from the FI:
    <MAILTRNRS>
    	<TRNUID>54321
    	<STATUS>
    		<CODE>0
    		<SEVERITY>INFO
    		</STATUS>
    	<MAILRS>
    		<MAIL>
    			<USERID>123456789
    			<DTCREATED>19960307
    			<FROM>Noelani Federal Savings
    			<TO>James Hackleman
    			<SUBJECT>Re: What do I need to earn interest?
    			<MSGBODY>><![ CDATA [<HTML><BODY>You need to maintain $1000 in this account to earn interest. Because your balance was only $750 this month, no interest was earned. You could also switch to our new Checking Extra plan that always pays interest. Call us or check our web page http://www.fi.com/check-plans.html for more information.
    Sincerely,
    Customer Service Department
    
    Original message:
    I didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?</BODY></HTML>
    ]]></MSGBODY>
    			<INCIMAGES>N
    			<USEHTML>Y
    		</MAIL>
    	</MAILRS>
    </MAILTRNRS>
    
    1. Example of Synchronization Involving E-Mail

    In the following example the client did not receive the reply to the message sent in the previous example, so its <TOKEN> is one less than the server's. The server replies by giving the current <TOKEN> and the missed response.

    <MAILSYNCRQ>
    	<TOKEN>101
    </MAILSYNCRQ>
    
    <MAILSYNCRS>
    	<TOKEN>102
    	<MAILTRNRS>
    		<TRNUID>54321
    		<STATUS>
    			<CODE>0
    			<SEVERITY>INFO
    		</STATUS>
    		<MAILRS>
    			... contents of e-mail message response as shown in previous example
    		</MAILRS>
    	</MAILTRNRS>
    </MAILSYNCRS>
    
    1. Get HTML Page

    Some responses contain values that are URLs, intended to be separately fetched by clients if desired. Clients can use their own HTTP libraries to perform this fetch outside of the Open Financial Exchange specification. However, to insulate clients against changes in transport technology, and to allow for fetches that require the protection of an authenticated signon by a specific user, Open Financial Exchange defines a transaction roughly equivalent to an HTTP Get. Any MIME type can be retrieved, including images as well as HTML pages.

    1. MIME Get Request and Response <GETMIMERQ> <GETMIMERS>

    The following table lists the components of a request:

    Tag Description
    <GETMIMERQ> Get-MIME-request aggregate
    <URL> URL, URL
    </GETMIMERQ>  

    The response simply echoes back the URL. The actual response, whether HTML, an image, or some other type, is always sent as a separate part of the file using multi-part MIME.

    Tag Description
    <GETMIMERS> Get-MIME-response aggregate
    <URL> URL, URL
    </GETMIMERS>  
    1. Status Codes
    Code Meaning
    0 Success (INFO)
    2000 General error (ERROR)
    2019 Duplicate transaction (ERROR)
    16502 Invalid URL (ERROR)
    16503 Unable to get URL (ERROR)
    1. ExampleA request:
    <GETMIMETRNRQ>
    	<TRNUID>54321
    	<GETMIMERQ>
    		<URL>http://www.fi.com/apage.html
    	</GETMIMERQ>
    </GETMIMETRNRQ>A response - the full
    file is shown here to illustrate the use of multi-part MIME:
    HTTP 1.0 200 OK
    Content-Type: multipart/x-mixed-replace; boundary =--boundary-
    
    --boundary--
    Content-Type: application/x-ofx
    Content-Length: 8732
    
    OFXHEADER:100
    DATA:OFXSGML
    VERSION:100
    ENCRYPTION:1
    ENCODING:USASCII
    
    <OFX>
    		<!-- signon not shown
    		message set wrappers not shown -->
    <GETMIMETRNRS>
    	<TRNUID>54321
    	<STATUS>
    		<CODE>0
    		<SEVERITY>INFO
    	</STATUS>
    	<GETMIMERS>
    		<URL>http://www.fi.com/apage.html
    	</GETMIMERS>
    </GETMIMETRNRS>
    </OFX>
    
    --boundary--
    Content-Type: text/html
    <HTML>
    	<!-- standard HTML page -->
    </HTML>
    
    --boundary--
    
    
    
    1. E-Mail Message Set Profile Information

    If either or both of the messages in the e-mail message set are supported, the following aggregate must be included in the profile <MSGSETLIST> response.

    Tag Description
    <EMAILMSGSET> E-mail-message-set-profile-information aggregate
    <EMAILMSGSETV1> Opening tag for V1 of the message set profile information
    <MSGSETCORE> Common message set information, defined in the profile chapter
    </MSGSETCORE>  
    <EMAIL> Y if server supports generic e-mail message
    <GETMIME> Y if server supports get MIME message
    </EMAILMSGSETV1>  
    </EMAILMSGSET>  



    1. Recurring Transactions

    Open Financial Exchange enables users to automate transactions that occur on a regular basis. Recurring transactions are useful when a customer has payments or transfers, for example, that repeat at regular intervals. The customer can create a "model" at the server for automatic generation of these instructions. The model in turn creates payments or transfers until it is canceled or expires. After the user creates a recurring model at the server, the server can relieve the user from the burden of creating these transactions; it generates the transactions on its own, based on the operating parameters of the model.

    1. Creating a Recurring Model

    The client must provide the following information to create a model:

    The model creates each transaction some time before its due date, usually thirty days. This allows the user to retrieve the transactions in advance of posting. This also gives the user the opportunity to modify or cancel individual transactions without changing the recurring model itself.

    When a model is created, it can generate several transactions immediately. The model does not automatically return responses for the newly created transactions. It only returns a response to the request that was made to create the model. For this reason, clients should send a synchronization request along with the request to create a model. This allows the server to return the newly created transaction responses, as well as the response to the request to set up a new model.

    1. Recurring Instructions <RECURRINST>

    The Recurring Instructions aggregate is used to specify the schedule for a repeating instruction. It is passed to the server when a recurring transfer or payment model is first created.

    Tag Description
    <RECURRINST> Recurring-Instructions aggregate
    <FREQ> Frequency, see section 10.2.1
    <NINSTS> Number of instructions

    If this tag is absent, the schedule is open-ended, N-3

    </RECURRINST>  
    1. Values for <FREQ>
    Value Description
    WEEKLY Weekly
    BIWEEKLY Biweekly
    TWICEMONTHLY Twice a month
    MONTHLY Monthly
    FOURWEEKS Every four weeks
    BIMONTHLY Bimonthly
    QUARTERLY Quarterly
    SEMIANNUALLY Semiannually
    TRIANNUALLY Triannually
    ANNUALLY Annually

    Rules for calculating recurring dates of WEEKLY, BIWEEKLY, and TWICEMONTHLY are as follows:

    Examples:

    Start date of May 2: next transaction date for WEEKLY is May 9; TWICEMONTHLY is May 17; next transfer date for BIWEEKLY is May 16.

    Start date of May 20: next date for WEEKLY is May 27; TWICEMONTHLY is June 4; next date for BIWEEKLY is June 3.

    TWICEMONTHLY recurring transactions will occur each month on those days adjusting for weekends and holidays. BIWEEKLY will occur every 14 days.

    1. Examples

    The following example illustrates the creation of a repeating payment. The payment repeats on a monthly basis for 12 months. All payments are for $395.The request:

    .
    .
    .
    <RECPMTRQ>
    	<RECURRINST>
    		<FREQ>MONTHLY
    		<NINSTS>12
    	</RECURRINST>
    	<PMTINFO>
    		<BANKACCTFROM>
    			<BANKID>555432180
    			<ACCTID>763984
    			<ACCTTYPE>CHECKING
    		</BANKACCTFROM>
    		<TRNAMT>395.00
    		<PAYEEID>77810
    		<PAYACCT>444-78-97572
    		<DTDUE>19971115
    		<MEMO>Auto loan payment
    	</PMTINFO>
    </RECPMTRQ>
    .
    .
    .
    
    The response includes the <RECSRVRTID>
    that the client can use
    
    to cancel or modify the model:
    .
    .
    .
    <RECPMTRS>
    	<RECSRVRTID>387687138
    	<RECURRINST>
    		<FREQ>MONTHLY
    		<NINSTS>12
    	</RECURRINST>
    	<PMTINFO>
    		<BANKACCTFROM>
    			<BANKID>555432180
    			<ACCTID>763984
    			<ACCTTYPE>CHECKING
    		</BANKACCTFROM>
    		<TRNAMT>395.00
    		<PAYEEID>77810
    		<PAYACCT>444-78-97572
    		<DTDUE>19971115
    		<MEMO>Auto loan payment
    	</PMTINFO>
    </RECPMTRS>
    .
    .
    .
    
    1. Retrieving Transactions Generated by a Recurring Model

    Once created, a recurring model independently generates instructions. Since the client has not directly generated these transactions, the client has no record of their creation. To enable users to modify and/or cancel pending instructions, the client must use data synchronization in order to retrieve these transactions.

    The client has two purposes for synchronizing state with the server with respect to recurring models:

    The client must be able to synchronize with the state of any models at the server, as well as the state of any transactions generated by the server.

    1. Modifying and Canceling Individual Transactions

    Once created and retrieved by the customer, recurring payments and transfers are almost identical to customer-created payments or transfers. As with ordinary payments or transfers, you can cancel or modify transactions individually. However, because servers generate these transfers, they are different in the following respects:

    1. Modifying and Canceling Recurring Models

    A recurring model can be modified or canceled. When a model is modified, all transactions that it generates in the future will change as well. The client can indicate whether transactions that have been generated, but have not been sent, should be modified as well. The actual elements within a transaction that can be modified differ by service. See the recurring sections within the Banking and Payments chapters for details.

    A user can cancel a model immediately or at a future date. If a user cancels the model immediately, the client cancels any transactions that it has not yet sent. If the client schedules the cancel for a future date, the client will not cancel pending transactions.

    1. Examples

    Canceling a recurring payment model requires the client to pass the <RECSRVRTID> of the model. The client requests that pending payments also be canceled. The server cancels the model immediately and notifies the client that both the model and any scheduled payments were canceled.The request:

    .
    .
    .
    	<RECPMTCANCRQ>
    		<RECSRVRTID>387687138
    		<CANPENDING>Y
    	</RECPMTCANCRQ>
    .
    .
    .
    The response:
    .
    .
    .
    	<RECPMTCANCRS>
    		<RECSRVRTID>387687138
    		<CANPENDING>Y
    	</RECPMTCANCRS>
    .
    .
    .
    
    
    stadyn_image9.gif (7739 bytes)
    stadyn_image9

    The server also cancels any payments that have been generated but not executed. In the example shown above, the client would not learn of this immediately. To receive notification that the model and all generated payments were canceled, the client would need to include a synchronization request in the file. The following example illustrates this alternate approach.The request file now includes a synchronization request:

    .
    .
    .
    	<RECPMTCANCRQ>
    		<RECSRVRTID>387687138
    		<CANPENDING>Y
    	</RECPMTCANCRQ>
    	<PMTSYNCRQ>
    		<TOKEN>12345
    		<BANKACCTFROM>
    			<BANKID>123432123
    			<ACCTID>516273
    			<ACCTTYPE>CHECKING
    		</BANKACCTFROM>
    	</PMTSYNCRQ>
    .
    .
    .
    The response file now contains two responses
    (assuming one payment was pending),
    
    one for the canceled model and one for the canceled payment.
    .
    .
    .
    	<RECPMTCANCRS>
    		<RECSRVRTID>387687138
    		<CANPENDING>Y
    	</RECPMTCANCRS>
    	<PMTSYNCRS>
    		<TOKEN>3247989384
    		<BANKACCTFROM>
    			<BANKID>123432123
    			<ACCTID>516273
    			<ACCTTYPE>CHECKING
    		</BANKACCTFROM>
    		<PMTTRNRS>
    			<TRNUID>10103
    			<STATUS>
    				<CODE>0
    				<SEVERITY>INFO
    			</STATUS>
    			<PMTCANCRS>
    				<SRVRTID>1030155
    			</PMTCANCRS>
    		</PMTTRNRS>
    	</PMTSYNCRS>
    .
    .
    .
    
    
    

    stadyn_image10.gif (4217 bytes)
    stadyn_image10

    ================================================ FILE: libs/megaparse/tests/test_endpoints.py ================================================ import pytest @pytest.mark.asyncio async def test_parse_file_endpoint(test_client): # Simulate a request to the parse endpoint with open("./tests/pdf/sample_pdf.pdf", "rb") as file: response = await test_client.post( "/v1/file", files={"file": ("test.pdf", file)}, data={ "method": "unstructured", "strategy": "auto", "language": "en", "check_table": False, }, ) assert response.status_code == 200 assert response.json()["message"] == "File parsed successfully" @pytest.mark.asyncio async def test_parse_url_endpoint(test_client): response = await test_client.post("/v1/url?url=https://www.quivr.com") assert response.status_code == 200 assert response.json() == { "message": "Website content parsed successfully", "result": "Fake website content", } ================================================ FILE: libs/megaparse/tests/test_import.py ================================================ import pytest from megaparse import MegaParse @pytest.mark.skip("slow test") def test_load(): megaparse = MegaParse() response = megaparse.load("./tests/data/dummy.pdf") print(response) assert response.strip("\n") == "Dummy PDF download" ================================================ FILE: libs/megaparse/tests/test_parsers.py ================================================ import os import pytest from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.llama import LlamaParser from megaparse.parser.megaparse_vision import MegaParseVision from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.schema.extensions import FileExtension PARSER_LIST = [ UnstructuredParser, # DoctrParser, ] @pytest.mark.parametrize("parser", PARSER_LIST) @pytest.mark.parametrize("extension", list(FileExtension)) def test_sync_parser(parser, extension): directory = "./tests/supported_docs" file_path = next( ( os.path.join(root, file) for root, _, files in os.walk(directory) for file in files if file.endswith(extension.value) ), None, ) if file_path is None: pytest.fail(f"No file with extension {extension.value} found in {directory}") myparser = parser() if extension in myparser.supported_extensions: response = myparser.convert(file_path) assert response assert len(str(response)) > 0 else: with pytest.raises(ValueError): myparser.convert(file_path) ================================================ FILE: libs/megaparse_sdk/CHANGELOG.md ================================================ # Changelog ## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (2025-02-13) ### Features * add layout detection ([#228](https://github.com/QuivrHQ/MegaParse/issues/228)) ([77f7040](https://github.com/QuivrHQ/MegaParse/commit/77f7040c9c221a17effce089be7ec575cdd83468)) ## [0.1.11](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.10...megaparse-sdk-v0.1.11) (2025-02-11) ### Features * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7)) * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37)) ### Bug Fixes * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38)) * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228)) * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3)) ## [0.1.10](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.9...megaparse-sdk-v0.1.10) (2024-12-16) ### Bug Fixes * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473)) ## [0.1.9](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.8...megaparse-sdk-v0.1.9) (2024-12-13) ### Features * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609)) ## [0.1.8](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.7...megaparse-sdk-v0.1.8) (2024-12-12) ### Features * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29)) * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036)) ## [0.1.7](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.6...megaparse-sdk-v0.1.7) (2024-11-25) ### Bug Fixes * Update README.md ([#154](https://github.com/QuivrHQ/MegaParse/issues/154)) ([a103393](https://github.com/QuivrHQ/MegaParse/commit/a1033938184e20c24b0e54ee0db088b28075fd14)) ## [0.1.6](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.5...megaparse-sdk-v0.1.6) (2024-11-25) ### Features * megaparse sdk tests ([#148](https://github.com/QuivrHQ/MegaParse/issues/148)) ([e030285](https://github.com/QuivrHQ/MegaParse/commit/e0302853fc2c1526b8e912bf3ef85b970a5b89bc)) ## [0.1.5](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.4...megaparse-sdk-v0.1.5) (2024-11-21) ### Features * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334)) * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925)) ================================================ FILE: libs/megaparse_sdk/README.md ================================================ ## MegaParse SDK Welcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload URLs and files for processing. ### Installation To install the MegaParse SDK, use pip: ```sh pip install megaparse-sdk ``` ### Usage Here is an example of how to use the MegaParse SDK: #### Uploading URLs ```python import asyncio import os from megaparse.sdk import MegaParseSDK async def upload_url(): api_key = str(os.getenv("MEGAPARSE_API_KEY")) megaparse = MegaParseSDK(api_key) url = "https://www.quivr.com" # Upload a URL url_response = await megaparse.url.upload(url) print(f"\n----- URL Response : {url} -----\n") print(url_response) await megaparse.close() if __name__ == "__main__": asyncio.run(upload_url()) ``` #### Uploading Files ```python import asyncio import os from megaparse.sdk import MegaParseSDK async def upload_file(): api_key = str(os.getenv("MEGAPARSE_API_KEY")) megaparse = MegaParseSDK(api_key) file_path = "your/file/path.pdf" # Upload a file response = await megaparse.file.upload( file_path=file_path, method="unstructured", # unstructured, llama_parser, megaparse_vision strategy="auto", ) print(f"\n----- File Response : {file_path} -----\n") print(response) await megaparse.close() if __name__ == "__main__": asyncio.run(upload_file()) ``` ### Features - **Upload URLs**: Easily upload URLs for processing. - **Upload Files**: Upload files with different processing methods and strategies. ### Getting Started 1. **Set up your API key**: Make sure to set the `MEGAPARSE_API_KEY` environment variable with your MegaParse API key. 2. **Run the example**: Use the provided example to see how to upload URLs and files. For more details, refer to the [usage example](#file:usage_example.py-context). We hope you find the MegaParse SDK useful for your projects! Enjoy, _Quivr Team_ ! ================================================ FILE: libs/megaparse_sdk/__init__.py ================================================ ================================================ FILE: libs/megaparse_sdk/examples/usage_example.py ================================================ import asyncio import os from megaparse.sdk.megaparse_sdk import MegaParseSDK async def main(): api_key = str(os.getenv("MEGAPARSE_API_KEY")) megaparse = MegaParseSDK(api_key) # url = "https://www.quivr.com" # # Upload a URL # url_response = await megaparse.url.upload(url) # print(f"\n----- URL Response : {url} -----\n") # print(url_response) # file_path = "megaparse/sdk/pdf/MegaFake_report.pdf" file_path = ( "megaparse/sdk/examples/only_pdfs/4 The Language of Medicine 2024.07.21.pdf" ) # Upload a file response = await megaparse.file.upload( file_path=file_path, method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision strategy="auto", # type: ignore # fast, auto, hi_res ) print(f"\n----- File Response : {file_path} -----\n") print(response) await megaparse.close() if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/__init__.py ================================================ from .client import MegaParseClient from .endpoints.file_upload import FileUpload from .endpoints.url_upload import URLUpload class MegaParseSDK: def __init__(self, api_key: str | None = None, base_url: str | None = None): self.client = MegaParseClient(api_key, base_url) self.file = FileUpload(self.client) self.url = URLUpload(self.client) async def close(self): await self.client.close() ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/client.py ================================================ import asyncio import enum import logging import os from io import BytesIO from pathlib import Path from types import TracebackType from typing import Any, Self import httpx import nats from nats.errors import NoRespondersError, TimeoutError from megaparse_sdk.config import ClientNATSConfig, MegaParseSDKConfig from megaparse_sdk.schema.document import Document from megaparse_sdk.schema.mp_exceptions import ( DownloadError, InternalServiceError, MemoryLimitExceeded, ModelNotSupported, ParsingException, ) from megaparse_sdk.schema.mp_inputs import ( FileInput, MPInput, ParseFileConfig, ParseFileInput, ParseUrlInput, ) from megaparse_sdk.schema.mp_outputs import ( MPErrorType, MPOutput, MPOutputType, ) from megaparse_sdk.utils.load_ssl import load_ssl_cxt logger = logging.getLogger("megparse_sdk") class MegaParseClient: def __init__( self, api_key: str | None = None, base_url: str | None = None, ): config = MegaParseSDKConfig() self.base_url = base_url or config.url self.api_key = api_key or config.api_key self.max_retries = config.max_retries if self.api_key: self.session = httpx.AsyncClient( headers={"x-api-key": self.api_key}, timeout=config.timeout ) else: self.session = httpx.AsyncClient(timeout=config.timeout) async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any: url = f"{self.base_url}{endpoint}" client = self.session for attempt in range(self.max_retries): try: response = await client.request(method, url, **kwargs) response.raise_for_status() return response.json() except (httpx.HTTPStatusError, httpx.RequestError): if attempt < self.max_retries - 1: await asyncio.sleep(2**attempt) # Exponential backoff raise RuntimeError(f"Can't send request to the server: {url}") async def close(self): await self.session.aclose() class ClientState(enum.Enum): # First state of the client UNOPENED = 1 # Client has either sent a request, or is within a `with` block. OPENED = 2 # Client has either exited the `with` block, or `close()` called. CLOSED = 3 class MegaParseNATSClient: def __init__(self, config: ClientNATSConfig): self.nc_config = config self.max_retries = self.nc_config.max_retries self.backoff = self.nc_config.backoff if self.nc_config.ssl_config: self.ssl_ctx = load_ssl_cxt(self.nc_config.ssl_config) else: self.ssl_ctx = None # Client connection self._state = ClientState.UNOPENED self._nc = None async def _get_nc(self): if self._nc is None: self._nc = await nats.connect( self.nc_config.endpoint, tls=self.ssl_ctx, connect_timeout=self.nc_config.connect_timeout, reconnect_time_wait=self.nc_config.reconnect_time_wait, max_reconnect_attempts=self.nc_config.max_reconnect_attempts, ) return self._nc return self._nc async def __aenter__(self: Self) -> Self: if self._state != ClientState.UNOPENED: msg = { ClientState.OPENED: "Cannot open a client instance more than once.", ClientState.CLOSED: ( "Cannot reopen a client instance, client was closed." ), }[self._state] raise RuntimeError(msg) self._state = ClientState.OPENED await self._get_nc() return self async def __aexit__( self, exc_type: type[BaseException] | None = None, exc_value: BaseException | None = None, traceback: TracebackType | None = None, ) -> None: self._state = ClientState.CLOSED await self.aclose() async def parse_url(self, url: str): url_inp = ParseUrlInput(url=url) return await self._send_req(MPInput(input=url_inp)) async def parse_file( self, file: Path | BytesIO, file_name: str | None = None ) -> str | Document: if isinstance(file, Path): with open(file, "rb") as f: data = f.read() file_name = os.path.basename(file) else: file.seek(0) data = file.read() if file_name is None: raise ValueError("please provide file_name if passing ByteIO stream") file_input = ParseFileInput( file_input=FileInput(file_name=file_name, file_size=len(data), data=data), parse_config=ParseFileConfig(), ) inp = MPInput(input=file_input) return await self._send_req(inp) async def _send_req(self, inp: MPInput) -> str | Document: logger.debug(f"Sending {inp} to megaparse service.") for attempt in range(self.max_retries): try: return await self._send_req_inner(inp) except (TimeoutError, NoRespondersError) as e: logger.error(f"Sending req error: {e}. Retrying for {attempt} time") if attempt < self.max_retries - 1: logger.debug(f"Backoff for {2**self.backoff}s") await asyncio.sleep(2**self.backoff) raise ParsingException async def _send_req_inner(self, inp: MPInput): nc = await self._get_nc() raw_response = await nc.request( self.nc_config.subject, inp.model_dump_json().encode("utf-8"), timeout=self.nc_config.timeout, ) response = MPOutput.model_validate_json(raw_response.data.decode("utf-8")) return self._handle_mp_output(response) def _handle_mp_output(self, response: MPOutput) -> str | Document: if response.output_type == MPOutputType.PARSE_OK: assert response.result, "Parsing OK but response is None" return response.result elif response.output_type == MPOutputType.PARSE_ERR: assert response.err, "Parsing OK but response is None" match response.err.mp_err_code: case MPErrorType.MEMORY_LIMIT: raise MemoryLimitExceeded case MPErrorType.INTERNAL_SERVER_ERROR: raise InternalServiceError case MPErrorType.MODEL_NOT_SUPPORTED: raise ModelNotSupported case MPErrorType.DOWNLOAD_ERROR: raise DownloadError case MPErrorType.PARSING_ERROR: raise ParsingException raise ValueError(f"unknown service response type: {response}") async def aclose(self): nc = await self._get_nc() await nc.close() ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/config.py ================================================ from pydantic import BaseModel, FilePath from pydantic_settings import BaseSettings, SettingsConfigDict class MegaParseSDKConfig(BaseSettings): """ Configuration for the Megaparse SDK. """ model_config = SettingsConfigDict(env_prefix="MEGAPARSE_SDK_") api_key: str | None = None url: str = "https://megaparse.tooling.quivr.app" timeout: int = 600 max_retries: int = 3 class SSLConfig(BaseModel): ssl_key_file: FilePath ssl_cert_file: FilePath ca_cert_file: FilePath | None = None class ClientNATSConfig(BaseSettings): model_config = SettingsConfigDict( env_prefix="MEGAPARSE_NATS_", env_file=(".env.local", ".env"), env_nested_delimiter="__", extra="ignore", ) subject: str = "parsing" endpoint: str = "https://tests@nats.tooling.quivr.app:4222" timeout: float = 20 max_retries: int = 5 backoff: float = 3 connect_timeout: int = 5 reconnect_time_wait: int = 1 max_reconnect_attempts: int = 20 ssl_config: SSLConfig | None = None ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py ================================================ ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py ================================================ from typing import Optional from httpx import Response from pydantic import BaseModel from megaparse_sdk.client import MegaParseClient from megaparse_sdk.schema.languages import Language from megaparse_sdk.schema.parser_config import ParserType, StrategyEnum class UploadFileConfig(BaseModel): method: ParserType strategy: StrategyEnum check_table: bool language: Language parsing_instruction: str | None = None model_name: str = "gpt-4o" class FileUpload: def __init__(self, client: MegaParseClient): self.client = client async def upload( self, file_path: str, method: ParserType = ParserType.UNSTRUCTURED, strategy: StrategyEnum = StrategyEnum.AUTO, check_table: bool = False, language: Language = Language.ENGLISH, parsing_instruction: Optional[str] = None, model_name: str = "gpt-4o", ) -> Response: data = UploadFileConfig( method=method, strategy=strategy, check_table=check_table, language=language, parsing_instruction=parsing_instruction, model_name=model_name, ) with open(file_path, "rb") as file: files = {"file": (file_path, file)} response = await self.client.request( "POST", "/v1/file", files=files, data=data.model_dump(mode="json"), ) return response ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py ================================================ from httpx import Response from megaparse_sdk.client import MegaParseClient class URLUpload: def __init__(self, client: MegaParseClient): self.client = client async def upload(self, url: str, max_retries: int = 3) -> Response: endpoint = f"/v1/url?url={url}" headers = {"accept": "application/json"} response = await self.client.request("POST", endpoint, headers=headers, data="") return response ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/__init__.py ================================================ ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/document.py ================================================ import uuid from enum import Enum from typing import Any, Dict, List, Literal, NamedTuple, Optional, Self, Tuple import numpy as np from PIL import Image, ImageDraw from pydantic import BaseModel, Field, field_validator class Point2D(NamedTuple): x: float y: float class BlockType(str, Enum): TEXT = "text" class BBOX(NamedTuple): top_left: Point2D bottom_right: Point2D def to_numpy(self): return np.array( [self.top_left.x, self.top_left.y, self.bottom_right.x, self.bottom_right.y] ) def iou(self, other: Self): x1 = max(self.top_left.x, other.top_left.x) y1 = max(self.top_left.y, other.top_left.y) x2 = min(self.bottom_right.x, other.bottom_right.x) y2 = min(self.bottom_right.y, other.bottom_right.y) intersection = max(0, x2 - x1) * max(0, y2 - y1) area_self = (self.bottom_right.x - self.top_left.x) * ( self.bottom_right.y - self.top_left.y ) area_other = (other.bottom_right.x - other.top_left.x) * ( other.bottom_right.y - other.top_left.y ) union = area_self + area_other - intersection return intersection / union class BlockLayout(BaseModel): bbox: BBOX objectness_score: float block_type: BlockType class TextDetection: __slots__ = [ "bboxes", "page_index", "dimensions", "orientation", "origin_page_shape", ] def __init__( self, bboxes: List[BlockLayout], page_index: int, dimensions: Tuple[int, ...], orientation: Tuple[int, float] | Literal[0], origin_page_shape, ): self.bboxes = bboxes self.page_index = page_index self.dimensions = dimensions self.orientation = orientation self.origin_page_shape = origin_page_shape def __repr__(self) -> str: return f"PageLayout(bboxes={self.bboxes}, page_index={self.page_index}, dimensions={self.dimensions}, orientation={self.orientation})" def render( self, page_array: np.ndarray, output_path: Optional[str] = "page_layout.png" ): """ Render the page layout with bounding boxes on the original page image. Args: page_array (np.ndarray): The original page image as a NumPy array. output_path (str): The path to save the rendered image. """ # Convert the NumPy array to a PIL image image = Image.fromarray(page_array) draw = ImageDraw.Draw(image) width, height = self.dimensions # Draw each bounding box for block in self.bboxes: bbox = block.bbox top_left = (bbox[0][0] * height, bbox[0][1] * width) bottom_right = (bbox[1][0] * height, bbox[1][1] * width) draw.rectangle([top_left, bottom_right], outline="red", width=2) if output_path: # Save the image image.save(output_path) print(f"Page layout saved to {output_path}") return image def get_loc_preds(self) -> np.ndarray: """ Get the location predictions of the bounding boxes. Returns: np.ndarray: The location predictions as a NumPy array. """ loc_preds = np.array([block.bbox.to_numpy() for block in self.bboxes]) return loc_preds def get_objectness_scores(self) -> np.ndarray: """ Get the objectness scores of the bounding boxes. Returns: np.ndarray: The objectness scores as a NumPy array. """ objectness_scores = np.array([block.objectness_score for block in self.bboxes]) return objectness_scores def get_origin_page_shapes(self) -> np.ndarray: """ Get the original page shapes. Returns: np.ndarray: The original page shapes as a NumPy array. """ origin_page_shapes = np.array([self.origin_page_shape for _ in self.bboxes]) return origin_page_shapes def get_orientations(self) -> np.ndarray: """ Get the orientations of the bounding boxes. Returns: np.ndarray: The orientations as a NumPy array. """ orientations = np.array([self.orientation for _ in self.bboxes]) return orientations class Block(BaseModel): """ A class to represent a block """ block_id: Optional[uuid.UUID] = Field(default_factory=uuid.uuid4) metadata: Dict[str, Any] # FIXME: TBD @Amine bbox: Optional[BBOX] = ( None # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in ) page_range: Optional[Tuple[int, int]] = Field( default=None ) # (start_page, end_page) @field_validator("page_range") def validate_range(cls, value): if value is None: return None start, end = value if start > end: raise ValueError( "The first value of the page range must be less than the second value" ) return value class TextBlock(Block): """ A class to represent a text block """ text: str def __str__(self): return self.text class UndefinedBlock(TextBlock): """ A class to represent a text block """ pass class TitleBlock(TextBlock): """ A class to represent a title block """ def __str__(self): return f"# {self.text}" class SubTitleBlock(TextBlock): """ A class to represent a subtitle block """ depth: int = 0 def __str__(self): heading_level = min(self.depth + 1, 6) return f"{'#' * heading_level} {self.text}" class CaptionBlock(TextBlock): """ A class to represent a caption block """ pass class ImageBlock(Block): """ A class to represent an image block """ text: Optional[str] = None caption: Optional[str] = "unknown" def __str__(self) -> str: return f"[Image: {self.caption}]" class TableBlock(ImageBlock): """ A class to represent a table block """ def __str__(self): return self.text if self.text else f"[Table : {self.caption}]" class ListElementBlock(TextBlock): """ A class to represent a list element """ depth: int = 0 class ListBlock(Block): """ A class to represent a list block """ list_elements: List[ListElementBlock] # rajouter fonction pydantic pour compute l attribut def __str__(self): return "\n".join( f"{' ' * (2 * element.depth)}* {element.text}" for element in self.list_elements ) class HeaderBlock(TextBlock): """ A class to represent a header block """ def __str__(self): return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" class FooterBlock(TextBlock): """ A class to represent a footer block """ def __str__(self): return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" class SectionBlock(Block): """ A class to represent a section block """ title: str depth: int content: List[Block] def __str__(self): lines = [] lines.extend(str(block) for block in self.content) return "\n".join(lines) class TOCItem(BaseModel): title: str depth: int page_range: Tuple[int, int] = Field(...) # (start_page, end_page) @field_validator("page_range") def validate_range(cls, value): start, end = value if start >= end: raise ValueError( "The first value of the page range must be less than the second value" ) return value def __str__(self): start_page, end_page = self.page_range page_info = ( f"page {start_page}" if start_page == end_page else f"pages {start_page}-{end_page}" ) return f"{' ' * (2 * self.depth)}* {self.title} ({page_info})" class TOC(BaseModel): content: List[TOCItem] @property def text(self) -> str: return "\n".join(str(item) for item in self.content) def __str__(self): return self.text class Document(BaseModel): """ A class to represent a document """ file_name: Optional[str] = None table_of_contents: Optional[TOC] = None content: List[Block] detection_origin: str metadata: Dict[str, Any] def __str__(self) -> str: lines = [] # If there's a table of contents, include it if self.table_of_contents: lines.append("Table of Contents:") # Use TOC’s own string-building property or method lines.append(self.table_of_contents.text) # Print each block’s text representation lines.extend(str(block) + "\n" for block in self.content) return "\n".join(lines) def clean(self): """ Clean the Document element by : - Merging Caption in ImageBlock - Merging continuous list items elements into ListBlock - Add Depth to Title / SubTitle / ListElementBlock - Creating sections - Creating TOC """ # Merge caption in ImageBlock simplified i = 0 list_elements_stack = [] while i < len(self.content) - 1: if isinstance(self.content[i], ListElementBlock): list_elements_stack.append(self.content[i]) self.content.pop(i) continue else: if list_elements_stack: self.content.insert( i, ListBlock(list_elements=list_elements_stack, metadata={}) ) list_elements_stack = [] if isinstance(self.content[i], ImageBlock) and isinstance( self.content[i + 1], CaptionBlock ): self.content[i].caption = str(self.content[i + 1]) # type: ignore self.content.pop(i + 1) elif isinstance(self.content[i], CaptionBlock) and isinstance( self.content[i + 1], ImageBlock ): self.content[i + 1].caption = str(self.content[i]) # type: ignore self.content.pop(i) i += 1 ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/extensions.py ================================================ from enum import Enum class FileExtension(str, Enum): """Supported file extension enumeration.""" _mimetype: str def __new__(cls, value: str, mimetype: str): obj = str.__new__(cls, value) obj._value_ = value obj._mimetype = mimetype return obj PDF = (".pdf", "application/pdf") DOCX = ( ".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) TXT = (".txt", "text/plain") OTF = (".odt", "application/vnd.oasis.opendocument.text") EPUB = (".epub", "application/epub") HTML = (".html", "text/html") XML = (".xml", "application/xml") CSV = (".csv", "text/csv") XLSX = ( ".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) XLS = (".xls", "application/vnd.ms-excel") PPTX = ( ".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ) MD = (".md", "text/markdown") MARKDOWN = (".markdown", "text/markdown") @property def mimetype(self) -> str: return self._mimetype ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/languages.py ================================================ from enum import Enum class Language(str, Enum): BAZA = "abq" ADYGHE = "ady" AFRIKAANS = "af" ANGIKA = "ang" ARABIC = "ar" ASSAMESE = "as" AVAR = "ava" AZERBAIJANI = "az" BELARUSIAN = "be" BULGARIAN = "bg" BIHARI = "bh" BHOJPURI = "bho" BENGALI = "bn" BOSNIAN = "bs" SIMPLIFIED_CHINESE = "ch_sim" TRADITIONAL_CHINESE = "ch_tra" CHECHEN = "che" CZECH = "cs" WELSH = "cy" DANISH = "da" DARGWA = "dar" GERMAN = "de" ENGLISH = "en" SPANISH = "es" ESTONIAN = "et" PERSIAN_FARSI = "fa" FRENCH = "fr" IRISH = "ga" GOAN_KONKANI = "gom" HINDI = "hi" CROATIAN = "hr" HUNGARIAN = "hu" INDONESIAN = "id" INGUSH = "inh" ICELANDIC = "is" ITALIAN = "it" JAPANESE = "ja" KABARDIAN = "kbd" KANNADA = "kn" KOREAN = "ko" KURDISH = "ku" LATIN = "la" LAK = "lbe" LEZGHIAN = "lez" LITHUANIAN = "lt" LATVIAN = "lv" MAGAHI = "mah" MAITHILI = "mai" MAORI = "mi" MONGOLIAN = "mn" MARATHI = "mr" MALAY = "ms" MALTESE = "mt" NEPALI = "ne" NEWARI = "new" DUTCH = "nl" NORWEGIAN = "no" OCCITAN = "oc" PALI = "pi" POLISH = "pl" PORTUGUESE = "pt" ROMANIAN = "ro" RUSSIAN = "ru" SERBIAN_CYRILLIC = "rs_cyrillic" SERBIAN_LATIN = "rs_latin" NAGPURI = "sck" SLOVAK = "sk" SLOVENIAN = "sl" ALBANIAN = "sq" SWEDISH = "sv" SWAHILI = "sw" TAMIL = "ta" TABASSARAN = "tab" TELUGU = "te" THAI = "th" TAJIK = "tjk" TAGALOG = "tl" TURKISH = "tr" UYGHUR = "ug" UKRAINIAN = "uk" URDU = "ur" UZBEK = "uz" VIETNAMESE = "vi" ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py ================================================ class ModelNotSupported(Exception): def __init__( self, message: str = "The requested model is not supported yet.", ): super().__init__(message) class MemoryLimitExceeded(Exception): def __init__(self, message="The service is under high memory pressure"): super().__init__(message) class InternalServiceError(Exception): def __init__(self, message="Internal service error occured"): super().__init__(message) class DownloadError(Exception): def __init__(self, message="Failed to download the file"): super().__init__(message) class ParsingException(Exception): def __init__(self, message="An error occurred during parsing"): super().__init__(message) ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py ================================================ import base64 from enum import Enum from typing import Literal, Union from pydantic import BaseModel, Field, field_serializer, field_validator from .parser_config import ParseFileConfig class FileInput(BaseModel): file_name: str file_size: int data: bytes @field_validator("data", mode="before") def decode_data(cls, value): if isinstance(value, str): try: return base64.b64decode(value) except Exception: raise ValueError("Invalid Base64 encoding for the 'data' field.") return value # TODO: this is slow !!! Move to reading bytes directly from bucket storage # append bytes with CRC32 @field_serializer("data", return_type=str) def serialize_data(self, data: bytes, _info): return base64.b64encode(data).decode("utf-8") class MPParseType(str, Enum): PARSE_FILE = "parse_file" PARSE_URL = "parse_url" class ParseFileInput(BaseModel): mp_parse_type: Literal[MPParseType.PARSE_FILE] = MPParseType.PARSE_FILE file_input: FileInput parse_config: ParseFileConfig class ParseUrlInput(BaseModel): mp_parse_type: Literal[MPParseType.PARSE_URL] = MPParseType.PARSE_URL url: str class MPInput(BaseModel): input: Union[ParseFileInput, ParseUrlInput] = Field( ..., discriminator="mp_parse_type" ) ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py ================================================ from enum import Enum, auto from typing import Dict from pydantic import BaseModel, Field from megaparse_sdk.schema.document import Document class MPErrorType(Enum): MEMORY_LIMIT = auto() INTERNAL_SERVER_ERROR = auto() MODEL_NOT_SUPPORTED = auto() DOWNLOAD_ERROR = auto() PARSING_ERROR = auto() class ParseError(BaseModel): mp_err_code: MPErrorType message: str class MPOutputType(str, Enum): PARSE_OK = "parse_file_ok" PARSE_ERR = "parse_file_err" class MPOutput(BaseModel): output_type: MPOutputType result: str | Document | None err: ParseError | None = None metadata: Dict[str, str] = Field(default_factory=dict) ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py ================================================ from enum import Enum from typing import Optional from pydantic import BaseModel from .languages import Language from .supported_models import SupportedModel class ParserType(str, Enum): """Parser type enumeration.""" UNSTRUCTURED = "unstructured" LLAMA_PARSER = "llama_parser" MEGAPARSE_VISION = "megaparse_vision" class StrategyEnum(str, Enum): """Method to use for the conversion""" FAST = "fast" AUTO = "auto" HI_RES = "hi_res" class ParseFileConfig(BaseModel): llm_model_name: SupportedModel = SupportedModel.GPT_4 method: ParserType = ParserType.UNSTRUCTURED strategy: StrategyEnum = StrategyEnum.AUTO check_table: bool = False language: Language = Language.ENGLISH parsing_instruction: Optional[str] = None ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py ================================================ from enum import Enum class SupportedModel(str, Enum): """Supported models enumeration.""" # OpenAI Models GPT_4 = "gpt-4" GPT_4_TURBO = "gpt-4-turbo" GPT_3_5_TURBO = "gpt-3.5-turbo" GPT_4O = "gpt-4o" GPT_4O_MINI = "gpt-4o-mini" # Anthropic Models CLAUDE_3_5_SONNET_LATEST = "claude-3-5-sonnet-latest" CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022" CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022" CLAUDE_3_5_HAIKU_LATEST = "claude-3-5-haiku-latest" CLAUDE_3_OPUS = "claude-3-opus-20240229" CLAUDE_3_OPUS_LATEST = "claude-3-opus-latest" CLAUDE_3_SONNET = "claude-3-sonnet-20240229" CLAUDE_3_HAIKU = "claude-3-haiku-20240307" def __str__(self): return self.value @classmethod def is_supported(cls, model_name: str) -> bool: """Check if the model is supported.""" return model_name in cls.__members__.values() @classmethod def get_supported_models(cls) -> list[str]: """Get the list of supported models.""" return list(cls.__members__.values()) ================================================ FILE: libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py ================================================ import ssl from megaparse_sdk.config import SSLConfig def load_ssl_cxt(ssl_config: SSLConfig): context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) if ssl_config.ca_cert_file: context.load_verify_locations(cafile=ssl_config.ca_cert_file) context.load_cert_chain( certfile=ssl_config.ssl_cert_file, keyfile=ssl_config.ssl_key_file ) return context ================================================ FILE: libs/megaparse_sdk/pyproject.toml ================================================ [project] name = "megaparse-sdk" version = "0.1.12" description = "Megaparse SDK" dependencies = [ "python-dotenv>=1.0.0", "pycryptodome>=3.21.0", "psutil>=6.1.0", "httpx>=0.27.0", "nats-py>=2.9.0", "loguru>=0.7.2", ] readme = "README.md" requires-python = ">= 3.11" [build-system] requires = ["hatchling==1.26.3"] build-backend = "hatchling.build" [tool.rye] managed = true dev-dependencies = [] universal = true [tool.hatch.metadata] allow-direct-references = true [tool.hatch.build.targets.wheel] packages = ["megaparse_sdk"] ================================================ FILE: libs/megaparse_sdk/tests/README.md ================================================ ================================================ FILE: libs/megaparse_sdk/tests/certs/client-cert.pem ================================================ -----BEGIN CERTIFICATE----- MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9 ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7 /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw= -----END CERTIFICATE----- ================================================ FILE: libs/megaparse_sdk/tests/certs/client-key.pem ================================================ -----BEGIN PRIVATE KEY----- MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0 xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18 vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI 1OaXIqrCA/V43NydDezh0ylQ -----END PRIVATE KEY----- ================================================ FILE: libs/megaparse_sdk/tests/certs/rootCA.pem ================================================ -----BEGIN CERTIFICATE----- MIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+ BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt aW5lIGRpcmhvdXNzaSkwHhcNMjQxMTE5MTAwMTA5WhcNMzQxMTE5MTAwMTA5WjCB nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+ BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt aW5lIGRpcmhvdXNzaSkwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCw 6TX1kvqVMb8ZUQVT/vuDsedmbYgSFn68yJRlmE9BsqG7TLQHl2Kw6VQqZBSIkeZG CypmUysX/3qrvICeArIdmmsrWUTDYPoauw/a/RY0I07rALj3YR0Y7039Hxf/UPT9 xlUtnM2NafkZyp6WRjEN0N4ETvJDIbUQiosiiPilxhwRbJURhT/JPskaw+OM2Sw5 dFAT20zkYC5VIc4wJBFLAMG0XzI6Sy/4wI1WdRBXd2UMpQU4u7TyD0RB4mnHorV6 kXjtLKD/KWSrSG1nnum9SB9eVatbRD+TUgoclwAKedrlCDEM4EsXVVuUuYCizQNb +H3BSPfj1upUW5eKfgAyB+8r4QGf2yCY9O8NMMrJ1K5Qv4vSuWAU2tZqAyE8Z4Ke UtHsl/M0zIvIKwyki2N/rieL/m6lTzS3dwSf9vv7eePEvxd8SBClSF07MUzyxkZ5 UYNxaK5t2ZRADZ6n/9/hAQsMscCkHiX1N2ypBFV+86Pr78BC48JgIyCMwuiBN4sC AwEAAaNFMEMwDgYDVR0PAQH/BAQDAgIEMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYD VR0OBBYEFFdsN4L0DOS2tdn5PNLSV6DP9eJeMA0GCSqGSIb3DQEBCwUAA4IBgQBj KosfLfW/ZH80NM16pvpyRF3mCi+q+I+P8zrfilMYJBH4EEdEGAUgTO5do1kJXeel Wky+FNxaP6KCNiT+0amypKg+yjBlnqLKVdnEgR5s12ZfmerV59stx1A/c/bYMEAS re6xskBkowP2cVQHAC2dy/0Ov+lZsiNaPV2bQx6KUJurveebUQsH3uF3ZEhnUVQ6 rt5+JGY4x9Tr1YMhvHqEDTrsipPdDB1MyW1SnCkqSXrz+DPXGd8BW0O0hpM5la81 J+rfZGinbcUgXM6JMLIHDxLc4Xxzm4NijFzXhbR3XPXqEwsnZOuxcYYFgUGs3FwS 4ro+34a/O4uKS2KV8wsUWj/tWD2rLpduDgag4WSipCvWtaNve8gPdUiyPxUqxyoZ aFAFg/izXwmRntogJtV0Zvo3fqAaQQDl8t2s21IIx0wmgHzgmkswb5OwFg3dOn/S lmaH8v7FCBP7jHx/NCPTT5Sy/1EMRATmhFDUZ8Bod/TIlV3e+FCVqlX3kBBRbAU= -----END CERTIFICATE----- ================================================ FILE: libs/megaparse_sdk/tests/test_nats_client.py ================================================ import asyncio import logging from pathlib import Path import nats import pytest import pytest_asyncio from megaparse_sdk.client import ClientState, MegaParseNATSClient from megaparse_sdk.config import ClientNATSConfig, SSLConfig from megaparse_sdk.schema.mp_exceptions import ( DownloadError, InternalServiceError, MemoryLimitExceeded, ModelNotSupported, ParsingException, ) from megaparse_sdk.schema.mp_inputs import MPInput, ParseFileInput, ParseUrlInput from megaparse_sdk.schema.mp_outputs import ( MPErrorType, MPOutput, MPOutputType, ParseError, ) from nats.aio.client import Client logger = logging.getLogger(__name__) NATS_URL = "nats://test@127.0.0.1:4222" NATS_SUBJECT = "parsing" SSL_CERT_FILE = "./tests/certs/client-cert.pem" SSL_KEY_FILE = "./tests/certs/client-key.pem" CA_CERT_FILE = "./tests/certs/rootCA.pem" @pytest.fixture(scope="session") def ssl_config() -> SSLConfig: return SSLConfig( ca_cert_file=CA_CERT_FILE, ssl_key_file=SSL_KEY_FILE, ssl_cert_file=SSL_CERT_FILE, ) @pytest.fixture(scope="session") def nc_config(ssl_config: SSLConfig) -> ClientNATSConfig: config = ClientNATSConfig( subject=NATS_SUBJECT, endpoint=NATS_URL, ssl_config=ssl_config, timeout=0.5, max_retries=1, backoff=-1, connect_timeout=1, reconnect_time_wait=1, max_reconnect_attempts=1, ) return config @pytest_asyncio.fixture(scope="function") async def nats_service(nc_config: ClientNATSConfig): # TODO: fix TLS handshake to work in CI # ssl_config = load_ssl_cxt(nc_config.ssl_config) nc = await nats.connect( nc_config.endpoint, tls=ssl_config, connect_timeout=nc_config.connect_timeout, reconnect_time_wait=nc_config.reconnect_time_wait, max_reconnect_attempts=nc_config.max_reconnect_attempts, ) yield nc await nc.drain() @pytest.mark.asyncio async def test_client_state_transition(nc_config: ClientNATSConfig): mpc = MegaParseNATSClient(nc_config) assert mpc._state == ClientState.UNOPENED async with mpc: assert mpc._state == ClientState.OPENED assert mpc._state == ClientState.CLOSED with pytest.raises(RuntimeError): async with mpc: pass @pytest.mark.asyncio(loop_scope="session") async def test_client_parse_file(nats_service: Client, nc_config: ClientNATSConfig): async def message_handler(msg): parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input assert isinstance(parsed_input, ParseFileInput) output = MPOutput(output_type=MPOutputType.PARSE_OK, result="test") await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) file_path = Path("./tests/pdf/sample_table.pdf") async with MegaParseNATSClient(nc_config) as mp_client: resp = await mp_client.parse_file(file=file_path) assert resp == "test" @pytest.mark.asyncio(loop_scope="session") async def test_client_parse_url(nats_service: Client, nc_config: ClientNATSConfig): async def message_handler(msg): parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input assert isinstance(parsed_input, ParseUrlInput) output = MPOutput(output_type=MPOutputType.PARSE_OK, result="url") await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) async with MegaParseNATSClient(nc_config) as mp_client: resp = await mp_client.parse_url(url="this://this") assert resp == "url" @pytest.mark.asyncio(loop_scope="session") async def test_client_parse_timeout(nats_service: Client, ssl_config: SSLConfig): nc_config = ClientNATSConfig( subject=NATS_SUBJECT, endpoint=NATS_URL, ssl_config=ssl_config, timeout=0.1, max_retries=1, backoff=1, ) async def service(msg): await asyncio.sleep(2 * nc_config.timeout) await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service) file_path = Path("./tests/pdf/sample_table.pdf") with pytest.raises(ParsingException): async with MegaParseNATSClient(nc_config) as mp_client: await mp_client.parse_file(file=file_path) @pytest.mark.asyncio(loop_scope="session") async def test_client_parse_timeout_retry(nats_service: Client, ssl_config: SSLConfig): nc_config = ClientNATSConfig( subject=NATS_SUBJECT, endpoint=NATS_URL, ssl_config=ssl_config, timeout=0.1, max_retries=2, backoff=-5, ) msgs = [] async def service(msg): msgs.append(msg) await asyncio.sleep(2 * nc_config.timeout) await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service) file_path = Path("./tests/pdf/sample_table.pdf") with pytest.raises(ParsingException): async with MegaParseNATSClient(nc_config) as mp_client: await mp_client.parse_file(file=file_path) assert len(msgs) == 2 @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "mp_error_type, exception_class", [ ("MEMORY_LIMIT", MemoryLimitExceeded), ("INTERNAL_SERVER_ERROR", InternalServiceError), ("MODEL_NOT_SUPPORTED", ModelNotSupported), ("DOWNLOAD_ERROR", DownloadError), ("PARSING_ERROR", ParsingException), ], ) async def test_client_parse_file_excp( nats_service: Client, nc_config: ClientNATSConfig, mp_error_type, exception_class ): async def message_handler(msg): parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input assert isinstance(parsed_input, ParseFileInput) err = ParseError(mp_err_code=MPErrorType[mp_error_type], message="") output = MPOutput( output_type=MPOutputType.PARSE_ERR, err=err, result=None, ) await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) file_path = Path("./tests/pdf/sample_table.pdf") with pytest.raises(exception_class): async with MegaParseNATSClient(nc_config) as mp_client: await mp_client.parse_file(file=file_path) ================================================ FILE: pyproject.toml ================================================ [project] name = "megaparse-monorepo" version = "0.0.1" description = "Megaparse monorepo" authors = [ { name = "Stan Girard", email = "stan@quivr.app" }, { name = "Chloé Daems", email = "chloe@quivr.app" }, { name = "Amine Dirhoussi", email = "amine@quivr.app" }, { name = "Jacopo Chevallard", email = "jacopo@quivr.app" }, ] readme = "README.md" requires-python = ">= 3.11" dependencies = [ "packaging>=22.0", ] [build-system] requires = ["hatchling==1.26.3"] build-backend = "hatchling.build" [tool.rye] python = ">= 3.11" managed = true universal = true dev-dependencies = [ "mypy>=1.11.1", "pre-commit>=3.8.0", "ipykernel>=6.29.5", "ruff>=0.6.0", "flake8>=7.1.1", "flake8-black>=0.3.6", "pytest-asyncio>=0.23.8", "pytest>=8.3.3", "pytest-xdist>=3.6.1", "pytest-cov>=5.0.0", "pytest-profiling>=1.8.1", ] [tool.rye.workspace] members = ["libs/*"] [tool.hatch.metadata] allow-direct-references = true [tool.hatch.build.targets.wheel] packages = ["src/megaparse"] [tool.ruff] line-length = 88 exclude = [".git", "__pycache__", ".mypy_cache", ".pytest_cache"] [tool.ruff.lint] select = [ "E", # pycodestyle errors "W", # pycodestyle warnings "F", # pyflakes "I", # isort "C", # flake8-comprehensions "B", # flake8-bugbear ] ignore = [ "B904", "B006", "E501", # line too long, handled by black "B008", # do not perform function calls in argument defaults "C901", # too complex ] [tool.ruff.lint.isort] order-by-type = true relative-imports-order = "closest-to-furthest" extra-standard-library = ["typing"] section-order = [ "future", "standard-library", "third-party", "first-party", "local-folder", ] known-first-party = [] [tool.pytest.ini_options] addopts = "--tb=short -ra -v" asyncio_default_fixture_loop_scope = "session" filterwarnings = ["ignore::DeprecationWarning"] markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "base: these tests require quivr-core with extra `base` to be installed", "tika: these tests require a tika server to be running", "unstructured: these tests require `unstructured` dependency", ] ================================================ FILE: release-please-config.json ================================================ { "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json", "separate-pull-requests": true, "include-v-in-tag": true, "bump-patch-for-minor-pre-major": true, "include-component-in-tag": true, "packages": { "libs/megaparse": { "release-type": "python", "package-name": "megaparse", "changelog-notes-type": "github" }, "libs/megaparse_sdk": { "release-type": "python", "package-name": "megaparse-sdk", "changelog-notes-type": "github" } } }